-
Notifications
You must be signed in to change notification settings - Fork 696
NonlinearMinimizer using Projection and Proximal Operators #364
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
fb74645
a7ee059
679bb5f
3d80b31
3781e37
536886d
f98bd80
ae795b6
51b4224
ee697bf
ce8638f
bbc3edd
f85ff86
e3a61a9
928de32
91f2e17
33d28ff
6cba897
9bef354
18c7789
43794c0
e2c1db8
defaff5
610027f
8c6a6c8
b4d86e8
3a6fc97
8533ada
a0bbd33
40a45a8
7308c7a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,24 @@ | ||
| package breeze.optimize | ||
|
|
||
| /* | ||
| Copyright 2015 David Hall, Debasish Das | ||
|
|
||
| Licensed under the Apache License, Version 2.0 (the "License") | ||
| you may not use this file except in compliance with the License. | ||
| You may obtain a copy of the License at | ||
|
|
||
| http://www.apache.org/licenses/LICENSE-2.0 | ||
|
|
||
| Unless required by applicable law or agreed to in writing, software | ||
| distributed under the License is distributed on an "AS IS" BASIS, | ||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| See the License for the specific language governing permissions and | ||
| limitations under the License. | ||
| */ | ||
|
|
||
| import breeze.linalg._ | ||
| import breeze.collection.mutable.RingBuffer | ||
| import breeze.math.{MutableInnerProductModule, MutableVectorField} | ||
| import breeze.math.MutableInnerProductModule | ||
| import breeze.util.SerializableLogging | ||
|
|
||
| // Compact representation of an n x n Hessian, maintained via L-BFGS updates | ||
|
|
@@ -66,32 +82,32 @@ class ProjectedQuasiNewton(tolerance: Double = 1e-6, | |
| val m: Int = 10, | ||
| val initFeas: Boolean = false, | ||
| val testOpt: Boolean = true, | ||
| val maxNumIt: Int = 500, | ||
| maxIter: Int = -1, | ||
| val maxSrchIt: Int = 50, | ||
| val gamma: Double = 1e-4, | ||
| val projection: DenseVector[Double] => DenseVector[Double] = identity) | ||
| (implicit space: MutableInnerProductModule[DenseVector[Double],Double]) | ||
| extends FirstOrderMinimizer[DenseVector[Double], DiffFunction[DenseVector[Double]]](maxIter = maxNumIt, tolerance = tolerance) with Projecting[DenseVector[Double]] with SerializableLogging { | ||
| val innerOptimizer = new SpectralProjectedGradient[DenseVector[Double], DiffFunction[DenseVector[Double]]]( | ||
| testOpt = true, | ||
| extends FirstOrderMinimizer[DenseVector[Double], DiffFunction[DenseVector[Double]]](maxIter = maxIter, tolerance = tolerance) with Projecting[DenseVector[Double]] with SerializableLogging { | ||
| type BDV = DenseVector[Double] | ||
|
|
||
| val innerOptimizer = new SpectralProjectedGradient[BDV]( | ||
| tolerance = tolerance, | ||
| maxIter = 500, | ||
| maxIter = 50, | ||
| bbMemory = 5, | ||
| initFeas = true, | ||
| minImprovementWindow = 10, | ||
| projection = projection | ||
| ) | ||
|
|
||
| type History = CompactHessian | ||
|
|
||
|
|
||
| protected def initialHistory(f: DiffFunction[DenseVector[Double]], init: DenseVector[Double]): History = { | ||
| new CompactHessian(m) | ||
| } | ||
|
|
||
| override protected def adjust(newX: DenseVector[Double], newGrad: DenseVector[Double], newVal: Double):(Double,DenseVector[Double]) = (newVal,-projectedVector(newX, -newGrad)) | ||
| override protected def adjust(newX: DenseVector[Double], newGrad: DenseVector[Double], newVal: Double):(Double,DenseVector[Double]) = (newVal,projectedVector(newX, -newGrad)) | ||
|
|
||
| private def computeGradient(x: DenseVector[Double], g: DenseVector[Double]): DenseVector[Double] = projectedVector(x, -g) | ||
| private def computeGradientNorm(x: DenseVector[Double], g: DenseVector[Double]): Double = norm(computeGradient(x, g),Double.PositiveInfinity) | ||
|
|
||
| protected def chooseDescentDirection(state: State, fn: DiffFunction[DenseVector[Double]]): DenseVector[Double] = { | ||
| import state._ | ||
|
|
@@ -101,72 +117,54 @@ class ProjectedQuasiNewton(tolerance: Double = 1e-6, | |
| // Update the limited-memory BFGS approximation to the Hessian | ||
| //B.update(y, s) | ||
| // Solve subproblem; we use the current iterate x as a guess | ||
| val subprob = new ProjectedQuasiNewton.QuadraticSubproblem(fn, state.adjustedValue, x, grad, history) | ||
| val p = innerOptimizer.minimize(new CachedDiffFunction(subprob), x) | ||
| p - x | ||
| val subprob = new ProjectedQuasiNewton.QuadraticSubproblem(state.adjustedValue, x, grad, history) | ||
| val spgResult = innerOptimizer.minimizeAndReturnState(new CachedDiffFunction(subprob), x) | ||
| logger.info(f"ProjectedQuasiNewton: outerIter ${state.iter} innerIters ${spgResult.iter}") | ||
| spgResult.x - x | ||
| // time += subprob.time | ||
| } | ||
| } | ||
|
|
||
|
|
||
| protected def determineStepSize(state: State, fn: DiffFunction[DenseVector[Double]], dir: DenseVector[Double]): Double = { | ||
| if (state.iter == 0) | ||
| return scala.math.min(1.0, 1.0 / norm(state.grad,1.0)) | ||
| val dirnorm = norm(dir, Double.PositiveInfinity) | ||
| if(dirnorm < 1E-10) return 0.0 | ||
| import state._ | ||
| // Backtracking line-search | ||
| var accepted = false | ||
| var lambda = 1.0 | ||
| val gTd = grad dot dir | ||
| var srchit = 0 | ||
|
|
||
| do { | ||
| val candx = x + dir * lambda | ||
| val candf = fn.valueAt(candx) | ||
| val suffdec = gamma * lambda * gTd | ||
|
|
||
| if (testOpt && srchit > 0) { | ||
| logger.debug(f"PQN: SrchIt $srchit%4d: f $candf%-10.4f t $lambda%-10.4f\n") | ||
| } | ||
|
|
||
| if (candf < state.adjustedValue + suffdec) { | ||
| accepted = true | ||
| } else if (srchit >= maxSrchIt) { | ||
| accepted = true | ||
| } else { | ||
| lambda *= 0.5 | ||
| srchit = srchit + 1 | ||
| } | ||
| } while (!accepted) | ||
|
|
||
| if (srchit >= maxSrchIt) { | ||
| logger.info("PQN: Line search cannot make further progress") | ||
| throw new LineSearchFailed(norm(state.grad,Double.PositiveInfinity), norm(dir, Double.PositiveInfinity)) | ||
| } | ||
| lambda | ||
| /** | ||
| * Given a direction, perform a Strong Wolfe Line Search | ||
| * | ||
| * TO DO: Compare performance with Cubic Interpolation based line search from Mark's PQN paper | ||
| * | ||
| * @param state the current state | ||
| * @param f The objective | ||
| * @param dir The step direction | ||
| * @return stepSize | ||
| */ | ||
| protected def determineStepSize(state: State, f: DiffFunction[DenseVector[Double]], dir: DenseVector[Double]) = { | ||
| val x = state.x | ||
| val grad = state.grad | ||
|
|
||
| val ff = LineSearch.functionFromSearchDirection(f, x, dir) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we not need to project inside the line search?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Matlab code and the paper did not project inside PQN line search...
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah. Right, alpha is probably always <= 1, so it's safe. |
||
| val search = new BacktrackingLineSearch(state.value, maxIterations = maxSrchIt, shrinkStep= if(state.iter < 1) 0.1 else 0.5) | ||
| var alpha = if(state.iter == 0.0) min(1.0, 1.0/norm(dir)) else 1.0 | ||
| alpha = search.minimize(ff, alpha) | ||
|
|
||
| if(alpha * norm(grad) < 1E-10) throw new StepSizeUnderflow | ||
|
|
||
| alpha | ||
| } | ||
|
|
||
|
|
||
| protected def takeStep(state: State, dir: DenseVector[Double], stepSize: Double): DenseVector[Double] = { | ||
| projection(state.x + dir * stepSize) | ||
| } | ||
|
|
||
|
|
||
| protected def updateHistory(newX: DenseVector[Double], newGrad: DenseVector[Double], newVal: Double, f: DiffFunction[DenseVector[Double]], oldState: State): History = { | ||
| import oldState._ | ||
| val s = newX - oldState.x | ||
| val y = newGrad - oldState.grad | ||
| val s = newX - x | ||
| val y = newGrad - grad | ||
| oldState.history.updated(y, s) | ||
| } | ||
|
|
||
| } | ||
|
|
||
| object ProjectedQuasiNewton { | ||
| object ProjectedQuasiNewton extends SerializableLogging { | ||
| // Forms a quadratic model around fun, the argmin of which is then a feasible | ||
| // quasi-Newton descent direction | ||
| class QuadraticSubproblem(fun: DiffFunction[DenseVector[Double]], | ||
| fk: Double, | ||
| class QuadraticSubproblem(fk: Double, | ||
| xk: DenseVector[Double], | ||
| gk: DenseVector[Double], | ||
| B: CompactHessian) extends DiffFunction[DenseVector[Double]] { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FWIW, all rights reserved has no legal meaning in modern copyright law.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
PQN can do simplex projection but it does not separate f(x) and g(z)....what we really want to do is to solve f (x) through a blocked cyclic coordinate descent using bfgs and satisfy g (z) through proximal operator...that's our version of parameter server (distributed solver)...I am still thinking if we can replace coordinate descent with distributed bfgs by using some tricks...if I use owlqn or pqn I have to change code in all of them...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also I looked at the paper...that's the same algorithm I am implementing....it is a proximal algorithm....may be we can plugin to pqn as well....i dont know pqn that well...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I read the PQN paper and it is possible to stick all the proximal operators inside PQN...I have tested bounds with PQN already and it works well compared to default ADMM.
If it holds for ProximalL1() compared to OWLQN then I am good to use PQN as the core of NonlinearMinimizer...PQN right now accepts closure of form DenseVector[Double] => DenseVector[Double]. Would it be fine if I change the signature to (x: DenseVector[Double], rho: Double) => DenseVector[Double] ?
This is in-tune to generic proximal algorithms:
minimize g(x) + rho||x - v||_{2}^{2}
g(x) can be constraints here: x \in C
g(x) can be L1 here as well through soft-thresholding I think
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm totally open to that, but I'm not quite sure how it will be used. PQN
won't manipulate it, right? Why not have your driver specify it in a new
instance of PQN?
-- David
On Tue, Feb 17, 2015 at 12:01 PM, Debasish Das [email protected]
wrote:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not an interface like:
trait SeparableDiffFunction[T] {
def apply(x: T): IndexedSeq[(Double, T)]
}
?
maybe with a method to turn it into a normal DiffFunction given an OpAdd
for T?
On Sat, Feb 21, 2015 at 7:53 AM, Debasish Das [email protected]
wrote:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let me add the experimental version with RDD in it and you can help defining the clean interface...I feel we will need different interfaces for separable and non-separable functions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added the PQN driver and the projection operators for L1(x) <= c and ProbabilitySimplex...Somehow in my experiments I am unable to reproduce the results from Figure 1 of PQN paper...OWLQN is first run to extract the parameter lambda_L1(x_) to generate the constraint L1(x) <= c for PQN
runMain breeze.optimize.proximal.NonlinearMinimizer 500 0.4 0.99
Elastic Net with L1 and L2 regularization.
Linear Regression:
Issues:
owlqn 678.072 ms iters 173
pqnSparseTime 30600.365 ms iters 500
owlqnObj -145.38669700980395 pqnSparseObj -135.15057488775597
Logistic Regression:
Cons:
owlqn 187.894 ms iters 74 pqn 758.073 ms iters 28
objective owlqnLogistic 52.32713379333781 pqnLogistic 81.37098398138012
I am debugging the code further but any pointers will be great. I don't think this is the expected behavior from PQN on L1/ProbabilitySimplex constraint as per paper.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Random question: does it seem to be line searching a lot (relative to
OWLQN, or runs with a box constraint?)
On Mon, Feb 23, 2015 at 2:09 PM, Debasish Das [email protected]
wrote:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think my L1 projection has bugs in it...L1(x) is built using ProbabilitySimplex and PQN is beating naive ADMM and at par with QuadraticMinimizer for ProbabilitySimplex.
I will take a closer look into L1 projection for bugs but for ADMM based multicore/distributed consensus I will most likely choose OWLQN for elastic net and PQN for other constraints.
Here are the results with ProbabilitySimplex:
minimize f(x) s.t 1'x = c, x >= 0, c = 1
Linear Regression with ProbabilitySimplex, f(x) = ||Ax - b||^2
Objective pqn 85.34613832959954 nl 84.95320179604967 qp 85.33114863196366
Constraint pqn 0.9999999999999997 nl 1.001707509961072 qp 1.000000000000004
time pqn 150.552 ms nl 15847.125 ms qp 96.105 ms
Logistic Regression with ProbabilitySimplex, f(x) logistic loss from mllib
Objective pqn 257.6058563777358 nl 257.4025971846134
Constraint pqn 0.9999999999999998 nl 1.0007230450802203
time pqn 94.19 ms nl 25160.317 ms