dlwh · dlwh · Jul 10, 2015 · Dec 16, 2014 · Dec 19, 2014 · Dec 19, 2014
diff --git a/.gitignore b/.gitignore
@@ -16,7 +16,7 @@ tmp/
 .idea*
 .scratch/
 java.hprof.txt
-
 *.bbl
 *.blg
 *.aux
+/bin/
diff --git a/README-NEURAL.md b/README-NEURAL.md
@@ -0,0 +1,106 @@
+The neural CRF parser is a high-performing constituency parser.
+
+
+
+##Preamble
+
+The neural CRF parser is described in:
+
+"Neural CRF Parsing" Greg Durrett and Dan Klein. ACL 2015.
+
+It is an extension of the span parser described in
+
+"Less Grammar, More Features" David Hall, Greg Durrett, and Dan Klein. ACL 2014.
+
+and is based on the Epic parsing framework. See https://github.com/dlwh/epic
+for more documentation about the span parser and the Epic framework.
+See http://www.eecs.berkeley.edu/~gdurrett/ for papers and BibTeX.
+
+Questions? Bugs? Email me at [email protected]
+
+
+
+##Setup
+
+You need three things to run the neural CRF parser:
+
+1) The compiled .jar; run ```sbt assembly``` to produce this
+
+2) A treebank: the Penn Treebank or one of the SPMRL treebanks
+
+3) Some sort of word vectors. These can either be in the .bin format
+of Mikolov et al. (2013) or the .txt format of Bansal et al. (ACL 2014).  For
+English, the best performance comes from using Bansal et al.'s vectors:
+
+http://ttic.uchicago.edu/~mbansal/codedata/dependencyEmbeddings-skipdep.zip
+
+For other languages, you can train suitable vectors on monolingual data using
+```word2vec``` with the following arguments:
+
+    -cbow 0 -size 100 -window 1 -sample 1e-4 -threads 8 -binary 0 -iter 15
+
+These are mildly tuned, and using a small window size is important, but other
+settings are likely to work well too.
+
+
+
+
+##Usage
+
+To run the parser on new text (tokenized, one-sentence-per-line), use the following command:
+
+    java -Xmx4g -cp path/to/assembly.jar epic.parser.ParseText --model neuralcrf.parser --nthreads 8 [files]
+
+To reproduce the results in the neural CRF paper, run the following command
+(note that you need to fill in paths for -cp, --treebank.path, and --word2vecPath):
+
+    java -Xmx47g -cp path/to/assembly.jar epic.parser.models.NeuralParserTrainer \
+      --cache.path constraints.cache \
+      --opt.useStochastic \
+      --treebank.path path/to/wsj/ \
+      --evalOnTest \
+      --includeDevInTrain \
+      --trainer.modelFactory.annotator epic.trees.annotations.PipelineAnnotator \
+      --ann.0 epic.trees.annotations.FilterAnnotations  \
+      --ann.1 epic.trees.annotations.ForgetHeadTag \
+      --ann.2 epic.trees.annotations.Markovize \
+      --ann.2.horizontal 0 \
+      --ann.2.vertical 0 \
+      --modelFactory epic.parser.models.PositionalNeuralModelFactory \
+      --opt.batchSize 200 \
+      --word2vecPath path/to/skipdep_embeddings.txt \
+      --threads 8
+
+To run SPMRL treebanks, modify the arguments to the command above as follows:
+
+1) Add the following arguments (replace ${LANG}$ as appropriate):
+
+    --treebankType spmrl \
+    --binarization head \
+    --supervisedHeadFinderPtbPath path/to/gold/ptb/train/train.${LANG}.gold.ptb \
+    --supervisedHeadFinderConllPath path/to/gold/conll/train/train.${LANG}.gold.conll \
+    --ann.3 epic.trees.annotations.SplitPunct
+
+2) Modify --treebank.path to point to the X_SPMRL/gold/ptb directory.
+
+Options to configure the neural network and training are largely defined in
+```epic.parser.models.PositionalNeuralModel```
+
+###Miscellaneous Notes
+
+To run on the development set, simply remove ```evalOnTest``` and
+```includeDevInTrain``` from the arguments.
+
+Note that you should use the official version of ```evalb``` on the output
+files (gold and guess) rather than relying on the native scorer in the Epic
+parser. For SPMRL, you should use the version distributed with the shared
+task.
+
+Also note that the X-bar grammar and coarse pruning masks (constraints) are
+cached between runs in the same directory, which speeds up training and testing
+time considerably as generating the masks is time-consuming.
+
+
+
+
+
diff --git a/build.sbt b/build.sbt
@@ -102,6 +102,7 @@ mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
 {
   case PathList("org", "w3c", "dom", _) => MergeStrategy.first
   case PathList("javax", "xml", "stream", _ *) => MergeStrategy.first
+  case PathList("scala", "xml", _ *) => MergeStrategy.first
   case PathList("org", "cyberneko", "html", _ *) => MergeStrategy.first
   case x => old(x)
 }

diff --git a/src/main/scala/epic/dense/AdadeltaGradientDescentDVD.scala b/src/main/scala/epic/dense/AdadeltaGradientDescentDVD.scala
@@ -0,0 +1,71 @@
+package epic.dense
+
+import breeze.linalg._
+import breeze.numerics._
+import breeze.optimize.StochasticDiffFunction
+import breeze.optimize.StochasticGradientDescent
+
+
+class AdadeltaGradientDescentDVD(maxIter: Int,
+                              rho: Double = 0.95,
+                              tolerance: Double = 1E-5,
+                              improvementTolerance: Double = 1E-4,
+                              minImprovementWindow: Int = 50)
+    extends StochasticGradientDescent[DenseVector[Double]](1.0, maxIter, tolerance, improvementTolerance, minImprovementWindow) {
+
+  val delta = 1E-4
+  val epsilon = 1e-6
+  import vspace._
+
+  case class History(squaredGradientsHistory: DenseVector[Double], squaredUpdatesHistory: DenseVector[Double])
+  override def initialHistory(f: StochasticDiffFunction[DenseVector[Double]],init: DenseVector[Double]) = {
+    History(DenseVector(Array.tabulate(init.size)(i => 1e-6)), DenseVector(Array.tabulate(init.size)(i => 1e-6)))
+  }
+
+  override def updateHistory(newX: DenseVector[Double], newGrad: DenseVector[Double], newValue: Double, f: StochasticDiffFunction[DenseVector[Double]], oldState: State) = {
+    val oldHistory = oldState.history
+    // This is correct; the new gradient gets incorporated during the next round of takeStep,
+    // so this computation should lag by one
+    val newG = (oldState.grad :* oldState.grad) * (1 - rho)
+    axpy(rho, oldHistory.squaredGradientsHistory, newG)
+    val deltaX = newX - oldState.x
+    val newU = deltaX :* deltaX * (1 - rho);
+    axpy(rho, oldHistory.squaredUpdatesHistory, newU)
+    new History(newG, newU)
+//    val oldHistory = oldState.history
+//    val newG = (oldState.grad :* oldState.grad)
+//    val maxAge = 1000.0
+//    if(oldState.iter > maxAge) {
+//      newG *= 1/maxAge
+//      axpy((maxAge - 1)/maxAge, oldHistory.sumOfSquaredGradients, newG)
+//    } else {
+//      newG += oldHistory.sumOfSquaredGradients
+//    }
+//    new History(newG)
+  }
+
+  override protected def takeStep(state: State, dir: DenseVector[Double], stepSize: Double) = {
+    // gradient sum needs to 
+    import state._
+    // Need to pre-emptively update the gradient since the history only has it through the
+    // last timestep
+    val rmsGt = sqrt((state.history.squaredGradientsHistory * rho) :+ ((state.grad :* state.grad) * (1-rho)) :+ epsilon)
+    val rmsDeltaXtm1 = sqrt(state.history.squaredUpdatesHistory :+ epsilon)
+    val step = dir :* rmsDeltaXtm1 :/ rmsGt
+    val newX = x
+    axpy(1.0, step, newX)
+    newX
+  }
+
+  override def determineStepSize(state: State, f: StochasticDiffFunction[DenseVector[Double]], dir: DenseVector[Double]) = {
+    defaultStepSize // pegged to 1.0 for this method
+  }
+
+  override protected def adjust(newX: DenseVector[Double], newGrad: DenseVector[Double], newVal: Double) = {
+    newVal -> newGrad
+//    val av = newVal + (newX dot newX) * regularizationConstant / 2.0
+//    val ag = newGrad + newX * regularizationConstant
+//    (av -> ag)
+  }
+
+}
diff --git a/src/main/scala/epic/dense/AffineOutputTransform.scala b/src/main/scala/epic/dense/AffineOutputTransform.scala
@@ -0,0 +1,108 @@
+package epic.dense
+
+import breeze.linalg._
+import breeze.linalg.operators.OpMulMatrix
+import epic.features.SegmentedIndex
+import epic.framework.Feature
+
+import scala.runtime.ScalaRunTime
+import scala.util.Random
+
+/**
+ * Used at the output layer when we're only going to need some of the possible ouputs;
+ * it exposes the penultimate layer and then the Layer allows you to pass the results
+ * from that back in (caching it elsewhere) and only compute certain cells in the
+ * output layer (activationsFromPenultimateDot). 
+ */
+case class AffineOutputTransform[FV](numOutputs: Int, numInputs: Int, innerTransform: Transform[FV, DenseVector[Double]], includeBias: Boolean = true) extends OutputTransform[FV, DenseVector[Double]] {
+
+
+  val index = SegmentedIndex(new AffineTransform.Index(numOutputs, numInputs, includeBias), innerTransform.index)
+
+  def extractLayerAndPenultimateLayer(weights: DenseVector[Double], forTrain: Boolean) = {
+    val mat = weights(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require)
+    val bias = if(includeBias) {
+      weights(numOutputs * numInputs until index.componentOffset(1))
+    } else {
+      DenseVector.zeros[Double](numOutputs)
+    }
+    val inner = innerTransform.extractLayer(weights(index.componentOffset(1) to -1), forTrain)
+    new OutputLayer(mat, bias, inner) -> inner
+  }
+
+  /**
+   * N.B. Initialized to zero because this should *only* be used at the output layer, where
+   * zero initialization is appropriate
+   */
+  def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = {
+    require(outputLayer)
+    DenseVector.vertcat(DenseVector.zeros(index.indices(0).size), innerTransform.initialWeightVector(initWeightsScale, rng, false, spec))
+  }
+
+  def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) {
+    innerTransform.clipHiddenWeightVectors(weights(index.componentOffset(1) to -1), norm, false)
+  }
+
+  def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = {
+    (offset until offset + Math.min(10, index.indices(0).size)) ++ innerTransform.getInterestingWeightIndicesForGradientCheck(offset + index.indices(0).size)
+  }
+
+  case class OutputLayer(weights: DenseMatrix[Double], bias: DenseVector[Double], innerLayer: innerTransform.Layer) extends OutputTransform.OutputLayer[FV,DenseVector[Double]] {
+    override val index = AffineOutputTransform.this.index
+
+    val weightst = weights.t
+//    val weightst = weights.t.copy
+
+
+    def activations(fv: FV) = {
+      val out = weights * innerLayer.activations(fv) += bias
+      out
+    }
+
+    def activationsDot(fv: FV, sparseIdx: Int) = {
+      activationsFromPenultimateDot(innerLayer.activations(fv), sparseIdx)
+    }
+
+    def activationsDot(fv: FV, sparseIndices: Array[Int]) = {
+      activationsFromPenultimateDot(innerLayer.activations(fv), sparseIndices)
+    }
+
+    def activationsFromPenultimateDot(innerLayerActivations: DenseVector[Double], sparseIdx: Int) = {
+      weights(sparseIdx, ::) * innerLayerActivations + bias(sparseIdx)
+    }
+
+    def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = {
+      val scale = _scale
+      val matDeriv = deriv(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require)
+      val biasDeriv = if(includeBias) {
+        deriv(numOutputs * numInputs until index.componentOffset(1))
+      } else {
+        DenseVector.zeros[Double](numOutputs)
+      }
+
+      // whole function is f(mat * inner(fv) + bias)
+      // scale(i) pushes in  (f'(mat * inner(v) + bias))(i)
+      val innerAct = innerLayer.activations(fv)
+      // d/d(weights(::, i)) == scale(i) * innerAct
+      for (i <- 0 until weights.rows) {
+        val a: Double = scale(i)
+        if(a != 0.0) {
+          axpy(a, innerAct, matDeriv.t(::, i))
+        // so d/dbias(i) = scale(i)
+          biasDeriv(i) += a
+        }
+      }
+
+//      biasDeriv += scale
+
+      // scale is f'(mat * inner(v) + bias)
+      // d/dv is mat.t * f'(mat * inner(v) + bias)
+
+      innerLayer.tallyDerivative(deriv(index.componentOffset(1) to -1), weightst * scale, fv)
+    }
+
+    def applyBatchNormalization(inputs: scala.collection.GenTraversable[FV]) = innerLayer.applyBatchNormalization(inputs)
+
+  }
+
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,7 +16,7 @@ tmp/ @@
     .idea*
     .scratch/
     java.hprof.txt
     *.bbl
     *.blg
     *.aux
+    /bin/