diff --git a/.gitignore b/.gitignore index d99dd25e..e188c6ae 100644 --- a/.gitignore +++ b/.gitignore @@ -16,7 +16,7 @@ tmp/ .idea* .scratch/ java.hprof.txt - *.bbl *.blg *.aux +/bin/ diff --git a/README-NEURAL.md b/README-NEURAL.md new file mode 100644 index 00000000..36f14b16 --- /dev/null +++ b/README-NEURAL.md @@ -0,0 +1,114 @@ +The neural CRF parser is a high-performing constituency parser. + + + +##Preamble + +The neural CRF parser is described in: + +"Neural CRF Parsing" Greg Durrett and Dan Klein. ACL 2015. + +It is an extension of the span parser described in + +"Less Grammar, More Features" David Hall, Greg Durrett, and Dan Klein. ACL 2014. + +and is based on the Epic parsing framework. See https://github.com/dlwh/epic +for more documentation about the span parser and the Epic framework. +See http://www.eecs.berkeley.edu/~gdurrett/ for papers and BibTeX. + +Questions? Bugs? Email me at gdurrett@eecs.berkeley.edu + + + +##Setup + +You need three things to run the neural CRF parser: + +1) The compiled .jar; run ```sbt assembly``` to produce this + +2) A treebank: the Penn Treebank or one of the SPMRL treebanks + +3) Some sort of word vectors. These can either be in the .bin format +of Mikolov et al. (2013) or the .txt format of Bansal et al. (ACL 2014). For +English, the best performance comes from using Bansal et al.'s vectors: + +http://ttic.uchicago.edu/~mbansal/codedata/dependencyEmbeddings-skipdep.zip + +For other languages, you can train suitable vectors on monolingual data using +```word2vec``` with the following arguments: + + -cbow 0 -size 100 -window 1 -sample 1e-4 -threads 8 -binary 0 -iter 15 + +These are mildly tuned, and using a small window size is important, but other +settings are likely to work well too. + + + + +##Usage + +To run the parser on new text (tokenized, one-sentence-per-line), use the following command: + + java -Xmx4g -cp path/to/assembly.jar epic.parser.ParseText --model neuralcrf.parser \ + --tokenizer whitespace --sentences newline --nthreads 8 [files] + +You can download the ```neuralcrf.parser``` model from: + +http://nlp.cs.berkeley.edu/projects/neuralcrf.shtml + +Due to modifications to the code for the system release and randomness in the +training process, this model performs slightly worse than reported in the paper +(90.9 on WSJ Section 23). + +To train a new parser as described in the neural CRF paper, run the following command +(note that you need to fill in paths for -cp, --treebank.path, and --word2vecPath): + + java -Xmx47g -cp path/to/assembly.jar epic.parser.models.NeuralParserTrainer \ + --cache.path constraints.cache \ + --opt.useStochastic \ + --treebank.path path/to/wsj/ \ + --evalOnTest \ + --includeDevInTrain \ + --trainer.modelFactory.annotator epic.trees.annotations.PipelineAnnotator \ + --ann.0 epic.trees.annotations.FilterAnnotations \ + --ann.1 epic.trees.annotations.ForgetHeadTag \ + --ann.2 epic.trees.annotations.Markovize \ + --ann.2.horizontal 0 \ + --ann.2.vertical 0 \ + --modelFactory epic.parser.models.PositionalNeuralModelFactory \ + --opt.batchSize 200 \ + --word2vecPath path/to/skipdep_embeddings.txt \ + --threads 8 + +To run on SPMRL treebanks, modify the arguments to the command above as follows: + +1) Add the following arguments (replace ${LANG}$ as appropriate): + + --treebankType spmrl \ + --binarization head \ + --supervisedHeadFinderPtbPath path/to/gold/ptb/train/train.${LANG}.gold.ptb \ + --supervisedHeadFinderConllPath path/to/gold/conll/train/train.${LANG}.gold.conll \ + --ann.3 epic.trees.annotations.SplitPunct + +2) Modify --treebank.path to point to the X_SPMRL/gold/ptb directory. + +Options to configure the neural network and training are largely defined in +```epic.parser.models.PositionalNeuralModel``` + +###Miscellaneous Notes + +To run on the development set, simply remove ```evalOnTest``` and +```includeDevInTrain``` from the arguments. + +You should use the official version of ```evalb``` on the output files (gold +and guess) rather than relying on the native scorer in the Epic parser. For +SPMRL, you should use the version distributed with the shared task. + +Note that the X-bar grammar and coarse pruning masks (constraints) are cached +between runs in the same directory, which speeds up training and testing time +considerably as generating the masks is time-consuming. + +Finally, note that multiple parsers cannot be trained simultaneously in +the same directory, since certain files (such as pruning masks from the +coarse model) will collide. + diff --git a/build.sbt b/build.sbt index b50860ae..926c429d 100644 --- a/build.sbt +++ b/build.sbt @@ -102,6 +102,7 @@ mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) => { case PathList("org", "w3c", "dom", _) => MergeStrategy.first case PathList("javax", "xml", "stream", _ *) => MergeStrategy.first + case PathList("scala", "xml", _ *) => MergeStrategy.first case PathList("org", "cyberneko", "html", _ *) => MergeStrategy.first case x => old(x) } diff --git a/src/main/scala/epic/dense/AdadeltaGradientDescentDVD.scala b/src/main/scala/epic/dense/AdadeltaGradientDescentDVD.scala new file mode 100644 index 00000000..b76098f0 --- /dev/null +++ b/src/main/scala/epic/dense/AdadeltaGradientDescentDVD.scala @@ -0,0 +1,57 @@ +package epic.dense + +import breeze.linalg._ +import breeze.numerics._ +import breeze.optimize.StochasticDiffFunction +import breeze.optimize.StochasticGradientDescent + + +class AdadeltaGradientDescentDVD(maxIter: Int, + rho: Double = 0.95, + tolerance: Double = 1E-5, + improvementTolerance: Double = 1E-4, + minImprovementWindow: Int = 50) + extends StochasticGradientDescent[DenseVector[Double]](1.0, maxIter, tolerance, improvementTolerance, minImprovementWindow) { + + val delta = 1E-4 + val epsilon = 1e-6 + import vspace._ + + case class History(squaredGradientsHistory: DenseVector[Double], squaredUpdatesHistory: DenseVector[Double]) + override def initialHistory(f: StochasticDiffFunction[DenseVector[Double]],init: DenseVector[Double]) = { + History(DenseVector(Array.tabulate(init.size)(i => 1e-6)), DenseVector(Array.tabulate(init.size)(i => 1e-6))) + } + + override def updateHistory(newX: DenseVector[Double], newGrad: DenseVector[Double], newValue: Double, f: StochasticDiffFunction[DenseVector[Double]], oldState: State) = { + val oldHistory = oldState.history + // The new gradient gets incorporated during the next round of takeStep, + // so this computation should lag by one + val newG = (oldState.grad :* oldState.grad) * (1 - rho) + axpy(rho, oldHistory.squaredGradientsHistory, newG) + val deltaX = newX - oldState.x + val newU = deltaX :* deltaX * (1 - rho); + axpy(rho, oldHistory.squaredUpdatesHistory, newU) + new History(newG, newU) + } + + override protected def takeStep(state: State, dir: DenseVector[Double], stepSize: Double) = { + import state._ + // Need to pre-emptively update the gradient since the history only has it through the + // last timestep + val rmsGt = sqrt((state.history.squaredGradientsHistory * rho) :+ ((state.grad :* state.grad) * (1-rho)) :+ epsilon) + val rmsDeltaXtm1 = sqrt(state.history.squaredUpdatesHistory :+ epsilon) + val step = dir :* rmsDeltaXtm1 :/ rmsGt + val newX = x + axpy(1.0, step, newX) + newX + } + + override def determineStepSize(state: State, f: StochasticDiffFunction[DenseVector[Double]], dir: DenseVector[Double]) = { + defaultStepSize // pegged to 1.0 for this method + } + + override protected def adjust(newX: DenseVector[Double], newGrad: DenseVector[Double], newVal: Double) = { + newVal -> newGrad + } + +} \ No newline at end of file diff --git a/src/main/scala/epic/dense/AffineOutputTransform.scala b/src/main/scala/epic/dense/AffineOutputTransform.scala new file mode 100644 index 00000000..167aa4df --- /dev/null +++ b/src/main/scala/epic/dense/AffineOutputTransform.scala @@ -0,0 +1,106 @@ +package epic.dense + +import breeze.linalg._ +import breeze.linalg.operators.OpMulMatrix +import epic.features.SegmentedIndex +import epic.framework.Feature + +import scala.runtime.ScalaRunTime +import scala.util.Random + +/** + * Used at the output layer when we're only going to need some of the possible ouputs; + * it exposes the penultimate layer and then the Layer allows you to pass the results + * from that back in (caching it elsewhere) and only compute certain cells in the + * output layer (activationsFromPenultimateDot). + */ +case class AffineOutputTransform[FV](numOutputs: Int, numInputs: Int, innerTransform: Transform[FV, DenseVector[Double]], includeBias: Boolean = true) extends OutputTransform[FV, DenseVector[Double]] { + + + val index = SegmentedIndex(new AffineTransform.Index(numOutputs, numInputs, includeBias), innerTransform.index) + + def extractLayerAndPenultimateLayer(weights: DenseVector[Double], forTrain: Boolean) = { + val mat = weights(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) + val bias = if(includeBias) { + weights(numOutputs * numInputs until index.componentOffset(1)) + } else { + DenseVector.zeros[Double](numOutputs) + } + val inner = innerTransform.extractLayer(weights(index.componentOffset(1) to -1), forTrain) + new OutputLayer(mat, bias, inner) -> inner + } + + /** + * N.B. Initialized to zero because this should *only* be used at the output layer, where + * zero initialization is appropriate + */ + def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = { + require(outputLayer) + DenseVector.vertcat(DenseVector.zeros(index.indices(0).size), innerTransform.initialWeightVector(initWeightsScale, rng, false, spec)) + } + + def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) { + innerTransform.clipHiddenWeightVectors(weights(index.componentOffset(1) to -1), norm, false) + } + + def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = { + (offset until offset + Math.min(10, index.indices(0).size)) ++ innerTransform.getInterestingWeightIndicesForGradientCheck(offset + index.indices(0).size) + } + + case class OutputLayer(weights: DenseMatrix[Double], bias: DenseVector[Double], innerLayer: innerTransform.Layer) extends OutputTransform.OutputLayer[FV,DenseVector[Double]] { + override val index = AffineOutputTransform.this.index + + val weightst = weights.t +// val weightst = weights.t.copy + + + def activations(fv: FV) = { + val out = weights * innerLayer.activations(fv) += bias + out + } + + def activationsDot(fv: FV, sparseIdx: Int) = { + activationsFromPenultimateDot(innerLayer.activations(fv), sparseIdx) + } + + def activationsDot(fv: FV, sparseIndices: Array[Int]) = { + activationsFromPenultimateDot(innerLayer.activations(fv), sparseIndices) + } + + def activationsFromPenultimateDot(innerLayerActivations: DenseVector[Double], sparseIdx: Int) = { + weights(sparseIdx, ::) * innerLayerActivations + bias(sparseIdx) + } + + def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = { + val scale = _scale + val matDeriv = deriv(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) + val biasDeriv = if(includeBias) { + deriv(numOutputs * numInputs until index.componentOffset(1)) + } else { + DenseVector.zeros[Double](numOutputs) + } + + // whole function is f(mat * inner(fv) + bias) + // scale(i) pushes in (f'(mat * inner(v) + bias))(i) + val innerAct = innerLayer.activations(fv) + // d/d(weights(::, i)) == scale(i) * innerAct + for (i <- 0 until weights.rows) { + val a: Double = scale(i) + if(a != 0.0) { + axpy(a, innerAct, matDeriv.t(::, i)) + // so d/dbias(i) = scale(i) + biasDeriv(i) += a + } + } + + // scale is f'(mat * inner(v) + bias) + // d/dv is mat.t * f'(mat * inner(v) + bias) + + innerLayer.tallyDerivative(deriv(index.componentOffset(1) to -1), weightst * scale, fv) + } + + def applyBatchNormalization(inputs: scala.collection.GenTraversable[FV]) = innerLayer.applyBatchNormalization(inputs) + + } + +} diff --git a/src/main/scala/epic/dense/AffineTransform.scala b/src/main/scala/epic/dense/AffineTransform.scala index 04231d53..5e6be860 100644 --- a/src/main/scala/epic/dense/AffineTransform.scala +++ b/src/main/scala/epic/dense/AffineTransform.scala @@ -6,7 +6,7 @@ import epic.features.SegmentedIndex import epic.framework.Feature import scala.runtime.ScalaRunTime - +import scala.util.Random case class AffineTransform[FV, Mid](numOutputs: Int, numInputs: Int, innerTransform: Transform[FV, Mid], includeBias: Boolean = true) (implicit mult: OpMulMatrix.Impl2[DenseMatrix[Double], Mid, DenseVector[Double]], @@ -15,23 +15,50 @@ case class AffineTransform[FV, Mid](numOutputs: Int, numInputs: Int, innerTransf val index = SegmentedIndex(new AffineTransform.Index(numOutputs, numInputs, includeBias), innerTransform.index) - - - def extractLayer(weights: DenseVector[Double]) = { + def extractLayer(weights: DenseVector[Double], forTrain: Boolean) = { + extractLayerAndPenultimateLayer(weights, forTrain)._1 + } + + def extractLayerAndPenultimateLayer(weights: DenseVector[Double], forTrain: Boolean) = { val mat = weights(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) val bias = if(includeBias) { weights(numOutputs * numInputs until index.componentOffset(1)) } else { DenseVector.zeros[Double](numOutputs) } - val inner = innerTransform.extractLayer(weights(index.componentOffset(1) to -1)) - new Layer(mat, bias, inner) + val inner = innerTransform.extractLayer(weights(index.componentOffset(1) to -1), forTrain) + new Layer(mat, bias, inner) -> inner + } + + def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = { +// if (spec == "") { +// DenseVector(Array.tabulate(index.indices(0).size)(i => if (outputLayer) 0.0 else rng.nextGaussian * initWeightsScale)), + val myWeights = if (outputLayer) { + DenseVector(Array.tabulate(index.indices(0).size)(i => 0.0)) + } else if (spec == "magic") { + AffineTransform.getMagicAffineWeights(index.indices(0).size, numInputs, numOutputs, initWeightsScale, rng) + } else { + AffineTransform.getGaussianAffineWeights(index.indices(0).size, initWeightsScale, rng) + } + DenseVector.vertcat(myWeights, innerTransform.initialWeightVector(initWeightsScale, rng, false, spec)) + } + + def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) { + if (!outputLayer) { + AffineTransform.clipHiddenWeightVectors(numOutputs, numInputs, weights, norm) + } + innerTransform.clipHiddenWeightVectors(weights(index.componentOffset(1) to -1), norm, false) + } + + def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = { + (offset until offset + Math.min(10, index.indices(0).size)) ++ innerTransform.getInterestingWeightIndicesForGradientCheck(offset + index.indices(0).size) } - case class Layer(weights: DenseMatrix[Double], bias: DenseVector[Double], innerLayer: innerTransform.Layer) extends _Layer { + case class Layer(weights: DenseMatrix[Double], bias: DenseVector[Double], innerLayer: innerTransform.Layer) extends Transform.Layer[FV,DenseVector[Double]] { override val index = AffineTransform.this.index - val weightst = weights.t.copy + val weightst = weights.t +// val weightst = weights.t.copy def activations(fv: FV) = { @@ -40,6 +67,7 @@ case class AffineTransform[FV, Mid](numOutputs: Int, numInputs: Int, innerTransf } def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = { +// println("SCALE: " + _scale) val scale = _scale val matDeriv = deriv(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) val biasDeriv = if(includeBias) { @@ -65,9 +93,11 @@ case class AffineTransform[FV, Mid](numOutputs: Int, numInputs: Int, innerTransf // scale is f'(mat * inner(v) + bias) // d/dv is mat.t * f'(mat * inner(v) + bias) - +// println("Intermediate scale: " + weightst * scale) innerLayer.tallyDerivative(deriv(index.componentOffset(1) to -1), weightst * scale, fv) } + + def applyBatchNormalization(inputs: scala.collection.GenTraversable[FV]) = innerLayer.applyBatchNormalization(inputs) } @@ -78,6 +108,31 @@ object AffineTransform { canAxpy: scaleAdd.InPlaceImpl3[DenseVector[Double], Double, FV]) = new AffineTransform(numOutputs, numInputs, new IdentityTransform[FV], includeBias) def apply(numOutputs: Int, numInputs: Int, includeBias: Boolean):AffineTransform[DenseVector[Double], DenseVector[Double]] = apply(numOutputs, numInputs, new IdentityTransform[DenseVector[Double]], includeBias) def apply(numOutputs: Int, numInputs: Int):AffineTransform[DenseVector[Double], DenseVector[Double]] = apply(numOutputs, numInputs, true) + + + def getUniformAffineWeights(numWeights: Int, initWeightsScale: Double, rng: Random) = { + DenseVector(Array.tabulate(numWeights)(i => rng.nextGaussian * initWeightsScale)) + } + + def getGaussianAffineWeights(numWeights: Int, initWeightsScale: Double, rng: Random) = { + DenseVector(Array.tabulate(numWeights)(i => rng.nextGaussian * initWeightsScale)) + } + + // N.B. numWeights != inSize * outSize if there's a bias + def getMagicAffineWeights(numWeights: Int, inSize: Int, outSize: Int, initWeightsScale: Double, rng: Random) = { + val range = Math.sqrt(6.0/(inSize + outSize)) + DenseVector(Array.tabulate(numWeights)(i => rng.nextDouble * 2 * range - range)) + } + + def clipHiddenWeightVectors(numOutputs: Int, numInputs: Int, weights: DenseVector[Double], norm: Double) { + val mat = weights(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) + for (i <- 0 until mat.rows) { + val thisRowNorm = breeze.linalg.norm(mat(i, ::), 2) + val multFactor = norm/Math.sqrt(thisRowNorm) + mat(i, ::) *= multFactor + } + } + case class Index(numOutputs: Int, numInputs: Int, includeBias: Boolean = true) extends breeze.util.Index[Feature] { def apply(t: Feature): Int = t match { case NeuralFeature(output, input) if output < numOutputs && input < numInputs && output > 0 && input > 0 => diff --git a/src/main/scala/epic/dense/BatchNormalizationTransform.scala b/src/main/scala/epic/dense/BatchNormalizationTransform.scala new file mode 100644 index 00000000..04abb51f --- /dev/null +++ b/src/main/scala/epic/dense/BatchNormalizationTransform.scala @@ -0,0 +1,97 @@ +package epic.dense + +import breeze.linalg._ +import breeze.linalg.DenseVector +import epic.framework.Feature +import breeze.util.Index +import scala.util.Random +import breeze.numerics.sigmoid +import epic.features.SegmentedIndex + +/** + * Implements batch normalization from + * http://arxiv.org/pdf/1502.03167v3.pdf + * Basically, each unit is shifted and rescaled per minibatch so that its activations + * have mean 0 and variance 1. This has been demonstrated to help training deep networks, + * but doesn't seem to help here. + */ +case class BatchNormalizationTransform[FV](size: Int, useBias: Boolean, inner: Transform[FV, DenseVector[Double]]) extends Transform[FV, DenseVector[Double]] { + + val index = if (useBias) { + SegmentedIndex(new AffineTransform.Index(size, 0, true), inner.index) + } else { + inner.index + } + + def extractLayer(dv: DenseVector[Double], forTrain: Boolean) = { + if (useBias) { + new Layer(dv(0 until size), size, inner.extractLayer(dv(size to -1), forTrain)) + } else { + new Layer(DenseVector.zeros[Double](size), size, inner.extractLayer(dv, forTrain)) + } + } + + def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = { + if (useBias) { + DenseVector.vertcat(DenseVector.zeros[Double](size), + inner.initialWeightVector(initWeightsScale, rng, false, spec)) + } else { + inner.initialWeightVector(initWeightsScale, rng, false, spec) + } + } + + def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) = inner.clipHiddenWeightVectors(weights, norm, false) + + def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = { + if (useBias) { + (offset until offset + Math.min(10, size)) ++ inner.getInterestingWeightIndicesForGradientCheck(offset + size) + } else { + inner.getInterestingWeightIndicesForGradientCheck(offset) + } + } + + case class Layer(bias: DenseVector[Double], size: Int, innerLayer: inner.Layer) extends Transform.Layer[FV,DenseVector[Double]] { + + var fcn = new NonlinearTransform.ShiftAndScaleEach(Array.tabulate(size)(i => 0.0), Array.tabulate(size)(i => 1.0)) + + val myIndex = Index[Feature] + + def index = myIndex; + + def activations(fv: FV): DenseVector[Double] = { + val act = innerLayer.activations(fv) + var i = 0; + while (i < act.size) { + act(i) = fcn.fcn(i, act(i)) + bias(i) + i += 1 + } + act + } + + def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = { + val biasDeriv = if (useBias) deriv(0 until size) else DenseVector[Double]() + val scale = _scale + var i = 0; + while (i < scale.size) { + if (useBias) { + biasDeriv(i) += scale(i) + } + scale(i) = scale(i) * fcn.deriv(i, 0) // we know it's linear so just evaluate the derivative at 0, saves computing activations + i += 1 + } + innerLayer.tallyDerivative(if (useBias) deriv(size to -1) else deriv, scale, fv) + } + + def applyBatchNormalization(inputs: scala.collection.GenTraversable[FV]) = { + val allActivations = inputs.map(activations(_)) + val mean = allActivations.reduce(_ + _) * (1.0/inputs.size) + val variances = allActivations.map(act => (act - mean) :* (act - mean)).reduce(_ + _) * (1.0/inputs.size) + val invStdDevs = variances.data.map(variance => 1.0/Math.sqrt(variance + 1e-6)) +// println(mean.data.toSeq) +// println(invStdDevs.toSeq) + fcn = new NonlinearTransform.ShiftAndScaleEach(mean.data, invStdDevs) + innerLayer.applyBatchNormalization(inputs) + } + } + +} \ No newline at end of file diff --git a/src/main/scala/epic/dense/CachingLookupAndAffineTransformDense.scala b/src/main/scala/epic/dense/CachingLookupAndAffineTransformDense.scala new file mode 100644 index 00000000..417627d8 --- /dev/null +++ b/src/main/scala/epic/dense/CachingLookupAndAffineTransformDense.scala @@ -0,0 +1,113 @@ +package epic.dense + +import scala.runtime.ScalaRunTime +import breeze.linalg._ +import epic.features.SegmentedIndex +import epic.framework.Feature +import scala.collection.mutable.HashMap +import scala.util.Random + +/** + * Used at the input layer to cache lookups and the result of applying + * the affine transform at the first layer of the network. This saves + * computation across repeated invocations of the neural network in + * the sentence. + */ +case class CachingLookupAndAffineTransformDense[FV](numOutputs: Int, + numInputs: Int, + word2vecIndexed: Word2VecIndexed[String], + includeBias: Boolean = true) extends Transform[Array[Int], DenseVector[Double]] { + + + val index = new AffineTransform.Index(numOutputs, numInputs, includeBias) + + def extractLayer(weights: DenseVector[Double], forTrain: Boolean) = { + val mat = weights(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) + val bias = if(includeBias) { + weights(numOutputs * numInputs until index.size) + } else { + DenseVector.zeros[Double](numOutputs) + } + new Layer(mat, bias) + } + + def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = { + val myWeights = if (outputLayer) { + DenseVector.zeros[Double](index.size) + } else if (spec == "magic") { + AffineTransform.getMagicAffineWeights(index.size, numInputs, numOutputs, initWeightsScale, rng) + } else { + AffineTransform.getGaussianAffineWeights(index.size, initWeightsScale, rng) + } + myWeights + } + + def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) { + if (!outputLayer) { + AffineTransform.clipHiddenWeightVectors(numOutputs, numInputs, weights, norm) + } + } + + def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = { + (offset until offset + Math.min(10, index.size)) + } + + case class Layer(weights: DenseMatrix[Double], bias: DenseVector[Double]) extends Transform.Layer[Array[Int],DenseVector[Double]] { + + override val index = CachingLookupAndAffineTransformDense.this.index + + val weightst = weights.t + + // Cache stores pairs of (word identity, position) mapped to the final results of + // these being multiplied by the parameter vector. Note that although the same + // word vector is used for each word identity, the parameter vector depends + // on the position. + val caches = Array.tabulate(numInputs/word2vecIndexed.wordRepSize)(i => new HashMap[Int,DenseVector[Double]]) + + def activations(fv: Array[Int]) = { + val finalVector = DenseVector.zeros[Double](numOutputs) + for (i <- 0 until fv.size) { +// val wordPosn = fv(i) -> i + if (fv(i) != -1) { + caches(i).synchronized { + if (!caches(i).contains(fv(i))) { + val startIdx = i * word2vecIndexed.wordRepSize + caches(i).put(fv(i), weights(::, startIdx until startIdx + word2vecIndexed.wordRepSize) * DenseVector(word2vecIndexed.convertIndexToVector(fv(i)))) + } + finalVector += caches(i)(fv(i)) + } + } + } + finalVector + bias + } + + def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: Array[Int]) = { + val scale = _scale + val matDeriv = deriv(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) + val biasDeriv = if(includeBias) { + deriv(numOutputs * numInputs until index.size) + } else { + DenseVector.zeros[Double](numOutputs) + } + + // whole function is f(mat * inner(fv) + bias) + // scale(i) pushes in (f'(mat * inner(v) + bias))(i) + val innerAct = DenseVector(word2vecIndexed.convertToVector(fv)); + + // d/d(weights(::, i)) == scale(i) * innerAct + for (i <- 0 until weights.rows) { + val a: Double = scale(i) + if(a != 0.0) { + axpy(a, innerAct, matDeriv.t(::, i)) + // so d/dbias(i) = scale(i) + biasDeriv(i) += a + } + } + + // scale is f'(mat * inner(v) + bias) + // d/dv is mat.t * f'(mat * inner(v) + bias) + } + + def applyBatchNormalization(inputs: scala.collection.GenTraversable[Array[Int]]) = {} + } +} diff --git a/src/main/scala/epic/dense/CachingLookupTransform.scala b/src/main/scala/epic/dense/CachingLookupTransform.scala new file mode 100644 index 00000000..e934d2f8 --- /dev/null +++ b/src/main/scala/epic/dense/CachingLookupTransform.scala @@ -0,0 +1,43 @@ +package epic.dense + +import scala.runtime.ScalaRunTime +import breeze.linalg._ +import epic.features.SegmentedIndex +import epic.framework.Feature +import scala.collection.mutable.HashMap +import scala.util.Random +import breeze.util.Index + +/** + * Used at the input layer to cache lookups and + */ +case class CachingLookupTransform(word2vecIndexed: Word2VecIndexed[String]) extends Transform[Array[Int], DenseVector[Double]] { + + val index = Index[epic.framework.Feature]() + + def extractLayer(weights: DenseVector[Double], forTrain: Boolean) = new Layer() + + def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = DenseVector() + + def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) {} + + def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = Seq[Int]() + + case class Layer() extends Transform.Layer[Array[Int],DenseVector[Double]] { + + override val index = Index[epic.framework.Feature]() + + def activations(fv: Array[Int]) = { + var finalVector = DenseVector.zeros[Double](0) + for (i <- 0 until fv.size) { + val vec: DenseVector[Double] = if (fv(i) != -1) DenseVector(word2vecIndexed.convertIndexToVector(fv(i))) else DenseVector(word2vecIndexed.zeroVector) + finalVector = DenseVector.vertcat(finalVector, vec) + } + finalVector + } + + def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: Array[Int]) = {} + + def applyBatchNormalization(inputs: scala.collection.GenTraversable[Array[Int]]) = {} + } +} \ No newline at end of file diff --git a/src/main/scala/epic/dense/EmbeddingsTransform.scala b/src/main/scala/epic/dense/EmbeddingsTransform.scala new file mode 100644 index 00000000..bd0fbad2 --- /dev/null +++ b/src/main/scala/epic/dense/EmbeddingsTransform.scala @@ -0,0 +1,126 @@ +package epic.dense + +import scala.runtime.ScalaRunTime +import breeze.linalg._ +import epic.features.SegmentedIndex +import epic.framework.Feature +import scala.collection.mutable.HashMap +import scala.util.Random + +/** + * Used at the input layer to cache lookups and + * backprop into embeddings + */ +case class EmbeddingsTransform[FV](numOutputs: Int, + numInputs: Int, + word2vecIndexed: Word2VecIndexed[String], + includeBias: Boolean = true) extends Transform[Array[Int], DenseVector[Double]] { + + + val index = SegmentedIndex(new AffineTransform.Index(numOutputs, numInputs, includeBias), + new AffineTransform.Index(word2vecIndexed.vocSize, word2vecIndexed.wordRepSize, false)) + println("Allocated " + index.indices.map(_.size) + " parameters for each index in the embedding layer (backpropagating into embeddings)") + + def extractLayer(weights: DenseVector[Double], forTrain: Boolean) = { + val mat = weights(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) + val bias = if(includeBias) { + weights(numOutputs * numInputs until index.indices(0).size) + } else { + DenseVector.zeros[Double](numOutputs) + } + val wordWeights = weights(index.indices(0).size until index.indices(0).size + index.indices(1).size).asDenseMatrix.reshape(word2vecIndexed.vocSize, word2vecIndexed.wordRepSize, view = View.Require) + new Layer(mat, bias, wordWeights) + } + + def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = { + val myWeights = if (outputLayer) { + DenseVector(Array.tabulate(index.indices(0).size)(i => 0.0)) + } else if (spec == "magic") { + AffineTransform.getMagicAffineWeights(index.indices(0).size, numInputs, numOutputs, initWeightsScale, rng) + } else { + AffineTransform.getGaussianAffineWeights(index.indices(0).size, initWeightsScale, rng) + } + // Only randomly initialize the weights in the matrix, not the word deltas + DenseVector.vertcat(myWeights, DenseVector.zeros[Double](index.size - index.indices(0).size)) +// DenseVector(Array.tabulate(index.size)(i => if (!outputLayer && i < index.indices(0).size) rng.nextGaussian * initWeightsScale else 0.0)) + } + + def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) { + if (!outputLayer) { + AffineTransform.clipHiddenWeightVectors(numOutputs, numInputs, weights, norm) + } + } + + def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = { + (offset until offset + Math.min(10, index.indices(0).size)) ++ (offset + index.componentOffset(1) until offset + index.componentOffset(1) + Math.min(10, index.indices(1).size)) + } + + case class Layer(weights: DenseMatrix[Double], bias: DenseVector[Double], wordWeights: DenseMatrix[Double]) extends Transform.Layer[Array[Int],DenseVector[Double]] { + + override val index = EmbeddingsTransform.this.index + + val weightst = weights.t + + // Cache stores pairs of (word identity, position) mapped to the final results of + // these being multiplied by the parameter vector. Note that although the same + // word vector is used for each word identity, the parameter vector depends + // on the position. + val caches = Array.tabulate(numInputs/word2vecIndexed.wordRepSize)(i => new HashMap[Int,DenseVector[Double]]) + + def activations(fv: Array[Int]) = { + val finalVector = DenseVector.zeros[Double](numOutputs) + for (i <- 0 until fv.size) { +// val wordPosn = fv(i) -> i + if (fv(i) != -1) { + caches(i).synchronized { + if (!caches(i).contains(fv(i))) { + val startIdx = i * word2vecIndexed.wordRepSize + val wordVec = DenseVector(word2vecIndexed.convertIndexToVector(fv(i))) + wordWeights(fv(i), ::).t + caches(i).put(fv(i), weights(::, startIdx until startIdx + word2vecIndexed.wordRepSize) * wordVec) + } + finalVector += caches(i)(fv(i)) + } + } + } + finalVector + bias + } + + def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: Array[Int]) = { + val scale = _scale + val matDeriv = deriv(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) + val biasDeriv = if(includeBias) { + deriv(numOutputs * numInputs until index.size) + } else { + DenseVector.zeros[Double](numOutputs) + } + + // whole function is f(mat * inner(fv) + bias) + // scale(i) pushes in (f'(mat * inner(v) + bias))(i) + val innerAct = DenseVector(word2vecIndexed.convertToVector(fv)) + Word2VecSurfaceFeaturizerIndexed.makeVectFromParams(fv, wordWeights); + + val wordsDeriv = deriv(index.indices(0).size until index.indices(0).size + index.indices(1).size).asDenseMatrix.reshape(word2vecIndexed.vocSize, word2vecIndexed.wordRepSize, view = View.Require) + val wordsDerivs = Array.tabulate(fv.size)(wordPosnIdx => wordsDeriv(fv(wordPosnIdx), ::).t) + // d/d(weights(::, i)) == scale(i) * innerAct + for (i <- 0 until weights.rows) { + val a: Double = scale(i) + if(a != 0.0) { + axpy(a, innerAct, matDeriv.t(::, i)) + var wordPosnIdx = 0; + while (wordPosnIdx < fv.size) { + val relevantWeights = weights(i, wordPosnIdx * word2vecIndexed.wordRepSize until (wordPosnIdx + 1) * word2vecIndexed.wordRepSize).t + axpy(a, relevantWeights, wordsDerivs(wordPosnIdx)) + wordPosnIdx += 1 + } + // so d/dbias(i) = scale(i) + biasDeriv(i) += a + } + } + + // scale is f'(mat * inner(v) + bias) + // d/dv is mat.t * f'(mat * inner(v) + bias) + } + + + def applyBatchNormalization(inputs: scala.collection.GenTraversable[Array[Int]]) = {} + } +} \ No newline at end of file diff --git a/src/main/scala/epic/dense/IdentityTransform.scala b/src/main/scala/epic/dense/IdentityTransform.scala index 3f42ad28..f7410f36 100644 --- a/src/main/scala/epic/dense/IdentityTransform.scala +++ b/src/main/scala/epic/dense/IdentityTransform.scala @@ -3,22 +3,31 @@ package epic.dense import breeze.linalg._ import breeze.util.Index import epic.framework.Feature - +import scala.util.Random class IdentityTransform[T] extends Transform[T, T] { val index = Index[Feature]() + def extractLayer(weights: DenseVector[Double], forTrain: Boolean) = new Layer() + + def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = DenseVector(Array[Double]()) + + def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) {} + + def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = Seq[Int]() - def extractLayer(weights: DenseVector[Double]) = { - new Layer() - } - - class Layer extends _Layer { + class Layer extends Transform.Layer[T,T] { + + val myIndex = Index[Feature] + + def index = myIndex; def activations(fv: T) = fv def tallyDerivative(deriv: DenseVector[Double], scale: =>Vector[Double], t: T) = {} + + def applyBatchNormalization(inputs: scala.collection.GenTraversable[T]) = {} } } diff --git a/src/main/scala/epic/dense/LowRankQuadraticTransform.scala b/src/main/scala/epic/dense/LowRankQuadraticTransform.scala new file mode 100644 index 00000000..0dad562a --- /dev/null +++ b/src/main/scala/epic/dense/LowRankQuadraticTransform.scala @@ -0,0 +1,142 @@ +package epic.dense + +import breeze.linalg._ +import breeze.linalg.operators.OpMulMatrix +import epic.features.SegmentedIndex +import epic.framework.Feature +import breeze.util.Index + +import scala.runtime.ScalaRunTime +import scala.util.Random + +case class LowRankQuadraticTransform[FV](numOutputs: Int, numRanks: Int, numLeftInputs: Int, numRightInputs: Int, innerTransform: Transform[FV, DenseVector[Double]]) extends OutputTransform[FV, DenseVector[Double]] { + + val neurons = (0 until numOutputs).map(i => new LowRankQuadraticTransformNeuron(numRanks, numLeftInputs, numRightInputs)) + val neuronIndex = SegmentedIndex(neurons.map(_.index):_*) + val index = SegmentedIndex(neuronIndex, innerTransform.index) + + def extractLayerAndPenultimateLayer(weights: DenseVector[Double], forTrain: Boolean) = { + val subTransforms = (0 until neurons.size).map(i => neurons(i).extractLayer(weights(neuronIndex.componentOffset(i) until neuronIndex.componentOffset(i) + neuronIndex.indices(i).size))) + val innerLayer = innerTransform.extractLayer(weights(index.componentOffset(1) to -1), forTrain); + new OutputLayer(subTransforms, innerLayer) -> innerLayer + } + +// def extractLayer(weights: DenseVector[Double]) = { +// val subTransforms = (0 until neurons.size).map(i => neurons(i).extractLayer(weights(neuronIndex.componentOffset(i) until neuronIndex.componentOffset(i) + neuronIndex.indices(i).size))) +// new Layer(subTransforms, innerTransform.extractLayer(weights(index.componentOffset(1) to -1))) +// } + + def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = { + val subVects = DenseVector.vertcat(neurons.map(_.initialWeightVector(initWeightsScale, rng, outputLayer, spec)):_*) + DenseVector.vertcat(subVects, innerTransform.initialWeightVector(initWeightsScale, rng, outputLayer, spec)) + } + + def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) { + innerTransform.clipHiddenWeightVectors(weights(index.componentOffset(1) to -1), norm, outputLayer); + } + + def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = { + (offset until offset + Math.min(10, index.indices(0).size)) ++ innerTransform.getInterestingWeightIndicesForGradientCheck(offset + index.indices(0).size) + } + + case class OutputLayer(sublayers: Seq[LRQTNLayer], innerLayer: innerTransform.Layer) extends OutputTransform.OutputLayer[FV,DenseVector[Double]] { + + override val index = LowRankQuadraticTransform.this.index + val neuronIndex = LowRankQuadraticTransform.this.neuronIndex + + def activations(fv: FV) = { + val innerActivations = innerLayer.activations(fv) + DenseVector(Array.tabulate(sublayers.size)(i => sublayers(i).activations(innerActivations)(0))) + } + + def activationsFromPenultimateDot(innerLayerActivations: DenseVector[Double], sparseIdx: Int): Double = { + sublayers(sparseIdx).activations(innerLayerActivations)(0) + } + + def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = { + val innerActivations = innerLayer.activations(fv) + for (i <- 0 until sublayers.size) { + sublayers(i).tallyDerivative(deriv(neuronIndex.componentOffset(i) until neuronIndex.componentOffset(i) + neuronIndex.indices(i).size), _scale(i), innerActivations) + } + } + + def applyBatchNormalization(inputs: scala.collection.GenTraversable[FV]) = innerLayer.applyBatchNormalization(inputs) + } + + +} + +/** + * Separate because I was having some issues... + */ +case class LowRankQuadraticTransformNeuron(numRanks: Int, numLeftInputs: Int, numRightInputs: Int) { + + val index = SegmentedIndex(new AffineTransform.Index(numRanks, numLeftInputs, false), new AffineTransform.Index(numRanks, numRightInputs, false)) + + def extractLayer(weights: DenseVector[Double]) = { + val lhsSize = numRanks * numLeftInputs + val rhsSize = numRanks * numRightInputs + val lhsMat = weights(0 until lhsSize).asDenseMatrix.reshape(numRanks, numLeftInputs, view = View.Require) + val rhsMat = weights(lhsSize until (lhsSize + rhsSize)).asDenseMatrix.reshape(numRanks, numRightInputs, view = View.Require) + new LRQTNLayer(lhsMat, rhsMat, index, numRanks, numLeftInputs, numRightInputs) + } + + def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = { + if (spec == "magic") { + DenseVector.vertcat(AffineTransform.getMagicAffineWeights(index.indices(0).size, numLeftInputs, numRanks, initWeightsScale, rng), + AffineTransform.getMagicAffineWeights(index.indices(1).size, numRightInputs, numRanks, initWeightsScale, rng)) + } else { + DenseVector.vertcat(AffineTransform.getGaussianAffineWeights(index.indices(0).size, initWeightsScale, rng), + AffineTransform.getGaussianAffineWeights(index.indices(1).size, initWeightsScale, rng)) + } + } + + def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) { + } +} + + +case class LRQTNLayer(lhsWeights: DenseMatrix[Double], rhsWeights: DenseMatrix[Double], index: Index[Feature], numRanks: Int, numLeftInputs: Int, numRightInputs: Int) { + val lhsWeightst = lhsWeights.t + val rhsWeightst = rhsWeights.t + + def activations(fv: DenseVector[Double]) = { + val lhsProj = lhsWeights * fv + val rhsProj = rhsWeights * fv + val dotProd = lhsProj.dot(rhsProj) +// println(dotProd + " " + lhsProj.data.toSeq + " " + rhsProj.data.toSeq) + DenseVector(dotProd) + } + + def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: DenseVector[Double]) = { +// println("SCALE: " + _scale) + val scale = _scale(0) + if (Math.abs(scale) > 1e-6) { + val lhsSize = numRanks * numLeftInputs + val rhsSize = numRanks * numRightInputs +// println(deriv.size + " " + lhsSize + " " + numRanks + " " + numLeftInputs + " " + rhsSize) + val lhsDeriv = deriv(0 until lhsSize).asDenseMatrix.reshape(numRanks, numLeftInputs, view = View.Require) + val rhsDeriv = deriv(lhsSize until lhsSize + rhsSize).asDenseMatrix.reshape(numRanks, numRightInputs, view = View.Require) + + val innerActs = fv + val lhsProj = lhsWeights * innerActs + val rhsProj = rhsWeights * innerActs + + // Smart way + lhsDeriv += rhsProj * innerActs.t * scale + rhsDeriv += lhsProj * innerActs.t * scale + // Dumb way +// for (r <- 0 until lhsWeights.rows) { +// for (i <- 0 until lhsWeights.cols) { +// lhsDeriv(r, i) += scale * innerActs(i) * rhsProj(r) +// } +// for (i <- 0 until rhsWeights.cols) { +// rhsDeriv(r, i) += scale * innerActs(i) * lhsProj(r) +// } +// } + require(deriv.size == lhsSize + rhsSize, "Backpropagating through LowRankQuadraticTransform is not currently supported") + } + } +} + +//} diff --git a/src/main/scala/epic/dense/NonlinearTransform.scala b/src/main/scala/epic/dense/NonlinearTransform.scala new file mode 100644 index 00000000..31ff8cf1 --- /dev/null +++ b/src/main/scala/epic/dense/NonlinearTransform.scala @@ -0,0 +1,144 @@ +package epic.dense + +import breeze.linalg._ +import breeze.linalg.DenseVector +import epic.framework.Feature +import breeze.util.Index +import scala.util.Random +import breeze.numerics.sigmoid + +/** + * A bit of a misnomer since this has been generalized to support linear functions as + * well... + */ +case class NonlinearTransform[FV](nonLinType: String, size: Int, inner: Transform[FV, DenseVector[Double]], dropoutRate: Double = 0.5) extends Transform[FV, DenseVector[Double]] { + + val index: inner.index.type = inner.index + + def extractLayer(dv: DenseVector[Double], forTrain: Boolean) = { + if (nonLinType == "dropout") { + val keepFrac = 1.0 - dropoutRate + val fcn = if (forTrain) { + // Only have "true" when we want to keep things around + new NonlinearTransform.Mask(Array.fill(size)(NonlinearTransform.globalRng.nextDouble < keepFrac)) + } else { + new NonlinearTransform.Scale(keepFrac) + } + new Layer(fcn, inner.extractLayer(dv, forTrain)) + } else { + val nonlinearFcn = NonlinearTransform.getNonlinearFcn(nonLinType); + new Layer(nonlinearFcn, inner.extractLayer(dv, forTrain)) + } + } + + def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = inner.initialWeightVector(initWeightsScale, rng, false, spec) + + def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) = inner.clipHiddenWeightVectors(weights, norm, false) + + def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = inner.getInterestingWeightIndicesForGradientCheck(offset) + + case class Layer(nonlinearFcn: NonlinearTransform.NonlinearFcn, innerLayer: inner.Layer) extends Transform.Layer[FV,DenseVector[Double]] { + + val myIndex = Index[Feature] + + def index = myIndex; + + def activations(fv: FV): DenseVector[Double] = { + val act = innerLayer.activations(fv) + var i = 0; + while (i < act.size) { + act(i) = nonlinearFcn.fcn(i, act(i)) + i += 1 + } + act + } + + def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = { + val scale = _scale + val act = innerLayer.activations(fv) + var i = 0; + while (i < act.size) { + act(i) = nonlinearFcn.deriv(i, act(i)) + i += 1 + } + act :*= scale + innerLayer.tallyDerivative(deriv, act, fv) + } + + def applyBatchNormalization(inputs: scala.collection.GenTraversable[FV]) = innerLayer.applyBatchNormalization(inputs) + + } + +} + +object NonlinearTransform { + + val globalRng = new scala.util.Random(0) + + def getNonlinearFcn(nonLinType: String) = { + if (nonLinType == "tanh") { + Tanh() + } else if (nonLinType == "relu") { + Relu() + } else if (nonLinType == "requ") { + Requ() + } else if (nonLinType == "cube") { + Cube() + } else if (nonLinType == "const") { + Constant() + } else { + throw new RuntimeException("Unrecognized nonlin type: " + nonLinType) + } + } + + trait NonlinearFcn { + // idx is the position of the unit; this basically only applies to dropout + // where we want to zero out particular units + def fcn(idx: Int, x: Double): Double; + def deriv(idx: Int, x: Double): Double; + } + + case class Constant() extends NonlinearFcn { + def fcn(idx: Int, x: Double) = 1 + def deriv(idx: Int, x: Double) = 0 + } + + case class Mask(val mask: Array[Boolean]) extends NonlinearFcn { + def fcn(idx: Int, x: Double) = if (mask(idx)) x else 0 + def deriv(idx: Int, x: Double) = if (mask(idx)) 1 else 0 + } + + case class ShiftAndScaleEach(val shifts: Array[Double], val factors: Array[Double]) extends NonlinearFcn { + def fcn(idx: Int, x: Double) = factors(idx) * (x - shifts(idx)) + def deriv(idx: Int, x: Double) = factors(idx) + } + + case class Scale(val factor: Double) extends NonlinearFcn { + def fcn(idx: Int, x: Double) = factor * x + def deriv(idx: Int, x: Double) = factor + } + + case class Tanh() extends NonlinearFcn { + def fcn(idx: Int, x: Double) = 2 * sigmoid(2 * x) - 1.0 + def deriv(idx: Int, x: Double) = { + val sig = sigmoid(2 * x) + -4 * sig * (sig - 1.0) + } + } + + case class Relu() extends NonlinearFcn { + def fcn(idx: Int, x: Double) = Math.max(x, 0) + def deriv(idx: Int, x: Double) = if (x > 0) 1.0 else 0.0 + } + + case class Requ() extends NonlinearFcn { + def fcn(idx: Int, x: Double) = if (x > 0) x * x else 0.0 + def deriv(idx: Int, x: Double) = if (x > 0) 2 * x else 0.0 + } + + case class Cube() extends NonlinearFcn { + def fcn(idx: Int, x: Double) = x * x * x + def deriv(idx: Int, x: Double) = 3 * x * x + } + +} \ No newline at end of file diff --git a/src/main/scala/epic/dense/OutputEmbeddingTransform.scala b/src/main/scala/epic/dense/OutputEmbeddingTransform.scala new file mode 100644 index 00000000..d9d494e8 --- /dev/null +++ b/src/main/scala/epic/dense/OutputEmbeddingTransform.scala @@ -0,0 +1,155 @@ +package epic.dense + +import breeze.linalg._ +import breeze.linalg.operators.OpMulMatrix +import epic.features.SegmentedIndex +import epic.framework.Feature + +import scala.runtime.ScalaRunTime +import scala.util.Random + +/** + * Output embedding technique described in section 6 of + * http://www.eecs.berkeley.edu/~gdurrett/papers/durrett-klein-acl2015.pdf + * Basically learns a dictionary for the output as well as an affine transformation + * in order to produce the vector that gets combined with the input in the final + * bilinear product. + */ +case class OutputEmbeddingTransform[FV](numOutputs: Int, outputDim: Int, innerTransform: Transform[FV, DenseVector[Double]], coarsenerForInitialization: Option[Int => Int] = None) extends OutputTransform[FV, DenseVector[Double]] { + + + val index = SegmentedIndex(new AffineTransform.Index(numOutputs, outputDim, true), + innerTransform.index) + + def extractLayerAndPenultimateLayer(weights: DenseVector[Double], forTrain: Boolean) = { + val embeddings = weights(index.componentOffset(0) until index.componentOffset(0) + (numOutputs * outputDim)).asDenseMatrix.reshape(numOutputs, outputDim, view = View.Require) + val bias = weights(index.componentOffset(0) + numOutputs * outputDim until index.componentOffset(1)) + val inner = innerTransform.extractLayer(weights(index.componentOffset(1) to -1), forTrain) + new OutputLayer(embeddings, bias, inner) -> inner + } + + def clipEmbeddingNorms(weights: DenseVector[Double]) { + val embeddings = weights(index.componentOffset(1) until index.componentOffset(1) + (numOutputs * outputDim)).asDenseMatrix.reshape(numOutputs, outputDim, view = View.Require) + OutputEmbeddingTransform.clipEmbeddingNorms(embeddings); + } + + def displayEmbeddingNorms(weights: DenseVector[Double]) { + val embeddings = weights(index.componentOffset(1) until index.componentOffset(1) + (numOutputs * outputDim)).asDenseMatrix.reshape(numOutputs, outputDim, view = View.Require) + OutputEmbeddingTransform.displayEmbeddingNorms(embeddings); + } + + def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = { + require(outputLayer) + val embeddingsInitialization = if (coarsenerForInitialization.isDefined) { + OutputEmbeddingTransform.getCoarsenedInitialEmbeddingWeights(numOutputs, outputDim, coarsenerForInitialization.get) + } else if (spec == "magic") { + AffineTransform.getMagicAffineWeights(index.indices(0).size, numOutputs, outputDim, initWeightsScale, rng) + } else if (spec == "identity") { + OutputEmbeddingTransform.getIdentityEmbeddingWeights(numOutputs, outputDim, rng) + } else { + AffineTransform.getGaussianAffineWeights(index.indices(0).size, initWeightsScale, rng) + } + // N.B. "true" because the next layer effectively becomes the output layer from the purposes of + // initialization + DenseVector.vertcat(embeddingsInitialization, + innerTransform.initialWeightVector(initWeightsScale, rng, true, spec)) + } + + def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) { + innerTransform.clipHiddenWeightVectors(weights(index.componentOffset(1) to -1), norm, false) + } + + def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = { + (offset until offset + Math.min(10, index.indices(0).size)) ++ innerTransform.getInterestingWeightIndicesForGradientCheck(offset + index.indices(0).size) + } + + case class OutputLayer(embeddings: DenseMatrix[Double], bias: DenseVector[Double], innerLayer: innerTransform.Layer) extends OutputTransform.OutputLayer[FV,DenseVector[Double]] { + override val index = OutputEmbeddingTransform.this.index + + def activations(fv: FV) = { + val innerActs = innerLayer.activations(fv) + DenseVector(Array.tabulate(numOutputs)(i => activationsFromPenultimateDot(innerActs, i))) + } + + def activationsFromPenultimateDot(innerLayerActivations: DenseVector[Double], sparseIdx: Int) = { + innerLayerActivations dot embeddings(sparseIdx, ::).t + bias(sparseIdx) + } + + def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = { + val scale = _scale + val embeddingsDeriv = deriv(0 until numOutputs * outputDim).asDenseMatrix.reshape(numOutputs, outputDim, view = View.Require) + val biasDeriv = deriv(numOutputs * outputDim until index.componentOffset(1)) + val innerAct = innerLayer.activations(fv) + val innerScale = DenseVector(Array.tabulate(outputDim)(i => 0.0)) + for (k <- 0 until scale.size) { + // Assuming there's something nontrivial to pass back + if (scale(k) != 0.0) { + // Bias update + biasDeriv(k) += scale(k) + embeddingsDeriv(k, ::).t += innerAct * scale(k) // Embeddings update + innerScale += embeddings(k, ::).t * scale(k) + } + } + innerLayer.tallyDerivative(deriv(index.componentOffset(1) to -1), innerScale, fv) + } + + def applyBatchNormalization(inputs: scala.collection.GenTraversable[FV]) = innerLayer.applyBatchNormalization(inputs) + } + +} + +object OutputEmbeddingTransform { + + def getIdentityEmbeddingWeights(numOutputs: Int, outputDim: Int, rng: Random) = { + require(outputDim <= numOutputs, outputDim + " " + numOutputs) + val mat = DenseMatrix.zeros[Double](numOutputs, outputDim) + for (i <- 0 until outputDim) { + mat(i, i) = 1.0 + } + for (i <- outputDim until numOutputs) { + mat(i, rng.nextInt(outputDim)) = 1.0 + } + val biasInitializer = DenseVector.zeros[Double](numOutputs) + val initWeights = DenseVector.vertcat(DenseVector(mat.data), biasInitializer) + initWeights + } + + def clipEmbeddingNorms(embeddings: DenseMatrix[Double]) { + for (i <- 0 until embeddings.rows) { + var norm = 0.0 + for (j <- 0 until embeddings.cols) { + norm += embeddings(i, j) * embeddings(i, j) + } + norm = Math.sqrt(norm) + for (j <- 0 until embeddings.cols) { + embeddings(i, j) /= norm + } + } + } + + def displayEmbeddingNorms(embeddings: DenseMatrix[Double]) { + var avgNorm = 0.0 + var maxNorm = 0.0 + for (i <- 0 until embeddings.rows) { + var norm = 0.0 + for (j <- 0 until embeddings.cols) { + norm += embeddings(i, j) * embeddings(i, j) + } + norm = Math.sqrt(norm) + avgNorm += norm + maxNorm = Math.max(maxNorm, norm) + } + println("Average norm: " + avgNorm/embeddings.rows + ", max norm: " + maxNorm) + } + + def getCoarsenedInitialEmbeddingWeights(numOutputs: Int, outputDim: Int, coarsenerForInitialization: Int => Int) = { + val mat = DenseMatrix.zeros[Double](numOutputs, outputDim) + for (i <- 0 until numOutputs) { + val j = ((coarsenerForInitialization(i) % outputDim) + outputDim) % outputDim + mat(i, j) = 1.0 + } + val biasInitializer = DenseVector.zeros[Double](numOutputs) + val initWeights = DenseVector.vertcat(DenseVector(mat.data), biasInitializer) + initWeights + } +} diff --git a/src/main/scala/epic/dense/OutputTransform.scala b/src/main/scala/epic/dense/OutputTransform.scala new file mode 100644 index 00000000..fe64cb4c --- /dev/null +++ b/src/main/scala/epic/dense/OutputTransform.scala @@ -0,0 +1,47 @@ +package epic.dense + +import breeze.linalg._ +import breeze.util.Index +import epic.framework.Feature +import scala.util.Random + +trait OutputTransform[In, +Out] extends Serializable { + val index: Index[Feature] + + def extractLayer(dv: DenseVector[Double], forTrain: Boolean):OutputLayer = extractLayerAndPenultimateLayer(dv, forTrain)._1 + + def extractLayerAndPenultimateLayer(dv: DenseVector[Double], forTrain: Boolean): (OutputLayer, Transform.Layer[In,Out]); + + def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String): DenseVector[Double] + + def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) + + def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int]; + + type OutputLayer <: OutputTransform.OutputLayer[In,Out] +} + +object OutputTransform { + + trait OutputLayer[In, +Out] extends Transform.Layer[In,Out] { + + def index: Index[Feature]; + + def activations(fv: In):Out + + def activationsFromPenultimateDot(innerLayerActivations: DenseVector[Double], sparseIdx: Int): Double; + + def activationsFromPenultimateDot(innerLayerActivations: DenseVector[Double], sparseFeatures: Array[Int]): Double = { + var value = 0.0; + for (sparseFeature <- sparseFeatures) { + value += activationsFromPenultimateDot(innerLayerActivations, sparseFeature) + } + value + } + + def tallyDerivative(deriv: DenseVector[Double], scale: =>Vector[Double], fv: In) + + def applyBatchNormalization(inputs: scala.collection.GenTraversable[In]) + } + +} diff --git a/src/main/scala/epic/dense/SigmoidTransform.scala b/src/main/scala/epic/dense/SigmoidTransform.scala deleted file mode 100644 index a2e5c33a..00000000 --- a/src/main/scala/epic/dense/SigmoidTransform.scala +++ /dev/null @@ -1,49 +0,0 @@ -package epic.dense - -import epic.framework.Feature -import breeze.linalg._ -import breeze.linalg.operators.OpMulMatrix -import breeze.numerics._ -import breeze.linalg.support.{CanMapValues} - -/** - * - * - * @author dlwh - */ -case class NeuralFeature(output: Int, input: Int) extends Feature -case class NeuralBias(input: Int) extends Feature - -case class SigmoidTransform[FV](inner: Transform[FV, DenseVector[Double]]) extends Transform[FV, DenseVector[Double]] { - def this(numOutputs: Int, numInputs: Int, - includeBias: Boolean = true) - (implicit mult: OpMulMatrix.Impl2[DenseMatrix[Double], FV, DenseVector[Double]], - canaxpy: scaleAdd.InPlaceImpl3[DenseVector[Double], Double, FV]) = this(AffineTransform.typed(numOutputs, numInputs, includeBias)) - - val index: inner.index.type = inner.index - - - def extractLayer(dv: DenseVector[Double]) = new Layer(inner.extractLayer(dv)) - - case class Layer(innerLayer: inner.Layer) extends _Layer { - - def activations(fv: FV): DenseVector[Double] = sigmoid(innerLayer.activations(fv)) - - def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = { - val scale = _scale - val act = activations(fv) - act :*= (act - 1.0) - act :*= -1.0 - // whole function is f(sigmoid(transform(features))) - // scale(i) pushes in (f'(sigmoid(transform(features)))(i) so just need to finish the chain rule. - // activations(...) computes sigmoid(transform(features)) - // act is currently sigmoid'(transform(...)) - act :*= scale - - innerLayer.tallyDerivative(deriv, act, fv) - - } - - } - -} \ No newline at end of file diff --git a/src/main/scala/epic/dense/TanhTransform.scala b/src/main/scala/epic/dense/TanhTransform.scala index e3d8bdf2..92a052d5 100644 --- a/src/main/scala/epic/dense/TanhTransform.scala +++ b/src/main/scala/epic/dense/TanhTransform.scala @@ -3,7 +3,9 @@ package epic.dense import breeze.linalg._ import breeze.linalg.operators.OpMulMatrix import breeze.numerics._ - +import epic.framework.Feature +import breeze.util.Index +import scala.util.Random case class TanhTransform[FV](inner: Transform[FV, DenseVector[Double]]) extends Transform[FV, DenseVector[Double]] { def this(numOutputs: Int, numInputs: Int, @@ -13,10 +15,19 @@ case class TanhTransform[FV](inner: Transform[FV, DenseVector[Double]]) extends val index: inner.index.type = inner.index + def extractLayer(dv: DenseVector[Double], forTrain: Boolean) = new Layer(inner.extractLayer(dv, forTrain)) + + def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = inner.initialWeightVector(initWeightsScale, rng, false, spec) + + def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) = inner.clipHiddenWeightVectors(weights, norm, false) + + def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = inner.getInterestingWeightIndicesForGradientCheck(offset) - def extractLayer(dv: DenseVector[Double]) = new Layer(inner.extractLayer(dv)) - - case class Layer(innerLayer: inner.Layer) extends _Layer { + case class Layer(innerLayer: inner.Layer) extends Transform.Layer[FV,DenseVector[Double]] { + + val myIndex = Index[Feature] + + def index = myIndex; def activations(fv: FV): DenseVector[Double] = { val act = innerLayer.activations(fv) * 2.0 @@ -41,6 +52,8 @@ case class TanhTransform[FV](inner: Transform[FV, DenseVector[Double]]) extends innerLayer.tallyDerivative(deriv, act, fv) } + + def applyBatchNormalization(inputs: scala.collection.GenTraversable[FV]) = innerLayer.applyBatchNormalization(inputs) } diff --git a/src/main/scala/epic/dense/Transform.scala b/src/main/scala/epic/dense/Transform.scala index 16d03588..10fd55ee 100644 --- a/src/main/scala/epic/dense/Transform.scala +++ b/src/main/scala/epic/dense/Transform.scala @@ -3,28 +3,42 @@ package epic.dense import breeze.linalg._ import breeze.util.Index import epic.framework.Feature +import scala.util.Random /** * * * @author dlwh */ -trait Transform[In, +Out] { +trait Transform[In, +Out] extends Serializable { val index: Index[Feature] - def extractLayer(dv: DenseVector[Double]):Layer + def extractLayer(dv: DenseVector[Double], forTrain: Boolean):Layer + + def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String): DenseVector[Double] - type Layer <: _Layer + def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) + + def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int]; + + type Layer <: Transform.Layer[In,Out] +} - trait _Layer { +object Transform { + + trait Layer[In, +Out] { - def index = Transform.this.index + def index: Index[Feature]; def activations(fv: In):Out def tallyDerivative(deriv: DenseVector[Double], scale: =>Vector[Double], fv: In) + def applyBatchNormalization(inputs: scala.collection.GenTraversable[In]) } } + +case class NeuralFeature(output: Int, input: Int) extends Feature +case class NeuralBias(input: Int) extends Feature diff --git a/src/main/scala/epic/dense/Word2Vec.scala b/src/main/scala/epic/dense/Word2Vec.scala new file mode 100644 index 00000000..fe79023b --- /dev/null +++ b/src/main/scala/epic/dense/Word2Vec.scala @@ -0,0 +1,220 @@ +package epic.dense + +import java.io.BufferedInputStream +import java.io.DataInputStream +import java.io.FileInputStream +import java.util.regex.Pattern +import scala.collection.mutable.HashMap +import scala.util.Random +import breeze.linalg.Counter +import java.io.File + +object Word2Vec { + /** + * Loads vectors from one or more sources in word2vecPaths; these might be in + * word2vec format (should end in .bin) or C+W/Bansal format (should end + * in .txt). + * + * For each word, vectors are appended from each source. If at least one source + * is present, others are zeroes. Otherwise, it gets a random vector. + */ + def smartLoadVectorsForVocabulary(word2vecPaths: Seq[String], voc: Set[String], vocCounts: Counter[String,Double] = Counter[String,Double], maxVectorLen: Int = Int.MaxValue, inputVectorBias: Boolean, randomizeUnks: Boolean = true) = { + val vectorsEachSource = for (word2vecPath <- word2vecPaths) yield { + if (word2vecPath.endsWith("bin")) { + readWord2Vec(word2vecPath, voc, false) + } else if (word2vecPath.endsWith(".txt")) { + readBansalEmbeddings(word2vecPath, voc, false) + } else { + throw new RuntimeException("Unrecognized vectors: " + word2vecPath) + } + } + val dimsEachSource = vectorsEachSource.map(_.values.head.size) + val finalVectorDim = Math.min(maxVectorLen, dimsEachSource.reduce(_ + _) + (if (inputVectorBias) 1 else 0)) + val finalVectors = new HashMap[String,Array[Float]] + val rng = new Random(0) + val mostCommonMisses = Counter[String,Double] + var numRand = 0 + for (word <- voc) { + val containedInSome = vectorsEachSource.map(_.keySet.contains(word)).reduce(_ || _) + val vector = if (containedInSome) { + var finalVector = (0 until vectorsEachSource.size).map(i => vectorsEachSource(i).getOrElse(word, { Array.tabulate(dimsEachSource(i))(j => 0.0F) })).reduce(_ ++ _) + if (inputVectorBias) { + finalVector = finalVector ++ Array(1.0F) + } + finalVector + } else { + mostCommonMisses(word) = vocCounts(word) + numRand += 1 + if (randomizeUnks) { + Array.tabulate(finalVectorDim)(i => if (i == finalVectorDim - 1 && inputVectorBias) 1.0F else ((rng.nextDouble - 0.5) * 0.5).toFloat) + } else { + Array.tabulate(finalVectorDim)(i => if (i == finalVectorDim - 1 && inputVectorBias) 1.0F else 0.0F) + } + } + val vectorTrimmed = if (vector.size > finalVectorDim) vector.slice(0, finalVectorDim) else vector + require(vectorTrimmed.size == finalVectorDim, "Mismatched sizes, expected dimension " + finalVectorDim + " but got " + vector.size + " clipped to " + vectorTrimmed.size) + finalVectors.put(word, vectorTrimmed) + } + println("Read embeddings for " + voc.size + " words from " + word2vecPaths.size + " sources, " + + "total embedding size = " + finalVectorDim + ", " + numRand + " present in no source") + println("Fifty most common misses: " + mostCommonMisses.argtopk(50).map(word => word + ": " + mostCommonMisses(word))) + finalVectors + } + + def makeRandomVectorsForVocabulary(voc: Set[String], dim: Int, inputVectorBias: Boolean) = { + val finalVectors = new HashMap[String,Array[Float]] + val finalVectorDim = dim + (if (inputVectorBias) 1 else 0) + val rng = new Random(0) + var numRand = 0 + for (word <- voc) { + val vec = Array.tabulate(finalVectorDim)(i => if (i == finalVectorDim - 1 && inputVectorBias) 1.0F else ((rng.nextDouble - 0.5) * 0.5).toFloat) + finalVectors.put(word, vec) + } + finalVectors + } + + /** + * Loads vectors for a vocabulary from word2vec, with OOV words having random vectors + * generated for them. + */ + def loadVectorsForVocabulary(word2vecPath: String, voc: Set[String], inputVectorBias: Boolean) = { + val word2vecMap = readWord2Vec(word2vecPath, voc, inputVectorBias); + if (word2vecMap.isEmpty) { + throw new RuntimeException("No word2vec vectors loaded") + } + augmentVectorsToCompleteVocabulary(word2vecMap, voc, inputVectorBias) + } + + def loadBansalVectorsForVocabulary(word2vecPath: String, voc: Set[String], inputVectorBias: Boolean) = { + val word2vecMap = readBansalEmbeddings(word2vecPath, voc, inputVectorBias); + if (word2vecMap.isEmpty) { + throw new RuntimeException("No Bansal vectors loaded") + } + augmentVectorsToCompleteVocabulary(word2vecMap, voc, inputVectorBias) + } + + private def augmentVectorsToCompleteVocabulary(word2vecMap: HashMap[String,Array[Float]], voc: Set[String], inputVectorBias: Boolean) = { + val word2vecDim = word2vecMap.values.head.size + val rng = new Random(0) + for (unkWord <- voc -- word2vecMap.keySet) { + // Set to random noise except for the bias feature, if it's there + word2vecMap.put(unkWord, Array.tabulate(word2vecDim)(i => if (i == word2vecDim - 1 && inputVectorBias) 1.0F else ((rng.nextDouble - 0.5) * 0.5).toFloat)) + } + word2vecMap + } + + /** + * Reads the vectors in words from the given word2vec path and augments with a bias feature + * if necessary. The returned map does not include entries for words that are not in the w2v + * file. + */ + def readWord2Vec(word2VecPath: String, words: Set[String], inputVectorBias: Boolean) = { + val bis = new BufferedInputStream(new FileInputStream(word2VecPath)); + val dis = new DataInputStream(bis); + val word2Vec = new HashMap[String,Array[Float]]; + // First two entries are vocabulary size and dimension of vectors + val vocSize = Word2VecUtils.readString(dis).toInt; + val dim = Word2VecUtils.readString(dis).toInt; + // Now read vectors, augmented with 1s for bias + for (i <- 0 until vocSize) { + if (i % 1000000 == 0) { + println("On line " + i) + } + val word = Word2VecUtils.readString(dis); + val vector = new Array[Float](if (inputVectorBias) dim + 1 else dim); + val len = 0; + var j = 0; + while (j < dim) { + vector(j) = Word2VecUtils.readFloat(dis); + j += 1; + } + if (inputVectorBias) { + vector(j) = 1.0F + } + if (words.isEmpty || words.contains(word)) { + word2Vec.put(word, vector); + } + } + println("Loaded " + word2Vec.size + " word2vec representations out of " + words.size + " attempted words"); + word2Vec; + } + + val hyphenPattern = Pattern.compile("(\\w+-)+(\\w+)"); + + def convertWord(str: String, lowercase: Boolean = false) = { + var strRep = str; + strRep = strRep.replace("-LRB-", "(") + strRep = strRep.replace("-RRB-", ")") + strRep = strRep.replace("-LSB-", "[") + strRep = strRep.replace("-RSB-", "]") + strRep = strRep.replace("-LCB-", "{") + strRep = strRep.replace("-RCB-", "}") + // Replace all numbers with 15 + strRep = strRep.replaceAll("^-?[0-9,.]{2,15}$", "fifteen") + // Replace hyphenated words with the last part + val m = hyphenPattern .matcher(str) + strRep = if (m.find()) { + m.group(2) + } else { + strRep + } + if (lowercase) { + strRep = strRep.toLowerCase() + } + strRep + } + + def readBansalEmbeddings(embeddingsPath: String, words: Set[String], inputVectorBias: Boolean) = { + val inFile = scala.io.Source.fromFile(new File(embeddingsPath)).getLines() + val word2Vec = new HashMap[String,Array[Float]]; + var firstLine = true + while (inFile.hasNext) { + val line = inFile.next; + if (firstLine) { + if (line.split("\\s+").size == 2) { + println("Skipping first line: " + line) + // Just an indicator of how many words there are and the vector dim, so + // skip over it by leaving firstLine set to true + } else { + println("Not skipping first line: " + line) + firstLine = false; + } + } + if (!firstLine) { + // If the line contains a tab, then that's the delimiter between the word and + // the vectors + if (line.contains("\t")) { + val word = line.substring(0, line.indexOf("\t")); + if (words.isEmpty || words.contains(word)) { + val entries = line.substring(line.indexOf("\t") + 1).split(" ") + val arr = Array.tabulate(if (inputVectorBias) entries.size + 1 else entries.size)(i => { + if (inputVectorBias && i == entries.size) { + 1.0F + } else { + entries(i).toFloat + } + }) + word2Vec.put(word, arr) + } + } else { + // Otherwise, a space is the first delimiter + val word = line.substring(0, line.indexOf(" ")); + if (words.isEmpty || words.contains(word)) { + val entries = line.substring(line.indexOf(" ") + 1).split(" "); + val arr = Array.tabulate(if (inputVectorBias) entries.size + 1 else entries.size)(i => { + if (inputVectorBias && i == entries.size) { + 1.0F + } else { + entries(i).toFloat + } + }) + word2Vec.put(word, arr) + } + } + } + firstLine = false; + } + println("Loaded " + word2Vec.size + " Bansal representations out of " + words.size + " attempted words"); + word2Vec; + } +} \ No newline at end of file diff --git a/src/main/scala/epic/dense/Word2VecSurfaceFeaturizer.scala b/src/main/scala/epic/dense/Word2VecSurfaceFeaturizer.scala new file mode 100644 index 00000000..eef2abaf --- /dev/null +++ b/src/main/scala/epic/dense/Word2VecSurfaceFeaturizer.scala @@ -0,0 +1,233 @@ +package epic.dense + +import breeze.linalg.DenseVector +import scala.collection.mutable.HashMap +import breeze.util.Index +import breeze.linalg.DenseMatrix +import breeze.linalg.Counter2 +import breeze.linalg.Counter +import epic.features.HackyLexicalProductionFeaturizer +import breeze.linalg.sum +import epic.features.RuleBasedHackyHeadFinder +import epic.features.HackyHeadFinder +import epic.parser.RuleTopology +import epic.trees.AnnotatedLabel + +/** + * converter is used to map words into the word2vec vocabulary, which might include things + * like lowercasing, replacing numbers, changing -LRB-, etc. See Word2Vec.convertWord + */ +class Word2VecIndexed[W](private val wordIndex: Index[W], + private val word2vec: Array[Array[Double]], + private val converter: W => W) extends Serializable { + + def wordRepSize = word2vec.head.size + def vocSize = wordIndex.size + + val zeroVector = Array.tabulate(wordRepSize)(i => 0.0) + + def containsWord(rawStr: W) = wordIndex.contains(converter(rawStr)) + + def indexWord(rawStr: W) = wordIndex(converter(rawStr)) + + def convertIndexToVector(idx: Int) = word2vec(idx) + + private def assemble(vectors: Seq[Array[Double]]) = vectors.reduce(_ ++ _) + + def convertToVector(indexedWords: Array[Int]): Array[Double] = { + assemble(indexedWords.map(wordIdx => if (wordIdx == -1) zeroVector else word2vec(wordIdx))) + } + + def augment(numSparseFeats: Int, featurizer: W => Array[Int]): Word2VecIndexed[W] = { + val newWord2Vec = Array.tabulate(word2vec.size)(i => { + val word = wordIndex.get(i) + val feats = featurizer(word) + word2vec(i) ++ Array.tabulate(numSparseFeats)(j => if (feats.contains(j)) 1.0 else 0.0) + }) + new Word2VecIndexed(wordIndex, newWord2Vec, converter) + } +} + +object Word2VecIndexed { + + def apply[W](word2vec: HashMap[W,Array[Double]], + converter: W => W) = { + val index = Index[W] + val arr = new Array[Array[Double]](word2vec.size) + for (word <- word2vec.keySet) { + arr(index.index(word)) = word2vec(word) + } + new Word2VecIndexed(index, arr, converter) + } +} + +trait WordVectorAnchoringIndexed[String] { + def reducedFeaturesForSpan(start: Int, end: Int): Array[Int]; + def featuresForSpan(start: Int, end: Int): Array[Int]; + def featuresForSplit(start: Int, split: Int, end: Int): Array[Int]; +} + +class Word2VecSurfaceFeaturizerIndexed[W](val word2vecIndexed: Word2VecIndexed[W], + val featureSpec: String) extends Serializable { + + def reducedInputSize = { + anchor(IndexedSeq[W]()).reducedFeaturesForSpan(0, 0).size * word2vecIndexed.wordRepSize + } + + def splitInputSize = { + anchor(IndexedSeq[W]()).featuresForSplit(0, 0, 0).size * word2vecIndexed.wordRepSize + } + + def anchor(words: IndexedSeq[W]): WordVectorAnchoringIndexed[W] = { + val indexedWords = words.map(word2vecIndexed.indexWord(_)) + new WordVectorAnchoringIndexed[W] { + + def reducedFeaturesForSpan(start: Int, end: Int) = { + if (featureSpec == "" || featureSpec == "moresplit" || featureSpec == "basic") { + Array(fetchWord(start - 1), fetchWord(start), fetchWord(end - 1), fetchWord(end)) + } else if (featureSpec == "morecontext") { + Array(fetchWord(start - 2), fetchWord(start - 1), fetchWord(start), fetchWord(end - 1), fetchWord(end), fetchWord(end + 1)) + } else if (featureSpec == "morefirstlast") { + Array(fetchWord(start - 1), fetchWord(start), fetchWord(start + 1), fetchWord(end - 2), fetchWord(end - 1), fetchWord(end)) + } else if (featureSpec == "mcmfl") { + Array(fetchWord(start - 2), fetchWord(start - 1), fetchWord(start), fetchWord(start + 1), fetchWord(end - 2), fetchWord(end - 1), fetchWord(end), fetchWord(end + 1)) + } else if (featureSpec == "most") { + Array(fetchWord(start - 2), fetchWord(start - 1), fetchWord(start), fetchWord(start + 1), fetchWord(end - 2), fetchWord(end - 1), fetchWord(end), fetchWord(end + 1)) + } else { + throw new RuntimeException("Unknown featureSpec: " + featureSpec) + } + } + + def featuresForSpan(start: Int, end: Int) = { + if (featureSpec == "" || featureSpec == "basic") { + Array(fetchWord(start - 1), fetchWord(start), -1, -1, fetchWord(end - 1), fetchWord(end)) + } else if (featureSpec == "morecontext") { + Array(fetchWord(start - 2), fetchWord(start - 1), fetchWord(start), -1, -1, fetchWord(end - 1), fetchWord(end), fetchWord(end + 1)) + } else if (featureSpec == "morefirstlast") { + Array(fetchWord(start - 1), fetchWord(start), fetchWord(start + 1), -1, -1, fetchWord(end - 2), fetchWord(end - 1), fetchWord(end)) + } else if (featureSpec == "moresplit") { + Array(fetchWord(start - 1), fetchWord(start), -1, -1, -1, -1, fetchWord(end - 1), fetchWord(end)) + } else if (featureSpec == "mcmfl") { + Array(fetchWord(start - 2), fetchWord(start - 1), fetchWord(start), fetchWord(start + 1), -1, -1, fetchWord(end - 2), fetchWord(end - 1), fetchWord(end), fetchWord(end + 1)) + } else if (featureSpec == "most") { + Array(fetchWord(start - 2), fetchWord(start - 1), fetchWord(start), fetchWord(start + 1), -1, -1, -1, -1, fetchWord(end - 2), fetchWord(end - 1), fetchWord(end), fetchWord(end + 1)) + } else { + throw new RuntimeException("Unknown featureSpec: " + featureSpec) + } + } + + def featuresForSplit(start: Int, split: Int, end: Int) = { + if (featureSpec == "" || featureSpec == "basic") { + Array(fetchWord(start - 1), fetchWord(start), fetchWord(split - 1), fetchWord(split), fetchWord(end - 1), fetchWord(end)) + } else if (featureSpec == "morecontext") { + Array(fetchWord(start - 2), fetchWord(start - 1), fetchWord(start), fetchWord(split - 1), fetchWord(split), fetchWord(end - 1), fetchWord(end), fetchWord(end + 1)) + } else if (featureSpec == "morefirstlast") { + Array(fetchWord(start - 1), fetchWord(start), fetchWord(start + 1), fetchWord(split - 1), fetchWord(split), fetchWord(end - 2), fetchWord(end - 1), fetchWord(end)) + } else if (featureSpec == "moresplit") { + Array(fetchWord(start - 1), fetchWord(start), fetchWord(split - 2), fetchWord(split - 1), fetchWord(split), fetchWord(split + 1), fetchWord(end - 1), fetchWord(end)) + } else if (featureSpec == "mcmfl") { + Array(fetchWord(start - 2), fetchWord(start - 1), fetchWord(start), fetchWord(start + 1), fetchWord(split - 1), fetchWord(split), fetchWord(end - 2), fetchWord(end - 1), fetchWord(end), fetchWord(end + 1)) + } else if (featureSpec == "most") { + Array(fetchWord(start - 2), fetchWord(start - 1), fetchWord(start), fetchWord(start + 1), fetchWord(split - 2), fetchWord(split - 1), fetchWord(split), fetchWord(split + 1), fetchWord(end - 2), fetchWord(end - 1), fetchWord(end), fetchWord(end + 1)) + } else { + throw new RuntimeException("Unknown featureSpec: " + featureSpec) + } + } + + private def fetchWord(idx: Int): Int = { + if (idx < 0 || idx >= words.size) -1 else indexedWords(idx) + } + } + } +} + +object Word2VecSurfaceFeaturizerIndexed { + + def makeVectFromParams(wordIndices: Array[Int], params: DenseMatrix[Double]): DenseVector[Double] = { + var currVect = DenseVector[Double](Array[Double]()) + for (wordIndex <- wordIndices) { + currVect = DenseVector.vertcat(currVect, params(wordIndex, ::).t) + } + currVect + } +} + + +trait WordVectorDepAnchoringIndexed[String] { + def getHeadDepPair(begin: Int, split: Int, end: Int, rule: Int): (Int, Int); + def featuresForHeadPair(head: Int, dep: Int): Array[Int]; +} + +class Word2VecDepFeaturizerIndexed[W](val word2VecIndexed: Word2VecIndexed[W], + val tagger: Tagger[W], + val topology: RuleTopology[AnnotatedLabel]) extends Serializable { + + val hackyHeadFinder: HackyHeadFinder[String,String] = new RuleBasedHackyHeadFinder + + def anchor(words: IndexedSeq[W]): WordVectorDepAnchoringIndexed[W] = { + val indexedWords = words.map(word2VecIndexed.indexWord(_)) + new WordVectorDepAnchoringIndexed[W] { + + val preterminals = new Array[String](words.size); + for (i <- 0 until words.size) { + preterminals(i) = tagger.tag(words(i)); + } + + def getHeadDepPair(begin: Int, split: Int, end: Int, rule: Int): (Int, Int) = { + val lc = topology.labelIndex.get(topology.leftChild(rule)).baseLabel; + val rc = topology.labelIndex.get(topology.rightChild(rule)).baseLabel; + val parent = topology.labelIndex.get(topology.parent(rule)).baseLabel; + + val lcHeadIdx = begin + hackyHeadFinder.findHead(lc, preterminals.slice(begin, split)); + val rcHeadIdx = split + hackyHeadFinder.findHead(rc, preterminals.slice(split, end)); + val overallHeadIdx = begin + hackyHeadFinder.findHead(parent, preterminals.slice(begin, end)) + if (overallHeadIdx == rcHeadIdx) { + (rcHeadIdx, lcHeadIdx) + } else { + (lcHeadIdx, rcHeadIdx) + } + } + + def featuresForHeadPair(head: Int, dep: Int) = { + Array(fetchWord(head - 1), fetchWord(head), fetchWord(head+1), fetchWord(dep-1), fetchWord(dep), fetchWord(dep+1)) + } + + private def fetchWord(idx: Int): Int = { + if (idx < 0 || idx >= words.size) -1 else indexedWords(idx) + } + } + } +} + +trait Tagger[W] { + def tag(word: W): String +} + +class FrequencyTagger[W](wordTagCounts: Counter2[String, W, Double]) extends Tagger[W] with Serializable { + + private val wordCounts = Counter[W,Double]; + private val wordToTagMap = new HashMap[W,String]; + for (word <- wordTagCounts.keysIterator.map(_._2).toSeq.distinct) { + wordCounts(word) = sum(wordTagCounts(::, word)); + if (!wordToTagMap.contains(word)) { + val tagCounts = wordTagCounts(::, word).iterator; + var bestTag = HackyLexicalProductionFeaturizer.UnkTag; + var bestTagCount = 0.0; + for ((tag, count) <- tagCounts) { + if (count > bestTagCount) { + bestTag = tag; + bestTagCount = count; + } + } + wordToTagMap.put(word, bestTag); + } + } + val tagTypesIdx = Index[String] + wordToTagMap.values.toSet[String].foreach(tagType => tagTypesIdx.index(tagType)) + tagTypesIdx.index(HackyLexicalProductionFeaturizer.UnkTag) + + def tag(word: W) = if (wordToTagMap.contains(word)) wordToTagMap(word) else HackyLexicalProductionFeaturizer.UnkTag; + + def convertToFeaturizer: W => Array[Int] = (word: W) => Array(tagTypesIdx.index(tag(word))) +} + diff --git a/src/main/scala/epic/dense/Word2VecUtils.java b/src/main/scala/epic/dense/Word2VecUtils.java new file mode 100644 index 00000000..687bf2f1 --- /dev/null +++ b/src/main/scala/epic/dense/Word2VecUtils.java @@ -0,0 +1,51 @@ +package epic.dense; + +import java.io.DataInputStream; +import java.io.IOException; +import java.io.InputStream; + +/** + * Utilities from + * https://gist.github.com/ansjsun/6304960 + * + * @author gdurrett + * + */ +public class Word2VecUtils { + + private static final int MAX_SIZE = 50; + + public static String readString(DataInputStream dis) throws IOException { + byte[] bytes = new byte[MAX_SIZE]; + byte b = dis.readByte(); + int i = -1; + StringBuilder sb = new StringBuilder(); + while (b != 32 && b != 10) { + i++; + bytes[i] = b; + b = dis.readByte(); + if (i == 49) { + sb.append(new String(bytes)); + i = -1; + bytes = new byte[MAX_SIZE]; + } + } + sb.append(new String(bytes, 0, i + 1)); + return sb.toString(); + } + + public static float readFloat(InputStream is) throws IOException { + byte[] bytes = new byte[4]; + is.read(bytes); + return getFloat(bytes); + } + + public static float getFloat(byte[] b) { + int accum = 0; + accum = accum | (b[0] & 0xff) << 0; + accum = accum | (b[1] & 0xff) << 8; + accum = accum | (b[2] & 0xff) << 16; + accum = accum | (b[3] & 0xff) << 24; + return Float.intBitsToFloat(accum); + } +} diff --git a/src/main/scala/epic/features/HackyHeadFinder.scala b/src/main/scala/epic/features/HackyHeadFinder.scala index 3d6f2dd6..3310ecd9 100644 --- a/src/main/scala/epic/features/HackyHeadFinder.scala +++ b/src/main/scala/epic/features/HackyHeadFinder.scala @@ -7,7 +7,7 @@ import scala.collection.mutable.HashMap * HackyHeadFinders find "heads" in a span using only preterminal labels. * It doesn't use the syntactic structure of the sentence. * - * @author gdurret + * @author gdurrett * @tparam L * @tparam T */ diff --git a/src/main/scala/epic/framework/ModelObjective.scala b/src/main/scala/epic/framework/ModelObjective.scala index 6ad23d81..a3f4cc68 100644 --- a/src/main/scala/epic/framework/ModelObjective.scala +++ b/src/main/scala/epic/framework/ModelObjective.scala @@ -9,6 +9,8 @@ import collection.parallel.ForkJoinTaskSupport import concurrent.forkjoin.ForkJoinPool import com.typesafe.scalalogging.slf4j.LazyLogging import epic.util.{SafeLogging, CacheBroker} +import epic.trees.AnnotatedLabel +import epic.trees.TreeInstance /** * The objective function for training a [[epic.framework.Model]]. Selects @@ -28,14 +30,20 @@ class ModelObjective[Datum](val model: Model[Datum], // Selects a set of data to use protected def select(batch: IndexedSeq[Int]):GenTraversable[Datum] = batchSelector(batch) - + def initialWeightVector(randomize: Boolean): DenseVector[Double] = { + initialWeightVector(randomize, 1E-3) + } + + def initialWeightVector(randomize: Boolean, scale: Double): DenseVector[Double] = { val v = model.readCachedFeatureWeights() match { case Some(vector) => vector case None => Encoder.fromIndex(featureIndex).tabulateDenseVector(f => model.initialValueForFeature(f)) } if(randomize) { - v += (DenseVector.rand(numFeatures) * 2E-3 - 1E-3) + // Control the seed of the RNG for the weights + val rng = new scala.util.Random(0) + v += DenseVector(Array.tabulate(numFeatures)(i => rng.nextDouble * 2.0 * scale - scale)) } v } @@ -55,7 +63,8 @@ class ModelObjective[Datum](val model: Model[Datum], val inference = inferenceFromWeights(x) val timeIn = System.currentTimeMillis() val success = new AtomicInteger(0) - val finalCounts = select(batch).aggregate(null:model.ExpectedCounts)({ ( _countsSoFar,datum) => + val minibatch = select(batch) + val finalCounts = minibatch.aggregate(null:model.ExpectedCounts)({ ( _countsSoFar,datum) => try { val countsSoFar:model.ExpectedCounts = if (_countsSoFar ne null) _countsSoFar else emptyCounts model.accumulateCounts(inference, datum, countsSoFar, 1.0) diff --git a/src/main/scala/epic/parser/models/NeuralParserTrainer.scala b/src/main/scala/epic/parser/models/NeuralParserTrainer.scala new file mode 100644 index 00000000..cbf78059 --- /dev/null +++ b/src/main/scala/epic/parser/models/NeuralParserTrainer.scala @@ -0,0 +1,234 @@ +package epic.parser.models + +import java.io.File + +import com.typesafe.scalalogging.slf4j.LazyLogging + +import breeze.config.Help +import breeze.linalg._ +import breeze.optimize._ +import breeze.optimize.FirstOrderMinimizer.OptParams +import breeze.util._ +import breeze.util.Implicits._ +import epic.constraints.CachedChartConstraintsFactory +import epic.constraints.ChartConstraints +import epic.dense.AdadeltaGradientDescentDVD +import epic.framework._ +import epic.parser._ +import epic.parser.ParseEval.Statistics +import epic.parser.ParserParams.XbarGrammar +import epic.parser.projections.OracleParser +import epic.parser.projections.ParserChartConstraintsFactory +import epic.trees.AnnotatedLabel +import epic.trees.TreeInstance +import epic.trees.annotations._ +import epic.util.CacheBroker + + +/** + * The main entry point for training discriminative parsers. + * Has a main method inherited from ParserPipeline. + * Use --help to see options, or just look at the Params class. + * + * + */ +object NeuralParserTrainer extends epic.parser.ParserPipeline with LazyLogging { + + case class ExtraPTParams(momentum: Double = 0.95, + computeTrainLL: Boolean = true) + + case class Params(@Help(text="Details about the parser to build") + modelFactory: PositionalNeuralModelFactory, + @Help(text="Name for the parser for saving and logging. will be inferrred if not provided.") + name: String = null, + implicit val cache: CacheBroker, + @Help(text="path for a baseline parser for computing constraints. will be built automatically if not provided.") + parser: File = null, + opt: OptParams, + @Help(text="How often to run on the dev set.") + iterationsPerEval: Int = 100, + @Help(text="How many iterations to run.") + maxIterations: Int = 1002, + @Help(text="How often to look at a small set of the dev set.") + iterPerValidate: Int = 30, + @Help(text="How many threads to use, default is to use whatever Scala thinks is best.") + threads: Int = -1, + @Help(text="Scale of random weight initialization") + initWeightsScale: Double = 1E-2, + @Help(text="String to specify fancier initialization types based on fan-in/fan-out") + initializerSpec: String = "", + @Help(text="True if we should determinimize training (remove randomness associated with random minibatches)") + determinizeTraining: Boolean = false, + @Help(text="True if we should train two models and ram them together") + ensemble: Boolean = false, + @Help(text="Use Adadelta for optimiziation instead of Adagrad") + useAdadelta: Boolean = true, + @Help(text="Should we enforce reachability? Can be useful if we're pruning the gold tree.") + enforceReachability: Boolean = true, + @Help(text="Whether or not we use constraints. Not using constraints is very slow.") + useConstraints: Boolean = true, + @Help(text="Should we check the gradient to make sure it's coded correctly?") + checkGradient: Boolean = false, + @Help(text="check specific indices, in addition to doing a full search.") + checkGradientsAt: String = null, + @Help(text="check specific indices, in addition to doing a full search.") + maxParseLength: Int = 70, + annotator: TreeAnnotator[AnnotatedLabel, String, AnnotatedLabel] = GenerativeParser.defaultAnnotator(), + extraPTParams: ExtraPTParams = ExtraPTParams()) + protected val paramManifest = manifest[Params] + + def trainParser( trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], + validate: (Parser[AnnotatedLabel, String]) => Statistics, params: Params) = { + import params._ + import extraPTParams._ + +// if(threads >= 1) +// collection.parallel.ForkJoinTasks.defaultForkJoinPool.setParallelism(params.threads) + + val initialParser = params.parser match { + case null => + val (grammar, lexicon) = XbarGrammar().xbarGrammar(trainTrees) + GenerativeParser.annotatedParser(grammar, lexicon, annotator, trainTrees) +// GenerativeParser.annotatedParser(grammar, lexicon, Xbarize(), trainTrees) + case f => + readObject[Parser[AnnotatedLabel, String]](f) + } + + val constraints = { + + val maxMarginalized = initialParser.copy(marginalFactory=initialParser.marginalFactory match { + case StandardChartFactory(ref, mm) => StandardChartFactory(ref, maxMarginal = true) + case x => x + }) + + val uncached = new ParserChartConstraintsFactory[AnnotatedLabel, String](maxMarginalized, {(_:AnnotatedLabel).isIntermediate}) + new CachedChartConstraintsFactory[AnnotatedLabel, String](uncached) + } + + var theTrees = trainTrees.toIndexedSeq.filterNot(sentTooLong(_, params.maxParseLength)) + + if(useConstraints && enforceReachability) { + val treebankGrammar = GenerativeParser.annotated(initialParser.topology, initialParser.lexicon, TreeAnnotator.identity, trainTrees) + val markovizedGrammar = GenerativeParser.annotated(initialParser.topology, initialParser.lexicon, annotator, trainTrees) + val proj = new OracleParser(treebankGrammar, markovizedGrammar) + theTrees = theTrees.par.map(ti => ti.copy(tree=proj.forTree(ti.tree, ti.words, constraints.constraints(ti.words)))).seq.toIndexedSeq + } + + val baseMeasure = if(useConstraints) { + constraints + } else { + ChartConstraints.Factory.noSparsity[AnnotatedLabel, String] + } + + println("Building model") + val model = modelFactory.make(theTrees, initialParser.topology, initialParser.lexicon, constraints) + val obj = new ModelObjective(model, theTrees, params.threads) + val cachedObj = new CachedBatchDiffFunction(obj) + println("Initializing weights custom for model " + model.getClass) + val init = model.initialWeightVector(initWeightsScale, initializerSpec) + if(checkGradient) { + val cachedObj2 = new CachedBatchDiffFunction(new ModelObjective(model, theTrees.take(opt.batchSize), params.threads)) + val defaultIndices = (0 until 10).map(i => if(i < 0) model.featureIndex.size + i else i) + val indices = if (model.transforms.size > 0) { + model.transforms(0).getInterestingWeightIndicesForGradientCheck(0) + } else { + defaultIndices + } + println("testIndices: " + indices) + GradientTester.testIndices(cachedObj2, init, indices, toString={(i: Int) => model.featureIndex.get(i).toString}, skipZeros = true) + println("test") + GradientTester.test(cachedObj2, init, toString={(i: Int) => model.featureIndex.get(i).toString}, skipZeros = false) + } + + type OptState = FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State + def evalAndCache(pair: (OptState, Int)) { + val (state, iter) = pair + val weights = state.x + if (iter % iterPerValidate == 0) { + logger.info("Validating...") + val parser = model.extractParser(weights) + val stats = validate(parser) + logger.info("Overall statistics for validation: " + stats) + } + } + + + val name = Option(params.name).orElse(Option(model.getClass.getSimpleName).filter(_.nonEmpty)).getOrElse("DiscrimParser") + val itr: Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State] = if (determinizeTraining) { + val scanningBatchesObj = cachedObj.withScanningBatches(params.opt.batchSize) + if (useAdadelta) { + println("OPTIMIZATION: Adadelta") + new AdadeltaGradientDescentDVD(params.opt.maxIterations, momentum).iterations(scanningBatchesObj, init). + asInstanceOf[Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State]] + } else { + println("OPTIMIZATION: Adagrad") + params.opt.iterations(scanningBatchesObj, init).asInstanceOf[Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State]] + } + } else { + if (useAdadelta) { + println("OPTIMIZATION: Adadelta") + new AdadeltaGradientDescentDVD(params.opt.maxIterations, momentum).iterations(cachedObj.withRandomBatches(params.opt.batchSize), init). + asInstanceOf[Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State]] + } else { + println("OPTIMIZATION: Adagrad") + params.opt.iterations(cachedObj, init) + } + } + if (ensemble) { + val weights1 = itr.take(maxIterations).last.x + // Hard-wired to use Adadelta + val initParams2 = model.initialWeightVector(initWeightsScale, initializerSpec, trulyRandom = true) + val itr2 = new AdadeltaGradientDescentDVD(params.opt.maxIterations).iterations(cachedObj.withRandomBatches(params.opt.batchSize), initParams2). + asInstanceOf[Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State]] + println("Optimizing second parser") + val weights2 = itr2.take(maxIterations).last.x + println("Optimized both parsers") + val clonedModel = model.cloneModelForEnsembling + val mergedWeights = model.mergeWeightsForEnsembling(weights1, weights2) + Seq(("ComboParser-Final", clonedModel.extractParser(mergedWeights))).iterator + } else { + // Normal execution + for ((state, iter) <- itr.take(maxIterations).zipWithIndex.tee(evalAndCache _) + if iter != 0 && iter % iterationsPerEval == 0 || evaluateNow) yield try { + // N.B. This may be wrong for batch normalization + val parser = model.extractParser(state.x) + if (iter + iterationsPerEval >= maxIterations && computeTrainLL) { + computeLL(trainTrees, model, state.x) + } + (s"$name-$iter", parser) + } catch { + case e: Exception => e.printStackTrace(); throw e + } + } + } + + def sentTooLong(p: TreeInstance[AnnotatedLabel, String], maxLength: Int): Boolean = { + p.words.count(x => x == "'s" || x(0).isLetterOrDigit) > maxLength + } + + def evaluateNow = { + val sentinel = new File("EVALUATE_NOW") + if(sentinel.exists()) { + sentinel.delete() + logger.info("Evaluating now!!!!") + true + } else { + false + } + } + + def computeLL(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], model: PositionalNeuralModel[AnnotatedLabel,AnnotatedLabel,String], weights: DenseVector[Double]) { + println("Computing final log likelihood on the whole training set...") + val inf = model.inferenceFromWeights(weights).forTesting + val ll = trainTrees.par.aggregate(0.0)((currLL, trainTree) => { + try { + val s = inf.scorer(trainTree) + currLL + inf.goldMarginal(s, trainTree).logPartition - inf.marginal(s, trainTree).logPartition + } catch { + case e: Exception => println("Couldn't parse") + currLL + } + }, _ + _) + println("Log likelihood on " + trainTrees.size + " examples: " + ll) + } +} diff --git a/src/main/scala/epic/parser/models/ParserTrainer.scala b/src/main/scala/epic/parser/models/ParserTrainer.scala index fcfbc4c2..931da01f 100644 --- a/src/main/scala/epic/parser/models/ParserTrainer.scala +++ b/src/main/scala/epic/parser/models/ParserTrainer.scala @@ -35,6 +35,7 @@ import epic.parser.ParseEval.Statistics import epic.features.LongestFrequentSuffixFeaturizer.LongestFrequentSuffix import epic.features.LongestFrequentSuffixFeaturizer import epic.util.Optional +import epic.dense.AdadeltaGradientDescentDVD /** * The main entry point for training discriminative parsers. @@ -53,6 +54,10 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging { @Help(text="path for a baseline parser for computing constraints. will be built automatically if not provided.") parser: File = null, opt: OptParams, + @Help(text="Use Adadelta instead of Adagrad (hardcoded in here...)") + useAdadelta: Boolean = false, + @Help(text="Make training batches deterministic; useful for debugging / regression testing") + determinizeTraining: Boolean = false, @Help(text="How often to run on the dev set.") iterationsPerEval: Int = 100, @Help(text="How many iterations to run.") @@ -71,8 +76,10 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging { checkGradient: Boolean = false, @Help(text="check specific indices, in addition to doing a full search.") checkGradientsAt: String = null, - @Help(text="check specific indices, in addition to doing a full search.") + @Help(text="Max parse length") maxParseLength: Int = 70, + @Help(text="Compute log likelihood on the training set") + computeTrainLL: Boolean = true, annotator: TreeAnnotator[AnnotatedLabel, String, AnnotatedLabel] = GenerativeParser.defaultAnnotator()) protected val paramManifest = manifest[Params] @@ -124,8 +131,10 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging { val init = obj.initialWeightVector(randomize) if(checkGradient) { val cachedObj2 = new CachedBatchDiffFunction(new ModelObjective(model, theTrees.take(opt.batchSize), params.threads)) - val indices = (0 until 10).map(i => if(i < 0) model.featureIndex.size + i else i) + val indices = (0 until 10).map(i => if(i < 0) model.featureIndex.size + i else i) + println("testIndices: " + indices) GradientTester.testIndices(cachedObj2, obj.initialWeightVector(randomize = true), indices, toString={(i: Int) => model.featureIndex.get(i).toString}, skipZeros = true) + println("test") GradientTester.test(cachedObj2, obj.initialWeightVector(randomize = true), toString={(i: Int) => model.featureIndex.get(i).toString}, skipZeros = false) } @@ -143,9 +152,32 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging { val name = Option(params.name).orElse(Option(model.getClass.getSimpleName).filter(_.nonEmpty)).getOrElse("DiscrimParser") - for ((state, iter) <- params.opt.iterations(cachedObj, init).take(maxIterations).zipWithIndex.tee(evalAndCache _) + val itr: Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State] = if (determinizeTraining) { + val scanningBatchesObj = cachedObj.withScanningBatches(params.opt.batchSize) + if (useAdadelta) { + println("OPTIMIZATION: Adadelta") + new AdadeltaGradientDescentDVD(params.opt.maxIterations).iterations(scanningBatchesObj, init). + asInstanceOf[Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State]] + } else { + println("OPTIMIZATION: Adagrad") + params.opt.iterations(scanningBatchesObj, init).asInstanceOf[Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State]] + } + } else { + if (useAdadelta) { + println("OPTIMIZATION: Adadelta") + new AdadeltaGradientDescentDVD(params.opt.maxIterations).iterations(cachedObj.withRandomBatches(params.opt.batchSize), init). + asInstanceOf[Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State]] + } else { + println("OPTIMIZATION: Adagrad") + params.opt.iterations(cachedObj, init) + } + } + for ((state, iter) <- itr.take(maxIterations).zipWithIndex.tee(evalAndCache _) if iter != 0 && iter % iterationsPerEval == 0 || evaluateNow) yield try { val parser = model.extractParser(state.x) + if (iter + iterationsPerEval >= maxIterations && computeTrainLL) { + computeLL(trainTrees, model, state.x) + } (s"$name-$iter", parser) } catch { case e: Exception => e.printStackTrace(); throw e @@ -167,6 +199,21 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging { } } + + def computeLL(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], model: Model[TreeInstance[AnnotatedLabel, String]], weights: DenseVector[Double]) { + println("Computing final log likelihood on the whole training set...") + val inf = model.inferenceFromWeights(weights).forTesting + val ll = trainTrees.par.aggregate(0.0)((currLL, trainTree) => { + try { + val s = inf.scorer(trainTree) + currLL + inf.goldMarginal(s, trainTree).logPartition - inf.marginal(s, trainTree).logPartition + } catch { + case e: Exception => println("Couldn't parse") + currLL + } + }, _ + _) + println("Log likelihood on " + trainTrees.size + " examples: " + ll) + } } diff --git a/src/main/scala/epic/parser/models/PositionalNeuralModel.scala b/src/main/scala/epic/parser/models/PositionalNeuralModel.scala new file mode 100644 index 00000000..dc474d8c --- /dev/null +++ b/src/main/scala/epic/parser/models/PositionalNeuralModel.scala @@ -0,0 +1,418 @@ +package epic.parser +package models + +import scala.collection.mutable.HashMap +import scala.util.Random +import scala.collection.GenTraversable +import breeze.features.FeatureVector +import breeze.linalg._ +import breeze.util.Index +import epic.constraints.ChartConstraints +import epic.dense.IdentityTransform +import epic.dense.OutputTransform +import epic.dense.Transform +import epic.dense.Word2VecDepFeaturizerIndexed +import epic.dense.Word2VecSurfaceFeaturizerIndexed +import epic.features._ +import epic.framework.Feature +import epic.framework.StandardExpectedCounts +import epic.lexicon.Lexicon +import epic.parser.projections.GrammarRefinements +import epic.trees._ +import scala.collection.mutable.ArrayBuffer + +/** + * Main neural CRF parser class. + * + * @author gdurrett + **/ +@SerialVersionUID(1L) +class PositionalNeuralModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W]) => BinarizedTree[IndexedSeq[L2]], + val constrainer: ChartConstraints.Factory[L, W], + val topology: RuleTopology[L], + val lexicon: Lexicon[L, W], + refinedTopology: RuleTopology[L2], + refinements: GrammarRefinements[L, L2], + labelFeaturizer: RefinedFeaturizer[L, W, Feature], + surfaceFeaturizer: Word2VecSurfaceFeaturizerIndexed[W], + depFeaturizer: Word2VecDepFeaturizerIndexed[W], + val transforms: IndexedSeq[OutputTransform[Array[Int],DenseVector[Double]]], + val maybeSparseSurfaceFeaturizer: Option[IndexedSpanFeaturizer[L, L2, W]], + val depTransforms: Seq[OutputTransform[Array[Int],DenseVector[Double]]], + val decoupledTransforms: Seq[OutputTransform[Array[Int],DenseVector[Double]]]) extends ParserModel[L, W] with Serializable { + + def mergeWeightsForEnsembling(x1: DenseVector[Double], x2: DenseVector[Double]) = { + require(decoupledTransforms.size == 0) + require(x1.size == x2.size) + // Stack up the dense parts, average the sparse parts + if (maybeSparseSurfaceFeaturizer.isDefined) { + val sparseFeatsStart = index.componentOffset(index.indices.size - 1) + val summedSparseFeatures = x1(sparseFeatsStart to -1) + x2(sparseFeatsStart to -1) + DenseVector.vertcat(x1(0 until sparseFeatsStart), x2(0 until sparseFeatsStart), summedSparseFeatures) + } else { + DenseVector.vertcat(x1, x2) + } + } + + def cloneModelForEnsembling = { + require(decoupledTransforms.size == 0) + // Note that duping the transforms is okay because they still produce distinct + // layers, so caching behavior is unaffected + val newTransforms = transforms ++ transforms; + val newDepTransforms = depTransforms ++ depTransforms; + new PositionalNeuralModel(annotator, constrainer, topology, lexicon, refinedTopology, refinements, labelFeaturizer, surfaceFeaturizer, depFeaturizer, + newTransforms, maybeSparseSurfaceFeaturizer, newDepTransforms, decoupledTransforms) + } + + override type Inference = PositionalNeuralModel.Inference[L, L2, W] + + override def accumulateCounts(inf: Inference, s: Scorer, d: TreeInstance[L, W], m: Marginal, accum: ExpectedCounts, scale: Double): Unit = { +// println("Extracting ecounts") + inf.grammar.extractEcounts(m, accum.counts, scale) + + if (maybeSparseSurfaceFeaturizer.isDefined) { + val f = maybeSparseSurfaceFeaturizer.get + val innerAccum = StandardExpectedCounts.zero(f.index) + m.expectedCounts(maybeSparseSurfaceFeaturizer.get, innerAccum, scale) + // val totalTransformSize = transform.index.size + val totalTransformSize = transforms.map(_.index.size).foldLeft(0)(_ + _) + depTransforms.map(_.index.size).foldLeft(0)(_ + _) + decoupledTransforms.map(_.index.size).foldLeft(0)(_ + _) + accum.counts += DenseVector.vertcat(DenseVector.zeros[Double](totalTransformSize), innerAccum.counts) + } +// println("Ecounts extracted") + accum.loss += scale * m.logPartition + } + + /** + * Models have features, and this defines the mapping from indices in the weight vector to features. + * @return + */ + val index = if (maybeSparseSurfaceFeaturizer.isDefined) { + SegmentedIndex((transforms.map(_.index) ++ depTransforms.map(_.index) ++ decoupledTransforms.map(_.index) ++ IndexedSeq(maybeSparseSurfaceFeaturizer.get.index)):_*) + } else { + SegmentedIndex((transforms.map(_.index) ++ depTransforms.map(_.index) ++ decoupledTransforms.map(_.index)):_*) + } + + def initialWeightVector(initWeightsScale: Double, initializerSpec: String, trulyRandom: Boolean = false): DenseVector[Double] = { + val rng = if (trulyRandom) new Random() else new Random(0) + val initTransformWeights = DenseVector.vertcat(transforms.map(_.initialWeightVector(initWeightsScale, rng, true, initializerSpec)):_*); + val initDepWeights = DenseVector.vertcat(depTransforms.map(_.initialWeightVector(initWeightsScale, rng, true, initializerSpec)):_*); + val initDecoupledWeights = DenseVector.vertcat(decoupledTransforms.map(_.initialWeightVector(initWeightsScale, rng, true, initializerSpec)):_*); + val newInitVector: DenseVector[Double] = if (maybeSparseSurfaceFeaturizer.isDefined) { + DenseVector.vertcat(initTransformWeights, initDepWeights, initDecoupledWeights, DenseVector.zeros(maybeSparseSurfaceFeaturizer.get.index.size)) + } else { + DenseVector.vertcat(initTransformWeights, initDepWeights, initDecoupledWeights) + } + require(newInitVector.size == index.size, newInitVector.size + " " + index.size) + newInitVector + } + + override def featureIndex: Index[Feature] = index + + override def inferenceFromWeights(weights: DenseVector[Double]): Inference = inferenceFromWeights(weights, true) + + def inferenceFromWeights(weights: DenseVector[Double], forTrain: Boolean): Inference = { + val layersAndInnerLayers = for (i <- 0 until transforms.size) yield { + transforms(i).extractLayerAndPenultimateLayer(weights(index.componentOffset(i) until index.componentOffset(i) + index.indices(i).size), forTrain) + } + val layers: IndexedSeq[OutputTransform[Array[Int],DenseVector[Double]]#OutputLayer] = layersAndInnerLayers.map(_._1) + val innerLayers: IndexedSeq[epic.dense.Transform.Layer[Array[Int],DenseVector[Double]]] = layersAndInnerLayers.map(_._2) + val depLayers: IndexedSeq[OutputTransform[Array[Int],DenseVector[Double]]#OutputLayer] = for (i <- 0 until depTransforms.size) yield { + val idxIdx = transforms.size + i + depTransforms(i).extractLayer(weights(index.componentOffset(idxIdx) until index.componentOffset(idxIdx) + index.indices(idxIdx).size), forTrain) + } + val decoupledLayersAndInner = for (i <- 0 until decoupledTransforms.size) yield { + val idxIdx = transforms.size + depTransforms.size + i + decoupledTransforms(i).extractLayerAndPenultimateLayer(weights(index.componentOffset(idxIdx) until index.componentOffset(idxIdx) + index.indices(idxIdx).size), forTrain) + } + val decoupledLayers = decoupledLayersAndInner.map(_._1) + val decoupledInnerLayers = decoupledLayersAndInner.map(_._2) + val grammar = new PositionalNeuralModel.PositionalNeuralGrammar[L, L2, W](topology, lexicon, refinedTopology, refinements, labelFeaturizer, + surfaceFeaturizer, depFeaturizer, layers, innerLayers, depLayers, maybeSparseSurfaceFeaturizer, decoupledLayers, decoupledInnerLayers, weights, this) + new Inference(annotator, constrainer, grammar, refinements) + } + + /** + * When doing batch normalization, we need to normalize the test network + */ + def extractParser(weights: DenseVector[Double], trainExs: Seq[TreeInstance[L,W]])(implicit deb: Debinarizer[L]) = { + val inf = inferenceFromWeights(weights).forTesting + inf.relativizeToData(trainExs.slice(0, Math.min(trainExs.size, 200)).asInstanceOf[Seq[TreeInstance[AnnotatedLabel,String]]]); + Parser(constrainer, inf.grammar, ChartDecoder[L, W]()) + } + + override def initialValueForFeature(f: Feature): Double = 0.0 +} + +object PositionalNeuralModel { + + case class Inference[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W]) => BinarizedTree[IndexedSeq[L2]], + constrainer: ChartConstraints.Factory[L, W], + grammar: PositionalNeuralGrammar[L, L2, W], + refinements: GrammarRefinements[L, L2]) extends ParserInference[L, W] { + override def goldMarginal(scorer: Scorer, ti: TreeInstance[L, W], aug: UnrefinedGrammarAnchoring[L, W]): Marginal = { + + import ti._ + + val annotated = annotator(tree, words).map(_.map(refinements.labels.localize)) + + val product = grammar.anchor(words, constrainer.constraints(ti.words)) + LatentTreeMarginal(product, annotated) + } + + // This needs to be different for dropout, so that we can get the right layers + override def forTesting = grammar.origPTModel.inferenceFromWeights(grammar.weights, false) + + def relativizeToData(data: GenTraversable[TreeInstance[AnnotatedLabel,String]]) { + } + } + + @SerialVersionUID(4749637878577393596L) + class PositionalNeuralGrammar[L, L2, W](val topology: RuleTopology[L], + val lexicon: Lexicon[L, W], + val refinedTopology: RuleTopology[L2], + val refinements: GrammarRefinements[L, L2], + labelFeaturizer: RefinedFeaturizer[L, W, Feature], + val surfaceFeaturizer: Word2VecSurfaceFeaturizerIndexed[W], + depFeaturizer: Word2VecDepFeaturizerIndexed[W], + val layers: IndexedSeq[OutputTransform[Array[Int],DenseVector[Double]]#OutputLayer], + penultimateLayers: IndexedSeq[epic.dense.Transform.Layer[Array[Int],DenseVector[Double]]], + depLayers: IndexedSeq[OutputTransform[Array[Int],DenseVector[Double]]#OutputLayer], + val maybeSparseSurfaceFeaturizer: Option[IndexedSpanFeaturizer[L, L2, W]], + decoupledLayers: IndexedSeq[OutputTransform[Array[Int],DenseVector[Double]]#OutputLayer], + penultimateDecoupledLayers: IndexedSeq[epic.dense.Transform.Layer[Array[Int],DenseVector[Double]]], + val weights: DenseVector[Double], + val origPTModel: PositionalNeuralModel[L,L2,W]) extends Grammar[L, W] with Serializable { + + val SpanLayerIdx = 0 + val UnaryLayerIdx = 1 + val BinaryLayerIdx = 2 + val dcSpanFeatOffset = layers.map(_.index.size).foldLeft(0)(_ + _) + depLayers.map(_.index.size).foldLeft(0)(_ + _) + val dcUnaryFeatOffset = dcSpanFeatOffset + (if (decoupledLayers.size > 0) decoupledLayers(0).index.size else 0) + val dcBinaryFeatOffset = dcUnaryFeatOffset + (if (decoupledLayers.size > 0) decoupledLayers(1).index.size else 0) + + override def withPermissiveLexicon: Grammar[L, W] = { + new PositionalNeuralGrammar(topology, lexicon.morePermissive, refinedTopology, refinements, labelFeaturizer, surfaceFeaturizer, + depFeaturizer, layers, penultimateLayers, depLayers, maybeSparseSurfaceFeaturizer, decoupledLayers, penultimateDecoupledLayers, weights, origPTModel) + } + + + /** + * N.B. does not extracted expected counts for sparse features; this is done outside this loop + */ + def extractEcounts(m: ParseMarginal[L, W], deriv: DenseVector[Double], scale: Double): Unit = { + val w = m.words + val length = w.length + val sspec = surfaceFeaturizer.anchor(w) + val depSpec = depFeaturizer.anchor(w) + val lspec = labelFeaturizer.anchor(w) + +// val maxTetraLen = ((w.size + 2) * (w.size + 3) * (w.size + 4))/6 + ((w.size + 1) * (w.size + 2))/2 + w.size + 2 + + def tetra(begin: Int, split: Int, end: Int) = { + (end * (end + 1) * (end + 2))/6 + ((split + 1) * split / 2 + begin) + } + + // This representation appears to make things a bit faster? + val ruleCountsPerState = new HashMap[Int,SparseVector[Double]] + val unaryRuleCountsPerState = new HashMap[Int,SparseVector[Double]] + val binaryRuleCountsPerState = new HashMap[Int,SparseVector[Double]] + val spanCountsPerState = new HashMap[Int,SparseVector[Double]] +// val ruleCountsPerState = Array.fill(maxTetraLen)(SparseVector.zeros[Double](labelFeaturizer.index.size)) +// val countsPerHeadDepPair = Array.tabulate(w.size, w.size)((i, j) => 0.0) +// val statesUsed = Array.fill(maxTetraLen)(false) +// val untetra = Array.fill(maxTetraLen)((-1, -1, -1)) + val untetra = new HashMap[Int,(Int,Int,Int)] + + m visit new AnchoredVisitor[L] { + + override def visitUnaryRule(begin: Int, end: Int, rule: Int, ref: Int, score: Double): Unit = { + val tetraIdx = tetra(begin, end, length + 1) + untetra(tetraIdx) = (begin, end, length + 1) + val fv = new FeatureVector(lspec.featuresForUnaryRule(begin, end, rule, ref)) + if (!ruleCountsPerState.contains(tetraIdx)) ruleCountsPerState.put(tetraIdx, SparseVector.zeros[Double](labelFeaturizer.index.size)) + axpy(score, fv, ruleCountsPerState(tetraIdx)) + if (!decoupledLayers.isEmpty) { + if (!unaryRuleCountsPerState.contains(tetraIdx)) unaryRuleCountsPerState.put(tetraIdx, SparseVector.zeros[Double](labelFeaturizer.index.size)) + axpy(score, fv, unaryRuleCountsPerState(tetraIdx)) + } + } + + override def visitSpan(begin: Int, end: Int, tag: Int, ref: Int, score: Double): Unit = { + val tetraIdx = tetra(begin, end, length + 2) + untetra(tetraIdx) = (begin, end, length + 2) + val fv = new FeatureVector(lspec.featuresForSpan(begin, end, tag, ref)) + if (!ruleCountsPerState.contains(tetraIdx)) ruleCountsPerState.put(tetraIdx, SparseVector.zeros[Double](labelFeaturizer.index.size)) + axpy(score, fv, ruleCountsPerState(tetraIdx)) + if (!decoupledLayers.isEmpty) { + if (!spanCountsPerState.contains(tetraIdx)) spanCountsPerState.put(tetraIdx, SparseVector.zeros[Double](labelFeaturizer.index.size)) + axpy(score, fv, spanCountsPerState(tetraIdx)) + } + } + + override def visitBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int, score: Double): Unit = { + val tetraIdx = tetra(begin, split, end) + untetra(tetraIdx) = (begin, split, end) + val fv = new FeatureVector(lspec.featuresForBinaryRule(begin, split, end, rule, ref)) + if (!ruleCountsPerState.contains(tetraIdx)) ruleCountsPerState.put(tetraIdx, SparseVector.zeros[Double](labelFeaturizer.index.size)) + axpy(score, fv, ruleCountsPerState(tetraIdx)) + if (!decoupledLayers.isEmpty) { + if (!binaryRuleCountsPerState.contains(tetraIdx)) binaryRuleCountsPerState.put(tetraIdx, SparseVector.zeros[Double](labelFeaturizer.index.size)) + axpy(score, fv, binaryRuleCountsPerState(tetraIdx)) + } + } + } + + for (key <- ruleCountsPerState.keySet) { + val (begin, split, end) = untetra(key) + val ffeats = if (end > length) sspec.featuresForSpan(begin, split) else sspec.featuresForSplit(begin, split, end) + var layerSizeTally = 0 + for (j <- 0 until layers.size) { + layers(j).tallyDerivative(deriv(layerSizeTally until layerSizeTally + layers(j).index.size), { ruleCountsPerState(key) * scale }, ffeats) + layerSizeTally += layers(j).index.size; + } + } + if (!decoupledLayers.isEmpty) { + for (key <- spanCountsPerState.keySet) { + val (begin, end, _) = untetra(key) + val ffeats = sspec.reducedFeaturesForSpan(begin, end) + decoupledLayers(SpanLayerIdx).tallyDerivative(deriv(dcSpanFeatOffset until dcSpanFeatOffset + decoupledLayers(SpanLayerIdx).index.size), { spanCountsPerState(key) * scale }, ffeats) + } + for (key <- unaryRuleCountsPerState.keySet) { + val (begin, end, _) = untetra(key) + val ffeats = sspec.reducedFeaturesForSpan(begin, end) + decoupledLayers(UnaryLayerIdx).tallyDerivative(deriv(dcUnaryFeatOffset until dcUnaryFeatOffset + decoupledLayers(UnaryLayerIdx).index.size), { unaryRuleCountsPerState(key) * scale }, ffeats) + } + for (key <- binaryRuleCountsPerState.keySet) { + val (begin, split, end) = untetra(key) + val ffeats = sspec.featuresForSplit(begin, split, end) + decoupledLayers(BinaryLayerIdx).tallyDerivative(deriv(dcBinaryFeatOffset until dcBinaryFeatOffset + decoupledLayers(BinaryLayerIdx).index.size), { binaryRuleCountsPerState(key) * scale }, ffeats) + } + } + } + + def anchor(w: IndexedSeq[W], cons: ChartConstraints[L]):GrammarAnchoring[L, W] = new ProjectionsGrammarAnchoring[L, L2, W] { + + override def addConstraints(constraints: ChartConstraints[L]): GrammarAnchoring[L, W] = { + anchor(w, cons & constraints) + } + + override def sparsityPattern: ChartConstraints[L] = cons + + def refinements = PositionalNeuralGrammar.this.refinements + def refinedTopology: RuleTopology[L2] = PositionalNeuralGrammar.this.refinedTopology + + val topology = PositionalNeuralGrammar.this.topology + val lexicon = PositionalNeuralGrammar.this.lexicon + + def words = w + + val l = w.size + val maxTetraLen = ((l + 2) * (l + 3) * (l + 4))/6 + ((l + 1) * (l + 2))/2 + l + 2 + + // Doesn't make things faster to use HashMaps here + val cache = Array.tabulate(layers.size + decoupledLayers.size)(i => new Array[DenseVector[Double]](maxTetraLen)) + val finalCache = Array.tabulate(layers.size + decoupledLayers.size)(i => new Array[SparseVector[Double]](maxTetraLen)) + + def getOrElseUpdate(layerIdx: Int, tetraIdx: Int, fun: => DenseVector[Double]) = { + if (cache(layerIdx)(tetraIdx) == null) cache(layerIdx)(tetraIdx) = fun + cache(layerIdx)(tetraIdx) + } + + def getOrElseUpdateFinal(layerIdx: Int, tetraIdx: Int, rfeatIdx: Int, maxVectSize: Int, fun: => Double) = { + if (finalCache(layerIdx)(tetraIdx) == null) finalCache(layerIdx)(tetraIdx) = SparseVector.zeros(maxVectSize) + if (!finalCache(layerIdx)(tetraIdx).contains(rfeatIdx)) finalCache(layerIdx)(tetraIdx)(rfeatIdx) = fun + finalCache(layerIdx)(tetraIdx)(rfeatIdx) + } + + val sspec = surfaceFeaturizer.anchor(w) + val depSpec = depFeaturizer.anchor(w) + val lspec = labelFeaturizer.anchor(w) + val fspec = if (maybeSparseSurfaceFeaturizer.isDefined) maybeSparseSurfaceFeaturizer.get.anchor(w) else null + val sparseFeatsStart = if (maybeSparseSurfaceFeaturizer.isDefined) (layers.map(_.index.size).foldLeft(0)(_ + _) + depLayers.map(_.index.size).foldLeft(0)(_ + _) + decoupledLayers.map(_.index.size).foldLeft(0)(_ + _)) else -1 + + private def tetra(begin: Int, split: Int, end: Int) = { + (end * (end + 1) * (end + 2))/6 + ((split + 1) * split / 2 + begin) + } + + def scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int) = { + var total = 0.0; + val tetraIdx = tetra(begin, split, end) + val rfeats = lspec.featuresForBinaryRule(begin, split, end, rule, ref) + for (layerIdx <- 0 until layers.size) { + val fs = getOrElseUpdate(layerIdx, tetraIdx, { penultimateLayers(layerIdx).activations(sspec.featuresForSplit(begin, split, end)) }) + for (rfeat <- rfeats) { + total += getOrElseUpdateFinal(layerIdx, tetraIdx, rfeat, labelFeaturizer.index.size, { layers(layerIdx).activationsFromPenultimateDot(fs, rfeat) }) + } + } + if (!decoupledLayers.isEmpty) { + val layerIdx = layers.size + BinaryLayerIdx + val fs = getOrElseUpdate(layerIdx, tetraIdx, { penultimateDecoupledLayers(BinaryLayerIdx).activations(sspec.featuresForSplit(begin, split, end)) }) + for (rfeat <- rfeats) { + total += getOrElseUpdateFinal(layerIdx, tetraIdx, rfeat, labelFeaturizer.index.size, { decoupledLayers(BinaryLayerIdx).activationsFromPenultimateDot(fs, rfeat) }) + } + } + if (maybeSparseSurfaceFeaturizer.isDefined) { + total += dot(fspec.featuresForBinaryRule(begin, split, end, rule, ref), sparseFeatsStart) + } + total + } + + def scoreUnaryRule(begin: Int, end: Int, rule: Int, ref: Int) = { + var total = 0.0; + val tetraIdx = tetra(begin, end, length + 1) + val rfeats = lspec.featuresForUnaryRule(begin, end, rule, ref) + for (layerIdx <- 0 until layers.size) { + val fs = getOrElseUpdate(layerIdx, tetraIdx, { penultimateLayers(layerIdx).activations(sspec.featuresForSpan(begin, end)) }) + for (rfeat <- rfeats) { + total += getOrElseUpdateFinal(layerIdx, tetraIdx, rfeat, labelFeaturizer.index.size, { layers(layerIdx).activationsFromPenultimateDot(fs, rfeat) }) + } + } + if (!decoupledLayers.isEmpty) { + val layerIdx = layers.size + UnaryLayerIdx + val fs = getOrElseUpdate(layerIdx, tetraIdx, { penultimateDecoupledLayers(UnaryLayerIdx).activations(sspec.reducedFeaturesForSpan(begin, end)) }) + for (rfeat <- rfeats) { + total += getOrElseUpdateFinal(layerIdx, tetraIdx, rfeat, labelFeaturizer.index.size, { decoupledLayers(UnaryLayerIdx).activationsFromPenultimateDot(fs, rfeat) }) + } + } + if (maybeSparseSurfaceFeaturizer.isDefined) { + total += dot(fspec.featuresForUnaryRule(begin, end, rule, ref), sparseFeatsStart) + } + total + } + + def scoreSpan(begin: Int, end: Int, tag: Int, ref: Int) = { + var total = 0.0; + val tetraIdx = tetra(begin, end, length + 2) + val rfeats = lspec.featuresForSpan(begin, end, tag, ref) + for (layerIdx <- 0 until layers.size) { + val fs = getOrElseUpdate(layerIdx, tetraIdx, { penultimateLayers(layerIdx).activations(sspec.featuresForSpan(begin, end)) }) + for (rfeat <- rfeats) { + total += getOrElseUpdateFinal(layerIdx, tetraIdx, rfeat, labelFeaturizer.index.size, { layers(layerIdx).activationsFromPenultimateDot(fs, rfeat) }) + } + } + if (!decoupledLayers.isEmpty) { + val layerIdx = layers.size + SpanLayerIdx + val fs = getOrElseUpdate(layerIdx, tetraIdx, { penultimateDecoupledLayers(SpanLayerIdx).activations(sspec.reducedFeaturesForSpan(begin, end)) }) + for (rfeat <- rfeats) { + total += getOrElseUpdateFinal(layerIdx, tetraIdx, rfeat, labelFeaturizer.index.size, { decoupledLayers(SpanLayerIdx).activationsFromPenultimateDot(fs, rfeat) }) + } + } + if (maybeSparseSurfaceFeaturizer.isDefined) { + total += dot(fspec.featuresForSpan(begin, end, tag, ref), sparseFeatsStart) + } + total + } + + private def dot(features: Array[Int], sparseFeaturesOffset: Int) = { + var i = 0 + var score = 0.0 + val wdata = weights.data + while(i < features.length) { + score += wdata(features(i) + sparseFeaturesOffset) + i += 1 + } + score + } + } + } +} diff --git a/src/main/scala/epic/parser/models/PositionalNeuralModelFactory.scala b/src/main/scala/epic/parser/models/PositionalNeuralModelFactory.scala new file mode 100644 index 00000000..b370317e --- /dev/null +++ b/src/main/scala/epic/parser/models/PositionalNeuralModelFactory.scala @@ -0,0 +1,321 @@ +package epic.parser +package models + +import java.io.File +import breeze.config.Help +import breeze.features.FeatureVector +import breeze.linalg._ +import breeze.util.Index +import epic.constraints.ChartConstraints +import epic.dense.{IdentityTransform, AffineTransform, Transform} +import epic.features.SurfaceFeaturizer.SingleWordSpanFeaturizer +import epic.features._ +import epic.framework.Feature +import epic.lexicon.Lexicon +import epic.parser.projections.GrammarRefinements +import epic.trees._ +import epic.trees.annotations.TreeAnnotator +import epic.util.{LRUCache, Optional} +import epic.dense.Transform +import epic.dense.TanhTransform +import epic.dense.OutputTransform +import epic.dense.AffineOutputTransform +import epic.dense.OutputEmbeddingTransform +import epic.dense.Word2Vec +import scala.collection.mutable.HashMap +import epic.dense.Word2VecSurfaceFeaturizerIndexed +import epic.dense.Word2VecDepFeaturizerIndexed +import epic.dense.Word2VecIndexed +import epic.dense.FrequencyTagger +import epic.dense.CachingLookupTransform +import epic.dense.CachingLookupAndAffineTransformDense +import epic.dense.EmbeddingsTransform +import epic.dense.NonlinearTransform +import scala.io.Source +import scala.collection.mutable.HashSet +import epic.dense.BatchNormalizationTransform + +/** + * Entry point for instantiating a neural CRF parser. Parameters specify neural + * net parameters, word vectors, and sparse features to use. + * + * @author gdurrett + **/ + + +/** + * Less-used parameters + */ +case class ExtraPNMParams(@Help(text="Used for ablations with random word embeddings; don't change this. Options: normal, random, trivial, normalpos") + embeddingType: String = "normal", + @Help(text="Use longest frequent suffix (standard representation) for sparse feats") + useSparseLfsuf: Boolean = true, + @Help(text="Use sparse Brown cluster features") + useSparseBrown: Boolean = false, + @Help(text="Use expanded set of sparse surface features (doesn't help)") + useMostSparseIndicators: Boolean = false, + @Help(text="Scaling factor for all input vectors") + vectorRescaling: Double = 1.0, + @Help(text="Use the output embedding model (Figure 4b in the neural CRF paper)") + outputEmbedding: Boolean = false, + @Help(text="Dimension of the output embedding model") + outputEmbeddingDim: Int = 20, + @Help(text="When initializing the output embedding model, initialize based on root symbols") + coarsenByRoot: Boolean = false, + @Help(text="Use separate neural net parameters for span/unary/binary settings. Doesn't help.") + decoupleTransforms: Boolean = false, + @Help(text="Extract additional output features based on root label.") + useRootLabel: Boolean = false, + @Help(text="Set unknown word vectors to be random rather than 0") + randomizeUnks: Boolean = false) + +case class PositionalNeuralModelFactory(@Help(text= + """The kind of annotation to do on the refined grammar. Default uses just parent annotation. +You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Manning 2003. + """) + annotator: TreeAnnotator[AnnotatedLabel, String, AnnotatedLabel] = GenerativeParser.defaultAnnotator(), + @Help(text="For features not seen in gold trees, we bin them into dummyFeats * numGoldFeatures bins using hashing. If negative, use absolute value as number of hash features.") + dummyFeats: Double = 0.5, + @Help(text="Sparse features only fire on suffixes seen at lease this many times. Lower than 100 doesn't seem to do better.") + commonWordThreshold: Int = 100, + @Help(text="Combine the neural net features with sparse features. The NN does well on its own but sparse helps by >1 F1.") + useSparseFeatures: Boolean = true, + @Help(text="Nonlinearity to use. Options: tanh, relu, cube") + nonLinType: String = "relu", + @Help(text="Backpropagate into word embeddings (tune them during training). Doesn't help.") + backpropIntoEmbeddings: Boolean = false, + @Help(text="Dropout rate; 0.0 won't instantiate any dropout units, higher rates will but it doesn't seem to help.") + dropoutRate: Double = 0.0, + @Help(text="Width of hidden layer to use.") + numHidden: Int = 200, + @Help(text="Number of hidden layers to use. More than 1 slows down dramatically and doesn't help.") + numHiddenLayers: Int = 1, + @Help(text="How much surface context should we use as input to the neural network? Default is +/-2 words around begin/end/split. See Word2VecSurfaceFeaturizer for options") + neuralSurfaceWordsToUse: String = "most", + @Help(text="Path to word vectors. Can either be .bin like Mikolov et al.'s or .txt like Bansal et al.'s") + word2vecPath: String = "", + @Help(text="Load additional word vectors into the model rather than just those in the training set. Doesn't help.") + vocFile: String = "", + @Help(text="Set to true if your word vectors are all lowercase. Otherwise true case is used.") + lowercasedVectors: Boolean = false, + extraPNMParams: ExtraPNMParams = ExtraPNMParams()) extends ParserModelFactory[AnnotatedLabel, String] { + + type MyModel = PositionalNeuralModel[AnnotatedLabel, AnnotatedLabel, String] + + + + override def make(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], + topology: RuleTopology[AnnotatedLabel], + lexicon: Lexicon[AnnotatedLabel, String], + constrainer: ChartConstraints.Factory[AnnotatedLabel, String]): MyModel = { + import extraPNMParams._ + val annTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]] = trainTrees.map(annotator(_)) + println("Here's what the annotation looks like on the first few trees") + annTrees.slice(0, Math.min(3, annTrees.size)).foreach(tree => println(tree.render(false))) + + val (annWords, annBinaries, annUnaries) = this.extractBasicCounts(annTrees) + val refGrammar = RuleTopology(AnnotatedLabel.TOP, annBinaries, annUnaries) + + val xbarGrammar = topology + val xbarLexicon = lexicon + + val indexedRefinements = GrammarRefinements(xbarGrammar, refGrammar, (_: AnnotatedLabel).baseAnnotatedLabel) + + def labelFeaturizer(l: AnnotatedLabel) = Set(l, l.baseAnnotatedLabel).toSeq + def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if (useRootLabel) { + Set(r, r.map(_.baseAnnotatedLabel), ParentFeature(r.parent)).toSeq + } else { + Set(r, r.map(_.baseAnnotatedLabel)).toSeq + } + + val prodFeaturizer = new ProductionFeaturizer[AnnotatedLabel, AnnotatedLabel, String](xbarGrammar, indexedRefinements, lGen=labelFeaturizer, rGen=ruleFeaturizer) + + + /////////////////////// + // READ IN WORD VECTORS + val tagCountsLexicon = TagSpanShapeGenerator.makeStandardLexicon(annTrees) + val freqTagger = new FrequencyTagger(tagCountsLexicon) + + val voc = new HashSet[String]() + // Add words in the training set + val summedWordCounts: Counter[String, Double] = sum(annWords, Axis._0) + voc ++= summedWordCounts.keySet.toSet[String].map(str => Word2Vec.convertWord(str, lowercasedVectors)) + // Read in a file of words in the treebank; this allows us to load words that are + // in the dev or test sets but not in train + voc ++= (if (vocFile != "") Source.fromFile(vocFile).getLines().map(str => Word2Vec.convertWord(str, lowercasedVectors)).toSet else Set[String]()) + val word2vec = if (embeddingType == "trivial") { + Word2Vec.makeRandomVectorsForVocabulary(voc.toSet, 0, true) + } else if (embeddingType == "random") { + Word2Vec.makeRandomVectorsForVocabulary(voc.toSet, 50, true) + } else { + Word2Vec.smartLoadVectorsForVocabulary(word2vecPath.split(":"), voc.toSet, summedWordCounts, if (embeddingType == "trivial") 1 else Int.MaxValue, true, randomizeUnks) + } + // Convert Array[Float] values to Array[Double] values and rescale them + val word2vecDoubleVect = word2vec.map(keyValue => (keyValue._1 -> keyValue._2.map(_.toDouble * vectorRescaling))) +// val word2vecDoubleVect = word2vec.map(keyValue => (keyValue._1 -> new DenseVector[Double](keyValue._2.map(_.toDouble)))) + val word2vecIndexed: Word2VecIndexed[String] = if (embeddingType == "normalpos") { + Word2VecIndexed(word2vecDoubleVect, (str: String) => Word2Vec.convertWord(str, lowercasedVectors)).augment(freqTagger.tagTypesIdx.size, freqTagger.convertToFeaturizer) + } else { + Word2VecIndexed(word2vecDoubleVect, (str: String) => Word2Vec.convertWord(str, lowercasedVectors)) + } + ////////////////////// + + val surfaceFeaturizer = new Word2VecSurfaceFeaturizerIndexed(word2vecIndexed, neuralSurfaceWordsToUse) + val depFeaturizer = new Word2VecDepFeaturizerIndexed(word2vecIndexed, freqTagger, topology) + + val transforms = if (decoupleTransforms) { + IndexedSeq[AffineOutputTransform[Array[Int]]]() + } else { + val inputSize = surfaceFeaturizer.splitInputSize + val transform = if (outputEmbedding) { + val coarsenerForInitialization = if (coarsenByRoot) { + Option(PositionalNeuralModelFactory.getRuleToParentMapping(prodFeaturizer.index)) + } else { + None + } + PositionalNeuralModelFactory.buildNetOutputEmbedding(word2vecIndexed, inputSize, numHidden, numHiddenLayers, prodFeaturizer.index.size, nonLinType, dropoutRate, backpropIntoEmbeddings, outputEmbeddingDim, coarsenerForInitialization) + } else { + // THIS IS THE STANDARD CODE PATH + println(inputSize + " x (" + numHidden + ")^" + numHiddenLayers + " x " + prodFeaturizer.index.size + " neural net") + PositionalNeuralModelFactory.buildNet(word2vecIndexed, inputSize, numHidden, numHiddenLayers, prodFeaturizer.index.size, nonLinType, dropoutRate, backpropIntoEmbeddings) + } + IndexedSeq(transform) + } + val depTransforms: IndexedSeq[AffineOutputTransform[Array[Int]]] = IndexedSeq() + val decoupledTransforms = if (decoupleTransforms) { + // Span and unary use the reduced input (no split point features), whereas surface uses the split point features + val inputSizes = Seq(surfaceFeaturizer.reducedInputSize, surfaceFeaturizer.reducedInputSize, surfaceFeaturizer.splitInputSize) + inputSizes.map(inputSize => PositionalNeuralModelFactory.buildNet(word2vecIndexed, inputSize, numHidden, numHiddenLayers, prodFeaturizer.index.size, nonLinType, dropoutRate, backpropIntoEmbeddings)) + } else { + IndexedSeq[AffineOutputTransform[Array[Int]]]() + } + + println(transforms.size + " transforms, " + transforms.map(_.index.size).toSeq + " parameters for each") + println(depTransforms.size + " dep transforms, " + depTransforms.map(_.index.size).toSeq + " parameters for each") + println(decoupledTransforms.size + " decoupled transforms, " + decoupledTransforms.map(_.index.size).toSeq + " parameters for each") + + val maybeSparseFeaturizer = if (useSparseFeatures) { + var wf = SpanModelFactory.defaultPOSFeaturizer(annWords, useBrown = useSparseBrown) + var span = SpanModelFactory.goodFeaturizer(annWords, commonWordThreshold, useShape = false, useLfsuf = useSparseLfsuf, useBrown = useSparseBrown, useMostSparseIndicators = useMostSparseIndicators) + span += new SingleWordSpanFeaturizer[String](wf) + val indexedWord = IndexedWordFeaturizer.fromData(wf, annTrees.map{_.words}, deduplicateFeatures = false) + val indexedSurface = IndexedSplitSpanFeaturizer.fromData(span, annTrees, bloomFilter = false) + + def sparseLabelFeaturizer(l: AnnotatedLabel) = Set(l, l.baseAnnotatedLabel).toSeq + def sparseRuleFeaturizer(r: Rule[AnnotatedLabel]) = Set(r, r.map(_.baseAnnotatedLabel)).toSeq + val sparseProdFeaturizer = new ProductionFeaturizer[AnnotatedLabel, AnnotatedLabel, String](xbarGrammar, indexedRefinements, lGen=sparseLabelFeaturizer, rGen=sparseRuleFeaturizer) + + val indexed = IndexedSpanFeaturizer.extract[AnnotatedLabel, AnnotatedLabel, String](indexedWord, + indexedSurface, + sparseProdFeaturizer, + new ZeroRuleAndSpansFeaturizer(), + annotator.latent, + indexedRefinements, + xbarGrammar, + if(dummyFeats < 0) HashFeature.Absolute(-dummyFeats.toInt) else HashFeature.Relative(dummyFeats), + filterUnseenFeatures = false, + minFeatCount = 1, + trainTrees) + Option(indexed) + } else { + None + } + + new PositionalNeuralModel(annotator.latent, + constrainer, + topology, lexicon, + refGrammar, + indexedRefinements, + prodFeaturizer, + surfaceFeaturizer, + depFeaturizer, + transforms, + maybeSparseFeaturizer, + depTransforms, + decoupledTransforms) + } +} + +object PositionalNeuralModelFactory { + + def buildNetInnerTransforms(word2vecIndexed: Word2VecIndexed[String], + inputSize: Int, + numHidden: Int, + numHiddenLayers: Int, + nonLinType: String, + dropoutRate: Double, + backpropIntoEmbeddings: Boolean): Transform[Array[Int],DenseVector[Double]] = { + if (numHiddenLayers == 0) { + new CachingLookupTransform(word2vecIndexed) + } else { + val baseTransformLayer = if (backpropIntoEmbeddings) { + new EmbeddingsTransform(numHidden, inputSize, word2vecIndexed) + } else { + new CachingLookupAndAffineTransformDense(numHidden, inputSize, word2vecIndexed) + } + var currLayer = addNonlinearity(nonLinType, numHidden, dropoutRate, baseTransformLayer) + for (i <- 1 until numHiddenLayers) { + currLayer = addNonlinearity(nonLinType, numHidden, dropoutRate, new AffineTransform(numHidden, numHidden, currLayer)) + } + currLayer + } + } + + def buildNet(word2vecIndexed: Word2VecIndexed[String], + inputSize: Int, + numHidden: Int, + numHiddenLayers: Int, + outputSize: Int, + nonLinType: String, + dropoutRate: Double, + backpropIntoEmbeddings: Boolean): AffineOutputTransform[Array[Int]] = { + val innerTransform = buildNetInnerTransforms(word2vecIndexed, inputSize, numHidden, numHiddenLayers, nonLinType, dropoutRate, backpropIntoEmbeddings) + new AffineOutputTransform(outputSize, if (numHiddenLayers >= 1) numHidden else inputSize, innerTransform) + } + + + def buildNetOutputEmbedding(word2vecIndexed: Word2VecIndexed[String], + inputSize: Int, + numHidden: Int, + numHiddenLayers: Int, + outputSize: Int, + nonLinType: String, + dropoutRate: Double, + backpropIntoEmbeddings: Boolean, + outputEmbeddingDim: Int, + coarsenerForInitialization: Option[Int => Int]): OutputTransform[Array[Int],DenseVector[Double]] = { + val innerTransform = buildNetInnerTransforms(word2vecIndexed, inputSize, numHidden, numHiddenLayers, nonLinType, dropoutRate, backpropIntoEmbeddings) + + val innerTransformLastLayer = new AffineTransform(outputEmbeddingDim, if (numHiddenLayers >= 1) numHidden else inputSize, innerTransform) + new OutputEmbeddingTransform(outputSize, outputEmbeddingDim, innerTransformLastLayer, coarsenerForInitialization) + } + + def addNonlinearity(nonLinType: String, numHidden: Int, dropoutRate: Double, currLayer: Transform[Array[Int],DenseVector[Double]]) = { + val useDropout = dropoutRate > 1e-8 + var tmpLayer = currLayer + tmpLayer = new NonlinearTransform(nonLinType, numHidden, tmpLayer) + if (useDropout) { + tmpLayer = new NonlinearTransform("dropout", numHidden, tmpLayer, dropoutRate) + } + tmpLayer + } + + def getRuleToParentMapping(index: Index[Feature]): Int => Int = { + (i: Int) => { + if (index.get(i).isInstanceOf[Rule[AnnotatedLabel]]) { + val parentIdx = index(index.get(i).asInstanceOf[Rule[AnnotatedLabel]].parent) + if (parentIdx == -1) { + 0 + } else { + parentIdx + } + } else { + i + } + } + } +} + +case class ParentFeature(f: Feature) extends Feature; +case class LeftChildFeature(f: Feature) extends Feature; +case class RightChildFeature(f: Feature) extends Feature; diff --git a/src/main/scala/epic/parser/models/SpanModel.scala b/src/main/scala/epic/parser/models/SpanModel.scala index 30b5ffbb..75e7cb1b 100644 --- a/src/main/scala/epic/parser/models/SpanModel.scala +++ b/src/main/scala/epic/parser/models/SpanModel.scala @@ -338,6 +338,7 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma useNGrams:Boolean = false, maxNGramOrder:Int = 2, useGrammar: Boolean = true, + useChildFeats: Boolean = false, useFullShape: Boolean = false, useSplitShape: Boolean = false, posFeaturizer: Optional[WordFeaturizer[String]] = None, @@ -399,7 +400,23 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma def labelFeaturizer(l: AnnotatedLabel) = Set(l, l.baseAnnotatedLabel).toSeq - def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if(useGrammar) Set(r, r.map(_.baseAnnotatedLabel)).toSeq else if(r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty + +// def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if(useGrammar) Set(r, r.map(_.baseAnnotatedLabel)).toSeq else if(r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty + def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if(useGrammar) { + if (useChildFeats && r.isInstanceOf[BinaryRule[AnnotatedLabel]]) { + Set(r, + r.map(_.baseAnnotatedLabel), + new LeftChildFeature(r.asInstanceOf[BinaryRule[AnnotatedLabel]].left), + new RightChildFeature(r.asInstanceOf[BinaryRule[AnnotatedLabel]].right)).toSeq + } else { + Set(r, r.map(_.baseAnnotatedLabel)).toSeq + } + } else if(r.isInstanceOf[UnaryRule[AnnotatedLabel]]) { + Set(r.parent, r.parent.baseAnnotatedLabel).toSeq + } else { + Seq.empty + } + val featurizer = new ProductionFeaturizer[AnnotatedLabel, AnnotatedLabel, String](xbarGrammar, indexedRefinements, lGen=labelFeaturizer, @@ -584,15 +601,24 @@ case class LatentSpanModelFactory(inner: SpanModelFactory, object SpanModelFactory { def goodFeaturizer[L](wordCounts: Counter2[AnnotatedLabel, String, Double], commonWordThreshold: Int = 100, - useShape: Boolean = true) = { + useShape: Boolean = true, + useLfsuf: Boolean = true, + useBrown: Boolean = false, + useMostSparseIndicators: Boolean = false) = { val dsl = new WordFeaturizer.DSL(wordCounts, commonWordThreshold) with SurfaceFeaturizer.DSL with SplitSpanFeaturizer.DSL import dsl._ // class(split + 1) - val baseCat = lfsuf - + var baseCat: WordFeaturizer[String] = new ZeroFeaturizer[String]; + if (useLfsuf) { + baseCat += lfsuf + } + if (useBrown) { + baseCat += new BrownClusterFeaturizer(Array(4, 10)) + } + val leftOfSplit: SplitSpanFeaturizer[String] = ((baseCat)(-1)apply (split)) - + var featurizer: SplitSpanFeaturizer[String] = zeroSplit[String] featurizer += baseCat(begin) featurizer += baseCat(end-1) @@ -601,6 +627,14 @@ object SpanModelFactory { featurizer += leftOfSplit featurizer += baseCat(split) featurizer += length + if (useMostSparseIndicators) { + featurizer += baseCat(begin-2) + featurizer += baseCat(end-2) + featurizer += baseCat(begin+1) + featurizer += baseCat(end+1) + featurizer += ((baseCat)(-2)apply (split)) + featurizer += ((baseCat)(1)apply (split)) + } featurizer += distance[String](begin, split) featurizer += distance[String](split, end) @@ -609,11 +643,16 @@ object SpanModelFactory { featurizer } - def defaultPOSFeaturizer(annWords: Counter2[AnnotatedLabel, String, Double]): WordFeaturizer[String] = { + def defaultPOSFeaturizer(annWords: Counter2[AnnotatedLabel, String, Double], useBrown: Boolean = false): WordFeaturizer[String] = { { val dsl = new WordFeaturizer.DSL(annWords) import dsl._ - unigrams(word, 1) + suffixes() + prefixes() + if (useBrown) { + val brown = new BrownClusterFeaturizer(Array(4, 10)) + unigrams(brown, 1) + unigrams(word, 1) + suffixes() + prefixes() + } else { + unigrams(word, 1) + suffixes() + prefixes() + } } } @@ -636,6 +675,7 @@ object SpanModelFactory { new CachedChartConstraintsFactory[AnnotatedLabel, String](uncached) } + val mf = new SpanModelFactory(annotator = annotator, posFeaturizer = posFeaturizer, spanFeaturizer = spanFeaturizer).make(trees, topo, lexicon, constraints) val mobj = new ModelObjective(mf, trees) diff --git a/src/main/scala/epic/parser/models/TransformModel.scala b/src/main/scala/epic/parser/models/TransformModel.scala index 272d383e..c0c60de9 100644 --- a/src/main/scala/epic/parser/models/TransformModel.scala +++ b/src/main/scala/epic/parser/models/TransformModel.scala @@ -47,7 +47,7 @@ class TransformModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W]) => B override def featureIndex: Index[Feature] = transform.index override def inferenceFromWeights(weights: DenseVector[Double]): Inference = { - val layer = transform.extractLayer(weights) + val layer = transform.extractLayer(weights, true) val grammar = new TransformModel.TransformGrammar[L, L2, W, transform.type](topology, lexicon, refinedTopology, refinements, labelFeaturizer, surfaceFeaturizer, layer) new Inference(annotator, constrainer, grammar, refinements) @@ -94,7 +94,10 @@ object TransformModel { val sspec = surfaceFeaturizer.anchor(w) val lspec = labelFeaturizer.anchor(w) - // cache: we remember the (begin/end) pair we saw with each + // For each split point, remember the (begin, end) pair that that split point was observed with. There'll + // only be one in the gold, but more in the prediction. Accumulate rule counts (output layer) until + // we need this split point for a different set of indices or we come to the end. Then, backpropagate + // the rule marginals through the network to get the derivative. val UNUSED = (-1, -1) val states = Array.fill(w.length + 2)(UNUSED) // 1 for each split, length for unaries, length +1 for spans val ruleCountsPerState = Array.fill(w.length + 2)(SparseVector.zeros[Double](labelFeaturizer.index.size)) @@ -187,7 +190,6 @@ object TransformModel { layer.activations(new FeatureVector(sfeats)) }) val rfeats = lspec.featuresForUnaryRule(begin, end, rule, ref) - new FeatureVector(rfeats) dot fs } @@ -197,7 +199,6 @@ object TransformModel { layer.activations(new FeatureVector(sfeats)) }) val rfeats = lspec.featuresForSpan(begin, end, tag, ref) - new FeatureVector(rfeats) dot fs } diff --git a/src/main/scala/epic/trees/AnnotatedLabel.scala b/src/main/scala/epic/trees/AnnotatedLabel.scala index 36a760d4..4ba2645c 100644 --- a/src/main/scala/epic/trees/AnnotatedLabel.scala +++ b/src/main/scala/epic/trees/AnnotatedLabel.scala @@ -101,8 +101,8 @@ object AnnotatedLabel { Array("PRT") } else if (label.startsWith("-") || label.isEmpty || label == "#") { Array(label) - } else if (label.contains("#")) { - val splits = label.split("#").filter(_.nonEmpty) + } else if (label.contains("##")) { // SPMRL uses two ## as the delimiter for this info + val splits = label.split("##").filter(_.nonEmpty) val nonmorphSplits = splits.head.split("[-=]") val morphSplits = splits.tail.flatMap(_.split("[|]")).filter("_" != _) nonmorphSplits ++ morphSplits @@ -142,4 +142,4 @@ object AnnotatedLabel { def get(t: AnnotatedLabel) = t.label def set(t: AnnotatedLabel, u: String) = t.copy(u) } -} \ No newline at end of file +} diff --git a/src/test/scala/epic/dense/AffineTransformTest.scala b/src/test/scala/epic/dense/AffineTransformTest.scala index 3bc4bf55..56b5ba32 100644 --- a/src/test/scala/epic/dense/AffineTransformTest.scala +++ b/src/test/scala/epic/dense/AffineTransformTest.scala @@ -16,7 +16,7 @@ class AffineTransformTest extends FunSuite { val dv = DenseVector.rand(10) val objective = new DiffFunction[DenseVector[Double]] { def calculate(x: DenseVector[Double]): (Double, DenseVector[Double]) = { - val layer = index.extractLayer(x) + val layer = index.extractLayer(x, true) val acts = layer.activations(dv) val obj = acts.sum val deriv = DenseVector.zeros[Double](x.length) @@ -36,7 +36,7 @@ class AffineTransformTest extends FunSuite { val target = DenseVector.rand(11) * 100.0 val objective = new DiffFunction[DenseVector[Double]] { def calculate(x: DenseVector[Double]): (Double, DenseVector[Double]) = { - val layer = index.extractLayer(x) + val layer = index.extractLayer(x, true) val acts = layer.activations(dv) val obj = math.pow(norm(target - acts, 2), 2) / 2 val initDeriv = acts - target @@ -48,7 +48,7 @@ class AffineTransformTest extends FunSuite { val weights: DenseVector[Double] = (DenseVector.rand[Double](index.index.size) - 0.5) * 4.0 val diffs = GradientTester.test[Int, DenseVector[Double]](objective, weights, randFraction = 1.0) - assert(max(diffs) < 4E-3, s"${diffs.max} was bigger than expected!!") + assert(max(diffs) < 2E-2, s"${diffs.max} was bigger than expected!!") } } diff --git a/src/test/scala/epic/dense/TanhTransformTest.scala b/src/test/scala/epic/dense/TanhTransformTest.scala index c0d1c00d..fcc46f6b 100644 --- a/src/test/scala/epic/dense/TanhTransformTest.scala +++ b/src/test/scala/epic/dense/TanhTransformTest.scala @@ -16,7 +16,7 @@ class TanhTransformTest extends FunSuite { val dv = DenseVector.rand(10) val objective = new DiffFunction[DenseVector[Double]] { def calculate(x: DenseVector[Double]): (Double, DenseVector[Double]) = { - val layer = index.extractLayer(x) + val layer = index.extractLayer(x, true) val acts = layer.activations(dv) val obj = acts.sum val deriv = DenseVector.zeros[Double](x.length) @@ -36,7 +36,7 @@ class TanhTransformTest extends FunSuite { val dv = DenseVector.rand(10) val objective = new DiffFunction[DenseVector[Double]] { def calculate(x: DenseVector[Double]): (Double, DenseVector[Double]) = { - val layer = index.extractLayer(x) + val layer = index.extractLayer(x, true) val acts = layer.activations(dv) val obj = acts.sum val deriv = DenseVector.zeros[Double](x.length)