diff --git a/.gitignore b/.gitignore
index d99dd25e..e188c6ae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,7 +16,7 @@ tmp/
 .idea*
 .scratch/
 java.hprof.txt
-
 *.bbl
 *.blg
 *.aux
+/bin/
diff --git a/README-NEURAL.md b/README-NEURAL.md
new file mode 100644
index 00000000..36f14b16
--- /dev/null
+++ b/README-NEURAL.md
@@ -0,0 +1,114 @@
+The neural CRF parser is a high-performing constituency parser.
+
+
+
+##Preamble
+
+The neural CRF parser is described in:
+
+"Neural CRF Parsing" Greg Durrett and Dan Klein. ACL 2015.
+
+It is an extension of the span parser described in
+
+"Less Grammar, More Features" David Hall, Greg Durrett, and Dan Klein. ACL 2014.
+
+and is based on the Epic parsing framework. See https://github.com/dlwh/epic
+for more documentation about the span parser and the Epic framework.
+See http://www.eecs.berkeley.edu/~gdurrett/ for papers and BibTeX.
+
+Questions? Bugs? Email me at gdurrett@eecs.berkeley.edu
+
+
+
+##Setup
+
+You need three things to run the neural CRF parser:
+
+1) The compiled .jar; run ```sbt assembly``` to produce this
+
+2) A treebank: the Penn Treebank or one of the SPMRL treebanks
+
+3) Some sort of word vectors. These can either be in the .bin format
+of Mikolov et al. (2013) or the .txt format of Bansal et al. (ACL 2014).  For
+English, the best performance comes from using Bansal et al.'s vectors:
+
+http://ttic.uchicago.edu/~mbansal/codedata/dependencyEmbeddings-skipdep.zip
+
+For other languages, you can train suitable vectors on monolingual data using
+```word2vec``` with the following arguments:
+
+    -cbow 0 -size 100 -window 1 -sample 1e-4 -threads 8 -binary 0 -iter 15
+
+These are mildly tuned, and using a small window size is important, but other
+settings are likely to work well too.
+
+
+
+
+##Usage
+
+To run the parser on new text (tokenized, one-sentence-per-line), use the following command:
+
+    java -Xmx4g -cp path/to/assembly.jar epic.parser.ParseText --model neuralcrf.parser \
+      --tokenizer whitespace --sentences newline --nthreads 8 [files]
+
+You can download the ```neuralcrf.parser``` model from:
+
+http://nlp.cs.berkeley.edu/projects/neuralcrf.shtml
+
+Due to modifications to the code for the system release and randomness in the
+training process, this model performs slightly worse than reported in the paper
+(90.9 on WSJ Section 23).
+
+To train a new parser as described in the neural CRF paper, run the following command
+(note that you need to fill in paths for -cp, --treebank.path, and --word2vecPath):
+
+    java -Xmx47g -cp path/to/assembly.jar epic.parser.models.NeuralParserTrainer \
+      --cache.path constraints.cache \
+      --opt.useStochastic \
+      --treebank.path path/to/wsj/ \
+      --evalOnTest \
+      --includeDevInTrain \
+      --trainer.modelFactory.annotator epic.trees.annotations.PipelineAnnotator \
+      --ann.0 epic.trees.annotations.FilterAnnotations  \
+      --ann.1 epic.trees.annotations.ForgetHeadTag \
+      --ann.2 epic.trees.annotations.Markovize \
+      --ann.2.horizontal 0 \
+      --ann.2.vertical 0 \
+      --modelFactory epic.parser.models.PositionalNeuralModelFactory \
+      --opt.batchSize 200 \
+      --word2vecPath path/to/skipdep_embeddings.txt \
+      --threads 8
+
+To run on SPMRL treebanks, modify the arguments to the command above as follows:
+
+1) Add the following arguments (replace ${LANG}$ as appropriate):
+
+    --treebankType spmrl \
+    --binarization head \
+    --supervisedHeadFinderPtbPath path/to/gold/ptb/train/train.${LANG}.gold.ptb \
+    --supervisedHeadFinderConllPath path/to/gold/conll/train/train.${LANG}.gold.conll \
+    --ann.3 epic.trees.annotations.SplitPunct
+
+2) Modify --treebank.path to point to the X_SPMRL/gold/ptb directory.
+
+Options to configure the neural network and training are largely defined in
+```epic.parser.models.PositionalNeuralModel```
+
+###Miscellaneous Notes
+
+To run on the development set, simply remove ```evalOnTest``` and
+```includeDevInTrain``` from the arguments.
+
+You should use the official version of ```evalb``` on the output files (gold
+and guess) rather than relying on the native scorer in the Epic parser. For
+SPMRL, you should use the version distributed with the shared task.
+
+Note that the X-bar grammar and coarse pruning masks (constraints) are cached
+between runs in the same directory, which speeds up training and testing time
+considerably as generating the masks is time-consuming.
+
+Finally, note that multiple parsers cannot be trained simultaneously in
+the same directory, since certain files (such as pruning masks from the
+coarse model) will collide.
+
diff --git a/build.sbt b/build.sbt
index b50860ae..926c429d 100644
--- a/build.sbt
+++ b/build.sbt
@@ -102,6 +102,7 @@ mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
 {
   case PathList("org", "w3c", "dom", _) => MergeStrategy.first
   case PathList("javax", "xml", "stream", _ *) => MergeStrategy.first
+  case PathList("scala", "xml", _ *) => MergeStrategy.first
   case PathList("org", "cyberneko", "html", _ *) => MergeStrategy.first
   case x => old(x)
 }
diff --git a/src/main/scala/epic/dense/AdadeltaGradientDescentDVD.scala b/src/main/scala/epic/dense/AdadeltaGradientDescentDVD.scala
new file mode 100644
index 00000000..b76098f0
--- /dev/null
+++ b/src/main/scala/epic/dense/AdadeltaGradientDescentDVD.scala
@@ -0,0 +1,57 @@
+package epic.dense
+
+import breeze.linalg._
+import breeze.numerics._
+import breeze.optimize.StochasticDiffFunction
+import breeze.optimize.StochasticGradientDescent
+
+
+class AdadeltaGradientDescentDVD(maxIter: Int,
+                              rho: Double = 0.95,
+                              tolerance: Double = 1E-5,
+                              improvementTolerance: Double = 1E-4,
+                              minImprovementWindow: Int = 50)
+    extends StochasticGradientDescent[DenseVector[Double]](1.0, maxIter, tolerance, improvementTolerance, minImprovementWindow) {
+
+  val delta = 1E-4
+  val epsilon = 1e-6
+  import vspace._
+  
+  case class History(squaredGradientsHistory: DenseVector[Double], squaredUpdatesHistory: DenseVector[Double])
+  override def initialHistory(f: StochasticDiffFunction[DenseVector[Double]],init: DenseVector[Double]) = {
+    History(DenseVector(Array.tabulate(init.size)(i => 1e-6)), DenseVector(Array.tabulate(init.size)(i => 1e-6)))
+  }
+
+  override def updateHistory(newX: DenseVector[Double], newGrad: DenseVector[Double], newValue: Double, f: StochasticDiffFunction[DenseVector[Double]], oldState: State) = {
+    val oldHistory = oldState.history
+    // The new gradient gets incorporated during the next round of takeStep,
+    // so this computation should lag by one
+    val newG = (oldState.grad :* oldState.grad) * (1 - rho)
+    axpy(rho, oldHistory.squaredGradientsHistory, newG)
+    val deltaX = newX - oldState.x
+    val newU = deltaX :* deltaX * (1 - rho);
+    axpy(rho, oldHistory.squaredUpdatesHistory, newU)
+    new History(newG, newU)
+  }
+
+  override protected def takeStep(state: State, dir: DenseVector[Double], stepSize: Double) = {
+    import state._
+    // Need to pre-emptively update the gradient since the history only has it through the
+    // last timestep
+    val rmsGt = sqrt((state.history.squaredGradientsHistory * rho) :+ ((state.grad :* state.grad) * (1-rho)) :+ epsilon)
+    val rmsDeltaXtm1 = sqrt(state.history.squaredUpdatesHistory :+ epsilon)
+    val step = dir :* rmsDeltaXtm1 :/ rmsGt
+    val newX = x
+    axpy(1.0, step, newX)
+    newX
+  }
+
+  override def determineStepSize(state: State, f: StochasticDiffFunction[DenseVector[Double]], dir: DenseVector[Double]) = {
+    defaultStepSize // pegged to 1.0 for this method
+  }
+
+  override protected def adjust(newX: DenseVector[Double], newGrad: DenseVector[Double], newVal: Double) = {
+    newVal -> newGrad
+  }
+
+}
\ No newline at end of file
diff --git a/src/main/scala/epic/dense/AffineOutputTransform.scala b/src/main/scala/epic/dense/AffineOutputTransform.scala
new file mode 100644
index 00000000..167aa4df
--- /dev/null
+++ b/src/main/scala/epic/dense/AffineOutputTransform.scala
@@ -0,0 +1,106 @@
+package epic.dense
+
+import breeze.linalg._
+import breeze.linalg.operators.OpMulMatrix
+import epic.features.SegmentedIndex
+import epic.framework.Feature
+
+import scala.runtime.ScalaRunTime
+import scala.util.Random
+
+/**
+ * Used at the output layer when we're only going to need some of the possible ouputs;
+ * it exposes the penultimate layer and then the Layer allows you to pass the results
+ * from that back in (caching it elsewhere) and only compute certain cells in the
+ * output layer (activationsFromPenultimateDot). 
+ */
+case class AffineOutputTransform[FV](numOutputs: Int, numInputs: Int, innerTransform: Transform[FV, DenseVector[Double]], includeBias: Boolean = true) extends OutputTransform[FV, DenseVector[Double]] {
+
+
+  val index = SegmentedIndex(new AffineTransform.Index(numOutputs, numInputs, includeBias), innerTransform.index)
+  
+  def extractLayerAndPenultimateLayer(weights: DenseVector[Double], forTrain: Boolean) = {
+    val mat = weights(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require)
+    val bias = if(includeBias) {
+      weights(numOutputs * numInputs until index.componentOffset(1))
+    } else {
+      DenseVector.zeros[Double](numOutputs)
+    }
+    val inner = innerTransform.extractLayer(weights(index.componentOffset(1) to -1), forTrain)
+    new OutputLayer(mat, bias, inner) -> inner
+  }
+  
+  /**
+   * N.B. Initialized to zero because this should *only* be used at the output layer, where
+   * zero initialization is appropriate
+   */
+  def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = {
+    require(outputLayer)
+    DenseVector.vertcat(DenseVector.zeros(index.indices(0).size), innerTransform.initialWeightVector(initWeightsScale, rng, false, spec))
+  }
+  
+  def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) {
+    innerTransform.clipHiddenWeightVectors(weights(index.componentOffset(1) to -1), norm, false)
+  }
+  
+  def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = {
+    (offset until offset + Math.min(10, index.indices(0).size)) ++ innerTransform.getInterestingWeightIndicesForGradientCheck(offset + index.indices(0).size)
+  }
+
+  case class OutputLayer(weights: DenseMatrix[Double], bias: DenseVector[Double], innerLayer: innerTransform.Layer) extends OutputTransform.OutputLayer[FV,DenseVector[Double]] {
+    override val index = AffineOutputTransform.this.index
+
+    val weightst = weights.t
+//    val weightst = weights.t.copy
+
+
+    def activations(fv: FV) = {
+      val out = weights * innerLayer.activations(fv) += bias
+      out
+    }
+    
+    def activationsDot(fv: FV, sparseIdx: Int) = {
+      activationsFromPenultimateDot(innerLayer.activations(fv), sparseIdx)
+    }
+    
+    def activationsDot(fv: FV, sparseIndices: Array[Int]) = {
+      activationsFromPenultimateDot(innerLayer.activations(fv), sparseIndices)
+    }
+    
+    def activationsFromPenultimateDot(innerLayerActivations: DenseVector[Double], sparseIdx: Int) = {
+      weights(sparseIdx, ::) * innerLayerActivations + bias(sparseIdx)
+    }
+
+    def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = {
+      val scale = _scale
+      val matDeriv = deriv(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require)
+      val biasDeriv = if(includeBias) {
+        deriv(numOutputs * numInputs until index.componentOffset(1))
+      } else {
+        DenseVector.zeros[Double](numOutputs)
+      }
+
+      // whole function is f(mat * inner(fv) + bias)
+      // scale(i) pushes in  (f'(mat * inner(v) + bias))(i)
+      val innerAct = innerLayer.activations(fv)
+      // d/d(weights(::, i)) == scale(i) * innerAct
+      for (i <- 0 until weights.rows) {
+        val a: Double = scale(i)
+        if(a != 0.0) {
+          axpy(a, innerAct, matDeriv.t(::, i))
+        // so d/dbias(i) = scale(i)
+          biasDeriv(i) += a
+        }
+      }
+
+      // scale is f'(mat * inner(v) + bias)
+      // d/dv is mat.t * f'(mat * inner(v) + bias)
+
+      innerLayer.tallyDerivative(deriv(index.componentOffset(1) to -1), weightst * scale, fv)
+    }
+    
+    def applyBatchNormalization(inputs: scala.collection.GenTraversable[FV]) = innerLayer.applyBatchNormalization(inputs)
+
+  }
+
+}
diff --git a/src/main/scala/epic/dense/AffineTransform.scala b/src/main/scala/epic/dense/AffineTransform.scala
index 04231d53..5e6be860 100644
--- a/src/main/scala/epic/dense/AffineTransform.scala
+++ b/src/main/scala/epic/dense/AffineTransform.scala
@@ -6,7 +6,7 @@ import epic.features.SegmentedIndex
 import epic.framework.Feature
 
 import scala.runtime.ScalaRunTime
-
+import scala.util.Random
 
 case class AffineTransform[FV, Mid](numOutputs: Int, numInputs: Int, innerTransform: Transform[FV, Mid], includeBias: Boolean = true)
                               (implicit mult: OpMulMatrix.Impl2[DenseMatrix[Double], Mid, DenseVector[Double]],
@@ -15,23 +15,50 @@ case class AffineTransform[FV, Mid](numOutputs: Int, numInputs: Int, innerTransf
 
   val index = SegmentedIndex(new AffineTransform.Index(numOutputs, numInputs, includeBias), innerTransform.index)
 
-
-
-  def extractLayer(weights: DenseVector[Double]) = {
+  def extractLayer(weights: DenseVector[Double], forTrain: Boolean) = {
+    extractLayerAndPenultimateLayer(weights, forTrain)._1
+  }
+  
+  def extractLayerAndPenultimateLayer(weights: DenseVector[Double], forTrain: Boolean) = {
     val mat = weights(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require)
     val bias = if(includeBias) {
       weights(numOutputs * numInputs until index.componentOffset(1))
     } else {
       DenseVector.zeros[Double](numOutputs)
     }
-    val inner = innerTransform.extractLayer(weights(index.componentOffset(1) to -1))
-    new Layer(mat, bias, inner)
+    val inner = innerTransform.extractLayer(weights(index.componentOffset(1) to -1), forTrain)
+    new Layer(mat, bias, inner) -> inner
+  }
+  
+  def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = {
+//    if (spec == "") {
+//    DenseVector(Array.tabulate(index.indices(0).size)(i => if (outputLayer) 0.0 else rng.nextGaussian * initWeightsScale)),
+    val myWeights = if (outputLayer) {
+      DenseVector(Array.tabulate(index.indices(0).size)(i => 0.0))
+    } else if (spec == "magic") {
+      AffineTransform.getMagicAffineWeights(index.indices(0).size, numInputs, numOutputs, initWeightsScale, rng)
+    } else {
+      AffineTransform.getGaussianAffineWeights(index.indices(0).size, initWeightsScale, rng)
+    }
+    DenseVector.vertcat(myWeights, innerTransform.initialWeightVector(initWeightsScale, rng, false, spec))
+  }
+  
+  def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) {
+    if (!outputLayer) {
+      AffineTransform.clipHiddenWeightVectors(numOutputs, numInputs, weights, norm)
+    }
+    innerTransform.clipHiddenWeightVectors(weights(index.componentOffset(1) to -1), norm, false)
+  }
+  
+  def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = {
+    (offset until offset + Math.min(10, index.indices(0).size)) ++ innerTransform.getInterestingWeightIndicesForGradientCheck(offset + index.indices(0).size)
   }
 
-  case class Layer(weights: DenseMatrix[Double], bias: DenseVector[Double], innerLayer: innerTransform.Layer) extends _Layer {
+  case class Layer(weights: DenseMatrix[Double], bias: DenseVector[Double], innerLayer: innerTransform.Layer) extends Transform.Layer[FV,DenseVector[Double]] {
     override val index = AffineTransform.this.index
 
-    val weightst = weights.t.copy
+    val weightst = weights.t
+//    val weightst = weights.t.copy
 
 
     def activations(fv: FV) = {
@@ -40,6 +67,7 @@ case class AffineTransform[FV, Mid](numOutputs: Int, numInputs: Int, innerTransf
     }
 
     def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = {
+//      println("SCALE: " + _scale) 
       val scale = _scale
       val matDeriv = deriv(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require)
       val biasDeriv = if(includeBias) {
@@ -65,9 +93,11 @@ case class AffineTransform[FV, Mid](numOutputs: Int, numInputs: Int, innerTransf
 
       // scale is f'(mat * inner(v) + bias)
       // d/dv is mat.t * f'(mat * inner(v) + bias)
-
+//      println("Intermediate scale: " + weightst * scale)
       innerLayer.tallyDerivative(deriv(index.componentOffset(1) to -1), weightst * scale, fv)
     }
+    
+    def applyBatchNormalization(inputs: scala.collection.GenTraversable[FV]) = innerLayer.applyBatchNormalization(inputs)
 
   }
 
@@ -78,6 +108,31 @@ object AffineTransform {
                                                                     canAxpy: scaleAdd.InPlaceImpl3[DenseVector[Double], Double, FV]) = new AffineTransform(numOutputs, numInputs, new IdentityTransform[FV], includeBias)
   def apply(numOutputs: Int, numInputs: Int, includeBias: Boolean):AffineTransform[DenseVector[Double], DenseVector[Double]] = apply(numOutputs, numInputs, new IdentityTransform[DenseVector[Double]], includeBias)
   def apply(numOutputs: Int, numInputs: Int):AffineTransform[DenseVector[Double], DenseVector[Double]]  = apply(numOutputs, numInputs, true)
+  
+  
+  def getUniformAffineWeights(numWeights: Int, initWeightsScale: Double, rng: Random) = {
+    DenseVector(Array.tabulate(numWeights)(i => rng.nextGaussian * initWeightsScale))
+  }
+  
+  def getGaussianAffineWeights(numWeights: Int, initWeightsScale: Double, rng: Random) = {
+    DenseVector(Array.tabulate(numWeights)(i => rng.nextGaussian * initWeightsScale))
+  }
+  
+  // N.B. numWeights != inSize * outSize if there's a bias
+  def getMagicAffineWeights(numWeights: Int, inSize: Int, outSize: Int, initWeightsScale: Double, rng: Random) = {
+    val range = Math.sqrt(6.0/(inSize + outSize))
+    DenseVector(Array.tabulate(numWeights)(i => rng.nextDouble * 2 * range - range))
+  }
+  
+  def clipHiddenWeightVectors(numOutputs: Int, numInputs: Int, weights: DenseVector[Double], norm: Double) {
+    val mat = weights(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require)
+    for (i <- 0 until mat.rows) {
+      val thisRowNorm = breeze.linalg.norm(mat(i, ::), 2)
+      val multFactor = norm/Math.sqrt(thisRowNorm)
+      mat(i, ::) *= multFactor
+    }
+  }
+  
   case class Index(numOutputs: Int, numInputs: Int, includeBias: Boolean = true) extends breeze.util.Index[Feature] {
     def apply(t: Feature): Int = t match {
       case NeuralFeature(output, input) if output < numOutputs && input < numInputs && output > 0 && input > 0 =>
diff --git a/src/main/scala/epic/dense/BatchNormalizationTransform.scala b/src/main/scala/epic/dense/BatchNormalizationTransform.scala
new file mode 100644
index 00000000..04abb51f
--- /dev/null
+++ b/src/main/scala/epic/dense/BatchNormalizationTransform.scala
@@ -0,0 +1,97 @@
+package epic.dense
+
+import breeze.linalg._
+import breeze.linalg.DenseVector
+import epic.framework.Feature
+import breeze.util.Index
+import scala.util.Random
+import breeze.numerics.sigmoid
+import epic.features.SegmentedIndex
+
+/**
+ * Implements batch normalization from
+ * http://arxiv.org/pdf/1502.03167v3.pdf
+ * Basically, each unit is shifted and rescaled per minibatch so that its activations 
+ * have mean 0 and variance 1. This has been demonstrated to help training deep networks,
+ * but doesn't seem to help here.
+ */
+case class BatchNormalizationTransform[FV](size: Int, useBias: Boolean, inner: Transform[FV, DenseVector[Double]]) extends Transform[FV, DenseVector[Double]] {
+  
+  val index = if (useBias) {
+    SegmentedIndex(new AffineTransform.Index(size, 0, true), inner.index)
+  } else {
+    inner.index
+  }
+
+  def extractLayer(dv: DenseVector[Double], forTrain: Boolean) = {
+    if (useBias) {
+      new Layer(dv(0 until size), size, inner.extractLayer(dv(size to -1), forTrain))
+    } else {
+      new Layer(DenseVector.zeros[Double](size), size, inner.extractLayer(dv, forTrain))
+    }
+  }
+  
+  def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = {
+    if (useBias) {
+      DenseVector.vertcat(DenseVector.zeros[Double](size),
+                          inner.initialWeightVector(initWeightsScale, rng, false, spec))
+    } else {
+      inner.initialWeightVector(initWeightsScale, rng, false, spec)
+    }
+  }
+
+  def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) = inner.clipHiddenWeightVectors(weights, norm, false)
+  
+  def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = {
+    if (useBias) {
+      (offset until offset + Math.min(10, size)) ++ inner.getInterestingWeightIndicesForGradientCheck(offset + size)
+    } else {
+      inner.getInterestingWeightIndicesForGradientCheck(offset)
+    }
+  }
+  
+  case class Layer(bias: DenseVector[Double], size: Int, innerLayer: inner.Layer) extends Transform.Layer[FV,DenseVector[Double]] {
+    
+    var fcn = new NonlinearTransform.ShiftAndScaleEach(Array.tabulate(size)(i => 0.0), Array.tabulate(size)(i => 1.0))
+    
+    val myIndex = Index[Feature]
+    
+    def index = myIndex;
+    
+    def activations(fv: FV): DenseVector[Double] = {
+      val act = innerLayer.activations(fv)
+      var i = 0;
+      while (i < act.size) {
+        act(i) = fcn.fcn(i, act(i)) + bias(i)
+        i += 1
+      }
+      act
+    }
+
+    def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = {
+      val biasDeriv = if (useBias) deriv(0 until size) else DenseVector[Double]()
+      val scale = _scale
+      var i = 0;
+      while (i < scale.size) {
+        if (useBias) {
+          biasDeriv(i) += scale(i)
+        }
+        scale(i) = scale(i) * fcn.deriv(i, 0) // we know it's linear so just evaluate the derivative at 0, saves computing activations
+        i += 1
+      }
+      innerLayer.tallyDerivative(if (useBias) deriv(size to -1) else deriv, scale, fv)
+    }
+
+    def applyBatchNormalization(inputs: scala.collection.GenTraversable[FV]) = {
+      val allActivations = inputs.map(activations(_))
+      val mean = allActivations.reduce(_ + _) * (1.0/inputs.size)
+      val variances = allActivations.map(act => (act - mean) :* (act - mean)).reduce(_ + _) * (1.0/inputs.size)
+      val invStdDevs = variances.data.map(variance => 1.0/Math.sqrt(variance + 1e-6))
+//      println(mean.data.toSeq)
+//      println(invStdDevs.toSeq)
+      fcn = new NonlinearTransform.ShiftAndScaleEach(mean.data, invStdDevs)
+      innerLayer.applyBatchNormalization(inputs)
+    }
+  }
+
+}
\ No newline at end of file
diff --git a/src/main/scala/epic/dense/CachingLookupAndAffineTransformDense.scala b/src/main/scala/epic/dense/CachingLookupAndAffineTransformDense.scala
new file mode 100644
index 00000000..417627d8
--- /dev/null
+++ b/src/main/scala/epic/dense/CachingLookupAndAffineTransformDense.scala
@@ -0,0 +1,113 @@
+package epic.dense
+
+import scala.runtime.ScalaRunTime
+import breeze.linalg._
+import epic.features.SegmentedIndex
+import epic.framework.Feature
+import scala.collection.mutable.HashMap
+import scala.util.Random
+
+/**
+ * Used at the input layer to cache lookups and the result of applying
+ * the affine transform at the first layer of the network. This saves
+ * computation across repeated invocations of the neural network in
+ * the sentence.
+ */
+case class CachingLookupAndAffineTransformDense[FV](numOutputs: Int,
+                                                    numInputs: Int,
+                                                    word2vecIndexed: Word2VecIndexed[String],
+                                                    includeBias: Boolean = true) extends Transform[Array[Int], DenseVector[Double]] {
+
+
+  val index = new AffineTransform.Index(numOutputs, numInputs, includeBias)
+  
+  def extractLayer(weights: DenseVector[Double], forTrain: Boolean) = {
+    val mat = weights(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require)
+    val bias = if(includeBias) {
+      weights(numOutputs * numInputs until index.size)
+    } else {
+      DenseVector.zeros[Double](numOutputs)
+    }
+    new Layer(mat, bias)
+  }
+  
+  def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = {
+    val myWeights = if (outputLayer) {
+      DenseVector.zeros[Double](index.size)
+    } else if (spec == "magic") {
+      AffineTransform.getMagicAffineWeights(index.size, numInputs, numOutputs, initWeightsScale, rng)
+    } else {
+      AffineTransform.getGaussianAffineWeights(index.size, initWeightsScale, rng)
+    }
+    myWeights
+  }
+  
+  def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) {
+    if (!outputLayer) {
+      AffineTransform.clipHiddenWeightVectors(numOutputs, numInputs, weights, norm)
+    }
+  }
+  
+  def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = {
+    (offset until offset + Math.min(10, index.size))
+  }
+
+  case class Layer(weights: DenseMatrix[Double], bias: DenseVector[Double]) extends Transform.Layer[Array[Int],DenseVector[Double]] {
+    
+    override val index = CachingLookupAndAffineTransformDense.this.index
+    
+    val weightst = weights.t
+
+    // Cache stores pairs of (word identity, position) mapped to the final results of
+    // these being multiplied by the parameter vector. Note that although the same
+    // word vector is used for each word identity, the parameter vector depends
+    // on the position.
+    val caches = Array.tabulate(numInputs/word2vecIndexed.wordRepSize)(i => new HashMap[Int,DenseVector[Double]])
+
+    def activations(fv: Array[Int]) = {
+      val finalVector = DenseVector.zeros[Double](numOutputs)
+      for (i <- 0 until fv.size) {
+//        val wordPosn = fv(i) -> i
+        if (fv(i) != -1) {
+          caches(i).synchronized {
+            if (!caches(i).contains(fv(i))) {
+              val startIdx = i * word2vecIndexed.wordRepSize
+              caches(i).put(fv(i), weights(::, startIdx until startIdx + word2vecIndexed.wordRepSize) * DenseVector(word2vecIndexed.convertIndexToVector(fv(i))))
+            }
+            finalVector += caches(i)(fv(i))
+          }
+        }
+      }
+      finalVector + bias
+    }
+
+    def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: Array[Int]) = {
+      val scale = _scale
+      val matDeriv = deriv(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require)
+      val biasDeriv = if(includeBias) {
+        deriv(numOutputs * numInputs until index.size)
+      } else {
+        DenseVector.zeros[Double](numOutputs)
+      }
+
+      // whole function is f(mat * inner(fv) + bias)
+      // scale(i) pushes in  (f'(mat * inner(v) + bias))(i)
+      val innerAct = DenseVector(word2vecIndexed.convertToVector(fv));
+      
+      // d/d(weights(::, i)) == scale(i) * innerAct
+      for (i <- 0 until weights.rows) {
+        val a: Double = scale(i)
+        if(a != 0.0) {
+          axpy(a, innerAct, matDeriv.t(::, i))
+        // so d/dbias(i) = scale(i)
+          biasDeriv(i) += a
+        }
+      }
+
+      // scale is f'(mat * inner(v) + bias)
+      // d/dv is mat.t * f'(mat * inner(v) + bias)
+    }
+    
+    def applyBatchNormalization(inputs: scala.collection.GenTraversable[Array[Int]]) = {}
+  }
+}
diff --git a/src/main/scala/epic/dense/CachingLookupTransform.scala b/src/main/scala/epic/dense/CachingLookupTransform.scala
new file mode 100644
index 00000000..e934d2f8
--- /dev/null
+++ b/src/main/scala/epic/dense/CachingLookupTransform.scala
@@ -0,0 +1,43 @@
+package epic.dense
+
+import scala.runtime.ScalaRunTime
+import breeze.linalg._
+import epic.features.SegmentedIndex
+import epic.framework.Feature
+import scala.collection.mutable.HashMap
+import scala.util.Random
+import breeze.util.Index
+
+/**
+ * Used at the input layer to cache lookups and 
+ */
+case class CachingLookupTransform(word2vecIndexed: Word2VecIndexed[String]) extends Transform[Array[Int], DenseVector[Double]] {
+
+  val index = Index[epic.framework.Feature]()
+  
+  def extractLayer(weights: DenseVector[Double], forTrain: Boolean) = new Layer()
+  
+  def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = DenseVector()
+  
+  def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) {}
+  
+  def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = Seq[Int]()
+
+  case class Layer() extends Transform.Layer[Array[Int],DenseVector[Double]] {
+    
+    override val index = Index[epic.framework.Feature]()
+
+    def activations(fv: Array[Int]) = {
+      var finalVector = DenseVector.zeros[Double](0)
+      for (i <- 0 until fv.size) {
+        val vec: DenseVector[Double] = if (fv(i) != -1) DenseVector(word2vecIndexed.convertIndexToVector(fv(i))) else DenseVector(word2vecIndexed.zeroVector)
+        finalVector = DenseVector.vertcat(finalVector, vec)
+      }
+      finalVector
+    }
+
+    def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: Array[Int]) = {}
+    
+    def applyBatchNormalization(inputs: scala.collection.GenTraversable[Array[Int]]) = {}
+  }
+}
\ No newline at end of file
diff --git a/src/main/scala/epic/dense/EmbeddingsTransform.scala b/src/main/scala/epic/dense/EmbeddingsTransform.scala
new file mode 100644
index 00000000..bd0fbad2
--- /dev/null
+++ b/src/main/scala/epic/dense/EmbeddingsTransform.scala
@@ -0,0 +1,126 @@
+package epic.dense
+
+import scala.runtime.ScalaRunTime
+import breeze.linalg._
+import epic.features.SegmentedIndex
+import epic.framework.Feature
+import scala.collection.mutable.HashMap
+import scala.util.Random
+
+/**
+ * Used at the input layer to cache lookups and
+ * backprop into embeddings 
+ */
+case class EmbeddingsTransform[FV](numOutputs: Int,
+                                   numInputs: Int,
+                                   word2vecIndexed: Word2VecIndexed[String],
+                                   includeBias: Boolean = true) extends Transform[Array[Int], DenseVector[Double]] {
+
+
+  val index = SegmentedIndex(new AffineTransform.Index(numOutputs, numInputs, includeBias),
+                             new AffineTransform.Index(word2vecIndexed.vocSize, word2vecIndexed.wordRepSize, false))
+  println("Allocated " + index.indices.map(_.size) + " parameters for each index in the embedding layer (backpropagating into embeddings)")
+  
+  def extractLayer(weights: DenseVector[Double], forTrain: Boolean) = {
+    val mat = weights(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require)
+    val bias = if(includeBias) {
+      weights(numOutputs * numInputs until index.indices(0).size)
+    } else {
+      DenseVector.zeros[Double](numOutputs)
+    }
+    val wordWeights = weights(index.indices(0).size until index.indices(0).size + index.indices(1).size).asDenseMatrix.reshape(word2vecIndexed.vocSize, word2vecIndexed.wordRepSize, view = View.Require)
+    new Layer(mat, bias, wordWeights)
+  }
+  
+  def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = {
+    val myWeights = if (outputLayer) {
+      DenseVector(Array.tabulate(index.indices(0).size)(i => 0.0))
+    } else if (spec == "magic") {
+      AffineTransform.getMagicAffineWeights(index.indices(0).size, numInputs, numOutputs, initWeightsScale, rng)
+    } else {
+      AffineTransform.getGaussianAffineWeights(index.indices(0).size, initWeightsScale, rng)
+    }
+    // Only randomly initialize the weights in the matrix, not the word deltas
+    DenseVector.vertcat(myWeights, DenseVector.zeros[Double](index.size - index.indices(0).size))
+//    DenseVector(Array.tabulate(index.size)(i => if (!outputLayer && i < index.indices(0).size) rng.nextGaussian * initWeightsScale else 0.0))
+  }
+  
+  def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) {
+    if (!outputLayer) {
+      AffineTransform.clipHiddenWeightVectors(numOutputs, numInputs, weights, norm)
+    }
+  }
+  
+  def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = {
+    (offset until offset + Math.min(10, index.indices(0).size)) ++ (offset + index.componentOffset(1) until offset + index.componentOffset(1) + Math.min(10, index.indices(1).size))
+  }
+
+  case class Layer(weights: DenseMatrix[Double], bias: DenseVector[Double], wordWeights: DenseMatrix[Double]) extends Transform.Layer[Array[Int],DenseVector[Double]] {
+    
+    override val index = EmbeddingsTransform.this.index
+    
+    val weightst = weights.t
+
+    // Cache stores pairs of (word identity, position) mapped to the final results of
+    // these being multiplied by the parameter vector. Note that although the same
+    // word vector is used for each word identity, the parameter vector depends
+    // on the position.
+    val caches = Array.tabulate(numInputs/word2vecIndexed.wordRepSize)(i => new HashMap[Int,DenseVector[Double]])
+
+    def activations(fv: Array[Int]) = {
+      val finalVector = DenseVector.zeros[Double](numOutputs)
+      for (i <- 0 until fv.size) {
+//        val wordPosn = fv(i) -> i
+        if (fv(i) != -1) {
+          caches(i).synchronized {
+            if (!caches(i).contains(fv(i))) {
+              val startIdx = i * word2vecIndexed.wordRepSize
+              val wordVec = DenseVector(word2vecIndexed.convertIndexToVector(fv(i))) + wordWeights(fv(i), ::).t
+              caches(i).put(fv(i), weights(::, startIdx until startIdx + word2vecIndexed.wordRepSize) * wordVec)
+            }
+            finalVector += caches(i)(fv(i))
+          }
+        }
+      }
+      finalVector + bias
+    }
+
+    def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: Array[Int]) = {
+      val scale = _scale
+      val matDeriv = deriv(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require)
+      val biasDeriv = if(includeBias) {
+        deriv(numOutputs * numInputs until index.size)
+      } else {
+        DenseVector.zeros[Double](numOutputs)
+      }
+
+      // whole function is f(mat * inner(fv) + bias)
+      // scale(i) pushes in  (f'(mat * inner(v) + bias))(i)
+      val innerAct = DenseVector(word2vecIndexed.convertToVector(fv)) + Word2VecSurfaceFeaturizerIndexed.makeVectFromParams(fv, wordWeights);
+      
+      val wordsDeriv = deriv(index.indices(0).size until index.indices(0).size + index.indices(1).size).asDenseMatrix.reshape(word2vecIndexed.vocSize, word2vecIndexed.wordRepSize, view = View.Require)
+      val wordsDerivs = Array.tabulate(fv.size)(wordPosnIdx => wordsDeriv(fv(wordPosnIdx), ::).t)
+      // d/d(weights(::, i)) == scale(i) * innerAct
+      for (i <- 0 until weights.rows) {
+        val a: Double = scale(i)
+        if(a != 0.0) {
+          axpy(a, innerAct, matDeriv.t(::, i))
+          var wordPosnIdx = 0;
+          while (wordPosnIdx < fv.size) {
+            val relevantWeights = weights(i, wordPosnIdx * word2vecIndexed.wordRepSize until (wordPosnIdx + 1) * word2vecIndexed.wordRepSize).t
+            axpy(a, relevantWeights, wordsDerivs(wordPosnIdx))
+            wordPosnIdx += 1
+          }
+        // so d/dbias(i) = scale(i)
+          biasDeriv(i) += a
+        }
+      }
+
+      // scale is f'(mat * inner(v) + bias)
+      // d/dv is mat.t * f'(mat * inner(v) + bias)
+    }
+    
+    
+    def applyBatchNormalization(inputs: scala.collection.GenTraversable[Array[Int]]) = {}
+  }
+}
\ No newline at end of file
diff --git a/src/main/scala/epic/dense/IdentityTransform.scala b/src/main/scala/epic/dense/IdentityTransform.scala
index 3f42ad28..f7410f36 100644
--- a/src/main/scala/epic/dense/IdentityTransform.scala
+++ b/src/main/scala/epic/dense/IdentityTransform.scala
@@ -3,22 +3,31 @@ package epic.dense
 import breeze.linalg._
 import breeze.util.Index
 import epic.framework.Feature
-
+import scala.util.Random
 
 class IdentityTransform[T] extends Transform[T, T] {
 
   val index = Index[Feature]()
 
+  def extractLayer(weights: DenseVector[Double], forTrain: Boolean) = new Layer()
+  
+  def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = DenseVector(Array[Double]())
+  
+  def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) {}
+  
+  def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = Seq[Int]()
 
-  def extractLayer(weights: DenseVector[Double]) = {
-    new Layer()
-  }
-
-  class Layer extends _Layer {
+  class Layer extends Transform.Layer[T,T] {
+    
+    val myIndex = Index[Feature]
+    
+    def index = myIndex;
 
     def activations(fv: T) = fv
 
     def tallyDerivative(deriv: DenseVector[Double], scale: =>Vector[Double], t: T) = {}
+    
+    def applyBatchNormalization(inputs: scala.collection.GenTraversable[T]) = {}
   }
 
 }
diff --git a/src/main/scala/epic/dense/LowRankQuadraticTransform.scala b/src/main/scala/epic/dense/LowRankQuadraticTransform.scala
new file mode 100644
index 00000000..0dad562a
--- /dev/null
+++ b/src/main/scala/epic/dense/LowRankQuadraticTransform.scala
@@ -0,0 +1,142 @@
+package epic.dense
+
+import breeze.linalg._
+import breeze.linalg.operators.OpMulMatrix
+import epic.features.SegmentedIndex
+import epic.framework.Feature
+import breeze.util.Index
+
+import scala.runtime.ScalaRunTime
+import scala.util.Random
+
+case class LowRankQuadraticTransform[FV](numOutputs: Int, numRanks: Int, numLeftInputs: Int, numRightInputs: Int, innerTransform: Transform[FV, DenseVector[Double]]) extends OutputTransform[FV, DenseVector[Double]] {
+
+  val neurons = (0 until numOutputs).map(i => new LowRankQuadraticTransformNeuron(numRanks, numLeftInputs, numRightInputs))
+  val neuronIndex = SegmentedIndex(neurons.map(_.index):_*)
+  val index = SegmentedIndex(neuronIndex, innerTransform.index)
+
+  def extractLayerAndPenultimateLayer(weights: DenseVector[Double], forTrain: Boolean) = {
+    val subTransforms = (0 until neurons.size).map(i => neurons(i).extractLayer(weights(neuronIndex.componentOffset(i) until neuronIndex.componentOffset(i) + neuronIndex.indices(i).size)))
+    val innerLayer = innerTransform.extractLayer(weights(index.componentOffset(1) to -1), forTrain);
+    new OutputLayer(subTransforms, innerLayer) -> innerLayer
+  }
+  
+//  def extractLayer(weights: DenseVector[Double]) = {
+//    val subTransforms = (0 until neurons.size).map(i => neurons(i).extractLayer(weights(neuronIndex.componentOffset(i) until neuronIndex.componentOffset(i) + neuronIndex.indices(i).size)))
+//    new Layer(subTransforms, innerTransform.extractLayer(weights(index.componentOffset(1) to -1)))
+//  }
+  
+  def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = {
+    val subVects = DenseVector.vertcat(neurons.map(_.initialWeightVector(initWeightsScale, rng, outputLayer, spec)):_*) 
+    DenseVector.vertcat(subVects, innerTransform.initialWeightVector(initWeightsScale, rng, outputLayer, spec))
+  }
+  
+  def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) {
+    innerTransform.clipHiddenWeightVectors(weights(index.componentOffset(1) to -1), norm, outputLayer);
+  }
+  
+  def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = {
+    (offset until offset + Math.min(10, index.indices(0).size)) ++ innerTransform.getInterestingWeightIndicesForGradientCheck(offset + index.indices(0).size)
+  }
+
+  case class OutputLayer(sublayers: Seq[LRQTNLayer], innerLayer: innerTransform.Layer) extends OutputTransform.OutputLayer[FV,DenseVector[Double]] {
+    
+    override val index = LowRankQuadraticTransform.this.index
+    val neuronIndex = LowRankQuadraticTransform.this.neuronIndex
+
+    def activations(fv: FV) = {
+      val innerActivations = innerLayer.activations(fv)
+      DenseVector(Array.tabulate(sublayers.size)(i => sublayers(i).activations(innerActivations)(0)))
+    }
+    
+    def activationsFromPenultimateDot(innerLayerActivations: DenseVector[Double], sparseIdx: Int): Double = {
+      sublayers(sparseIdx).activations(innerLayerActivations)(0)
+    }
+
+    def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = {
+      val innerActivations = innerLayer.activations(fv)
+      for (i <- 0 until sublayers.size) {
+        sublayers(i).tallyDerivative(deriv(neuronIndex.componentOffset(i) until neuronIndex.componentOffset(i) + neuronIndex.indices(i).size), _scale(i), innerActivations)
+      }
+    }
+    
+    def applyBatchNormalization(inputs: scala.collection.GenTraversable[FV]) = innerLayer.applyBatchNormalization(inputs)
+  }
+
+ 
+}
+
+/**
+ * Separate because I was having some issues...
+ */
+case class LowRankQuadraticTransformNeuron(numRanks: Int, numLeftInputs: Int, numRightInputs: Int) {
+
+  val index = SegmentedIndex(new AffineTransform.Index(numRanks, numLeftInputs, false), new AffineTransform.Index(numRanks, numRightInputs, false))
+
+  def extractLayer(weights: DenseVector[Double]) = {
+    val lhsSize = numRanks * numLeftInputs
+    val rhsSize = numRanks * numRightInputs
+    val lhsMat = weights(0 until lhsSize).asDenseMatrix.reshape(numRanks, numLeftInputs, view = View.Require)
+    val rhsMat = weights(lhsSize until (lhsSize + rhsSize)).asDenseMatrix.reshape(numRanks, numRightInputs, view = View.Require)
+    new LRQTNLayer(lhsMat, rhsMat, index, numRanks, numLeftInputs, numRightInputs)
+  }
+  
+  def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = {
+    if (spec == "magic") {
+      DenseVector.vertcat(AffineTransform.getMagicAffineWeights(index.indices(0).size, numLeftInputs, numRanks, initWeightsScale, rng),
+                          AffineTransform.getMagicAffineWeights(index.indices(1).size, numRightInputs, numRanks, initWeightsScale, rng))
+    } else {
+      DenseVector.vertcat(AffineTransform.getGaussianAffineWeights(index.indices(0).size, initWeightsScale, rng),
+                          AffineTransform.getGaussianAffineWeights(index.indices(1).size, initWeightsScale, rng))
+    } 
+  }
+  
+  def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) {
+  }
+}
+
+
+case class LRQTNLayer(lhsWeights: DenseMatrix[Double], rhsWeights: DenseMatrix[Double], index: Index[Feature], numRanks: Int, numLeftInputs: Int, numRightInputs: Int) {
+  val lhsWeightst = lhsWeights.t
+  val rhsWeightst = rhsWeights.t
+
+  def activations(fv: DenseVector[Double]) = {
+    val lhsProj = lhsWeights * fv
+    val rhsProj = rhsWeights * fv
+    val dotProd = lhsProj.dot(rhsProj)
+//    println(dotProd + "            " + lhsProj.data.toSeq + "         " + rhsProj.data.toSeq)
+    DenseVector(dotProd)
+  }
+
+  def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: DenseVector[Double]) = {
+//      println("SCALE: " + _scale)
+    val scale = _scale(0)
+    if (Math.abs(scale) > 1e-6) {
+      val lhsSize = numRanks * numLeftInputs
+      val rhsSize = numRanks * numRightInputs
+//      println(deriv.size + " " + lhsSize + " " + numRanks + " " + numLeftInputs + " " + rhsSize)
+      val lhsDeriv = deriv(0 until lhsSize).asDenseMatrix.reshape(numRanks, numLeftInputs, view = View.Require)
+      val rhsDeriv = deriv(lhsSize until lhsSize + rhsSize).asDenseMatrix.reshape(numRanks, numRightInputs, view = View.Require)
+
+      val innerActs = fv
+      val lhsProj = lhsWeights * innerActs
+      val rhsProj = rhsWeights * innerActs
+
+      // Smart way
+      lhsDeriv += rhsProj * innerActs.t * scale
+      rhsDeriv += lhsProj * innerActs.t * scale
+      // Dumb way
+//      for (r <- 0 until lhsWeights.rows) {
+//        for (i <- 0 until lhsWeights.cols) {
+//          lhsDeriv(r, i) += scale * innerActs(i) * rhsProj(r)
+//        }
+//        for (i <- 0 until rhsWeights.cols) {
+//          rhsDeriv(r, i) += scale * innerActs(i) * lhsProj(r)
+//        }
+//      }
+      require(deriv.size == lhsSize + rhsSize, "Backpropagating through LowRankQuadraticTransform is not currently supported")
+    }
+  }
+}
+
+//}
diff --git a/src/main/scala/epic/dense/NonlinearTransform.scala b/src/main/scala/epic/dense/NonlinearTransform.scala
new file mode 100644
index 00000000..31ff8cf1
--- /dev/null
+++ b/src/main/scala/epic/dense/NonlinearTransform.scala
@@ -0,0 +1,144 @@
+package epic.dense
+
+import breeze.linalg._
+import breeze.linalg.DenseVector
+import epic.framework.Feature
+import breeze.util.Index
+import scala.util.Random
+import breeze.numerics.sigmoid
+
+/**
+ * A bit of a misnomer since this has been generalized to support linear functions as
+ * well...
+ */
+case class NonlinearTransform[FV](nonLinType: String, size: Int, inner: Transform[FV, DenseVector[Double]], dropoutRate: Double = 0.5) extends Transform[FV, DenseVector[Double]] {
+  
+  val index: inner.index.type = inner.index
+
+  def extractLayer(dv: DenseVector[Double], forTrain: Boolean) = { 
+    if (nonLinType == "dropout") {
+      val keepFrac = 1.0 - dropoutRate
+      val fcn = if (forTrain) {
+        // Only have "true" when we want to keep things around
+        new NonlinearTransform.Mask(Array.fill(size)(NonlinearTransform.globalRng.nextDouble < keepFrac)) 
+      } else {
+        new NonlinearTransform.Scale(keepFrac)
+      }
+      new Layer(fcn, inner.extractLayer(dv, forTrain))
+    } else {
+      val nonlinearFcn = NonlinearTransform.getNonlinearFcn(nonLinType);
+      new Layer(nonlinearFcn, inner.extractLayer(dv, forTrain))
+    }
+  }
+  
+  def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = inner.initialWeightVector(initWeightsScale, rng, false, spec)
+
+  def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) = inner.clipHiddenWeightVectors(weights, norm, false)
+  
+  def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = inner.getInterestingWeightIndicesForGradientCheck(offset)
+  
+  case class Layer(nonlinearFcn: NonlinearTransform.NonlinearFcn, innerLayer: inner.Layer) extends Transform.Layer[FV,DenseVector[Double]] {
+    
+    val myIndex = Index[Feature]
+    
+    def index = myIndex;
+
+    def activations(fv: FV): DenseVector[Double] = {
+      val act = innerLayer.activations(fv)
+      var i = 0;
+      while (i < act.size) {
+        act(i) = nonlinearFcn.fcn(i, act(i))
+        i += 1
+      }
+      act
+    }
+
+    def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = {
+      val scale = _scale
+      val act = innerLayer.activations(fv)
+      var i = 0;
+      while (i < act.size) {
+        act(i) = nonlinearFcn.deriv(i, act(i))
+        i += 1
+      }
+      act :*= scale
+      innerLayer.tallyDerivative(deriv, act, fv)
+    }
+    
+    def applyBatchNormalization(inputs: scala.collection.GenTraversable[FV]) = innerLayer.applyBatchNormalization(inputs)
+
+  }
+
+}
+
+object NonlinearTransform {
+  
+  val globalRng = new scala.util.Random(0)
+  
+  def getNonlinearFcn(nonLinType: String) = {
+    if (nonLinType == "tanh") {
+      Tanh()
+    } else if (nonLinType == "relu") {
+      Relu()
+    } else if (nonLinType == "requ") {
+      Requ()
+    } else if (nonLinType == "cube") {
+      Cube()
+    } else if (nonLinType == "const") {
+      Constant()
+    } else {
+      throw new RuntimeException("Unrecognized nonlin type: " + nonLinType)
+    }
+  }
+  
+  trait NonlinearFcn {
+    // idx is the position of the unit; this basically only applies to dropout
+    // where we want to zero out particular units
+    def fcn(idx: Int, x: Double): Double;
+    def deriv(idx: Int, x: Double): Double;
+  }
+  
+  case class Constant() extends NonlinearFcn {
+    def fcn(idx: Int, x: Double) = 1
+    def deriv(idx: Int, x: Double) = 0
+  }
+  
+  case class Mask(val mask: Array[Boolean]) extends NonlinearFcn {
+    def fcn(idx: Int, x: Double) = if (mask(idx)) x else 0
+    def deriv(idx: Int, x: Double) = if (mask(idx)) 1 else 0
+  }
+  
+  case class ShiftAndScaleEach(val shifts: Array[Double], val factors: Array[Double]) extends NonlinearFcn {
+    def fcn(idx: Int, x: Double) = factors(idx) * (x - shifts(idx))
+    def deriv(idx: Int, x: Double) = factors(idx)
+  }
+  
+  case class Scale(val factor: Double) extends NonlinearFcn {
+    def fcn(idx: Int, x: Double) = factor * x
+    def deriv(idx: Int, x: Double) = factor
+  }
+  
+  case class Tanh() extends NonlinearFcn {
+    def fcn(idx: Int, x: Double) = 2 * sigmoid(2 * x) - 1.0
+    def deriv(idx: Int, x: Double) = {
+      val sig = sigmoid(2 * x)
+      -4 * sig * (sig - 1.0)
+    }
+  }
+  
+  case class Relu() extends NonlinearFcn {
+    def fcn(idx: Int, x: Double) = Math.max(x, 0)
+    def deriv(idx: Int, x: Double) = if (x > 0) 1.0 else 0.0
+  }
+  
+  case class Requ() extends NonlinearFcn {
+    def fcn(idx: Int, x: Double) = if (x > 0) x * x else 0.0
+    def deriv(idx: Int, x: Double) = if (x > 0) 2 * x else 0.0
+  }
+  
+  case class Cube() extends NonlinearFcn {
+    def fcn(idx: Int, x: Double) = x * x * x
+    def deriv(idx: Int, x: Double) = 3 * x * x
+  }
+  
+}
\ No newline at end of file
diff --git a/src/main/scala/epic/dense/OutputEmbeddingTransform.scala b/src/main/scala/epic/dense/OutputEmbeddingTransform.scala
new file mode 100644
index 00000000..d9d494e8
--- /dev/null
+++ b/src/main/scala/epic/dense/OutputEmbeddingTransform.scala
@@ -0,0 +1,155 @@
+package epic.dense
+
+import breeze.linalg._
+import breeze.linalg.operators.OpMulMatrix
+import epic.features.SegmentedIndex
+import epic.framework.Feature
+
+import scala.runtime.ScalaRunTime
+import scala.util.Random
+
+/**
+ * Output embedding technique described in section 6 of
+ * http://www.eecs.berkeley.edu/~gdurrett/papers/durrett-klein-acl2015.pdf
+ * Basically learns a dictionary for the output as well as an affine transformation
+ * in order to produce the vector that gets combined with the input in the final
+ * bilinear product.
+ */
+case class OutputEmbeddingTransform[FV](numOutputs: Int, outputDim: Int, innerTransform: Transform[FV, DenseVector[Double]], coarsenerForInitialization: Option[Int => Int] = None) extends OutputTransform[FV, DenseVector[Double]] {
+
+
+  val index = SegmentedIndex(new AffineTransform.Index(numOutputs, outputDim, true),
+                             innerTransform.index)
+  
+  def extractLayerAndPenultimateLayer(weights: DenseVector[Double], forTrain: Boolean) = {
+    val embeddings = weights(index.componentOffset(0) until index.componentOffset(0) + (numOutputs * outputDim)).asDenseMatrix.reshape(numOutputs, outputDim, view = View.Require)
+    val bias = weights(index.componentOffset(0) + numOutputs * outputDim until index.componentOffset(1))
+    val inner = innerTransform.extractLayer(weights(index.componentOffset(1) to -1), forTrain)
+    new OutputLayer(embeddings, bias, inner) -> inner
+  }
+  
+  def clipEmbeddingNorms(weights: DenseVector[Double]) {
+    val embeddings = weights(index.componentOffset(1) until index.componentOffset(1) + (numOutputs * outputDim)).asDenseMatrix.reshape(numOutputs, outputDim, view = View.Require)
+    OutputEmbeddingTransform.clipEmbeddingNorms(embeddings);
+  }
+  
+  def displayEmbeddingNorms(weights: DenseVector[Double]) {
+    val embeddings = weights(index.componentOffset(1) until index.componentOffset(1) + (numOutputs * outputDim)).asDenseMatrix.reshape(numOutputs, outputDim, view = View.Require)
+    OutputEmbeddingTransform.displayEmbeddingNorms(embeddings);
+  }
+  
+  def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = {
+    require(outputLayer)
+    val embeddingsInitialization = if (coarsenerForInitialization.isDefined) {
+      OutputEmbeddingTransform.getCoarsenedInitialEmbeddingWeights(numOutputs, outputDim, coarsenerForInitialization.get)
+    } else if (spec == "magic") {
+      AffineTransform.getMagicAffineWeights(index.indices(0).size, numOutputs, outputDim, initWeightsScale, rng)
+    } else if (spec == "identity") {
+      OutputEmbeddingTransform.getIdentityEmbeddingWeights(numOutputs, outputDim, rng)
+    } else {
+      AffineTransform.getGaussianAffineWeights(index.indices(0).size, initWeightsScale, rng)
+    }
+    // N.B. "true" because the next layer effectively becomes the output layer from the purposes of
+    // initialization
+    DenseVector.vertcat(embeddingsInitialization,
+                        innerTransform.initialWeightVector(initWeightsScale, rng, true, spec))
+  }
+  
+  def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) {
+    innerTransform.clipHiddenWeightVectors(weights(index.componentOffset(1) to -1), norm, false)
+  }
+  
+  def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = {
+    (offset until offset + Math.min(10, index.indices(0).size)) ++ innerTransform.getInterestingWeightIndicesForGradientCheck(offset + index.indices(0).size)
+  }
+
+  case class OutputLayer(embeddings: DenseMatrix[Double], bias: DenseVector[Double], innerLayer: innerTransform.Layer) extends OutputTransform.OutputLayer[FV,DenseVector[Double]] {
+    override val index = OutputEmbeddingTransform.this.index
+
+    def activations(fv: FV) = {
+      val innerActs = innerLayer.activations(fv)
+      DenseVector(Array.tabulate(numOutputs)(i => activationsFromPenultimateDot(innerActs, i)))
+    }
+    
+    def activationsFromPenultimateDot(innerLayerActivations: DenseVector[Double], sparseIdx: Int) = {
+      innerLayerActivations dot embeddings(sparseIdx, ::).t + bias(sparseIdx)
+    }
+
+    def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = {
+      val scale = _scale
+      val embeddingsDeriv = deriv(0 until numOutputs * outputDim).asDenseMatrix.reshape(numOutputs, outputDim, view = View.Require)
+      val biasDeriv = deriv(numOutputs * outputDim until index.componentOffset(1))
+      val innerAct = innerLayer.activations(fv)
+      val innerScale = DenseVector(Array.tabulate(outputDim)(i => 0.0))
+      for (k <- 0 until scale.size) {
+        // Assuming there's something nontrivial to pass back
+        if (scale(k) != 0.0) {
+          // Bias update
+          biasDeriv(k) += scale(k)
+          embeddingsDeriv(k, ::).t += innerAct * scale(k) // Embeddings update
+          innerScale += embeddings(k, ::).t * scale(k)
+        }
+      }
+      innerLayer.tallyDerivative(deriv(index.componentOffset(1) to -1), innerScale, fv)
+    }
+    
+    def applyBatchNormalization(inputs: scala.collection.GenTraversable[FV]) = innerLayer.applyBatchNormalization(inputs)
+  }
+
+}
+
+object OutputEmbeddingTransform {
+  
+  def getIdentityEmbeddingWeights(numOutputs: Int, outputDim: Int, rng: Random) = {
+    require(outputDim <= numOutputs, outputDim + " " + numOutputs)
+    val mat = DenseMatrix.zeros[Double](numOutputs, outputDim)
+    for (i <- 0 until outputDim) {
+      mat(i, i) = 1.0
+    }
+    for (i <- outputDim until numOutputs) {
+      mat(i, rng.nextInt(outputDim)) = 1.0
+    }
+    val biasInitializer = DenseVector.zeros[Double](numOutputs)
+    val initWeights = DenseVector.vertcat(DenseVector(mat.data), biasInitializer)
+    initWeights
+  }
+  
+  def clipEmbeddingNorms(embeddings: DenseMatrix[Double]) {
+    for (i <- 0 until embeddings.rows) {
+      var norm = 0.0
+      for (j <- 0 until embeddings.cols) {
+        norm += embeddings(i, j) * embeddings(i, j)
+      }
+      norm = Math.sqrt(norm)
+      for (j <- 0 until embeddings.cols) {
+        embeddings(i, j) /= norm
+      }
+    }
+  }
+  
+  def displayEmbeddingNorms(embeddings: DenseMatrix[Double]) {
+    var avgNorm = 0.0
+    var maxNorm = 0.0
+    for (i <- 0 until embeddings.rows) {
+      var norm = 0.0
+      for (j <- 0 until embeddings.cols) {
+        norm += embeddings(i, j) * embeddings(i, j)
+      }
+      norm = Math.sqrt(norm)
+      avgNorm += norm
+      maxNorm = Math.max(maxNorm, norm)
+    }
+    println("Average norm: " + avgNorm/embeddings.rows + ", max norm: " + maxNorm)
+  }
+  
+  def getCoarsenedInitialEmbeddingWeights(numOutputs: Int, outputDim: Int, coarsenerForInitialization: Int => Int) = {
+    val mat = DenseMatrix.zeros[Double](numOutputs, outputDim)
+    for (i <- 0 until numOutputs) {
+      val j = ((coarsenerForInitialization(i) % outputDim) + outputDim) % outputDim
+      mat(i, j) = 1.0
+    }
+    val biasInitializer = DenseVector.zeros[Double](numOutputs)
+    val initWeights = DenseVector.vertcat(DenseVector(mat.data), biasInitializer)
+    initWeights
+  }
+}
diff --git a/src/main/scala/epic/dense/OutputTransform.scala b/src/main/scala/epic/dense/OutputTransform.scala
new file mode 100644
index 00000000..fe64cb4c
--- /dev/null
+++ b/src/main/scala/epic/dense/OutputTransform.scala
@@ -0,0 +1,47 @@
+package epic.dense
+
+import breeze.linalg._
+import breeze.util.Index
+import epic.framework.Feature
+import scala.util.Random
+
+trait OutputTransform[In, +Out] extends Serializable {
+  val index: Index[Feature]
+
+  def extractLayer(dv: DenseVector[Double], forTrain: Boolean):OutputLayer = extractLayerAndPenultimateLayer(dv, forTrain)._1
+  
+  def extractLayerAndPenultimateLayer(dv: DenseVector[Double], forTrain: Boolean): (OutputLayer, Transform.Layer[In,Out]);
+  
+  def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String): DenseVector[Double]
+
+  def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean)
+  
+  def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int];
+  
+  type OutputLayer <: OutputTransform.OutputLayer[In,Out]
+}
+
+object OutputTransform {
+  
+  trait OutputLayer[In, +Out] extends Transform.Layer[In,Out] {
+
+    def index: Index[Feature];
+
+    def activations(fv: In):Out
+    
+    def activationsFromPenultimateDot(innerLayerActivations: DenseVector[Double], sparseIdx: Int): Double;
+    
+    def activationsFromPenultimateDot(innerLayerActivations: DenseVector[Double], sparseFeatures: Array[Int]): Double = {
+      var value = 0.0;
+      for (sparseFeature <- sparseFeatures) {
+        value += activationsFromPenultimateDot(innerLayerActivations, sparseFeature)
+      }
+      value
+    }
+
+    def tallyDerivative(deriv: DenseVector[Double], scale: =>Vector[Double], fv: In)
+
+    def applyBatchNormalization(inputs: scala.collection.GenTraversable[In])
+  }
+
+}
diff --git a/src/main/scala/epic/dense/SigmoidTransform.scala b/src/main/scala/epic/dense/SigmoidTransform.scala
deleted file mode 100644
index a2e5c33a..00000000
--- a/src/main/scala/epic/dense/SigmoidTransform.scala
+++ /dev/null
@@ -1,49 +0,0 @@
-package epic.dense
-
-import epic.framework.Feature
-import breeze.linalg._
-import breeze.linalg.operators.OpMulMatrix
-import breeze.numerics._
-import breeze.linalg.support.{CanMapValues}
-
-/**
- *
- *
- * @author dlwh
- */
-case class NeuralFeature(output: Int, input: Int) extends Feature
-case class NeuralBias(input: Int) extends Feature
-
-case class SigmoidTransform[FV](inner: Transform[FV, DenseVector[Double]]) extends Transform[FV, DenseVector[Double]] {
-  def this(numOutputs: Int, numInputs: Int,
-           includeBias: Boolean = true)
-          (implicit mult: OpMulMatrix.Impl2[DenseMatrix[Double], FV, DenseVector[Double]],
-           canaxpy: scaleAdd.InPlaceImpl3[DenseVector[Double], Double, FV])  = this(AffineTransform.typed(numOutputs, numInputs, includeBias))
-
-  val index: inner.index.type = inner.index
-
-
-  def extractLayer(dv: DenseVector[Double]) = new Layer(inner.extractLayer(dv))
-
-  case class Layer(innerLayer: inner.Layer) extends _Layer {
-
-    def activations(fv: FV): DenseVector[Double] = sigmoid(innerLayer.activations(fv))
-
-    def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = {
-      val scale = _scale
-      val act = activations(fv)
-      act :*= (act - 1.0)
-      act :*= -1.0
-      // whole function is f(sigmoid(transform(features)))
-      // scale(i) pushes in  (f'(sigmoid(transform(features)))(i) so just need to finish the chain rule.
-      // activations(...) computes sigmoid(transform(features))
-      // act is currently sigmoid'(transform(...))
-      act :*= scale
-
-      innerLayer.tallyDerivative(deriv, act, fv)
-
-    }
-
-  }
-
-}
\ No newline at end of file
diff --git a/src/main/scala/epic/dense/TanhTransform.scala b/src/main/scala/epic/dense/TanhTransform.scala
index e3d8bdf2..92a052d5 100644
--- a/src/main/scala/epic/dense/TanhTransform.scala
+++ b/src/main/scala/epic/dense/TanhTransform.scala
@@ -3,7 +3,9 @@ package epic.dense
 import breeze.linalg._
 import breeze.linalg.operators.OpMulMatrix
 import breeze.numerics._
-
+import epic.framework.Feature
+import breeze.util.Index
+import scala.util.Random
 
 case class TanhTransform[FV](inner: Transform[FV, DenseVector[Double]]) extends Transform[FV, DenseVector[Double]] {
   def this(numOutputs: Int, numInputs: Int,
@@ -13,10 +15,19 @@ case class TanhTransform[FV](inner: Transform[FV, DenseVector[Double]]) extends
 
   val index: inner.index.type = inner.index
 
+  def extractLayer(dv: DenseVector[Double], forTrain: Boolean) = new Layer(inner.extractLayer(dv, forTrain))
+  
+  def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = inner.initialWeightVector(initWeightsScale, rng, false, spec)
+  
+  def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) = inner.clipHiddenWeightVectors(weights, norm, false)
+  
+  def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = inner.getInterestingWeightIndicesForGradientCheck(offset)
 
-  def extractLayer(dv: DenseVector[Double]) = new Layer(inner.extractLayer(dv))
-
-  case class Layer(innerLayer: inner.Layer) extends _Layer {
+  case class Layer(innerLayer: inner.Layer) extends Transform.Layer[FV,DenseVector[Double]] {
+    
+    val myIndex = Index[Feature]
+    
+    def index = myIndex;
 
     def activations(fv: FV): DenseVector[Double] = {
       val act = innerLayer.activations(fv) * 2.0
@@ -41,6 +52,8 @@ case class TanhTransform[FV](inner: Transform[FV, DenseVector[Double]]) extends
       innerLayer.tallyDerivative(deriv, act, fv)
 
     }
+    
+    def applyBatchNormalization(inputs: scala.collection.GenTraversable[FV]) = innerLayer.applyBatchNormalization(inputs)
 
   }
 
diff --git a/src/main/scala/epic/dense/Transform.scala b/src/main/scala/epic/dense/Transform.scala
index 16d03588..10fd55ee 100644
--- a/src/main/scala/epic/dense/Transform.scala
+++ b/src/main/scala/epic/dense/Transform.scala
@@ -3,28 +3,42 @@ package epic.dense
 import breeze.linalg._
 import breeze.util.Index
 import epic.framework.Feature
+import scala.util.Random
 
 /**
  *
  *
  * @author dlwh
  */
-trait Transform[In, +Out] {
+trait Transform[In, +Out] extends Serializable  {
   val index: Index[Feature]
 
 
-  def extractLayer(dv: DenseVector[Double]):Layer
+  def extractLayer(dv: DenseVector[Double], forTrain: Boolean):Layer
+  
+  def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String): DenseVector[Double]
 
-  type Layer <: _Layer
+  def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean)
+  
+  def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int];
+  
+  type Layer <: Transform.Layer[In,Out]
+}
 
-  trait _Layer {
+object Transform {
+  
+  trait Layer[In, +Out] {
 
-    def index = Transform.this.index
+    def index: Index[Feature];
 
     def activations(fv: In):Out
 
     def tallyDerivative(deriv: DenseVector[Double], scale: =>Vector[Double], fv: In)
 
+    def applyBatchNormalization(inputs: scala.collection.GenTraversable[In])
   }
 
 }
+
+case class NeuralFeature(output: Int, input: Int) extends Feature
+case class NeuralBias(input: Int) extends Feature
diff --git a/src/main/scala/epic/dense/Word2Vec.scala b/src/main/scala/epic/dense/Word2Vec.scala
new file mode 100644
index 00000000..fe79023b
--- /dev/null
+++ b/src/main/scala/epic/dense/Word2Vec.scala
@@ -0,0 +1,220 @@
+package epic.dense
+
+import java.io.BufferedInputStream
+import java.io.DataInputStream
+import java.io.FileInputStream
+import java.util.regex.Pattern
+import scala.collection.mutable.HashMap
+import scala.util.Random
+import breeze.linalg.Counter
+import java.io.File
+
+object Word2Vec {
+  /**
+   * Loads vectors from one or more sources in word2vecPaths; these might be in
+   * word2vec format (should end in .bin) or C+W/Bansal format (should end
+   * in .txt).
+   * 
+   * For each word, vectors are appended from each source. If at least one source
+   * is present, others are zeroes. Otherwise, it gets a random vector.
+   */
+  def smartLoadVectorsForVocabulary(word2vecPaths: Seq[String], voc: Set[String], vocCounts: Counter[String,Double] = Counter[String,Double], maxVectorLen: Int = Int.MaxValue, inputVectorBias: Boolean, randomizeUnks: Boolean = true) = {
+    val vectorsEachSource = for (word2vecPath <- word2vecPaths) yield {
+      if (word2vecPath.endsWith("bin")) {
+        readWord2Vec(word2vecPath, voc, false)
+      } else if (word2vecPath.endsWith(".txt")) {
+        readBansalEmbeddings(word2vecPath, voc, false)
+      } else {
+        throw new RuntimeException("Unrecognized vectors: " + word2vecPath)
+      }
+    }
+    val dimsEachSource = vectorsEachSource.map(_.values.head.size)
+    val finalVectorDim = Math.min(maxVectorLen, dimsEachSource.reduce(_ + _) + (if (inputVectorBias) 1 else 0))
+    val finalVectors = new HashMap[String,Array[Float]]
+    val rng = new Random(0)
+    val mostCommonMisses = Counter[String,Double]
+    var numRand = 0
+    for (word <- voc) {
+      val containedInSome = vectorsEachSource.map(_.keySet.contains(word)).reduce(_ || _)
+      val vector = if (containedInSome) {
+        var finalVector = (0 until vectorsEachSource.size).map(i => vectorsEachSource(i).getOrElse(word, { Array.tabulate(dimsEachSource(i))(j => 0.0F) })).reduce(_ ++ _)
+        if (inputVectorBias) {
+          finalVector = finalVector ++ Array(1.0F) 
+        }
+        finalVector
+      } else {
+        mostCommonMisses(word) = vocCounts(word)
+        numRand += 1
+        if (randomizeUnks) {
+          Array.tabulate(finalVectorDim)(i => if (i == finalVectorDim - 1 && inputVectorBias) 1.0F else ((rng.nextDouble - 0.5) * 0.5).toFloat)
+        } else {
+          Array.tabulate(finalVectorDim)(i => if (i == finalVectorDim - 1 && inputVectorBias) 1.0F else 0.0F)
+        }
+      }
+      val vectorTrimmed = if (vector.size > finalVectorDim) vector.slice(0, finalVectorDim) else vector
+      require(vectorTrimmed.size == finalVectorDim, "Mismatched sizes, expected dimension " + finalVectorDim + " but got " + vector.size + " clipped to " + vectorTrimmed.size)
+      finalVectors.put(word, vectorTrimmed)
+    }
+    println("Read embeddings for " + voc.size + " words from " + word2vecPaths.size + " sources, " +
+            "total embedding size = " + finalVectorDim + ", " + numRand + " present in no source")
+    println("Fifty most common misses: " + mostCommonMisses.argtopk(50).map(word => word + ": " + mostCommonMisses(word)))
+    finalVectors
+  }
+  
+  def makeRandomVectorsForVocabulary(voc: Set[String], dim: Int, inputVectorBias: Boolean) = {
+    val finalVectors = new HashMap[String,Array[Float]]
+    val finalVectorDim = dim + (if (inputVectorBias) 1 else 0)
+    val rng = new Random(0)
+    var numRand = 0
+    for (word <- voc) {
+      val vec = Array.tabulate(finalVectorDim)(i => if (i == finalVectorDim - 1 && inputVectorBias) 1.0F else ((rng.nextDouble - 0.5) * 0.5).toFloat)
+      finalVectors.put(word, vec)
+    }
+    finalVectors
+  }
+  
+  /**
+   * Loads vectors for a vocabulary from word2vec, with OOV words having random vectors
+   * generated for them.
+   */
+  def loadVectorsForVocabulary(word2vecPath: String, voc: Set[String], inputVectorBias: Boolean) = {
+    val word2vecMap = readWord2Vec(word2vecPath, voc, inputVectorBias);
+    if (word2vecMap.isEmpty) {
+      throw new RuntimeException("No word2vec vectors loaded")
+    }
+    augmentVectorsToCompleteVocabulary(word2vecMap, voc, inputVectorBias)
+  }
+  
+  def loadBansalVectorsForVocabulary(word2vecPath: String, voc: Set[String], inputVectorBias: Boolean) = {
+    val word2vecMap = readBansalEmbeddings(word2vecPath, voc, inputVectorBias);
+    if (word2vecMap.isEmpty) {
+      throw new RuntimeException("No Bansal vectors loaded")
+    }
+    augmentVectorsToCompleteVocabulary(word2vecMap, voc, inputVectorBias)
+  }
+  
+  private def augmentVectorsToCompleteVocabulary(word2vecMap: HashMap[String,Array[Float]], voc: Set[String], inputVectorBias: Boolean) = {
+    val word2vecDim = word2vecMap.values.head.size
+    val rng = new Random(0)
+    for (unkWord <- voc -- word2vecMap.keySet) {
+      // Set to random noise except for the bias feature, if it's there
+      word2vecMap.put(unkWord, Array.tabulate(word2vecDim)(i => if (i == word2vecDim - 1 && inputVectorBias) 1.0F else ((rng.nextDouble - 0.5) * 0.5).toFloat))
+    }
+    word2vecMap
+  }
+
+  /**
+   * Reads the vectors in words from the given word2vec path and augments with a bias feature
+   * if necessary. The returned map does not include entries for words that are not in the w2v
+   * file.
+   */
+  def readWord2Vec(word2VecPath: String, words: Set[String], inputVectorBias: Boolean) = {
+    val bis = new BufferedInputStream(new FileInputStream(word2VecPath));
+    val dis = new DataInputStream(bis);
+    val word2Vec = new HashMap[String,Array[Float]];
+    // First two entries are vocabulary size and dimension of vectors
+    val vocSize = Word2VecUtils.readString(dis).toInt;
+    val dim = Word2VecUtils.readString(dis).toInt;
+    // Now read vectors, augmented with 1s for bias
+    for (i <- 0 until vocSize) {
+      if (i % 1000000 == 0) {
+        println("On line " + i)
+      }
+      val word = Word2VecUtils.readString(dis);
+      val vector = new Array[Float](if (inputVectorBias) dim + 1 else dim);
+      val len = 0;
+      var j = 0;
+      while (j < dim) {
+        vector(j) = Word2VecUtils.readFloat(dis);
+        j += 1;
+      }
+      if (inputVectorBias) {
+        vector(j) = 1.0F
+      }
+      if (words.isEmpty || words.contains(word)) {
+        word2Vec.put(word, vector);
+      }
+    }
+    println("Loaded " + word2Vec.size + " word2vec representations out of " + words.size + " attempted words");
+    word2Vec;
+  }
+  
+  val hyphenPattern = Pattern.compile("(\\w+-)+(\\w+)");
+  
+  def convertWord(str: String, lowercase: Boolean = false) = {
+    var strRep = str;
+    strRep = strRep.replace("-LRB-", "(")
+    strRep = strRep.replace("-RRB-", ")")
+    strRep = strRep.replace("-LSB-", "[")
+    strRep = strRep.replace("-RSB-", "]")
+    strRep = strRep.replace("-LCB-", "{")
+    strRep = strRep.replace("-RCB-", "}")
+    // Replace all numbers with 15
+    strRep = strRep.replaceAll("^-?[0-9,.]{2,15}$", "fifteen")
+    // Replace hyphenated words with the last part
+    val m = hyphenPattern .matcher(str)
+    strRep = if (m.find()) {
+      m.group(2)
+    } else {
+      strRep
+    }
+    if (lowercase) {
+      strRep = strRep.toLowerCase()
+    }
+    strRep
+  }
+  
+  def readBansalEmbeddings(embeddingsPath: String, words: Set[String], inputVectorBias: Boolean) = {
+    val inFile = scala.io.Source.fromFile(new File(embeddingsPath)).getLines()
+    val word2Vec = new HashMap[String,Array[Float]];
+    var firstLine = true
+    while (inFile.hasNext) {
+      val line = inFile.next;
+      if (firstLine) {
+        if (line.split("\\s+").size == 2) {
+          println("Skipping first line: " + line)
+          // Just an indicator of how many words there are and the vector dim, so
+          // skip over it by leaving firstLine set to true
+        } else {
+          println("Not skipping first line: " + line)
+          firstLine = false;
+        }
+      }
+      if (!firstLine) {
+        // If the line contains a tab, then that's the delimiter between the word and
+        // the vectors
+        if (line.contains("\t")) {
+          val word = line.substring(0, line.indexOf("\t"));
+          if (words.isEmpty || words.contains(word)) {
+            val entries = line.substring(line.indexOf("\t") + 1).split(" ")
+            val arr = Array.tabulate(if (inputVectorBias) entries.size + 1 else entries.size)(i => {
+              if (inputVectorBias && i == entries.size) {
+                1.0F
+              } else {
+                entries(i).toFloat
+              }
+            })
+            word2Vec.put(word, arr)
+          }
+        } else {
+          // Otherwise, a space is the first delimiter
+          val word = line.substring(0, line.indexOf(" "));
+          if (words.isEmpty || words.contains(word)) {
+            val entries = line.substring(line.indexOf(" ") + 1).split(" ");
+            val arr = Array.tabulate(if (inputVectorBias) entries.size + 1 else entries.size)(i => {
+              if (inputVectorBias && i == entries.size) {
+                1.0F
+              } else {
+                entries(i).toFloat
+              }
+            })
+            word2Vec.put(word, arr)
+          }
+        }
+      }
+      firstLine = false;
+    }
+    println("Loaded " + word2Vec.size + " Bansal representations out of " + words.size + " attempted words");
+    word2Vec;
+  }
+}
\ No newline at end of file
diff --git a/src/main/scala/epic/dense/Word2VecSurfaceFeaturizer.scala b/src/main/scala/epic/dense/Word2VecSurfaceFeaturizer.scala
new file mode 100644
index 00000000..eef2abaf
--- /dev/null
+++ b/src/main/scala/epic/dense/Word2VecSurfaceFeaturizer.scala
@@ -0,0 +1,233 @@
+package epic.dense
+
+import breeze.linalg.DenseVector
+import scala.collection.mutable.HashMap
+import breeze.util.Index
+import breeze.linalg.DenseMatrix
+import breeze.linalg.Counter2
+import breeze.linalg.Counter
+import epic.features.HackyLexicalProductionFeaturizer
+import breeze.linalg.sum
+import epic.features.RuleBasedHackyHeadFinder
+import epic.features.HackyHeadFinder
+import epic.parser.RuleTopology
+import epic.trees.AnnotatedLabel
+
+/**
+ * converter is used to map words into the word2vec vocabulary, which might include things
+ * like lowercasing, replacing numbers, changing -LRB-, etc. See Word2Vec.convertWord
+ */
+class Word2VecIndexed[W](private val wordIndex: Index[W],
+                         private val word2vec: Array[Array[Double]],
+                         private val converter: W => W) extends Serializable {
+  
+  def wordRepSize = word2vec.head.size
+  def vocSize = wordIndex.size
+    
+  val zeroVector = Array.tabulate(wordRepSize)(i => 0.0)
+  
+  def containsWord(rawStr: W) = wordIndex.contains(converter(rawStr))
+  
+  def indexWord(rawStr: W) = wordIndex(converter(rawStr))
+  
+  def convertIndexToVector(idx: Int) = word2vec(idx)
+   
+  private def assemble(vectors: Seq[Array[Double]]) = vectors.reduce(_ ++ _)
+  
+  def convertToVector(indexedWords: Array[Int]): Array[Double] = {
+    assemble(indexedWords.map(wordIdx => if (wordIdx == -1) zeroVector else word2vec(wordIdx)))
+  }
+  
+  def augment(numSparseFeats: Int, featurizer: W => Array[Int]): Word2VecIndexed[W] = {
+    val newWord2Vec = Array.tabulate(word2vec.size)(i => {
+      val word = wordIndex.get(i)
+      val feats = featurizer(word)
+      word2vec(i) ++ Array.tabulate(numSparseFeats)(j => if (feats.contains(j)) 1.0 else 0.0)
+    })
+    new Word2VecIndexed(wordIndex, newWord2Vec, converter)
+  }
+}
+
+object Word2VecIndexed {
+  
+  def apply[W](word2vec: HashMap[W,Array[Double]],
+               converter: W => W) = {
+    val index = Index[W]
+    val arr = new Array[Array[Double]](word2vec.size)
+    for (word <- word2vec.keySet) {
+      arr(index.index(word)) = word2vec(word)
+    }
+    new Word2VecIndexed(index, arr, converter)
+  }
+}
+
+trait WordVectorAnchoringIndexed[String] {
+  def reducedFeaturesForSpan(start: Int, end: Int): Array[Int];
+  def featuresForSpan(start: Int, end: Int): Array[Int];
+  def featuresForSplit(start: Int, split: Int, end: Int): Array[Int];
+}
+
+class Word2VecSurfaceFeaturizerIndexed[W](val word2vecIndexed: Word2VecIndexed[W],
+                                          val featureSpec: String) extends Serializable {
+  
+  def reducedInputSize = {
+    anchor(IndexedSeq[W]()).reducedFeaturesForSpan(0, 0).size * word2vecIndexed.wordRepSize
+  }
+  
+  def splitInputSize = {
+    anchor(IndexedSeq[W]()).featuresForSplit(0, 0, 0).size * word2vecIndexed.wordRepSize
+  }
+  
+  def anchor(words: IndexedSeq[W]): WordVectorAnchoringIndexed[W] = {
+    val indexedWords = words.map(word2vecIndexed.indexWord(_))
+    new WordVectorAnchoringIndexed[W] {
+      
+      def reducedFeaturesForSpan(start: Int, end: Int) = {
+        if (featureSpec == "" || featureSpec == "moresplit" || featureSpec == "basic") {
+          Array(fetchWord(start - 1), fetchWord(start), fetchWord(end - 1), fetchWord(end))
+        } else if (featureSpec == "morecontext") {
+          Array(fetchWord(start - 2), fetchWord(start - 1), fetchWord(start), fetchWord(end - 1), fetchWord(end), fetchWord(end + 1))
+        } else if (featureSpec == "morefirstlast") {
+          Array(fetchWord(start - 1), fetchWord(start), fetchWord(start + 1), fetchWord(end - 2), fetchWord(end - 1), fetchWord(end))
+        } else if (featureSpec == "mcmfl") {
+          Array(fetchWord(start - 2), fetchWord(start - 1), fetchWord(start), fetchWord(start + 1), fetchWord(end - 2), fetchWord(end - 1), fetchWord(end), fetchWord(end + 1))
+        } else if (featureSpec == "most") {
+          Array(fetchWord(start - 2), fetchWord(start - 1), fetchWord(start), fetchWord(start + 1), fetchWord(end - 2), fetchWord(end - 1), fetchWord(end), fetchWord(end + 1))
+        } else {
+          throw new RuntimeException("Unknown featureSpec: " + featureSpec)
+        }
+      }
+      
+      def featuresForSpan(start: Int, end: Int) = {
+        if (featureSpec == "" || featureSpec == "basic") {
+          Array(fetchWord(start - 1), fetchWord(start), -1, -1, fetchWord(end - 1), fetchWord(end))
+        } else if (featureSpec == "morecontext") {
+          Array(fetchWord(start - 2), fetchWord(start - 1), fetchWord(start), -1, -1, fetchWord(end - 1), fetchWord(end), fetchWord(end + 1))
+        } else if (featureSpec == "morefirstlast") {
+          Array(fetchWord(start - 1), fetchWord(start), fetchWord(start + 1), -1, -1, fetchWord(end - 2), fetchWord(end - 1), fetchWord(end))
+        } else if (featureSpec == "moresplit") {
+          Array(fetchWord(start - 1), fetchWord(start), -1, -1, -1, -1, fetchWord(end - 1), fetchWord(end))
+        } else if (featureSpec == "mcmfl") {
+          Array(fetchWord(start - 2), fetchWord(start - 1), fetchWord(start), fetchWord(start + 1), -1, -1, fetchWord(end - 2), fetchWord(end - 1), fetchWord(end), fetchWord(end + 1))
+        } else if (featureSpec == "most") {
+          Array(fetchWord(start - 2), fetchWord(start - 1), fetchWord(start), fetchWord(start + 1), -1, -1, -1, -1, fetchWord(end - 2), fetchWord(end - 1), fetchWord(end), fetchWord(end + 1))
+        } else {
+          throw new RuntimeException("Unknown featureSpec: " + featureSpec)
+        }
+      }
+        
+      def featuresForSplit(start: Int, split: Int, end: Int) = {
+        if (featureSpec == "" || featureSpec == "basic") {
+          Array(fetchWord(start - 1), fetchWord(start), fetchWord(split - 1), fetchWord(split), fetchWord(end - 1), fetchWord(end))
+        } else if (featureSpec == "morecontext") {
+          Array(fetchWord(start - 2), fetchWord(start - 1), fetchWord(start), fetchWord(split - 1), fetchWord(split), fetchWord(end - 1), fetchWord(end), fetchWord(end + 1))
+        } else if (featureSpec == "morefirstlast") {
+          Array(fetchWord(start - 1), fetchWord(start), fetchWord(start + 1), fetchWord(split - 1), fetchWord(split), fetchWord(end - 2), fetchWord(end - 1), fetchWord(end))
+        } else if (featureSpec == "moresplit") {
+          Array(fetchWord(start - 1), fetchWord(start), fetchWord(split - 2), fetchWord(split - 1), fetchWord(split), fetchWord(split + 1), fetchWord(end - 1), fetchWord(end))
+        } else if (featureSpec == "mcmfl") {
+          Array(fetchWord(start - 2), fetchWord(start - 1), fetchWord(start), fetchWord(start + 1), fetchWord(split - 1), fetchWord(split), fetchWord(end - 2), fetchWord(end - 1), fetchWord(end), fetchWord(end + 1))
+        } else if (featureSpec == "most") {
+          Array(fetchWord(start - 2), fetchWord(start - 1), fetchWord(start), fetchWord(start + 1), fetchWord(split - 2), fetchWord(split - 1), fetchWord(split), fetchWord(split + 1), fetchWord(end - 2), fetchWord(end - 1), fetchWord(end), fetchWord(end + 1))
+        } else {
+          throw new RuntimeException("Unknown featureSpec: " + featureSpec)
+        }
+      }
+      
+      private def fetchWord(idx: Int): Int = {
+        if (idx < 0 || idx >= words.size) -1 else indexedWords(idx)
+      }
+    } 
+  }
+}
+
+object Word2VecSurfaceFeaturizerIndexed {
+  
+  def makeVectFromParams(wordIndices: Array[Int], params: DenseMatrix[Double]): DenseVector[Double] = {
+    var currVect = DenseVector[Double](Array[Double]())
+    for (wordIndex <- wordIndices) {
+      currVect = DenseVector.vertcat(currVect, params(wordIndex, ::).t)
+    }
+    currVect
+  }
+}
+
+
+trait WordVectorDepAnchoringIndexed[String] {
+  def getHeadDepPair(begin: Int, split: Int, end: Int, rule: Int): (Int, Int);
+  def featuresForHeadPair(head: Int, dep: Int): Array[Int];
+}
+
+class Word2VecDepFeaturizerIndexed[W](val word2VecIndexed: Word2VecIndexed[W],
+                                      val tagger: Tagger[W],
+                                      val topology: RuleTopology[AnnotatedLabel]) extends Serializable  {
+  
+  val hackyHeadFinder: HackyHeadFinder[String,String] = new RuleBasedHackyHeadFinder
+    
+  def anchor(words: IndexedSeq[W]): WordVectorDepAnchoringIndexed[W] = {
+    val indexedWords = words.map(word2VecIndexed.indexWord(_))
+    new WordVectorDepAnchoringIndexed[W] {
+      
+      val preterminals = new Array[String](words.size);
+      for (i <- 0 until words.size) {
+        preterminals(i) = tagger.tag(words(i));
+      }
+      
+      def getHeadDepPair(begin: Int, split: Int, end: Int, rule: Int): (Int, Int) = {
+        val lc = topology.labelIndex.get(topology.leftChild(rule)).baseLabel;
+        val rc = topology.labelIndex.get(topology.rightChild(rule)).baseLabel;
+        val parent = topology.labelIndex.get(topology.parent(rule)).baseLabel;
+      
+        val lcHeadIdx = begin + hackyHeadFinder.findHead(lc, preterminals.slice(begin, split));
+        val rcHeadIdx = split + hackyHeadFinder.findHead(rc, preterminals.slice(split, end));
+        val overallHeadIdx = begin + hackyHeadFinder.findHead(parent, preterminals.slice(begin, end))
+        if (overallHeadIdx == rcHeadIdx) {
+          (rcHeadIdx, lcHeadIdx)
+        } else {
+          (lcHeadIdx, rcHeadIdx)
+        }
+      } 
+      
+      def featuresForHeadPair(head: Int, dep: Int) = {
+        Array(fetchWord(head - 1), fetchWord(head), fetchWord(head+1), fetchWord(dep-1), fetchWord(dep), fetchWord(dep+1))
+      }
+        
+      private def fetchWord(idx: Int): Int = {
+        if (idx < 0 || idx >= words.size) -1 else indexedWords(idx)
+      }
+    } 
+  }
+}
+
+trait Tagger[W] {
+  def tag(word: W): String
+}
+
+class FrequencyTagger[W](wordTagCounts: Counter2[String, W, Double]) extends Tagger[W] with Serializable {
+  
+  private val wordCounts = Counter[W,Double];
+  private val wordToTagMap = new HashMap[W,String];
+  for (word <- wordTagCounts.keysIterator.map(_._2).toSeq.distinct) {
+    wordCounts(word) = sum(wordTagCounts(::, word));
+    if (!wordToTagMap.contains(word)) {
+      val tagCounts = wordTagCounts(::, word).iterator;
+      var bestTag = HackyLexicalProductionFeaturizer.UnkTag;
+      var bestTagCount = 0.0;
+      for ((tag, count) <- tagCounts) {
+        if (count > bestTagCount) {
+          bestTag = tag;
+          bestTagCount = count;
+        }
+      }
+      wordToTagMap.put(word, bestTag);
+    }
+  }
+  val tagTypesIdx = Index[String]
+  wordToTagMap.values.toSet[String].foreach(tagType => tagTypesIdx.index(tagType))
+  tagTypesIdx.index(HackyLexicalProductionFeaturizer.UnkTag)
+  
+  def tag(word: W) = if (wordToTagMap.contains(word)) wordToTagMap(word) else HackyLexicalProductionFeaturizer.UnkTag;
+  
+  def convertToFeaturizer: W => Array[Int] = (word: W) => Array(tagTypesIdx.index(tag(word)))
+}
+
diff --git a/src/main/scala/epic/dense/Word2VecUtils.java b/src/main/scala/epic/dense/Word2VecUtils.java
new file mode 100644
index 00000000..687bf2f1
--- /dev/null
+++ b/src/main/scala/epic/dense/Word2VecUtils.java
@@ -0,0 +1,51 @@
+package epic.dense;
+
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Utilities from 
+ * https://gist.github.com/ansjsun/6304960
+ * 
+ * @author gdurrett
+ *
+ */
+public class Word2VecUtils {
+  
+  private static final int MAX_SIZE = 50;
+  
+  public static String readString(DataInputStream dis) throws IOException {
+    byte[] bytes = new byte[MAX_SIZE];
+    byte b = dis.readByte();
+    int i = -1;
+    StringBuilder sb = new StringBuilder();
+    while (b != 32 && b != 10) {
+      i++;
+      bytes[i] = b;
+      b = dis.readByte();
+      if (i == 49) {
+        sb.append(new String(bytes));
+        i = -1;
+        bytes = new byte[MAX_SIZE];
+      }
+    }
+    sb.append(new String(bytes, 0, i + 1));
+    return sb.toString();
+  }
+
+  public static float readFloat(InputStream is) throws IOException {
+    byte[] bytes = new byte[4];
+    is.read(bytes);
+    return getFloat(bytes);
+  }
+
+  public static float getFloat(byte[] b) {
+    int accum = 0;
+    accum = accum | (b[0] & 0xff) << 0;
+    accum = accum | (b[1] & 0xff) << 8;
+    accum = accum | (b[2] & 0xff) << 16;
+    accum = accum | (b[3] & 0xff) << 24;
+    return Float.intBitsToFloat(accum);
+  }
+}
diff --git a/src/main/scala/epic/features/HackyHeadFinder.scala b/src/main/scala/epic/features/HackyHeadFinder.scala
index 3d6f2dd6..3310ecd9 100644
--- a/src/main/scala/epic/features/HackyHeadFinder.scala
+++ b/src/main/scala/epic/features/HackyHeadFinder.scala
@@ -7,7 +7,7 @@ import scala.collection.mutable.HashMap
  * HackyHeadFinders find "heads" in a span using only preterminal labels.
  * It doesn't use the syntactic structure of the sentence.
  *
- * @author gdurret
+ * @author gdurrett
  * @tparam L
  * @tparam T
  */
diff --git a/src/main/scala/epic/framework/ModelObjective.scala b/src/main/scala/epic/framework/ModelObjective.scala
index 6ad23d81..a3f4cc68 100644
--- a/src/main/scala/epic/framework/ModelObjective.scala
+++ b/src/main/scala/epic/framework/ModelObjective.scala
@@ -9,6 +9,8 @@ import collection.parallel.ForkJoinTaskSupport
 import concurrent.forkjoin.ForkJoinPool
 import com.typesafe.scalalogging.slf4j.LazyLogging
 import epic.util.{SafeLogging, CacheBroker}
+import epic.trees.AnnotatedLabel
+import epic.trees.TreeInstance
 
 /**
  * The objective function for training a [[epic.framework.Model]]. Selects
@@ -28,14 +30,20 @@ class ModelObjective[Datum](val model: Model[Datum],
 
   // Selects a set of data to use
   protected def select(batch: IndexedSeq[Int]):GenTraversable[Datum] = batchSelector(batch)
-
+  
   def initialWeightVector(randomize: Boolean): DenseVector[Double] = {
+    initialWeightVector(randomize, 1E-3)
+  }
+
+  def initialWeightVector(randomize: Boolean, scale: Double): DenseVector[Double] = {
    val v = model.readCachedFeatureWeights() match {
      case Some(vector) => vector
      case None => Encoder.fromIndex(featureIndex).tabulateDenseVector(f => model.initialValueForFeature(f))
    }
     if(randomize) {
-      v += (DenseVector.rand(numFeatures) * 2E-3 - 1E-3)
+      // Control the seed of the RNG for the weights
+      val rng = new scala.util.Random(0)
+      v += DenseVector(Array.tabulate(numFeatures)(i => rng.nextDouble * 2.0 * scale - scale))
     }
     v
   }
@@ -55,7 +63,8 @@ class ModelObjective[Datum](val model: Model[Datum],
     val inference = inferenceFromWeights(x)
     val timeIn = System.currentTimeMillis()
     val success = new AtomicInteger(0)
-    val finalCounts = select(batch).aggregate(null:model.ExpectedCounts)({ ( _countsSoFar,datum) =>
+    val minibatch = select(batch)
+    val finalCounts = minibatch.aggregate(null:model.ExpectedCounts)({ ( _countsSoFar,datum) =>
       try {
         val countsSoFar:model.ExpectedCounts = if (_countsSoFar ne null) _countsSoFar else emptyCounts
         model.accumulateCounts(inference, datum, countsSoFar, 1.0)
diff --git a/src/main/scala/epic/parser/models/NeuralParserTrainer.scala b/src/main/scala/epic/parser/models/NeuralParserTrainer.scala
new file mode 100644
index 00000000..cbf78059
--- /dev/null
+++ b/src/main/scala/epic/parser/models/NeuralParserTrainer.scala
@@ -0,0 +1,234 @@
+package epic.parser.models
+
+import java.io.File
+
+import com.typesafe.scalalogging.slf4j.LazyLogging
+
+import breeze.config.Help
+import breeze.linalg._
+import breeze.optimize._
+import breeze.optimize.FirstOrderMinimizer.OptParams
+import breeze.util._
+import breeze.util.Implicits._
+import epic.constraints.CachedChartConstraintsFactory
+import epic.constraints.ChartConstraints
+import epic.dense.AdadeltaGradientDescentDVD
+import epic.framework._
+import epic.parser._
+import epic.parser.ParseEval.Statistics
+import epic.parser.ParserParams.XbarGrammar
+import epic.parser.projections.OracleParser
+import epic.parser.projections.ParserChartConstraintsFactory
+import epic.trees.AnnotatedLabel
+import epic.trees.TreeInstance
+import epic.trees.annotations._
+import epic.util.CacheBroker
+
+
+/**
+ * The main entry point for training discriminative parsers.
+ * Has a main method inherited from ParserPipeline.
+ * Use --help to see options, or just look at the Params class.
+ *
+ *
+ */
+object NeuralParserTrainer extends epic.parser.ParserPipeline with LazyLogging {
+  
+  case class ExtraPTParams(momentum: Double = 0.95,
+                           computeTrainLL: Boolean = true)
+
+  case class Params(@Help(text="Details about the parser to build")
+                    modelFactory: PositionalNeuralModelFactory,
+                    @Help(text="Name for the parser for saving and logging. will be inferrred if not provided.")
+                    name: String = null,
+                    implicit val cache: CacheBroker,
+                    @Help(text="path for a baseline parser for computing constraints. will be built automatically if not provided.")
+                    parser: File = null,
+                    opt: OptParams,
+                    @Help(text="How often to run on the dev set.")
+                    iterationsPerEval: Int = 100,
+                    @Help(text="How many iterations to run.")
+                    maxIterations: Int = 1002,
+                    @Help(text="How often to look at a small set of the dev set.")
+                    iterPerValidate: Int = 30,
+                    @Help(text="How many threads to use, default is to use whatever Scala thinks is best.")
+                    threads: Int = -1,
+                    @Help(text="Scale of random weight initialization")
+                    initWeightsScale: Double = 1E-2,
+                    @Help(text="String to specify fancier initialization types based on fan-in/fan-out")
+                    initializerSpec: String = "",
+                    @Help(text="True if we should determinimize training (remove randomness associated with random minibatches)")
+                    determinizeTraining: Boolean = false,
+                    @Help(text="True if we should train two models and ram them together")
+                    ensemble: Boolean = false,
+                    @Help(text="Use Adadelta for optimiziation instead of Adagrad")
+                    useAdadelta: Boolean = true,
+                    @Help(text="Should we enforce reachability? Can be useful if we're pruning the gold tree.")
+                    enforceReachability: Boolean = true,
+                    @Help(text="Whether or not we use constraints. Not using constraints is very slow.")
+                    useConstraints: Boolean = true,
+                    @Help(text="Should we check the gradient to make sure it's coded correctly?")
+                    checkGradient: Boolean = false,
+                    @Help(text="check specific indices, in addition to doing a full search.")
+                    checkGradientsAt: String = null,
+                    @Help(text="check specific indices, in addition to doing a full search.")
+                    maxParseLength: Int = 70,
+                    annotator: TreeAnnotator[AnnotatedLabel, String, AnnotatedLabel] = GenerativeParser.defaultAnnotator(),
+                    extraPTParams: ExtraPTParams = ExtraPTParams())
+  protected val paramManifest = manifest[Params]
+
+  def trainParser( trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]],
+                  validate: (Parser[AnnotatedLabel, String]) => Statistics, params: Params) = {
+    import params._
+    import extraPTParams._
+    
+//    if(threads >= 1)
+//      collection.parallel.ForkJoinTasks.defaultForkJoinPool.setParallelism(params.threads)
+
+    val initialParser = params.parser match {
+      case null =>
+        val (grammar, lexicon) = XbarGrammar().xbarGrammar(trainTrees)
+        GenerativeParser.annotatedParser(grammar, lexicon, annotator, trainTrees)
+//        GenerativeParser.annotatedParser(grammar, lexicon, Xbarize(), trainTrees)
+      case f =>
+        readObject[Parser[AnnotatedLabel, String]](f)
+    }
+
+    val constraints = {
+
+      val maxMarginalized = initialParser.copy(marginalFactory=initialParser.marginalFactory match {
+        case StandardChartFactory(ref, mm) => StandardChartFactory(ref, maxMarginal = true)
+        case x => x
+      })
+
+      val uncached = new ParserChartConstraintsFactory[AnnotatedLabel, String](maxMarginalized, {(_:AnnotatedLabel).isIntermediate})
+      new CachedChartConstraintsFactory[AnnotatedLabel, String](uncached)
+    }
+
+    var theTrees = trainTrees.toIndexedSeq.filterNot(sentTooLong(_, params.maxParseLength))
+
+    if(useConstraints && enforceReachability)  {
+      val treebankGrammar = GenerativeParser.annotated(initialParser.topology, initialParser.lexicon, TreeAnnotator.identity, trainTrees)
+      val markovizedGrammar = GenerativeParser.annotated(initialParser.topology, initialParser.lexicon, annotator, trainTrees)
+      val proj = new OracleParser(treebankGrammar, markovizedGrammar)
+      theTrees = theTrees.par.map(ti => ti.copy(tree=proj.forTree(ti.tree, ti.words, constraints.constraints(ti.words)))).seq.toIndexedSeq
+    }
+
+    val baseMeasure = if(useConstraints) {
+      constraints
+    } else {
+      ChartConstraints.Factory.noSparsity[AnnotatedLabel, String]
+    }
+
+    println("Building model")
+    val model = modelFactory.make(theTrees, initialParser.topology, initialParser.lexicon, constraints)
+    val obj = new ModelObjective(model, theTrees, params.threads)
+    val cachedObj = new CachedBatchDiffFunction(obj)
+    println("Initializing weights custom for model " + model.getClass)
+    val init = model.initialWeightVector(initWeightsScale, initializerSpec)
+    if(checkGradient) {
+      val cachedObj2 = new CachedBatchDiffFunction(new ModelObjective(model, theTrees.take(opt.batchSize), params.threads))
+      val defaultIndices = (0 until 10).map(i => if(i < 0) model.featureIndex.size + i else i)
+      val indices = if (model.transforms.size > 0) {
+          model.transforms(0).getInterestingWeightIndicesForGradientCheck(0)
+      } else {
+        defaultIndices
+      }
+      println("testIndices: " + indices)
+      GradientTester.testIndices(cachedObj2, init, indices, toString={(i: Int) => model.featureIndex.get(i).toString}, skipZeros = true)
+      println("test")
+      GradientTester.test(cachedObj2, init, toString={(i: Int) => model.featureIndex.get(i).toString}, skipZeros = false)
+    }
+
+    type OptState = FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State
+    def evalAndCache(pair: (OptState, Int)) {
+      val (state, iter) = pair
+      val weights = state.x
+      if (iter % iterPerValidate == 0) {
+        logger.info("Validating...")
+        val parser = model.extractParser(weights)
+        val stats = validate(parser)
+        logger.info("Overall statistics for validation: " + stats)
+      }
+    }
+
+
+    val name = Option(params.name).orElse(Option(model.getClass.getSimpleName).filter(_.nonEmpty)).getOrElse("DiscrimParser")
+    val itr: Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State] = if (determinizeTraining) {
+      val scanningBatchesObj = cachedObj.withScanningBatches(params.opt.batchSize)
+      if (useAdadelta) {
+        println("OPTIMIZATION: Adadelta")
+        new AdadeltaGradientDescentDVD(params.opt.maxIterations, momentum).iterations(scanningBatchesObj, init).
+            asInstanceOf[Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State]]
+      } else {
+        println("OPTIMIZATION: Adagrad")
+        params.opt.iterations(scanningBatchesObj, init).asInstanceOf[Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State]]
+      }
+    } else {
+      if (useAdadelta) {
+        println("OPTIMIZATION: Adadelta")
+        new AdadeltaGradientDescentDVD(params.opt.maxIterations, momentum).iterations(cachedObj.withRandomBatches(params.opt.batchSize), init).
+            asInstanceOf[Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State]]
+      } else {
+        println("OPTIMIZATION: Adagrad")
+        params.opt.iterations(cachedObj, init)
+      }
+    }
+    if (ensemble) {
+      val weights1 = itr.take(maxIterations).last.x
+      // Hard-wired to use Adadelta
+      val initParams2 = model.initialWeightVector(initWeightsScale, initializerSpec, trulyRandom = true)
+      val itr2 = new AdadeltaGradientDescentDVD(params.opt.maxIterations).iterations(cachedObj.withRandomBatches(params.opt.batchSize), initParams2).
+            asInstanceOf[Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State]]
+      println("Optimizing second parser")
+      val weights2 = itr2.take(maxIterations).last.x
+      println("Optimized both parsers")
+      val clonedModel = model.cloneModelForEnsembling
+      val mergedWeights = model.mergeWeightsForEnsembling(weights1, weights2)
+      Seq(("ComboParser-Final", clonedModel.extractParser(mergedWeights))).iterator
+    } else {
+      // Normal execution
+      for ((state, iter) <- itr.take(maxIterations).zipWithIndex.tee(evalAndCache _)
+           if iter != 0 && iter % iterationsPerEval == 0 || evaluateNow) yield try {
+        // N.B. This may be wrong for batch normalization
+        val parser = model.extractParser(state.x)
+        if (iter + iterationsPerEval >= maxIterations && computeTrainLL) {
+          computeLL(trainTrees, model, state.x)
+        }
+        (s"$name-$iter", parser)
+      } catch {
+        case e: Exception => e.printStackTrace(); throw e
+      }
+    }
+  }
+  
+  def sentTooLong(p: TreeInstance[AnnotatedLabel, String], maxLength: Int): Boolean = {
+    p.words.count(x => x == "'s" || x(0).isLetterOrDigit) > maxLength
+  }
+  
+  def evaluateNow = {
+    val sentinel = new File("EVALUATE_NOW")
+    if(sentinel.exists()) {
+      sentinel.delete()
+      logger.info("Evaluating now!!!!")
+      true
+    } else {
+      false
+    }
+  }
+  
+  def computeLL(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], model: PositionalNeuralModel[AnnotatedLabel,AnnotatedLabel,String], weights: DenseVector[Double]) {
+    println("Computing final log likelihood on the whole training set...")
+    val inf = model.inferenceFromWeights(weights).forTesting
+    val ll = trainTrees.par.aggregate(0.0)((currLL, trainTree) => { 
+      try {
+        val s = inf.scorer(trainTree)
+        currLL + inf.goldMarginal(s, trainTree).logPartition - inf.marginal(s, trainTree).logPartition
+      } catch {
+        case e: Exception => println("Couldn't parse")
+        currLL
+      }
+    }, _ + _)
+    println("Log likelihood on " + trainTrees.size + " examples: " + ll)
+  }
+}
diff --git a/src/main/scala/epic/parser/models/ParserTrainer.scala b/src/main/scala/epic/parser/models/ParserTrainer.scala
index fcfbc4c2..931da01f 100644
--- a/src/main/scala/epic/parser/models/ParserTrainer.scala
+++ b/src/main/scala/epic/parser/models/ParserTrainer.scala
@@ -35,6 +35,7 @@ import epic.parser.ParseEval.Statistics
 import epic.features.LongestFrequentSuffixFeaturizer.LongestFrequentSuffix
 import epic.features.LongestFrequentSuffixFeaturizer
 import epic.util.Optional
+import epic.dense.AdadeltaGradientDescentDVD
 
 /**
  * The main entry point for training discriminative parsers.
@@ -53,6 +54,10 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging {
                     @Help(text="path for a baseline parser for computing constraints. will be built automatically if not provided.")
                     parser: File = null,
                     opt: OptParams,
+                    @Help(text="Use Adadelta instead of Adagrad (hardcoded in here...)")
+                    useAdadelta: Boolean = false,
+                    @Help(text="Make training batches deterministic; useful for debugging / regression testing")
+                    determinizeTraining: Boolean = false,
                     @Help(text="How often to run on the dev set.")
                     iterationsPerEval: Int = 100,
                     @Help(text="How many iterations to run.")
@@ -71,8 +76,10 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging {
                     checkGradient: Boolean = false,
                     @Help(text="check specific indices, in addition to doing a full search.")
                     checkGradientsAt: String = null,
-                    @Help(text="check specific indices, in addition to doing a full search.")
+                    @Help(text="Max parse length")
                     maxParseLength: Int = 70,
+                    @Help(text="Compute log likelihood on the training set")
+                    computeTrainLL: Boolean = true,
                     annotator: TreeAnnotator[AnnotatedLabel, String, AnnotatedLabel] = GenerativeParser.defaultAnnotator())
   protected val paramManifest = manifest[Params]
 
@@ -124,8 +131,10 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging {
     val init = obj.initialWeightVector(randomize)
     if(checkGradient) {
       val cachedObj2 = new CachedBatchDiffFunction(new ModelObjective(model, theTrees.take(opt.batchSize), params.threads))
-        val indices = (0 until 10).map(i => if(i < 0) model.featureIndex.size + i else i)
+      val indices = (0 until 10).map(i => if(i < 0) model.featureIndex.size + i else i)
+      println("testIndices: " + indices)
       GradientTester.testIndices(cachedObj2, obj.initialWeightVector(randomize = true), indices, toString={(i: Int) => model.featureIndex.get(i).toString}, skipZeros = true)
+      println("test")
       GradientTester.test(cachedObj2, obj.initialWeightVector(randomize = true), toString={(i: Int) => model.featureIndex.get(i).toString}, skipZeros = false)
     }
 
@@ -143,9 +152,32 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging {
 
 
     val name = Option(params.name).orElse(Option(model.getClass.getSimpleName).filter(_.nonEmpty)).getOrElse("DiscrimParser")
-    for ((state, iter) <- params.opt.iterations(cachedObj, init).take(maxIterations).zipWithIndex.tee(evalAndCache _)
+    val itr: Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State] = if (determinizeTraining) {
+      val scanningBatchesObj = cachedObj.withScanningBatches(params.opt.batchSize)
+      if (useAdadelta) {
+        println("OPTIMIZATION: Adadelta")
+        new AdadeltaGradientDescentDVD(params.opt.maxIterations).iterations(scanningBatchesObj, init).
+            asInstanceOf[Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State]]
+      } else {
+        println("OPTIMIZATION: Adagrad")
+        params.opt.iterations(scanningBatchesObj, init).asInstanceOf[Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State]]
+      }
+    } else {
+      if (useAdadelta) {
+        println("OPTIMIZATION: Adadelta")
+        new AdadeltaGradientDescentDVD(params.opt.maxIterations).iterations(cachedObj.withRandomBatches(params.opt.batchSize), init).
+            asInstanceOf[Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State]]
+      } else {
+        println("OPTIMIZATION: Adagrad")
+        params.opt.iterations(cachedObj, init)
+      }
+    }
+    for ((state, iter) <- itr.take(maxIterations).zipWithIndex.tee(evalAndCache _)
          if iter != 0 && iter % iterationsPerEval == 0 || evaluateNow) yield try {
       val parser = model.extractParser(state.x)
+      if (iter + iterationsPerEval >= maxIterations && computeTrainLL) {
+        computeLL(trainTrees, model, state.x)
+      }
       (s"$name-$iter", parser)
     } catch {
       case e: Exception => e.printStackTrace(); throw e
@@ -167,6 +199,21 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging {
     }
     
   }
+  
+  def computeLL(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], model: Model[TreeInstance[AnnotatedLabel, String]], weights: DenseVector[Double]) {
+    println("Computing final log likelihood on the whole training set...")
+    val inf = model.inferenceFromWeights(weights).forTesting
+    val ll = trainTrees.par.aggregate(0.0)((currLL, trainTree) => { 
+      try {
+        val s = inf.scorer(trainTree)
+        currLL + inf.goldMarginal(s, trainTree).logPartition - inf.marginal(s, trainTree).logPartition
+      } catch {
+        case e: Exception => println("Couldn't parse")
+        currLL
+      }
+    }, _ + _)
+    println("Log likelihood on " + trainTrees.size + " examples: " + ll)
+  }
 }
 
 
diff --git a/src/main/scala/epic/parser/models/PositionalNeuralModel.scala b/src/main/scala/epic/parser/models/PositionalNeuralModel.scala
new file mode 100644
index 00000000..dc474d8c
--- /dev/null
+++ b/src/main/scala/epic/parser/models/PositionalNeuralModel.scala
@@ -0,0 +1,418 @@
+package epic.parser
+package models
+
+import scala.collection.mutable.HashMap
+import scala.util.Random
+import scala.collection.GenTraversable
+import breeze.features.FeatureVector
+import breeze.linalg._
+import breeze.util.Index
+import epic.constraints.ChartConstraints
+import epic.dense.IdentityTransform
+import epic.dense.OutputTransform
+import epic.dense.Transform
+import epic.dense.Word2VecDepFeaturizerIndexed
+import epic.dense.Word2VecSurfaceFeaturizerIndexed
+import epic.features._
+import epic.framework.Feature
+import epic.framework.StandardExpectedCounts
+import epic.lexicon.Lexicon
+import epic.parser.projections.GrammarRefinements
+import epic.trees._
+import scala.collection.mutable.ArrayBuffer
+
+/**
+ * Main neural CRF parser class.
+ *
+ * @author gdurrett
+ **/
+@SerialVersionUID(1L)
+class PositionalNeuralModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W]) => BinarizedTree[IndexedSeq[L2]],
+                                      val constrainer: ChartConstraints.Factory[L, W],
+                                      val topology: RuleTopology[L],
+                                      val lexicon: Lexicon[L, W],
+                                      refinedTopology: RuleTopology[L2],
+                                      refinements: GrammarRefinements[L, L2],
+                                      labelFeaturizer: RefinedFeaturizer[L, W, Feature],
+                                      surfaceFeaturizer: Word2VecSurfaceFeaturizerIndexed[W],
+                                      depFeaturizer: Word2VecDepFeaturizerIndexed[W],
+                                      val transforms: IndexedSeq[OutputTransform[Array[Int],DenseVector[Double]]],
+                                      val maybeSparseSurfaceFeaturizer: Option[IndexedSpanFeaturizer[L, L2, W]],
+                                      val depTransforms: Seq[OutputTransform[Array[Int],DenseVector[Double]]],
+                                      val decoupledTransforms: Seq[OutputTransform[Array[Int],DenseVector[Double]]]) extends ParserModel[L, W] with Serializable {
+  
+  def mergeWeightsForEnsembling(x1: DenseVector[Double], x2: DenseVector[Double]) = {
+    require(decoupledTransforms.size == 0)
+    require(x1.size == x2.size)
+    // Stack up the dense parts, average the sparse parts
+    if (maybeSparseSurfaceFeaturizer.isDefined) {
+      val sparseFeatsStart = index.componentOffset(index.indices.size - 1)
+      val summedSparseFeatures = x1(sparseFeatsStart to -1) + x2(sparseFeatsStart to -1)
+      DenseVector.vertcat(x1(0 until sparseFeatsStart), x2(0 until sparseFeatsStart), summedSparseFeatures)
+    } else {
+      DenseVector.vertcat(x1, x2)
+    }
+  }
+  
+  def cloneModelForEnsembling = {
+    require(decoupledTransforms.size == 0)
+    // Note that duping the transforms is okay because they still produce distinct
+    // layers, so caching behavior is unaffected
+    val newTransforms = transforms ++ transforms;
+    val newDepTransforms = depTransforms ++ depTransforms;
+    new PositionalNeuralModel(annotator, constrainer, topology, lexicon, refinedTopology, refinements, labelFeaturizer, surfaceFeaturizer, depFeaturizer,
+                                 newTransforms, maybeSparseSurfaceFeaturizer, newDepTransforms, decoupledTransforms)
+  }
+  
+  override type Inference = PositionalNeuralModel.Inference[L, L2, W]
+
+  override def accumulateCounts(inf: Inference, s: Scorer, d: TreeInstance[L, W], m: Marginal, accum: ExpectedCounts, scale: Double): Unit = {
+//    println("Extracting ecounts")
+    inf.grammar.extractEcounts(m, accum.counts, scale)
+    
+    if (maybeSparseSurfaceFeaturizer.isDefined) {
+      val f = maybeSparseSurfaceFeaturizer.get
+      val innerAccum = StandardExpectedCounts.zero(f.index)
+      m.expectedCounts(maybeSparseSurfaceFeaturizer.get, innerAccum, scale)
+      //      val totalTransformSize = transform.index.size
+      val totalTransformSize = transforms.map(_.index.size).foldLeft(0)(_ + _) + depTransforms.map(_.index.size).foldLeft(0)(_ + _)  + decoupledTransforms.map(_.index.size).foldLeft(0)(_ + _)
+      accum.counts += DenseVector.vertcat(DenseVector.zeros[Double](totalTransformSize), innerAccum.counts)
+    }
+//    println("Ecounts extracted")
+    accum.loss += scale * m.logPartition
+  }
+
+  /**
+   * Models have features, and this defines the mapping from indices in the weight vector to features.
+   * @return
+   */
+  val index = if (maybeSparseSurfaceFeaturizer.isDefined) {
+    SegmentedIndex((transforms.map(_.index) ++ depTransforms.map(_.index) ++ decoupledTransforms.map(_.index) ++ IndexedSeq(maybeSparseSurfaceFeaturizer.get.index)):_*)
+  } else {
+    SegmentedIndex((transforms.map(_.index) ++ depTransforms.map(_.index) ++ decoupledTransforms.map(_.index)):_*)
+  }
+  
+  def initialWeightVector(initWeightsScale: Double, initializerSpec: String, trulyRandom: Boolean = false): DenseVector[Double] = {
+    val rng = if (trulyRandom) new Random() else new Random(0)
+    val initTransformWeights = DenseVector.vertcat(transforms.map(_.initialWeightVector(initWeightsScale, rng, true, initializerSpec)):_*);
+    val initDepWeights = DenseVector.vertcat(depTransforms.map(_.initialWeightVector(initWeightsScale, rng, true, initializerSpec)):_*);
+    val initDecoupledWeights = DenseVector.vertcat(decoupledTransforms.map(_.initialWeightVector(initWeightsScale, rng, true, initializerSpec)):_*);
+    val newInitVector: DenseVector[Double] = if (maybeSparseSurfaceFeaturizer.isDefined) {
+      DenseVector.vertcat(initTransformWeights, initDepWeights, initDecoupledWeights, DenseVector.zeros(maybeSparseSurfaceFeaturizer.get.index.size))
+    } else {
+      DenseVector.vertcat(initTransformWeights, initDepWeights, initDecoupledWeights)
+    }
+    require(newInitVector.size == index.size, newInitVector.size + " " + index.size)
+    newInitVector
+  }
+  
+  override def featureIndex: Index[Feature] = index
+
+  override def inferenceFromWeights(weights: DenseVector[Double]): Inference = inferenceFromWeights(weights, true)
+  
+  def inferenceFromWeights(weights: DenseVector[Double], forTrain: Boolean): Inference = {
+    val layersAndInnerLayers = for (i <- 0 until transforms.size) yield {
+      transforms(i).extractLayerAndPenultimateLayer(weights(index.componentOffset(i) until index.componentOffset(i) + index.indices(i).size), forTrain)
+    }
+    val layers: IndexedSeq[OutputTransform[Array[Int],DenseVector[Double]]#OutputLayer] = layersAndInnerLayers.map(_._1)
+    val innerLayers: IndexedSeq[epic.dense.Transform.Layer[Array[Int],DenseVector[Double]]] = layersAndInnerLayers.map(_._2)
+    val depLayers: IndexedSeq[OutputTransform[Array[Int],DenseVector[Double]]#OutputLayer] = for (i <- 0 until depTransforms.size) yield {
+      val idxIdx = transforms.size + i
+      depTransforms(i).extractLayer(weights(index.componentOffset(idxIdx) until index.componentOffset(idxIdx) + index.indices(idxIdx).size), forTrain)
+    }
+    val decoupledLayersAndInner = for (i <- 0 until decoupledTransforms.size) yield {
+      val idxIdx = transforms.size + depTransforms.size + i
+      decoupledTransforms(i).extractLayerAndPenultimateLayer(weights(index.componentOffset(idxIdx) until index.componentOffset(idxIdx) + index.indices(idxIdx).size), forTrain)
+    }
+    val decoupledLayers = decoupledLayersAndInner.map(_._1)
+    val decoupledInnerLayers = decoupledLayersAndInner.map(_._2)
+    val grammar = new PositionalNeuralModel.PositionalNeuralGrammar[L, L2, W](topology, lexicon, refinedTopology, refinements, labelFeaturizer,
+                                                                                   surfaceFeaturizer, depFeaturizer, layers, innerLayers, depLayers, maybeSparseSurfaceFeaturizer, decoupledLayers, decoupledInnerLayers, weights, this)
+    new Inference(annotator, constrainer, grammar, refinements)
+  }
+  
+  /**
+   * When doing batch normalization, we need to normalize the test network
+   */
+  def extractParser(weights: DenseVector[Double], trainExs: Seq[TreeInstance[L,W]])(implicit deb: Debinarizer[L]) = {
+    val inf = inferenceFromWeights(weights).forTesting
+    inf.relativizeToData(trainExs.slice(0, Math.min(trainExs.size, 200)).asInstanceOf[Seq[TreeInstance[AnnotatedLabel,String]]]);
+    Parser(constrainer, inf.grammar, ChartDecoder[L, W]())
+  }
+
+  override def initialValueForFeature(f: Feature): Double = 0.0
+}
+
+object PositionalNeuralModel {
+
+  case class Inference[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W]) => BinarizedTree[IndexedSeq[L2]],
+                                 constrainer: ChartConstraints.Factory[L, W],
+                                 grammar: PositionalNeuralGrammar[L, L2, W],
+                                 refinements: GrammarRefinements[L, L2]) extends ParserInference[L, W]  {
+    override def goldMarginal(scorer: Scorer, ti: TreeInstance[L, W], aug: UnrefinedGrammarAnchoring[L, W]): Marginal = {
+
+      import ti._
+
+      val annotated = annotator(tree, words).map(_.map(refinements.labels.localize))
+
+      val product = grammar.anchor(words, constrainer.constraints(ti.words))
+      LatentTreeMarginal(product, annotated)
+    }
+    
+    // This needs to be different for dropout, so that we can get the right layers
+    override def forTesting = grammar.origPTModel.inferenceFromWeights(grammar.weights, false)
+    
+    def relativizeToData(data: GenTraversable[TreeInstance[AnnotatedLabel,String]]) {
+    }
+  }
+
+  @SerialVersionUID(4749637878577393596L)
+  class PositionalNeuralGrammar[L, L2, W](val topology: RuleTopology[L],
+                                             val lexicon: Lexicon[L, W],
+                                             val refinedTopology: RuleTopology[L2],
+                                             val refinements: GrammarRefinements[L, L2],
+                                             labelFeaturizer: RefinedFeaturizer[L, W, Feature],
+                                             val surfaceFeaturizer: Word2VecSurfaceFeaturizerIndexed[W],
+                                             depFeaturizer: Word2VecDepFeaturizerIndexed[W],
+                                             val layers: IndexedSeq[OutputTransform[Array[Int],DenseVector[Double]]#OutputLayer],
+                                             penultimateLayers: IndexedSeq[epic.dense.Transform.Layer[Array[Int],DenseVector[Double]]],
+                                             depLayers: IndexedSeq[OutputTransform[Array[Int],DenseVector[Double]]#OutputLayer],
+                                             val maybeSparseSurfaceFeaturizer: Option[IndexedSpanFeaturizer[L, L2, W]],
+                                             decoupledLayers: IndexedSeq[OutputTransform[Array[Int],DenseVector[Double]]#OutputLayer],
+                                             penultimateDecoupledLayers: IndexedSeq[epic.dense.Transform.Layer[Array[Int],DenseVector[Double]]],
+                                             val weights: DenseVector[Double],
+                                             val origPTModel: PositionalNeuralModel[L,L2,W]) extends Grammar[L, W] with Serializable {
+
+    val SpanLayerIdx = 0
+    val UnaryLayerIdx = 1
+    val BinaryLayerIdx = 2
+    val dcSpanFeatOffset = layers.map(_.index.size).foldLeft(0)(_ + _) + depLayers.map(_.index.size).foldLeft(0)(_ + _)
+    val dcUnaryFeatOffset = dcSpanFeatOffset + (if (decoupledLayers.size > 0) decoupledLayers(0).index.size else 0)
+    val dcBinaryFeatOffset = dcUnaryFeatOffset + (if (decoupledLayers.size > 0) decoupledLayers(1).index.size else 0)
+    
+    override def withPermissiveLexicon: Grammar[L, W] = {
+      new PositionalNeuralGrammar(topology, lexicon.morePermissive, refinedTopology, refinements, labelFeaturizer, surfaceFeaturizer,
+                                     depFeaturizer, layers, penultimateLayers, depLayers, maybeSparseSurfaceFeaturizer, decoupledLayers, penultimateDecoupledLayers, weights, origPTModel)
+    }
+
+
+    /**
+     * N.B. does not extracted expected counts for sparse features; this is done outside this loop
+     */
+    def extractEcounts(m: ParseMarginal[L, W], deriv: DenseVector[Double], scale: Double): Unit = {
+      val w = m.words
+      val length = w.length
+      val sspec = surfaceFeaturizer.anchor(w)
+      val depSpec = depFeaturizer.anchor(w)
+      val lspec = labelFeaturizer.anchor(w)
+      
+//      val maxTetraLen = ((w.size + 2) * (w.size + 3) * (w.size + 4))/6 + ((w.size + 1) * (w.size + 2))/2 + w.size + 2
+      
+      def tetra(begin: Int, split: Int, end: Int) = {
+        (end * (end + 1) * (end + 2))/6 + ((split + 1) * split / 2 + begin)
+      }
+      
+      // This representation appears to make things a bit faster?
+      val ruleCountsPerState = new HashMap[Int,SparseVector[Double]]
+      val unaryRuleCountsPerState = new HashMap[Int,SparseVector[Double]]
+      val binaryRuleCountsPerState = new HashMap[Int,SparseVector[Double]]
+      val spanCountsPerState = new HashMap[Int,SparseVector[Double]]
+//      val ruleCountsPerState = Array.fill(maxTetraLen)(SparseVector.zeros[Double](labelFeaturizer.index.size))
+//      val countsPerHeadDepPair = Array.tabulate(w.size, w.size)((i, j) => 0.0)
+//      val statesUsed = Array.fill(maxTetraLen)(false)
+//      val untetra = Array.fill(maxTetraLen)((-1, -1, -1))
+      val untetra = new HashMap[Int,(Int,Int,Int)]
+      
+      m visit new AnchoredVisitor[L] {
+        
+        override def visitUnaryRule(begin: Int, end: Int, rule: Int, ref: Int, score: Double): Unit = {
+          val tetraIdx = tetra(begin, end, length + 1)
+          untetra(tetraIdx) = (begin, end, length + 1)
+          val fv = new FeatureVector(lspec.featuresForUnaryRule(begin, end, rule, ref))
+          if (!ruleCountsPerState.contains(tetraIdx)) ruleCountsPerState.put(tetraIdx, SparseVector.zeros[Double](labelFeaturizer.index.size))
+          axpy(score, fv, ruleCountsPerState(tetraIdx))
+          if (!decoupledLayers.isEmpty) {
+            if (!unaryRuleCountsPerState.contains(tetraIdx)) unaryRuleCountsPerState.put(tetraIdx, SparseVector.zeros[Double](labelFeaturizer.index.size))
+            axpy(score, fv, unaryRuleCountsPerState(tetraIdx))
+          }
+        }
+
+        override def visitSpan(begin: Int, end: Int, tag: Int, ref: Int, score: Double): Unit = {
+          val tetraIdx = tetra(begin, end, length + 2)
+          untetra(tetraIdx) = (begin, end, length + 2)
+          val fv = new FeatureVector(lspec.featuresForSpan(begin, end, tag, ref))
+          if (!ruleCountsPerState.contains(tetraIdx)) ruleCountsPerState.put(tetraIdx, SparseVector.zeros[Double](labelFeaturizer.index.size))
+          axpy(score, fv, ruleCountsPerState(tetraIdx))
+          if (!decoupledLayers.isEmpty) {
+            if (!spanCountsPerState.contains(tetraIdx)) spanCountsPerState.put(tetraIdx, SparseVector.zeros[Double](labelFeaturizer.index.size))
+            axpy(score, fv, spanCountsPerState(tetraIdx))
+          }
+        }
+
+        override def visitBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int, score: Double): Unit = {
+          val tetraIdx = tetra(begin, split, end)
+          untetra(tetraIdx) = (begin, split, end)
+          val fv = new FeatureVector(lspec.featuresForBinaryRule(begin, split, end, rule, ref))
+          if (!ruleCountsPerState.contains(tetraIdx)) ruleCountsPerState.put(tetraIdx, SparseVector.zeros[Double](labelFeaturizer.index.size))
+          axpy(score, fv, ruleCountsPerState(tetraIdx))
+          if (!decoupledLayers.isEmpty) {
+            if (!binaryRuleCountsPerState.contains(tetraIdx)) binaryRuleCountsPerState.put(tetraIdx, SparseVector.zeros[Double](labelFeaturizer.index.size))
+            axpy(score, fv, binaryRuleCountsPerState(tetraIdx))
+          }
+        }
+      }
+
+      for (key <- ruleCountsPerState.keySet) {
+        val (begin, split, end) = untetra(key)
+        val ffeats = if (end > length) sspec.featuresForSpan(begin, split) else sspec.featuresForSplit(begin, split, end)
+        var layerSizeTally = 0
+        for (j <- 0 until layers.size) {
+          layers(j).tallyDerivative(deriv(layerSizeTally until layerSizeTally + layers(j).index.size), { ruleCountsPerState(key) * scale }, ffeats)
+          layerSizeTally += layers(j).index.size;
+        }
+      }
+      if (!decoupledLayers.isEmpty) {
+        for (key <- spanCountsPerState.keySet) {
+          val (begin, end, _) = untetra(key)
+          val ffeats = sspec.reducedFeaturesForSpan(begin, end)
+          decoupledLayers(SpanLayerIdx).tallyDerivative(deriv(dcSpanFeatOffset until dcSpanFeatOffset + decoupledLayers(SpanLayerIdx).index.size), { spanCountsPerState(key) * scale }, ffeats)
+        }
+        for (key <- unaryRuleCountsPerState.keySet) {
+          val (begin, end, _) = untetra(key)
+          val ffeats = sspec.reducedFeaturesForSpan(begin, end)
+          decoupledLayers(UnaryLayerIdx).tallyDerivative(deriv(dcUnaryFeatOffset until dcUnaryFeatOffset + decoupledLayers(UnaryLayerIdx).index.size), { unaryRuleCountsPerState(key) * scale }, ffeats)
+        }
+        for (key <- binaryRuleCountsPerState.keySet) {
+          val (begin, split, end) = untetra(key)
+          val ffeats = sspec.featuresForSplit(begin, split, end)
+          decoupledLayers(BinaryLayerIdx).tallyDerivative(deriv(dcBinaryFeatOffset until dcBinaryFeatOffset + decoupledLayers(BinaryLayerIdx).index.size), { binaryRuleCountsPerState(key) * scale }, ffeats)
+        }
+      }
+    }
+
+    def anchor(w: IndexedSeq[W], cons: ChartConstraints[L]):GrammarAnchoring[L, W] = new ProjectionsGrammarAnchoring[L, L2, W] {
+      
+      override def addConstraints(constraints: ChartConstraints[L]): GrammarAnchoring[L, W] = {
+        anchor(w, cons & constraints)
+      }
+
+      override def sparsityPattern: ChartConstraints[L] = cons
+
+      def refinements = PositionalNeuralGrammar.this.refinements
+      def refinedTopology: RuleTopology[L2] = PositionalNeuralGrammar.this.refinedTopology
+
+      val topology = PositionalNeuralGrammar.this.topology
+      val lexicon = PositionalNeuralGrammar.this.lexicon
+
+      def words = w
+
+      val l = w.size
+      val maxTetraLen = ((l + 2) * (l + 3) * (l + 4))/6 + ((l + 1) * (l + 2))/2 + l + 2
+      
+      // Doesn't make things faster to use HashMaps here
+      val cache = Array.tabulate(layers.size + decoupledLayers.size)(i => new Array[DenseVector[Double]](maxTetraLen))
+      val finalCache = Array.tabulate(layers.size + decoupledLayers.size)(i => new Array[SparseVector[Double]](maxTetraLen))
+      
+      def getOrElseUpdate(layerIdx: Int, tetraIdx: Int, fun: => DenseVector[Double]) = {
+        if (cache(layerIdx)(tetraIdx) == null) cache(layerIdx)(tetraIdx) = fun
+        cache(layerIdx)(tetraIdx)
+      }
+
+      def getOrElseUpdateFinal(layerIdx: Int, tetraIdx: Int, rfeatIdx: Int, maxVectSize: Int, fun: => Double) = {
+        if (finalCache(layerIdx)(tetraIdx) == null) finalCache(layerIdx)(tetraIdx) = SparseVector.zeros(maxVectSize)
+        if (!finalCache(layerIdx)(tetraIdx).contains(rfeatIdx)) finalCache(layerIdx)(tetraIdx)(rfeatIdx) = fun
+        finalCache(layerIdx)(tetraIdx)(rfeatIdx)
+      }
+      
+      val sspec = surfaceFeaturizer.anchor(w)
+      val depSpec = depFeaturizer.anchor(w)
+      val lspec = labelFeaturizer.anchor(w)
+      val fspec = if (maybeSparseSurfaceFeaturizer.isDefined) maybeSparseSurfaceFeaturizer.get.anchor(w) else null
+      val sparseFeatsStart = if (maybeSparseSurfaceFeaturizer.isDefined) (layers.map(_.index.size).foldLeft(0)(_ + _) + depLayers.map(_.index.size).foldLeft(0)(_ + _) + decoupledLayers.map(_.index.size).foldLeft(0)(_ + _)) else -1
+
+      private def tetra(begin: Int, split: Int, end: Int) = {
+        (end * (end + 1) * (end + 2))/6 + ((split + 1) * split / 2 + begin)
+      }
+      
+      def scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int) = {
+        var total = 0.0;
+        val tetraIdx = tetra(begin, split, end)
+        val rfeats = lspec.featuresForBinaryRule(begin, split, end, rule, ref)
+        for (layerIdx <- 0 until layers.size) {
+          val fs = getOrElseUpdate(layerIdx, tetraIdx, { penultimateLayers(layerIdx).activations(sspec.featuresForSplit(begin, split, end)) })
+          for (rfeat <- rfeats) {
+            total += getOrElseUpdateFinal(layerIdx, tetraIdx, rfeat, labelFeaturizer.index.size, { layers(layerIdx).activationsFromPenultimateDot(fs, rfeat) })
+          }
+        }
+        if (!decoupledLayers.isEmpty) {
+          val layerIdx = layers.size + BinaryLayerIdx
+          val fs = getOrElseUpdate(layerIdx, tetraIdx, { penultimateDecoupledLayers(BinaryLayerIdx).activations(sspec.featuresForSplit(begin, split, end)) })
+          for (rfeat <- rfeats) {
+            total += getOrElseUpdateFinal(layerIdx, tetraIdx, rfeat, labelFeaturizer.index.size, { decoupledLayers(BinaryLayerIdx).activationsFromPenultimateDot(fs, rfeat) })
+          }
+        }
+        if (maybeSparseSurfaceFeaturizer.isDefined) {
+          total += dot(fspec.featuresForBinaryRule(begin, split, end, rule, ref), sparseFeatsStart)
+        }
+        total
+      }
+      
+      def scoreUnaryRule(begin: Int, end: Int, rule: Int, ref: Int) = {
+        var total = 0.0;
+        val tetraIdx = tetra(begin, end, length + 1)
+        val rfeats = lspec.featuresForUnaryRule(begin, end, rule, ref)
+        for (layerIdx <- 0 until layers.size) {
+          val fs = getOrElseUpdate(layerIdx, tetraIdx, { penultimateLayers(layerIdx).activations(sspec.featuresForSpan(begin, end)) })
+          for (rfeat <- rfeats) {
+            total += getOrElseUpdateFinal(layerIdx, tetraIdx, rfeat, labelFeaturizer.index.size, { layers(layerIdx).activationsFromPenultimateDot(fs, rfeat) })
+          }
+        }
+        if (!decoupledLayers.isEmpty) {
+          val layerIdx = layers.size + UnaryLayerIdx
+          val fs = getOrElseUpdate(layerIdx, tetraIdx, { penultimateDecoupledLayers(UnaryLayerIdx).activations(sspec.reducedFeaturesForSpan(begin, end)) })
+          for (rfeat <- rfeats) {
+            total += getOrElseUpdateFinal(layerIdx, tetraIdx, rfeat, labelFeaturizer.index.size, { decoupledLayers(UnaryLayerIdx).activationsFromPenultimateDot(fs, rfeat) })
+          }
+        }
+        if (maybeSparseSurfaceFeaturizer.isDefined) {
+          total += dot(fspec.featuresForUnaryRule(begin, end, rule, ref), sparseFeatsStart)
+        }
+        total
+      }
+
+      def scoreSpan(begin: Int, end: Int, tag: Int, ref: Int) = {
+        var total = 0.0;
+        val tetraIdx = tetra(begin, end, length + 2)
+        val rfeats = lspec.featuresForSpan(begin, end, tag, ref)
+        for (layerIdx <- 0 until layers.size) {
+          val fs = getOrElseUpdate(layerIdx, tetraIdx, { penultimateLayers(layerIdx).activations(sspec.featuresForSpan(begin, end)) })
+          for (rfeat <- rfeats) {
+            total += getOrElseUpdateFinal(layerIdx, tetraIdx, rfeat, labelFeaturizer.index.size, { layers(layerIdx).activationsFromPenultimateDot(fs, rfeat) })
+          }
+        }
+        if (!decoupledLayers.isEmpty) {
+          val layerIdx = layers.size + SpanLayerIdx
+          val fs = getOrElseUpdate(layerIdx, tetraIdx, { penultimateDecoupledLayers(SpanLayerIdx).activations(sspec.reducedFeaturesForSpan(begin, end)) })
+          for (rfeat <- rfeats) {
+            total += getOrElseUpdateFinal(layerIdx, tetraIdx, rfeat, labelFeaturizer.index.size, { decoupledLayers(SpanLayerIdx).activationsFromPenultimateDot(fs, rfeat) })
+          }
+        }
+        if (maybeSparseSurfaceFeaturizer.isDefined) {
+          total += dot(fspec.featuresForSpan(begin, end, tag, ref), sparseFeatsStart)
+        }
+        total
+      }
+
+      private def dot(features: Array[Int], sparseFeaturesOffset: Int) = {
+        var i = 0
+        var score = 0.0
+        val wdata = weights.data
+        while(i < features.length) {
+          score += wdata(features(i) + sparseFeaturesOffset)
+          i += 1
+        }
+        score
+      }
+    }
+  }
+}
diff --git a/src/main/scala/epic/parser/models/PositionalNeuralModelFactory.scala b/src/main/scala/epic/parser/models/PositionalNeuralModelFactory.scala
new file mode 100644
index 00000000..b370317e
--- /dev/null
+++ b/src/main/scala/epic/parser/models/PositionalNeuralModelFactory.scala
@@ -0,0 +1,321 @@
+package epic.parser
+package models
+
+import java.io.File
+import breeze.config.Help
+import breeze.features.FeatureVector
+import breeze.linalg._
+import breeze.util.Index
+import epic.constraints.ChartConstraints
+import epic.dense.{IdentityTransform, AffineTransform, Transform}
+import epic.features.SurfaceFeaturizer.SingleWordSpanFeaturizer
+import epic.features._
+import epic.framework.Feature
+import epic.lexicon.Lexicon
+import epic.parser.projections.GrammarRefinements
+import epic.trees._
+import epic.trees.annotations.TreeAnnotator
+import epic.util.{LRUCache, Optional}
+import epic.dense.Transform
+import epic.dense.TanhTransform
+import epic.dense.OutputTransform
+import epic.dense.AffineOutputTransform
+import epic.dense.OutputEmbeddingTransform
+import epic.dense.Word2Vec
+import scala.collection.mutable.HashMap
+import epic.dense.Word2VecSurfaceFeaturizerIndexed
+import epic.dense.Word2VecDepFeaturizerIndexed
+import epic.dense.Word2VecIndexed
+import epic.dense.FrequencyTagger
+import epic.dense.CachingLookupTransform
+import epic.dense.CachingLookupAndAffineTransformDense
+import epic.dense.EmbeddingsTransform
+import epic.dense.NonlinearTransform
+import scala.io.Source
+import scala.collection.mutable.HashSet
+import epic.dense.BatchNormalizationTransform
+
+/**
+ * Entry point for instantiating a neural CRF parser. Parameters specify neural
+ * net parameters, word vectors, and sparse features to use.
+ *
+ * @author gdurrett
+ **/
+
+
+/**
+ * Less-used parameters
+ */
+case class ExtraPNMParams(@Help(text="Used for ablations with random word embeddings; don't change this. Options: normal, random, trivial, normalpos")
+                          embeddingType: String = "normal",
+                          @Help(text="Use longest frequent suffix (standard representation) for sparse feats")
+                          useSparseLfsuf: Boolean = true,
+                          @Help(text="Use sparse Brown cluster features")
+                          useSparseBrown: Boolean = false,
+                          @Help(text="Use expanded set of sparse surface features (doesn't help)")
+                          useMostSparseIndicators: Boolean = false,
+                          @Help(text="Scaling factor for all input vectors")
+                          vectorRescaling: Double = 1.0,
+                          @Help(text="Use the output embedding model (Figure 4b in the neural CRF paper)")
+                          outputEmbedding: Boolean = false,
+                          @Help(text="Dimension of the output embedding model")
+                          outputEmbeddingDim: Int = 20,
+                          @Help(text="When initializing the output embedding model, initialize based on root symbols")
+                          coarsenByRoot: Boolean = false,
+                          @Help(text="Use separate neural net parameters for span/unary/binary settings. Doesn't help.")
+                          decoupleTransforms: Boolean = false,
+                          @Help(text="Extract additional output features based on root label.")
+                          useRootLabel: Boolean = false,
+                          @Help(text="Set unknown word vectors to be random rather than 0")
+                          randomizeUnks: Boolean = false)
+                          
+case class PositionalNeuralModelFactory(@Help(text=
+                              """The kind of annotation to do on the refined grammar. Default uses just parent annotation.
+You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Manning 2003.
+                              """)
+                            annotator: TreeAnnotator[AnnotatedLabel, String, AnnotatedLabel] = GenerativeParser.defaultAnnotator(),
+                            @Help(text="For features not seen in gold trees, we bin them into dummyFeats * numGoldFeatures bins using hashing. If negative, use absolute value as number of hash features.")
+                            dummyFeats: Double = 0.5,
+                            @Help(text="Sparse features only fire on suffixes seen at lease this many times. Lower than 100 doesn't seem to do better.")
+                            commonWordThreshold: Int = 100,
+                            @Help(text="Combine the neural net features with sparse features. The NN does well on its own but sparse helps by >1 F1.")
+                            useSparseFeatures: Boolean = true,
+                            @Help(text="Nonlinearity to use. Options: tanh, relu, cube")
+                            nonLinType: String = "relu",
+                            @Help(text="Backpropagate into word embeddings (tune them during training). Doesn't help.")
+                            backpropIntoEmbeddings: Boolean = false,
+                            @Help(text="Dropout rate; 0.0 won't instantiate any dropout units, higher rates will but it doesn't seem to help.")
+                            dropoutRate: Double = 0.0,
+                            @Help(text="Width of hidden layer to use.")
+                            numHidden: Int = 200,
+                            @Help(text="Number of hidden layers to use. More than 1 slows down dramatically and doesn't help.")
+                            numHiddenLayers: Int = 1,
+                            @Help(text="How much surface context should we use as input to the neural network? Default is +/-2 words around begin/end/split. See Word2VecSurfaceFeaturizer for options")
+                            neuralSurfaceWordsToUse: String = "most",
+                            @Help(text="Path to word vectors. Can either be .bin like Mikolov et al.'s or .txt like Bansal et al.'s")
+                            word2vecPath: String = "",
+                            @Help(text="Load additional word vectors into the model rather than just those in the training set. Doesn't help.")
+                            vocFile: String = "",
+                            @Help(text="Set to true if your word vectors are all lowercase. Otherwise true case is used.")
+                            lowercasedVectors: Boolean = false,
+                            extraPNMParams: ExtraPNMParams = ExtraPNMParams()) extends ParserModelFactory[AnnotatedLabel, String] {
+  
+  type MyModel = PositionalNeuralModel[AnnotatedLabel, AnnotatedLabel, String]
+
+
+
+  override def make(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]],
+                    topology: RuleTopology[AnnotatedLabel],
+                    lexicon: Lexicon[AnnotatedLabel, String],
+                    constrainer: ChartConstraints.Factory[AnnotatedLabel, String]): MyModel = {
+    import extraPNMParams._
+    val annTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]] = trainTrees.map(annotator(_))
+    println("Here's what the annotation looks like on the first few trees")
+    annTrees.slice(0, Math.min(3, annTrees.size)).foreach(tree => println(tree.render(false)))
+
+    val (annWords, annBinaries, annUnaries) = this.extractBasicCounts(annTrees)
+    val refGrammar = RuleTopology(AnnotatedLabel.TOP, annBinaries, annUnaries)
+
+    val xbarGrammar = topology
+    val xbarLexicon = lexicon
+
+    val indexedRefinements = GrammarRefinements(xbarGrammar, refGrammar, (_: AnnotatedLabel).baseAnnotatedLabel)
+
+    def labelFeaturizer(l: AnnotatedLabel) = Set(l, l.baseAnnotatedLabel).toSeq
+    def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if (useRootLabel) {
+      Set(r, r.map(_.baseAnnotatedLabel), ParentFeature(r.parent)).toSeq
+    } else {
+      Set(r, r.map(_.baseAnnotatedLabel)).toSeq
+    }
+
+    val prodFeaturizer = new ProductionFeaturizer[AnnotatedLabel, AnnotatedLabel, String](xbarGrammar, indexedRefinements, lGen=labelFeaturizer, rGen=ruleFeaturizer)
+
+    
+    ///////////////////////
+    // READ IN WORD VECTORS
+    val tagCountsLexicon = TagSpanShapeGenerator.makeStandardLexicon(annTrees)
+    val freqTagger = new FrequencyTagger(tagCountsLexicon)
+
+    val voc = new HashSet[String]()
+    // Add words in the training set
+    val summedWordCounts: Counter[String, Double] = sum(annWords, Axis._0)
+    voc ++= summedWordCounts.keySet.toSet[String].map(str => Word2Vec.convertWord(str, lowercasedVectors))
+    // Read in a file of words in the treebank; this allows us to load words that are
+    // in the dev or test sets but not in train
+    voc ++= (if (vocFile != "") Source.fromFile(vocFile).getLines().map(str => Word2Vec.convertWord(str, lowercasedVectors)).toSet else Set[String]())
+    val word2vec = if (embeddingType == "trivial") {
+      Word2Vec.makeRandomVectorsForVocabulary(voc.toSet, 0, true)
+    } else if (embeddingType == "random") {
+      Word2Vec.makeRandomVectorsForVocabulary(voc.toSet, 50, true)
+    } else {
+      Word2Vec.smartLoadVectorsForVocabulary(word2vecPath.split(":"), voc.toSet, summedWordCounts, if (embeddingType == "trivial") 1 else Int.MaxValue, true, randomizeUnks)
+    }
+    // Convert Array[Float] values to Array[Double] values and rescale them
+    val word2vecDoubleVect = word2vec.map(keyValue => (keyValue._1 -> keyValue._2.map(_.toDouble * vectorRescaling)))
+//    val word2vecDoubleVect = word2vec.map(keyValue => (keyValue._1 -> new DenseVector[Double](keyValue._2.map(_.toDouble))))
+    val word2vecIndexed: Word2VecIndexed[String] = if (embeddingType == "normalpos") {
+      Word2VecIndexed(word2vecDoubleVect, (str: String) => Word2Vec.convertWord(str, lowercasedVectors)).augment(freqTagger.tagTypesIdx.size, freqTagger.convertToFeaturizer)
+    } else {
+      Word2VecIndexed(word2vecDoubleVect, (str: String) => Word2Vec.convertWord(str, lowercasedVectors))
+    }
+    //////////////////////
+    
+    val surfaceFeaturizer = new Word2VecSurfaceFeaturizerIndexed(word2vecIndexed, neuralSurfaceWordsToUse)
+    val depFeaturizer = new Word2VecDepFeaturizerIndexed(word2vecIndexed, freqTagger, topology)
+    
+    val transforms = if (decoupleTransforms) {
+      IndexedSeq[AffineOutputTransform[Array[Int]]]()
+    } else {
+      val inputSize = surfaceFeaturizer.splitInputSize
+      val transform = if (outputEmbedding) {
+        val coarsenerForInitialization = if (coarsenByRoot) {
+          Option(PositionalNeuralModelFactory.getRuleToParentMapping(prodFeaturizer.index)) 
+        } else {
+          None
+        }
+        PositionalNeuralModelFactory.buildNetOutputEmbedding(word2vecIndexed, inputSize, numHidden, numHiddenLayers, prodFeaturizer.index.size, nonLinType, dropoutRate, backpropIntoEmbeddings, outputEmbeddingDim, coarsenerForInitialization)
+      } else {
+        // THIS IS THE STANDARD CODE PATH
+        println(inputSize + " x (" + numHidden + ")^" + numHiddenLayers + " x " + prodFeaturizer.index.size + " neural net")
+        PositionalNeuralModelFactory.buildNet(word2vecIndexed, inputSize, numHidden, numHiddenLayers, prodFeaturizer.index.size, nonLinType, dropoutRate, backpropIntoEmbeddings)
+      }
+      IndexedSeq(transform)
+    }
+    val depTransforms: IndexedSeq[AffineOutputTransform[Array[Int]]] = IndexedSeq()
+    val decoupledTransforms = if (decoupleTransforms) {
+      // Span and unary use the reduced input (no split point features), whereas surface uses the split point features
+      val inputSizes = Seq(surfaceFeaturizer.reducedInputSize, surfaceFeaturizer.reducedInputSize, surfaceFeaturizer.splitInputSize)
+      inputSizes.map(inputSize => PositionalNeuralModelFactory.buildNet(word2vecIndexed, inputSize, numHidden, numHiddenLayers, prodFeaturizer.index.size, nonLinType, dropoutRate, backpropIntoEmbeddings))
+    } else {
+      IndexedSeq[AffineOutputTransform[Array[Int]]]()
+    }
+    
+    println(transforms.size + " transforms, " + transforms.map(_.index.size).toSeq + " parameters for each")
+    println(depTransforms.size + " dep transforms, " + depTransforms.map(_.index.size).toSeq + " parameters for each")
+    println(decoupledTransforms.size + " decoupled transforms, " + decoupledTransforms.map(_.index.size).toSeq + " parameters for each")
+    
+    val maybeSparseFeaturizer = if (useSparseFeatures) {
+      var wf = SpanModelFactory.defaultPOSFeaturizer(annWords, useBrown = useSparseBrown)
+      var span = SpanModelFactory.goodFeaturizer(annWords, commonWordThreshold, useShape = false, useLfsuf = useSparseLfsuf, useBrown = useSparseBrown, useMostSparseIndicators = useMostSparseIndicators)
+      span += new SingleWordSpanFeaturizer[String](wf)
+      val indexedWord = IndexedWordFeaturizer.fromData(wf, annTrees.map{_.words}, deduplicateFeatures = false)
+      val indexedSurface = IndexedSplitSpanFeaturizer.fromData(span, annTrees, bloomFilter = false)
+      
+      def sparseLabelFeaturizer(l: AnnotatedLabel) = Set(l, l.baseAnnotatedLabel).toSeq
+      def sparseRuleFeaturizer(r: Rule[AnnotatedLabel]) = Set(r, r.map(_.baseAnnotatedLabel)).toSeq
+      val sparseProdFeaturizer = new ProductionFeaturizer[AnnotatedLabel, AnnotatedLabel, String](xbarGrammar, indexedRefinements, lGen=sparseLabelFeaturizer, rGen=sparseRuleFeaturizer)
+      
+      val indexed = IndexedSpanFeaturizer.extract[AnnotatedLabel, AnnotatedLabel, String](indexedWord,
+        indexedSurface,
+        sparseProdFeaturizer,
+        new ZeroRuleAndSpansFeaturizer(),
+        annotator.latent,
+        indexedRefinements,
+        xbarGrammar,
+        if(dummyFeats < 0) HashFeature.Absolute(-dummyFeats.toInt) else HashFeature.Relative(dummyFeats),
+        filterUnseenFeatures = false,
+        minFeatCount = 1,
+        trainTrees)
+      Option(indexed)
+    } else {
+      None
+    }
+    
+    new PositionalNeuralModel(annotator.latent,
+      constrainer,
+      topology, lexicon,
+      refGrammar,
+      indexedRefinements,
+      prodFeaturizer,
+      surfaceFeaturizer,
+      depFeaturizer,
+      transforms,
+      maybeSparseFeaturizer,
+      depTransforms,
+      decoupledTransforms)
+  }
+}
+
+object PositionalNeuralModelFactory {
+  
+  def buildNetInnerTransforms(word2vecIndexed: Word2VecIndexed[String],
+                              inputSize: Int,
+                              numHidden: Int,
+                              numHiddenLayers: Int,
+                              nonLinType: String,
+                              dropoutRate: Double,
+                              backpropIntoEmbeddings: Boolean): Transform[Array[Int],DenseVector[Double]] = {
+    if (numHiddenLayers == 0) {
+      new CachingLookupTransform(word2vecIndexed)
+    } else {
+      val baseTransformLayer = if (backpropIntoEmbeddings) {
+        new EmbeddingsTransform(numHidden, inputSize, word2vecIndexed)
+      } else {
+        new CachingLookupAndAffineTransformDense(numHidden, inputSize, word2vecIndexed)
+      }
+      var currLayer = addNonlinearity(nonLinType, numHidden, dropoutRate, baseTransformLayer)
+      for (i <- 1 until numHiddenLayers) {
+        currLayer = addNonlinearity(nonLinType, numHidden, dropoutRate, new AffineTransform(numHidden, numHidden, currLayer))
+      }
+      currLayer
+    }
+  }
+  
+  def buildNet(word2vecIndexed: Word2VecIndexed[String],
+               inputSize: Int,
+               numHidden: Int,
+               numHiddenLayers: Int,
+               outputSize: Int,
+               nonLinType: String,
+               dropoutRate: Double,
+               backpropIntoEmbeddings: Boolean): AffineOutputTransform[Array[Int]] = {
+    val innerTransform = buildNetInnerTransforms(word2vecIndexed, inputSize, numHidden, numHiddenLayers, nonLinType, dropoutRate, backpropIntoEmbeddings)
+    new AffineOutputTransform(outputSize, if (numHiddenLayers >= 1) numHidden else inputSize, innerTransform)
+  }
+  
+  
+  def buildNetOutputEmbedding(word2vecIndexed: Word2VecIndexed[String],
+                              inputSize: Int,
+                              numHidden: Int,
+                              numHiddenLayers: Int,
+                              outputSize: Int,
+                              nonLinType: String,
+                              dropoutRate: Double,
+                              backpropIntoEmbeddings: Boolean,
+                              outputEmbeddingDim: Int,
+                              coarsenerForInitialization: Option[Int => Int]): OutputTransform[Array[Int],DenseVector[Double]] = {
+    val innerTransform = buildNetInnerTransforms(word2vecIndexed, inputSize, numHidden, numHiddenLayers, nonLinType, dropoutRate, backpropIntoEmbeddings)
+    
+    val innerTransformLastLayer = new AffineTransform(outputEmbeddingDim, if (numHiddenLayers >= 1) numHidden else inputSize, innerTransform)
+    new OutputEmbeddingTransform(outputSize, outputEmbeddingDim, innerTransformLastLayer, coarsenerForInitialization)
+  }
+  
+  def addNonlinearity(nonLinType: String, numHidden: Int, dropoutRate: Double, currLayer: Transform[Array[Int],DenseVector[Double]]) = {
+    val useDropout = dropoutRate > 1e-8
+    var tmpLayer = currLayer
+    tmpLayer = new NonlinearTransform(nonLinType, numHidden, tmpLayer)
+    if (useDropout) {
+      tmpLayer = new NonlinearTransform("dropout", numHidden, tmpLayer, dropoutRate)
+    }
+    tmpLayer
+  }
+  
+  def getRuleToParentMapping(index: Index[Feature]): Int => Int = {
+    (i: Int) => {
+      if (index.get(i).isInstanceOf[Rule[AnnotatedLabel]]) {
+        val parentIdx = index(index.get(i).asInstanceOf[Rule[AnnotatedLabel]].parent)
+        if (parentIdx == -1) {
+          0
+        } else {
+          parentIdx
+        }
+      } else {
+        i
+      }
+    }
+  }
+}
+
+case class ParentFeature(f: Feature) extends Feature;
+case class LeftChildFeature(f: Feature) extends Feature;
+case class RightChildFeature(f: Feature) extends Feature;
diff --git a/src/main/scala/epic/parser/models/SpanModel.scala b/src/main/scala/epic/parser/models/SpanModel.scala
index 30b5ffbb..75e7cb1b 100644
--- a/src/main/scala/epic/parser/models/SpanModel.scala
+++ b/src/main/scala/epic/parser/models/SpanModel.scala
@@ -338,6 +338,7 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma
                             useNGrams:Boolean = false,
                             maxNGramOrder:Int = 2,
                             useGrammar: Boolean = true,
+                            useChildFeats: Boolean = false,
                             useFullShape: Boolean = false,
                             useSplitShape: Boolean = false,
                             posFeaturizer: Optional[WordFeaturizer[String]] = None,
@@ -399,7 +400,23 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma
     
     
     def labelFeaturizer(l: AnnotatedLabel) = Set(l, l.baseAnnotatedLabel).toSeq
-    def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if(useGrammar) Set(r, r.map(_.baseAnnotatedLabel)).toSeq else if(r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty
+    
+//    def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if(useGrammar) Set(r, r.map(_.baseAnnotatedLabel)).toSeq else if(r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty
+    def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if(useGrammar) {
+      if (useChildFeats && r.isInstanceOf[BinaryRule[AnnotatedLabel]]) {
+        Set(r,
+            r.map(_.baseAnnotatedLabel),
+            new LeftChildFeature(r.asInstanceOf[BinaryRule[AnnotatedLabel]].left),
+            new RightChildFeature(r.asInstanceOf[BinaryRule[AnnotatedLabel]].right)).toSeq
+      } else {
+        Set(r, r.map(_.baseAnnotatedLabel)).toSeq
+      }
+    } else if(r.isInstanceOf[UnaryRule[AnnotatedLabel]]) {
+      Set(r.parent, r.parent.baseAnnotatedLabel).toSeq
+    } else {
+      Seq.empty
+    }
+    
     
     val featurizer = new ProductionFeaturizer[AnnotatedLabel, AnnotatedLabel, String](xbarGrammar, indexedRefinements,
       lGen=labelFeaturizer,
@@ -584,15 +601,24 @@ case class LatentSpanModelFactory(inner: SpanModelFactory,
 object SpanModelFactory {
   def goodFeaturizer[L](wordCounts: Counter2[AnnotatedLabel, String, Double],
                         commonWordThreshold: Int = 100,
-                        useShape: Boolean = true) = {
+                        useShape: Boolean = true,
+                        useLfsuf: Boolean = true,
+                        useBrown: Boolean = false,
+                        useMostSparseIndicators: Boolean = false) = {
     val dsl = new WordFeaturizer.DSL(wordCounts, commonWordThreshold) with SurfaceFeaturizer.DSL with SplitSpanFeaturizer.DSL
     import dsl._
 
     // class(split + 1)
-    val baseCat = lfsuf
-
+    var baseCat: WordFeaturizer[String] = new ZeroFeaturizer[String];
+    if (useLfsuf) {
+      baseCat += lfsuf
+    }
+    if (useBrown) {
+      baseCat += new BrownClusterFeaturizer(Array(4, 10))
+    }
+    
     val leftOfSplit: SplitSpanFeaturizer[String] =  ((baseCat)(-1)apply (split))
-
+    
     var featurizer: SplitSpanFeaturizer[String] = zeroSplit[String]
     featurizer += baseCat(begin)
     featurizer += baseCat(end-1)
@@ -601,6 +627,14 @@ object SpanModelFactory {
     featurizer += leftOfSplit
     featurizer += baseCat(split)
     featurizer += length
+    if (useMostSparseIndicators) {
+      featurizer += baseCat(begin-2)
+      featurizer += baseCat(end-2)
+      featurizer += baseCat(begin+1)
+      featurizer += baseCat(end+1)
+      featurizer +=  ((baseCat)(-2)apply (split))
+      featurizer +=  ((baseCat)(1)apply (split))
+    }
 
     featurizer += distance[String](begin, split)
     featurizer += distance[String](split, end)
@@ -609,11 +643,16 @@ object SpanModelFactory {
     featurizer
   }
 
-  def defaultPOSFeaturizer(annWords: Counter2[AnnotatedLabel, String, Double]): WordFeaturizer[String] = {
+  def defaultPOSFeaturizer(annWords: Counter2[AnnotatedLabel, String, Double], useBrown: Boolean = false): WordFeaturizer[String] = {
     {
       val dsl = new WordFeaturizer.DSL(annWords)
       import dsl._
-      unigrams(word, 1) + suffixes() + prefixes()
+      if (useBrown) {
+        val brown = new BrownClusterFeaturizer(Array(4, 10))
+        unigrams(brown, 1) + unigrams(word, 1) + suffixes() + prefixes()
+      } else {
+        unigrams(word, 1) + suffixes() + prefixes()
+      }
     }
   }
 
@@ -636,6 +675,7 @@ object SpanModelFactory {
       new CachedChartConstraintsFactory[AnnotatedLabel, String](uncached)
     }
 
+    
     val mf = new SpanModelFactory(annotator = annotator, posFeaturizer = posFeaturizer, spanFeaturizer = spanFeaturizer).make(trees, topo, lexicon, constraints)
 
     val mobj = new ModelObjective(mf, trees)
diff --git a/src/main/scala/epic/parser/models/TransformModel.scala b/src/main/scala/epic/parser/models/TransformModel.scala
index 272d383e..c0c60de9 100644
--- a/src/main/scala/epic/parser/models/TransformModel.scala
+++ b/src/main/scala/epic/parser/models/TransformModel.scala
@@ -47,7 +47,7 @@ class TransformModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W]) => B
   override def featureIndex: Index[Feature] = transform.index
 
   override def inferenceFromWeights(weights: DenseVector[Double]): Inference = {
-    val layer = transform.extractLayer(weights)
+    val layer = transform.extractLayer(weights, true)
 
     val grammar = new TransformModel.TransformGrammar[L, L2, W, transform.type](topology, lexicon, refinedTopology, refinements, labelFeaturizer, surfaceFeaturizer, layer)
     new Inference(annotator, constrainer, grammar, refinements)
@@ -94,7 +94,10 @@ object TransformModel {
       val sspec = surfaceFeaturizer.anchor(w)
       val lspec = labelFeaturizer.anchor(w)
 
-      // cache: we remember the (begin/end) pair we saw with each
+      // For each split point, remember the (begin, end) pair that that split point was observed with. There'll
+      // only be one in the gold, but more in the prediction. Accumulate rule counts (output layer) until
+      // we need this split point for a different set of indices or we come to the end. Then, backpropagate
+      // the rule marginals through the network to get the derivative.
       val UNUSED = (-1, -1)
       val states = Array.fill(w.length + 2)(UNUSED) // 1 for each split,  length for unaries, length +1 for spans
       val ruleCountsPerState = Array.fill(w.length + 2)(SparseVector.zeros[Double](labelFeaturizer.index.size))
@@ -187,7 +190,6 @@ object TransformModel {
           layer.activations(new FeatureVector(sfeats))
         })
         val rfeats = lspec.featuresForUnaryRule(begin, end, rule, ref)
-
         new FeatureVector(rfeats) dot fs
       }
 
@@ -197,7 +199,6 @@ object TransformModel {
           layer.activations(new FeatureVector(sfeats))
         })
         val rfeats = lspec.featuresForSpan(begin, end, tag, ref)
-
         new FeatureVector(rfeats) dot fs
       }
 
diff --git a/src/main/scala/epic/trees/AnnotatedLabel.scala b/src/main/scala/epic/trees/AnnotatedLabel.scala
index 36a760d4..4ba2645c 100644
--- a/src/main/scala/epic/trees/AnnotatedLabel.scala
+++ b/src/main/scala/epic/trees/AnnotatedLabel.scala
@@ -101,8 +101,8 @@ object AnnotatedLabel {
       Array("PRT")
     } else if (label.startsWith("-") || label.isEmpty || label == "#") {
       Array(label)
-    } else if (label.contains("#")) {
-      val splits = label.split("#").filter(_.nonEmpty)
+    } else if (label.contains("##")) { // SPMRL uses two ## as the delimiter for this info
+      val splits = label.split("##").filter(_.nonEmpty)
       val nonmorphSplits = splits.head.split("[-=]")
       val morphSplits = splits.tail.flatMap(_.split("[|]")).filter("_" != _)
       nonmorphSplits ++ morphSplits
@@ -142,4 +142,4 @@ object AnnotatedLabel {
     def get(t: AnnotatedLabel) = t.label
     def set(t: AnnotatedLabel, u: String) = t.copy(u)
   }
-}
\ No newline at end of file
+}
diff --git a/src/test/scala/epic/dense/AffineTransformTest.scala b/src/test/scala/epic/dense/AffineTransformTest.scala
index 3bc4bf55..56b5ba32 100644
--- a/src/test/scala/epic/dense/AffineTransformTest.scala
+++ b/src/test/scala/epic/dense/AffineTransformTest.scala
@@ -16,7 +16,7 @@ class AffineTransformTest extends FunSuite {
      val dv = DenseVector.rand(10)
      val objective = new DiffFunction[DenseVector[Double]] {
        def calculate(x: DenseVector[Double]): (Double, DenseVector[Double]) = {
-         val layer = index.extractLayer(x)
+         val layer = index.extractLayer(x, true)
          val acts = layer.activations(dv)
          val obj = acts.sum
          val deriv = DenseVector.zeros[Double](x.length)
@@ -36,7 +36,7 @@ class AffineTransformTest extends FunSuite {
     val target = DenseVector.rand(11) * 100.0
     val objective = new DiffFunction[DenseVector[Double]] {
       def calculate(x: DenseVector[Double]): (Double, DenseVector[Double]) = {
-        val layer = index.extractLayer(x)
+        val layer = index.extractLayer(x, true)
         val acts = layer.activations(dv)
         val obj = math.pow(norm(target - acts, 2), 2) / 2
         val initDeriv = acts - target
@@ -48,7 +48,7 @@ class AffineTransformTest extends FunSuite {
 
     val weights: DenseVector[Double] = (DenseVector.rand[Double](index.index.size) - 0.5) * 4.0
     val diffs = GradientTester.test[Int, DenseVector[Double]](objective, weights, randFraction = 1.0)
-    assert(max(diffs) < 4E-3, s"${diffs.max} was bigger than expected!!")
+    assert(max(diffs) < 2E-2, s"${diffs.max} was bigger than expected!!")
   }
 
  }
diff --git a/src/test/scala/epic/dense/TanhTransformTest.scala b/src/test/scala/epic/dense/TanhTransformTest.scala
index c0d1c00d..fcc46f6b 100644
--- a/src/test/scala/epic/dense/TanhTransformTest.scala
+++ b/src/test/scala/epic/dense/TanhTransformTest.scala
@@ -16,7 +16,7 @@ class TanhTransformTest extends FunSuite {
     val dv = DenseVector.rand(10)
     val objective = new DiffFunction[DenseVector[Double]] {
       def calculate(x: DenseVector[Double]): (Double, DenseVector[Double]) = {
-        val layer = index.extractLayer(x)
+        val layer = index.extractLayer(x, true)
         val acts = layer.activations(dv)
         val obj = acts.sum
         val deriv = DenseVector.zeros[Double](x.length)
@@ -36,7 +36,7 @@ class TanhTransformTest extends FunSuite {
     val dv = DenseVector.rand(10)
     val objective = new DiffFunction[DenseVector[Double]] {
       def calculate(x: DenseVector[Double]): (Double, DenseVector[Double]) = {
-        val layer = index.extractLayer(x)
+        val layer = index.extractLayer(x, true)
         val acts = layer.activations(dv)
         val obj = acts.sum
         val deriv = DenseVector.zeros[Double](x.length)