dlwh · dlwh · Oct 30, 2015 · Oct 30, 2015
diff --git a/src/main/scala/epic/parser/models/PositionalNeuralModelFactory.scala b/src/main/scala/epic/parser/models/PositionalNeuralModelFactory.scala
@@ -69,6 +69,15 @@ case class ExtraPNMParams(@Help(text="Used for ablations with random word embedd
                           @Help(text="Set unknown word vectors to be random rather than 0")
                           randomizeUnks: Boolean = false)
 
+case class ExtraPNMSparseParams(@Help(text="Use n-gram features in the sparse featurizer (good for sentiment)")
+                                useNGrams: Boolean = false,
+                                @Help(text="Max order of n-grams to use in these features")
+                                maxNGramOrder:Int = 2,
+                                @Help(text="Count threshold for firing n-gram features")
+                                ngramCountThreshold: Int = 1,
+                                @Help(text="Additional span shape features based on tags")
+                                useTagSpanShape: Boolean = false)
+
 case class PositionalNeuralModelFactory(@Help(text=
                               """The kind of annotation to do on the refined grammar. Default uses just parent annotation.
 You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Manning 2003.
@@ -98,7 +107,8 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma
                             vocFile: String = "",
                             @Help(text="Set to true if your word vectors are all lowercase. Otherwise true case is used.")
                             lowercasedVectors: Boolean = false,
-                            extraPNMParams: ExtraPNMParams = ExtraPNMParams()) extends ParserModelFactory[AnnotatedLabel, String] {
+                            extraPNMParams: ExtraPNMParams = ExtraPNMParams(),
+                            extraPNMSparseParams: ExtraPNMSparseParams = ExtraPNMSparseParams()) extends ParserModelFactory[AnnotatedLabel, String] {
 
   type MyModel = PositionalNeuralModel[AnnotatedLabel, AnnotatedLabel, String]
 
@@ -109,6 +119,7 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma
                     lexicon: Lexicon[AnnotatedLabel, String],
                     constrainer: ChartConstraints.Factory[AnnotatedLabel, String]): MyModel = {
     import extraPNMParams._
+    import extraPNMSparseParams._
     val annTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]] = trainTrees.map(annotator(_))
     println("Here's what the annotation looks like on the first few trees")
     annTrees.slice(0, Math.min(3, annTrees.size)).foreach(tree => println(tree.render(false)))
@@ -198,6 +209,12 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma
       var wf = SpanModelFactory.defaultPOSFeaturizer(annWords, useBrown = useSparseBrown)
       var span = SpanModelFactory.goodFeaturizer(annWords, commonWordThreshold, useShape = false, useLfsuf = useSparseLfsuf, useBrown = useSparseBrown, useMostSparseIndicators = useMostSparseIndicators)
       span += new SingleWordSpanFeaturizer[String](wf)
+      if (useNGrams) {
+        span += new NGramSpanFeaturizer(summedWordCounts, NGramSpanFeaturizer.countBigrams(annTrees), annTrees.map(_.words), ngramCountThreshold, maxNGramOrder, useNot = false)
+      }
+      if (useTagSpanShape) {
+        span += new TagSpanShapeFeaturizer(TagSpanShapeGenerator.makeBaseLexicon(trainTrees))
+      }
       val indexedWord = IndexedWordFeaturizer.fromData(wf, annTrees.map{_.words}, deduplicateFeatures = false)
       val indexedSurface = IndexedSplitSpanFeaturizer.fromData(span, annTrees, bloomFilter = false)
 

diff --git a/src/main/scala/epic/parser/models/SpanModel.scala b/src/main/scala/epic/parser/models/SpanModel.scala
@@ -315,9 +315,10 @@ object IndexedSpanFeaturizer {
 
 
 case class ExtraParams(useHackyLexicalFeatures:Boolean = false,
-                      hackyLexicalFeatureDesc:String = "",
-                      useMorph:Boolean = false,
-                      pathsToMorph:String = "")
+                       hackyLexicalFeatureDesc:String = "",
+                       useMorph:Boolean = false,
+                       useTagSpanShape:Boolean = false,
+                       pathsToMorph:String = "")
 
 case class SpanModelFactory(@Help(text=
                               """The kind of annotation to do on the refined grammar. Default uses just parent annotation.
@@ -388,8 +389,8 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma
     if(useNGrams)
       span += ngramF
 
-//    if(useTagSpanShape)
-//      span += tagSpanShape
+    if(useTagSpanShape)
+      span += tagSpanShape
 
     if(useFullShape)
       span += fullShape

diff --git a/src/main/scala/epic/sentiment/SentimentEvaluator.scala b/src/main/scala/epic/sentiment/SentimentEvaluator.scala
@@ -77,7 +77,6 @@ object SentimentEvaluator {
     println("Accuracy: " + accuracy(mat, isCorrectNormal, isUsedAlways)); // agrees with the Stanford system's way of combining the matrix
     println("Ternary: " + accuracy(mat, isCorrectTernary, isUsedAlways));
     println("Binary: " + accuracy(mat, isCorrectBinary, isUsedBinaryCoarse));
-    println("Socher binary: " + accuracy(mat, isCorrectBinary, isUsedSocherCoarse)); // agrees with the Stanford system's way of combining the matrix
   }
 
   def accuracy(mat: Array[Array[Int]]) = {
@@ -102,7 +101,6 @@ object SentimentEvaluator {
 
   def isUsedAlways(gold: Int, guess: Int) = true;
   def isUsedBinaryCoarse(gold: Int, guess: Int) = gold != 2;
-  def isUsedSocherCoarse(gold: Int, guess: Int) = gold != 2 && guess != 2;
 
 //  def ternaryCoarseEval(mat: Array[Array[Int]]) = {
 //    val numer = mat(0)(0) + mat(0)(1) + mat(1)(0) + mat(1)(1) + mat(2)(2) + mat(3)(3) + mat(3)(4) + mat(4)(3) + mat(4)(4); 

diff --git a/src/main/scala/epic/sentiment/SentimentTreebankPipeline.scala b/src/main/scala/epic/sentiment/SentimentTreebankPipeline.scala
@@ -1,14 +1,16 @@
 package epic.sentiment
 
 import java.io.File
+import breeze.config.Help
 import breeze.config.CommandLineParser
 import epic.trees._
-import epic.parser.models.{ParserInference, ParserModel}
+import epic.parser.models._
 import epic.parser._
 import breeze.linalg._
 import epic.framework._
 import epic.constraints.{LabeledSpanConstraints, SpanConstraints, ChartConstraints}
-import breeze.optimize.CachedBatchDiffFunction
+import breeze.optimize._
+import breeze.optimize.FirstOrderMinimizer.OptParams
 import com.typesafe.scalalogging.slf4j.LazyLogging
 import epic.parser.models.SpanModelFactory
 import epic.trees.ProcessedTreebank
@@ -18,21 +20,31 @@ import epic.trees.Span
 import scala.collection.mutable.HashMap
 import breeze.util._
 import epic.parser.models.ParserExtractableModelFactory
+import epic.dense.AdadeltaGradientDescentDVD
 
 /**
  *
  *
  * @author dlwh
  */
 object SentimentTreebankPipeline extends LazyLogging {
-  case class Options(path: File,
+  case class Options(@Help(text="Treebank path")
+                     path: File,
+                     @Help(text="Name for the model")
+                     name: String = "SentiParser",
                      opt: OptParams,
                      lossType: String = "",
-                     iterPerEval: Int = 100,
+                     iterationsPerEval: Int = 50,
+                     @Help(text="How many iterations to run.")
+                     maxIterations: Int = 202,
                      evalOnTest: Boolean = false,
+                     @Help(text="Evaluate on test as well; this is so we can pick the best held-out score and evaluate that on test")
+                     alsoEvalOnTest: Boolean = false,
                      includeDevInTrain: Boolean = false,
-                     modelFactory: ParserExtractableModelFactory[AnnotatedLabel, String] = new SpanModelFactory,
-                     rootLossScaling: Double = 1.0)
+                     @Help(text="Details about the parser to build")
+                     modelFactory: SpanModelFactory = new SpanModelFactory,
+                     rootLossScaling: Double = 1.0,
+                     computeTrainLL: Boolean = false)
 
 
   def main(args: Array[String]):Unit = {
@@ -82,25 +94,26 @@ object SentimentTreebankPipeline extends LazyLogging {
     val cachedObj = new CachedBatchDiffFunction(obj)
     val init = obj.initialWeightVector(true)
 
-    val name = "SentiParser"
-
-    for ((state, iter) <- params.opt.iterations(cachedObj, init).take(1000).zipWithIndex
-         if iter % params.iterPerEval == 0) try {
+    val itr = params.opt.iterations(cachedObj, init)
+
+    val name = params.name
+    for ((state, iter) <- itr.take(params.maxIterations).zipWithIndex
+         if iter % params.iterationsPerEval == 0) try {
       val parser = model.extractParser(state.x).copy(decoder=new MaxConstituentDecoder[AnnotatedLabel, String])
 //      if(params.evalOnTest)
 //        println("Eval: " + evaluate(s"$name-$iter", parser, treebank.testTrees))
 //      else
 //        println("Eval: " + evaluate(s"$name-$iter", parser, treebank.devTrees))
-      if(params.evalOnTest) {
+      if (params.computeTrainLL) {
+        computeLL(trainTrees, model, state.x)
+      }
+      if (params.evalOnTest) {
         println("NORMAL DECODE: Eval: " + evaluate(s"$name-$iter", parser, treebank.testTrees, DecodeType.Normal));
       } else {
-        println("Span confusions");
-        println(renderArr(evaluateSpanConfusions(s"$name-$iter", parser, treebank.devTrees, DecodeType.Normal)));
-        println("Root confusions");
-        println(renderArr(evaluateRootConfusions(s"$name-$iter", parser, treebank.devTrees, DecodeType.Normal)));
         println("NORMAL DECODE: Eval: " + evaluate(s"$name-$iter", parser, treebank.devTrees, DecodeType.Normal));
-        println("TERNARY DECODE: Eval: " + evaluate(s"$name-$iter", parser, treebank.devTrees, DecodeType.Ternary));
-//        println("BINARY DECODE: Eval: " + evaluateBetter(s"$name-$iter", parser, treebank.devTrees, DecodeType.Binary));
+        if (params.alsoEvalOnTest) {
+          println("TEST SET: Eval: " + evaluate(s"$name-$iter", parser, treebank.testTrees, DecodeType.Normal));
+        }
       }
     } catch {
       case e: Exception => e.printStackTrace(); throw e
@@ -162,37 +175,26 @@ object SentimentTreebankPipeline extends LazyLogging {
                          spansRightTernary: Int, // denom is same as numSpans
                          spansRightBinary: Int,
                          numBinarySpans: Int,
-                         numBinarySpansSocher: Int,
                          rootsRight: Int,
                          numRoots: Int,
                          rootsRightTernary: Int, // denom is same as numSpans
                          rootsRightBinary: Int,
-                         numBinaryRoots: Int,
-                         numBinaryRootsSocher: Int) {
+                         numBinaryRoots: Int) {
     def +(stats: Stats) = Stats(spansRight + stats.spansRight,
                                             numSpans + stats.numSpans,
                                             spansRightTernary + stats.spansRightTernary,
                                             spansRightBinary + stats.spansRightBinary,
                                             numBinarySpans + stats.numBinarySpans,
-                                            numBinarySpansSocher + stats.numBinarySpansSocher,
                                             rootsRight + stats.rootsRight,
                                             numRoots + stats.numRoots,
                                             rootsRightTernary + stats.rootsRightTernary,
                                             rootsRightBinary + stats.rootsRightBinary,
-                                            numBinaryRoots + stats.numBinaryRoots,
-                                            numBinaryRootsSocher + stats.numBinaryRootsSocher);
+                                            numBinaryRoots + stats.numBinaryRoots);
 
 
     override def toString = {
-      val render: (Int, Int) => String = SentimentEvaluator.renderNumerDenom;
-      "Spans: " + render(spansRight, numSpans) + "\n" +
-                    "  Ternary: " + render(spansRightTernary, numSpans) + "\n" +
-//                    "  Binary: " + render(spansRightBinary, numBinarySpans) + "\n" +
-//                    "  Binary (Socher): " + render(spansRightBinary, numBinarySpansSocher) + "\n" +
-                    "Roots: " + render(rootsRight, numRoots) + "\n" +
-                    "  Ternary: " + render(rootsRightTernary, numRoots) + "\n";
-//                    "  Binary: " + render(rootsRightBinary, numBinaryRoots) + "\n" +
-//                    "  Binary (Socher): " + render(rootsRightBinary, numBinaryRootsSocher);
+      "Spans: " + SentimentEvaluator.renderNumerDenom(spansRight, numSpans) + " (Ternary: " + SentimentEvaluator.renderNumerDenom(spansRightTernary, numSpans) +
+      "), Roots: " + SentimentEvaluator.renderNumerDenom(rootsRight, numRoots) + " (Ternary: " + SentimentEvaluator.renderNumerDenom(rootsRightTernary, numRoots) + ")";
     }
 
 
@@ -250,31 +252,33 @@ object SentimentTreebankPipeline extends LazyLogging {
       var spansRightTernary = 0;
       var spansRightBinary = 0;
       var numBinarySpans = 0;
-      var numBinarySpansSocher = 0;
       for ((gLabel, gSpan) <- gold) {
         val pLabel = guessMap(gSpan);
         spansRight += (if (SentimentEvaluator.isCorrectNormal(gLabel, pLabel)) 1 else 0);
         numSpans += 1;
         spansRightTernary += (if (SentimentEvaluator.isCorrectTernary(gLabel, pLabel)) 1 else 0);
         spansRightBinary += (if (SentimentEvaluator.isUsedBinaryCoarse(gLabel, pLabel) && SentimentEvaluator.isCorrectBinary(gLabel, pLabel)) 1 else 0);
         numBinarySpans += (if (SentimentEvaluator.isUsedBinaryCoarse(gLabel, pLabel)) 1 else 0);
-        numBinarySpansSocher += (if (SentimentEvaluator.isUsedSocherCoarse(gLabel, pLabel)) 1 else 0);
       }
       val rootsRight = (if (SentimentEvaluator.isCorrectNormal(goldRoot, guessRoot)) 1 else 0);
       val numRoots = 1;
       val rootsRightTernary = if (SentimentEvaluator.isCorrectTernary(goldRoot, guessRoot)) 1 else 0;
       val rootsRightBinary = (if (SentimentEvaluator.isUsedBinaryCoarse(goldRoot, guessRoot) && SentimentEvaluator.isCorrectBinary(goldRoot, guessRoot)) 1 else 0);
       val numBinaryRoots = (if (SentimentEvaluator.isUsedBinaryCoarse(goldRoot, guessRoot)) 1 else 0);
-      val numBinaryRootsSocher = (if (SentimentEvaluator.isUsedSocherCoarse(goldRoot, guessRoot)) 1 else 0);
-      Stats(spansRight, numSpans, spansRightTernary, spansRightBinary, numBinarySpans, numBinarySpansSocher,
-                  rootsRight, numRoots, rootsRightTernary, rootsRightBinary, numBinaryRoots, numBinaryRootsSocher)
+      Stats(spansRight, numSpans, spansRightTernary, spansRightBinary, numBinarySpans,
+                  rootsRight, numRoots, rootsRightTernary, rootsRightBinary, numBinaryRoots)
     }.reduce(_+_);
   }
 
   def decode(tree: BinarizedTree[Unit], marginal: ParseMarginal[AnnotatedLabel, String], decodeType: DecodeType) = {
     val (topMarg, botMarg) = marginal.labelMarginals
     tree.extend { t =>
-      val summed = topMarg(t.begin, t.end)
+      val summed = if (t.begin == 0 && t.end == tree.end) {
+        botMarg(t.begin, t.end)
+      } else {
+        // Elsewhere, use the top chart
+        topMarg(t.begin, t.end)
+      }
       if(decodeType == Binary) {
         val neg = (summed(AnnotatedLabel("0")) + summed(AnnotatedLabel("1")) )
         val pos = (summed(AnnotatedLabel("3")) + summed(AnnotatedLabel("4")) )
@@ -300,5 +304,21 @@ object SentimentTreebankPipeline extends LazyLogging {
       }
     }
   }
+
+
+  def computeLL(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], model: SpanModel[AnnotatedLabel,AnnotatedLabel,String], weights: DenseVector[Double]) {
+    println("Computing final log likelihood on the whole training set...")
+    val inf = model.inferenceFromWeights(weights)
+    val ll = trainTrees.par.aggregate(0.0)((currLL, trainTree) => { 
+      try {
+        val s = inf.scorer(trainTree)
+        currLL + inf.goldMarginal(s, trainTree).logPartition - inf.marginal(s, trainTree).logPartition
+      } catch {
+        case e: Exception => println("Couldn't parse")
+        currLL
+      }
+    }, _ + _)
+    println("Log likelihood on " + trainTrees.size + " examples: " + ll)
+  }
 
 }