Skip to content
This repository was archived by the owner on Feb 19, 2020. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,15 @@ case class ExtraPNMParams(@Help(text="Used for ablations with random word embedd
@Help(text="Set unknown word vectors to be random rather than 0")
randomizeUnks: Boolean = false)

case class ExtraPNMSparseParams(@Help(text="Use n-gram features in the sparse featurizer (good for sentiment)")
useNGrams: Boolean = false,
@Help(text="Max order of n-grams to use in these features")
maxNGramOrder:Int = 2,
@Help(text="Count threshold for firing n-gram features")
ngramCountThreshold: Int = 1,
@Help(text="Additional span shape features based on tags")
useTagSpanShape: Boolean = false)

case class PositionalNeuralModelFactory(@Help(text=
"""The kind of annotation to do on the refined grammar. Default uses just parent annotation.
You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Manning 2003.
Expand Down Expand Up @@ -98,7 +107,8 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma
vocFile: String = "",
@Help(text="Set to true if your word vectors are all lowercase. Otherwise true case is used.")
lowercasedVectors: Boolean = false,
extraPNMParams: ExtraPNMParams = ExtraPNMParams()) extends ParserModelFactory[AnnotatedLabel, String] {
extraPNMParams: ExtraPNMParams = ExtraPNMParams(),
extraPNMSparseParams: ExtraPNMSparseParams = ExtraPNMSparseParams()) extends ParserModelFactory[AnnotatedLabel, String] {

type MyModel = PositionalNeuralModel[AnnotatedLabel, AnnotatedLabel, String]

Expand All @@ -109,6 +119,7 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma
lexicon: Lexicon[AnnotatedLabel, String],
constrainer: ChartConstraints.Factory[AnnotatedLabel, String]): MyModel = {
import extraPNMParams._
import extraPNMSparseParams._
val annTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]] = trainTrees.map(annotator(_))
println("Here's what the annotation looks like on the first few trees")
annTrees.slice(0, Math.min(3, annTrees.size)).foreach(tree => println(tree.render(false)))
Expand Down Expand Up @@ -198,6 +209,12 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma
var wf = SpanModelFactory.defaultPOSFeaturizer(annWords, useBrown = useSparseBrown)
var span = SpanModelFactory.goodFeaturizer(annWords, commonWordThreshold, useShape = false, useLfsuf = useSparseLfsuf, useBrown = useSparseBrown, useMostSparseIndicators = useMostSparseIndicators)
span += new SingleWordSpanFeaturizer[String](wf)
if (useNGrams) {
span += new NGramSpanFeaturizer(summedWordCounts, NGramSpanFeaturizer.countBigrams(annTrees), annTrees.map(_.words), ngramCountThreshold, maxNGramOrder, useNot = false)
}
if (useTagSpanShape) {
span += new TagSpanShapeFeaturizer(TagSpanShapeGenerator.makeBaseLexicon(trainTrees))
}
val indexedWord = IndexedWordFeaturizer.fromData(wf, annTrees.map{_.words}, deduplicateFeatures = false)
val indexedSurface = IndexedSplitSpanFeaturizer.fromData(span, annTrees, bloomFilter = false)

Expand Down
11 changes: 6 additions & 5 deletions src/main/scala/epic/parser/models/SpanModel.scala
Original file line number Diff line number Diff line change
Expand Up @@ -315,9 +315,10 @@ object IndexedSpanFeaturizer {


case class ExtraParams(useHackyLexicalFeatures:Boolean = false,
hackyLexicalFeatureDesc:String = "",
useMorph:Boolean = false,
pathsToMorph:String = "")
hackyLexicalFeatureDesc:String = "",
useMorph:Boolean = false,
useTagSpanShape:Boolean = false,
pathsToMorph:String = "")

case class SpanModelFactory(@Help(text=
"""The kind of annotation to do on the refined grammar. Default uses just parent annotation.
Expand Down Expand Up @@ -388,8 +389,8 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma
if(useNGrams)
span += ngramF

// if(useTagSpanShape)
// span += tagSpanShape
if(useTagSpanShape)
span += tagSpanShape

if(useFullShape)
span += fullShape
Expand Down
2 changes: 0 additions & 2 deletions src/main/scala/epic/sentiment/SentimentEvaluator.scala
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ object SentimentEvaluator {
println("Accuracy: " + accuracy(mat, isCorrectNormal, isUsedAlways)); // agrees with the Stanford system's way of combining the matrix
println("Ternary: " + accuracy(mat, isCorrectTernary, isUsedAlways));
println("Binary: " + accuracy(mat, isCorrectBinary, isUsedBinaryCoarse));
println("Socher binary: " + accuracy(mat, isCorrectBinary, isUsedSocherCoarse)); // agrees with the Stanford system's way of combining the matrix
}

def accuracy(mat: Array[Array[Int]]) = {
Expand All @@ -102,7 +101,6 @@ object SentimentEvaluator {

def isUsedAlways(gold: Int, guess: Int) = true;
def isUsedBinaryCoarse(gold: Int, guess: Int) = gold != 2;
def isUsedSocherCoarse(gold: Int, guess: Int) = gold != 2 && guess != 2;

// def ternaryCoarseEval(mat: Array[Array[Int]]) = {
// val numer = mat(0)(0) + mat(0)(1) + mat(1)(0) + mat(1)(1) + mat(2)(2) + mat(3)(3) + mat(3)(4) + mat(4)(3) + mat(4)(4);
Expand Down
96 changes: 58 additions & 38 deletions src/main/scala/epic/sentiment/SentimentTreebankPipeline.scala
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
package epic.sentiment

import java.io.File
import breeze.config.Help
import breeze.config.CommandLineParser
import epic.trees._
import epic.parser.models.{ParserInference, ParserModel}
import epic.parser.models._
import epic.parser._
import breeze.linalg._
import epic.framework._
import epic.constraints.{LabeledSpanConstraints, SpanConstraints, ChartConstraints}
import breeze.optimize.CachedBatchDiffFunction
import breeze.optimize._
import breeze.optimize.FirstOrderMinimizer.OptParams
import com.typesafe.scalalogging.slf4j.LazyLogging
import epic.parser.models.SpanModelFactory
import epic.trees.ProcessedTreebank
Expand All @@ -18,21 +20,31 @@ import epic.trees.Span
import scala.collection.mutable.HashMap
import breeze.util._
import epic.parser.models.ParserExtractableModelFactory
import epic.dense.AdadeltaGradientDescentDVD

/**
*
*
* @author dlwh
*/
object SentimentTreebankPipeline extends LazyLogging {
case class Options(path: File,
case class Options(@Help(text="Treebank path")
path: File,
@Help(text="Name for the model")
name: String = "SentiParser",
opt: OptParams,
lossType: String = "",
iterPerEval: Int = 100,
iterationsPerEval: Int = 50,
@Help(text="How many iterations to run.")
maxIterations: Int = 202,
evalOnTest: Boolean = false,
@Help(text="Evaluate on test as well; this is so we can pick the best held-out score and evaluate that on test")
alsoEvalOnTest: Boolean = false,
includeDevInTrain: Boolean = false,
modelFactory: ParserExtractableModelFactory[AnnotatedLabel, String] = new SpanModelFactory,
rootLossScaling: Double = 1.0)
@Help(text="Details about the parser to build")
modelFactory: SpanModelFactory = new SpanModelFactory,
rootLossScaling: Double = 1.0,
computeTrainLL: Boolean = false)


def main(args: Array[String]):Unit = {
Expand Down Expand Up @@ -82,25 +94,26 @@ object SentimentTreebankPipeline extends LazyLogging {
val cachedObj = new CachedBatchDiffFunction(obj)
val init = obj.initialWeightVector(true)

val name = "SentiParser"

for ((state, iter) <- params.opt.iterations(cachedObj, init).take(1000).zipWithIndex
if iter % params.iterPerEval == 0) try {
val itr = params.opt.iterations(cachedObj, init)

val name = params.name
for ((state, iter) <- itr.take(params.maxIterations).zipWithIndex
if iter % params.iterationsPerEval == 0) try {
val parser = model.extractParser(state.x).copy(decoder=new MaxConstituentDecoder[AnnotatedLabel, String])
// if(params.evalOnTest)
// println("Eval: " + evaluate(s"$name-$iter", parser, treebank.testTrees))
// else
// println("Eval: " + evaluate(s"$name-$iter", parser, treebank.devTrees))
if(params.evalOnTest) {
if (params.computeTrainLL) {
computeLL(trainTrees, model, state.x)
}
if (params.evalOnTest) {
println("NORMAL DECODE: Eval: " + evaluate(s"$name-$iter", parser, treebank.testTrees, DecodeType.Normal));
} else {
println("Span confusions");
println(renderArr(evaluateSpanConfusions(s"$name-$iter", parser, treebank.devTrees, DecodeType.Normal)));
println("Root confusions");
println(renderArr(evaluateRootConfusions(s"$name-$iter", parser, treebank.devTrees, DecodeType.Normal)));
println("NORMAL DECODE: Eval: " + evaluate(s"$name-$iter", parser, treebank.devTrees, DecodeType.Normal));
println("TERNARY DECODE: Eval: " + evaluate(s"$name-$iter", parser, treebank.devTrees, DecodeType.Ternary));
// println("BINARY DECODE: Eval: " + evaluateBetter(s"$name-$iter", parser, treebank.devTrees, DecodeType.Binary));
if (params.alsoEvalOnTest) {
println("TEST SET: Eval: " + evaluate(s"$name-$iter", parser, treebank.testTrees, DecodeType.Normal));
}
}
} catch {
case e: Exception => e.printStackTrace(); throw e
Expand Down Expand Up @@ -162,37 +175,26 @@ object SentimentTreebankPipeline extends LazyLogging {
spansRightTernary: Int, // denom is same as numSpans
spansRightBinary: Int,
numBinarySpans: Int,
numBinarySpansSocher: Int,
rootsRight: Int,
numRoots: Int,
rootsRightTernary: Int, // denom is same as numSpans
rootsRightBinary: Int,
numBinaryRoots: Int,
numBinaryRootsSocher: Int) {
numBinaryRoots: Int) {
def +(stats: Stats) = Stats(spansRight + stats.spansRight,
numSpans + stats.numSpans,
spansRightTernary + stats.spansRightTernary,
spansRightBinary + stats.spansRightBinary,
numBinarySpans + stats.numBinarySpans,
numBinarySpansSocher + stats.numBinarySpansSocher,
rootsRight + stats.rootsRight,
numRoots + stats.numRoots,
rootsRightTernary + stats.rootsRightTernary,
rootsRightBinary + stats.rootsRightBinary,
numBinaryRoots + stats.numBinaryRoots,
numBinaryRootsSocher + stats.numBinaryRootsSocher);
numBinaryRoots + stats.numBinaryRoots);


override def toString = {
val render: (Int, Int) => String = SentimentEvaluator.renderNumerDenom;
"Spans: " + render(spansRight, numSpans) + "\n" +
" Ternary: " + render(spansRightTernary, numSpans) + "\n" +
// " Binary: " + render(spansRightBinary, numBinarySpans) + "\n" +
// " Binary (Socher): " + render(spansRightBinary, numBinarySpansSocher) + "\n" +
"Roots: " + render(rootsRight, numRoots) + "\n" +
" Ternary: " + render(rootsRightTernary, numRoots) + "\n";
// " Binary: " + render(rootsRightBinary, numBinaryRoots) + "\n" +
// " Binary (Socher): " + render(rootsRightBinary, numBinaryRootsSocher);
"Spans: " + SentimentEvaluator.renderNumerDenom(spansRight, numSpans) + " (Ternary: " + SentimentEvaluator.renderNumerDenom(spansRightTernary, numSpans) +
"), Roots: " + SentimentEvaluator.renderNumerDenom(rootsRight, numRoots) + " (Ternary: " + SentimentEvaluator.renderNumerDenom(rootsRightTernary, numRoots) + ")";
}


Expand Down Expand Up @@ -250,31 +252,33 @@ object SentimentTreebankPipeline extends LazyLogging {
var spansRightTernary = 0;
var spansRightBinary = 0;
var numBinarySpans = 0;
var numBinarySpansSocher = 0;
for ((gLabel, gSpan) <- gold) {
val pLabel = guessMap(gSpan);
spansRight += (if (SentimentEvaluator.isCorrectNormal(gLabel, pLabel)) 1 else 0);
numSpans += 1;
spansRightTernary += (if (SentimentEvaluator.isCorrectTernary(gLabel, pLabel)) 1 else 0);
spansRightBinary += (if (SentimentEvaluator.isUsedBinaryCoarse(gLabel, pLabel) && SentimentEvaluator.isCorrectBinary(gLabel, pLabel)) 1 else 0);
numBinarySpans += (if (SentimentEvaluator.isUsedBinaryCoarse(gLabel, pLabel)) 1 else 0);
numBinarySpansSocher += (if (SentimentEvaluator.isUsedSocherCoarse(gLabel, pLabel)) 1 else 0);
}
val rootsRight = (if (SentimentEvaluator.isCorrectNormal(goldRoot, guessRoot)) 1 else 0);
val numRoots = 1;
val rootsRightTernary = if (SentimentEvaluator.isCorrectTernary(goldRoot, guessRoot)) 1 else 0;
val rootsRightBinary = (if (SentimentEvaluator.isUsedBinaryCoarse(goldRoot, guessRoot) && SentimentEvaluator.isCorrectBinary(goldRoot, guessRoot)) 1 else 0);
val numBinaryRoots = (if (SentimentEvaluator.isUsedBinaryCoarse(goldRoot, guessRoot)) 1 else 0);
val numBinaryRootsSocher = (if (SentimentEvaluator.isUsedSocherCoarse(goldRoot, guessRoot)) 1 else 0);
Stats(spansRight, numSpans, spansRightTernary, spansRightBinary, numBinarySpans, numBinarySpansSocher,
rootsRight, numRoots, rootsRightTernary, rootsRightBinary, numBinaryRoots, numBinaryRootsSocher)
Stats(spansRight, numSpans, spansRightTernary, spansRightBinary, numBinarySpans,
rootsRight, numRoots, rootsRightTernary, rootsRightBinary, numBinaryRoots)
}.reduce(_+_);
}

def decode(tree: BinarizedTree[Unit], marginal: ParseMarginal[AnnotatedLabel, String], decodeType: DecodeType) = {
val (topMarg, botMarg) = marginal.labelMarginals
tree.extend { t =>
val summed = topMarg(t.begin, t.end)
val summed = if (t.begin == 0 && t.end == tree.end) {
botMarg(t.begin, t.end)
} else {
// Elsewhere, use the top chart
topMarg(t.begin, t.end)
}
if(decodeType == Binary) {
val neg = (summed(AnnotatedLabel("0")) + summed(AnnotatedLabel("1")) )
val pos = (summed(AnnotatedLabel("3")) + summed(AnnotatedLabel("4")) )
Expand All @@ -300,5 +304,21 @@ object SentimentTreebankPipeline extends LazyLogging {
}
}
}


def computeLL(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], model: SpanModel[AnnotatedLabel,AnnotatedLabel,String], weights: DenseVector[Double]) {
println("Computing final log likelihood on the whole training set...")
val inf = model.inferenceFromWeights(weights)
val ll = trainTrees.par.aggregate(0.0)((currLL, trainTree) => {
try {
val s = inf.scorer(trainTree)
currLL + inf.goldMarginal(s, trainTree).logPartition - inf.marginal(s, trainTree).logPartition
} catch {
case e: Exception => println("Couldn't parse")
currLL
}
}, _ + _)
println("Log likelihood on " + trainTrees.size + " examples: " + ll)
}

}