diff --git a/src/main/java/epic/util/Arrays.scala b/src/main/java/epic/util/Arrays.scala index f1890a31..7fba7390 100644 --- a/src/main/java/epic/util/Arrays.scala +++ b/src/main/java/epic/util/Arrays.scala @@ -48,9 +48,9 @@ object Arrays { val ret = new Array[C](arr1.length * arr2.length) var off = 0 var i = 0 - while(i < arr1.length) { + while (i < arr1.length) { var j = 0 - while(j < arr2.length) { + while (j < arr2.length) { ret(off) = f(arr1(i), arr2(j)) off += 1 j += 1 @@ -65,9 +65,9 @@ object Arrays { val ret = new Array[Int](arr1.length * arr2.length) var off = 0 var i = 0 - while(i < arr1.length) { + while (i < arr1.length) { var j = 0 - while(j < arr2.length) { + while (j < arr2.length) { ret(off) = arr1(i) + arr2(j) * secondScale off += 1 j += 1 diff --git a/src/main/scala/epic/constraints/CachedLabeledSpanConstraintsFactory.scala b/src/main/scala/epic/constraints/CachedLabeledSpanConstraintsFactory.scala index 195b4e61..b9d11b9a 100644 --- a/src/main/scala/epic/constraints/CachedLabeledSpanConstraintsFactory.scala +++ b/src/main/scala/epic/constraints/CachedLabeledSpanConstraintsFactory.scala @@ -2,7 +2,6 @@ package epic.constraints import epic.util.CacheBroker - /** * A cached version of [[epic.constraints.LabeledSpanConstraints.Factory]]. * Uses the [[epic.util.CacheBroker]] infrastructure diff --git a/src/main/scala/epic/constraints/ChartConstraints.scala b/src/main/scala/epic/constraints/ChartConstraints.scala index fcc68cbb..19a6990e 100644 --- a/src/main/scala/epic/constraints/ChartConstraints.scala +++ b/src/main/scala/epic/constraints/ChartConstraints.scala @@ -19,29 +19,26 @@ import java.io.{DataOutput, DataInput} case class ChartConstraints[L](top: LabeledSpanConstraints[L], bot: LabeledSpanConstraints[L]) extends SpanConstraints with Serializable { - - def isAllowedSpan(begin: Int, end: Int):Boolean = top.isAllowedSpan(begin, end) || bot.isAllowedSpan(begin, end) + def isAllowedSpan(begin: Int, end: Int): Boolean = top.isAllowedSpan(begin, end) || bot.isAllowedSpan(begin, end) /** TODO */ // TODO - def hasMaximalLabel(begin: Int, end: Int):Boolean = ??? - + def hasMaximalLabel(begin: Int, end: Int): Boolean = ??? def maxSpanLengthStartingAt(begin: Int): Int = top.maxSpanLengthStartingAt(begin) max bot.maxSpanLengthStartingAt(begin) def flatten = top | bot - def &(other: ChartConstraints[L]) = if(this eq other) this else ChartConstraints(top & other.top, bot & other.bot) + def &(other: ChartConstraints[L]) = if (this eq other) this else ChartConstraints(top & other.top, bot & other.bot) def |(other: ChartConstraints[L]) = ChartConstraints(top | other.top, bot | other.bot) - } object ChartConstraints { + def noSparsity[L]: ChartConstraints[L] = ChartConstraints[L](LabeledSpanConstraints.noConstraints[L], LabeledSpanConstraints.noConstraints[L]) def apply[L](top: TriangularArray[_ <: BitSet], bot: TriangularArray[_ <: BitSet]): ChartConstraints[L] = ChartConstraints(LabeledSpanConstraints(top), LabeledSpanConstraints(bot)) trait Factory[L, W] extends SpanConstraints.Factory[W] { def constraints(w: IndexedSeq[W]): ChartConstraints[L] - def |(cf: Factory[L, W]) = new OrFactory(this, cf) } @@ -82,11 +79,9 @@ object ChartConstraints { case _ => bot(t.begin,t.end) = BitSet(labelIndex(t.label)) } - ChartConstraints(LabeledSpanConstraints(top), LabeledSpanConstraints(bot)) } - implicit def serializerChartConstraints[L]:Serializer[ChartConstraints[L]] = new Serializer[ChartConstraints[L]] with Serializable { def serialize(out: DataOutput, value: ChartConstraints[L]) { implicitly[Serializer[LabeledSpanConstraints[L]]].serialize(out, value.top) @@ -98,6 +93,6 @@ object ChartConstraints { val bot = implicitly[Serializer[LabeledSpanConstraints[L]]].deserialize(in, available) ChartConstraints(top, bot) } - } + } diff --git a/src/main/scala/epic/constraints/LabeledSpanConstraints.scala b/src/main/scala/epic/constraints/LabeledSpanConstraints.scala index 6c46563f..b74473e1 100644 --- a/src/main/scala/epic/constraints/LabeledSpanConstraints.scala +++ b/src/main/scala/epic/constraints/LabeledSpanConstraints.scala @@ -26,9 +26,9 @@ sealed trait LabeledSpanConstraints[-L] extends SpanConstraints { def isAllowedLabeledSpan(begin: Int, end: Int, label: Int): Boolean def isAllowedSpan(begin: Int, end: Int): Boolean /** How long can a span be if it starts at begin*/ - def maxSpanLengthStartingAt(begin: Int):Int + def maxSpanLengthStartingAt(begin: Int): Int /** How long can a span be if it has label label in this sentence? */ - def maxSpanLengthForLabel(label: Int):Int + def maxSpanLengthForLabel(label: Int): Int /** * Computes the intersection of the constraints @@ -36,7 +36,7 @@ sealed trait LabeledSpanConstraints[-L] extends SpanConstraints { * @return */ def &(other: LabeledSpanConstraints[L @uncheckedVariance ]): LabeledSpanConstraints[L] = { - if(this eq other) this + if (this eq other) this else this match { case NoConstraints => other case PromotedSpanConstraints(inner) => other match { @@ -44,7 +44,7 @@ sealed trait LabeledSpanConstraints[-L] extends SpanConstraints { case PromotedSpanConstraints(otherinner) => PromotedSpanConstraints(inner & otherinner) case SimpleConstraints(maxPosX, maxLx, x) => SimpleConstraints(maxPosX, maxLx, TriangularArray.tabulate(x.dimension){(b, e) => - if(x(b,e) == null || !inner.isAllowedSpan(b,e)) null + if (x(b,e) == null || !inner.isAllowedSpan(b,e)) null else x(b,e) }) } @@ -55,14 +55,14 @@ sealed trait LabeledSpanConstraints[-L] extends SpanConstraints { require(x.dimension == y.dimension, "Dimensions of constrained spans must match!") SimpleConstraints( elementwiseMin(maxPosX, maxPosY), elementwiseMin(maxLx, maxLy), TriangularArray.tabulate(x.dimension) { (b,e) => - if(x(b,e) == null || y(b,e) == null) null + if (x(b,e) == null || y(b,e) == null) null else x(b,e) & y(b,e) }) } } } - def containsAll(other: LabeledSpanConstraints[L @uncheckedVariance]):Boolean = this match { + def containsAll(other: LabeledSpanConstraints[L @uncheckedVariance]): Boolean = this match { case NoConstraints => true case SimpleConstraints(maxPosX, maxLx, x) => other match { case NoConstraints => throw new UnsupportedOperationException("Can't check Simple.containsAll(noconstraints)") @@ -75,7 +75,6 @@ sealed trait LabeledSpanConstraints[-L] extends SpanConstraints { yield (y(i,j) eq null) || ((x(i,j) ne null) && (y(i,j) &~ x(i,j )).isEmpty) }.forall(identity)) } - } /** @@ -89,7 +88,6 @@ sealed trait LabeledSpanConstraints[-L] extends SpanConstraints { case NoConstraints => this case PromotedSpanConstraints(otherinner) => PromotedSpanConstraints(inner | otherinner) case SimpleConstraints(maxPosX, maxLx, x) => ??? - } case SimpleConstraints(maxPosX, maxLx, x) => other match { case NoConstraints => this @@ -98,15 +96,14 @@ sealed trait LabeledSpanConstraints[-L] extends SpanConstraints { require(x.dimension == y.dimension, "Dimensions of constrained spans must match!") SimpleConstraints( elementwiseMax(maxPosX, maxPosY), elementwiseMax(maxLx, maxLy), TriangularArray.tabulate(x.dimension) { (b,e) => - if(x(b,e) == null) y(b,e) + if (x(b,e) == null) y(b,e) else if (y(b,e) == null) x(b, e) else x(b,e) | y(b,e) }) } } - def decode(labelIndex: Index[L@uncheckedVariance ]):String - + def decode(labelIndex: Index[L@uncheckedVariance ]): String } @@ -122,7 +119,7 @@ object LabeledSpanConstraints { out.writeBoolean(true) val length: Int = maxLengthsForPosition.length out.writeInt(length) - if(length < Byte.MaxValue) { + if (length < Byte.MaxValue) { maxLengthsForPosition.foreach(maxLengthForPosition => out.writeByte((maxLengthForPosition min length).toByte) ) @@ -133,9 +130,9 @@ object LabeledSpanConstraints { maxLengthsForLabel.foreach(out.writeInt) for(i <- 0 until length; j <- (i+1) to length if value.isAllowedSpan(i, j)) { val cardinality: Int = spans(i, j).cardinality - if(cardinality != 0) { + if (cardinality != 0) { out.writeInt(TriangularArray.index(i, j)) - if(cardinality == 1) { + if (cardinality == 1) { // have to deal with 0 length mask out.writeInt(~(spans(i, j).nextSetBit(0))) } else { @@ -163,13 +160,13 @@ object LabeledSpanConstraints { val maxLengthsForLabel = Array.fill(labelLen)(in.readInt()) val spans = new TriangularArray[util.BitSet](length+1) var ok = true - while(ok) { + while (ok) { ok = false val ti = readInt() - if(ti >= 0) { + if (ti >= 0) { ok = true val bitmaskSize = readInt() - if(bitmaskSize < 0) { + if (bitmaskSize < 0) { val index = ~bitmaskSize spans.data(ti) = new util.BitSet() spans.data(ti).set(index) @@ -182,7 +179,6 @@ object LabeledSpanConstraints { } new SimpleConstraints[L](maxLengthsForPosition, maxLengthsForLabel, spans) } - } } @@ -200,9 +196,9 @@ object LabeledSpanConstraints { } val maxLengthLabel = ArrayBuffer[Int]() for(begin <- 0 until spans.dimension; end <- (begin+1) until spans.dimension) { - if(spans(begin, end) ne null) { + if (spans(begin, end) ne null) { for(l <- spans(begin, end)) { - if(l >= maxLengthLabel.length) { + if (l >= maxLengthLabel.length) { maxLengthLabel ++= new Array[Int](l - maxLengthLabel.length + 1) } maxLengthLabel(l) = maxLengthLabel(l) max (end-begin) @@ -210,17 +206,16 @@ object LabeledSpanConstraints { } } - apply(maxLengthPos, maxLengthLabel.toArray, spans) } def apply[L](maxLengthPos: Array[Int], maxLengthLabel: Array[Int], spans: TriangularArray[_ <: BitSet]):LabeledSpanConstraints[L] = { - SimpleConstraints(maxLengthPos, maxLengthLabel, spans.map(bs => if(bs eq null) null else java.util.BitSet.valueOf(bs.toBitMask))) + SimpleConstraints(maxLengthPos, maxLengthLabel, spans.map(bs => if (bs eq null) null else java.util.BitSet.valueOf(bs.toBitMask))) } def fromTagConstraints[L](constraints: TagConstraints[L]): LabeledSpanConstraints[L] = { val arr = TriangularArray.tabulate(constraints.length+1) { (b,e) => - if(b +1 == e) { + if (b +1 == e) { ensureBitSet(constraints.allowedTags(b)) } else { null @@ -229,7 +224,6 @@ object LabeledSpanConstraints { apply(arr) } - private def ensureBitSet[L](tags: Set[Int]): BitSet = { tags match { case x: BitSet => x @@ -247,21 +241,19 @@ object LabeledSpanConstraints { val arr = new TriangularArray[BitSet](localization.length + 1) val maxMaxLength = maxLengthForLabel.max min localization.length for(i <- 0 until localization.length) { - arr(i, i+1) = ensureBitSet(localization.allowedTags(i)) + arr(i, i+1) = ensureBitSet(localization.allowedTags(i)) } - val maxLengthPos = Array.fill(localization.length)(1) val maxLengthLabel = maxLengthForLabel.clone() - var acceptableTags = BitSet.empty ++ maxLengthForLabel.indices for(length <- 2 to maxMaxLength if acceptableTags.nonEmpty) { acceptableTags = acceptableTags.filter(i => maxLengthForLabel(i) >= length) - if(acceptableTags.nonEmpty) + if (acceptableTags.nonEmpty) for (begin <- 0 to (localization.length - length) ) { val end = begin + length - if(arr(begin,begin+1) != null && arr(begin+1,end) != null) { + if (arr(begin,begin+1) != null && arr(begin+1,end) != null) { arr(begin, end) = (arr(begin, begin+1) & arr(begin+1, end)) & acceptableTags - if(arr(begin,end).isEmpty) { + if (arr(begin,end).isEmpty) { arr(begin, end) = null } else { maxLengthPos(begin) = length @@ -274,50 +266,34 @@ object LabeledSpanConstraints { apply(maxLengthPos, maxLengthLabel, arr) } - @SerialVersionUID(1L) object NoConstraints extends LabeledSpanConstraints[Any] with Serializable { - def maxSpanLengthStartingAt(begin: Int): Int = Int.MaxValue/2 // /2 because i get worried about wrap around. - def isAllowedSpan(begin: Int, end: Int): Boolean = true def isAllowedLabeledSpan(begin: Int, end: Int, label: Int): Boolean = true - - def maxSpanLengthForLabel(label: Int):Int = Int.MaxValue / 2 - - - def decode(labelIndex: Index[Any]):String = toString + def maxSpanLengthForLabel(label: Int): Int = Int.MaxValue / 2 + def decode(labelIndex: Index[Any]): String = toString } - @SerialVersionUID(1L) case class PromotedSpanConstraints(inner: SpanConstraints) extends LabeledSpanConstraints[Any] with Serializable { - def maxSpanLengthStartingAt(begin: Int): Int = Int.MaxValue/2 // /2 because i get worried about wrap around. - def isAllowedSpan(begin: Int, end: Int): Boolean = inner.isAllowedSpan(begin, end) def isAllowedLabeledSpan(begin: Int, end: Int, label: Int): Boolean = isAllowedSpan(begin, end) - - def maxSpanLengthForLabel(label: Int):Int = Int.MaxValue / 2 - - def decode(labelIndex: Index[Any]):String = inner.toString + def maxSpanLengthForLabel(label: Int): Int = Int.MaxValue / 2 + def decode(labelIndex: Index[Any]): String = inner.toString } - // private vars for serialization. @SerialVersionUID(2L) case class SimpleConstraints[L](private var maxLengthsForPosition: Array[Int], // maximum length for position private var maxLengthsForLabel: Array[Int], private var spans: TriangularArray[java.util.BitSet]) extends LabeledSpanConstraints[L] with Serializable { def isAllowedSpan(begin: Int, end: Int): Boolean = (spans(begin,end) ne null) && spans(begin,end).cardinality() > 0 - def isAllowedLabeledSpan(begin: Int, end: Int, label: Int): Boolean = (spans(begin,end) ne null) && spans(begin, end).get(label) - def maxSpanLengthStartingAt(begin: Int): Int = maxLengthsForPosition(begin) - - def maxSpanLengthForLabel(label: Int) = if(maxLengthsForLabel.length <= label) 0 else maxLengthsForLabel(label) - - def decode(labelIndex: Index[L]):String = { + def maxSpanLengthForLabel(label: Int) = if (maxLengthsForLabel.length <= label) 0 else maxLengthsForLabel(label) + def decode(labelIndex: Index[L]): String = { val ret = new StringBuilder() val enc = Encoder.fromIndex(labelIndex) ret ++= "SimpleConstraints(positionMaxLengths=" @@ -327,7 +303,7 @@ object LabeledSpanConstraints { ret ++= ")\n" for(i <- 0 until maxLengthsForPosition.length; j <- (i+1) to maxLengthsForPosition.length) { val s = spans(i, j) - if(s ne null) { + if (s ne null) { ret ++= s" ($i,$j) " + enc.decode(Array.tabulate(labelIndex.size)(x => spans(i, j).get(x))).toString + "\n" } } @@ -352,15 +328,15 @@ object LabeledSpanConstraints { private def elementwiseMax(a: Array[Int], b: Array[Int]):Array[Int] = { // could avoid the allocation, but whatever. - if(a.length < b.length) elementwiseMax(util.Arrays.copyOf(a, b.length), b) - else if(b.length < a.length) elementwiseMax(a, util.Arrays.copyOf(b, a.length)) + if (a.length < b.length) elementwiseMax(util.Arrays.copyOf(a, b.length), b) + else if (b.length < a.length) elementwiseMax(a, util.Arrays.copyOf(b, a.length)) else Array.fillWith[Int](a.length)(i => math.max(a(i), b(i))) } private def elementwiseMin(a: Array[Int], b: Array[Int]):Array[Int] = { // could avoid the allocation, but whatever. - if(a.length < b.length) elementwiseMin(util.Arrays.copyOf(a, b.length), b) - else if(b.length < a.length) elementwiseMin(a, util.Arrays.copyOf(b, a.length)) + if (a.length < b.length) elementwiseMin(util.Arrays.copyOf(a, b.length), b) + else if (b.length < a.length) elementwiseMin(a, util.Arrays.copyOf(b, a.length)) else Array.fillWith[Int](a.length)(i => math.min(a(i), b(i))) } diff --git a/src/main/scala/epic/constraints/LongSpanConstraints.scala b/src/main/scala/epic/constraints/LongSpanConstraints.scala index eb1ad7e5..ccdda7e5 100644 --- a/src/main/scala/epic/constraints/LongSpanConstraints.scala +++ b/src/main/scala/epic/constraints/LongSpanConstraints.scala @@ -14,16 +14,9 @@ object LongSpanConstraints { val spans = new SpanConstraints { val oks = w.map(ww => okWords(ww) || !ww.head.isLetterOrDigit) def maxSpanLengthStartingAt(begin: Int): Int = w.length - begin - - def maxSpanLengthForLabel(label: Int): Int = w.length - - def decode(labelIndex: Index[L]): String = "..." - - def isAllowedLabeledSpan(begin: Int, end: Int, label: Int): Boolean = isAllowedSpan(begin, end) - def isAllowedSpan(begin: Int, end: Int): Boolean = ( end - begin <= maxSimpleLength || end == w.length @@ -36,6 +29,5 @@ object LongSpanConstraints { } new ChartConstraints(PromotedSpanConstraints(spans), PromotedSpanConstraints(spans)) } - } } diff --git a/src/main/scala/epic/constraints/SpanConstraints.scala b/src/main/scala/epic/constraints/SpanConstraints.scala index 51f9d89e..1dc4abfa 100644 --- a/src/main/scala/epic/constraints/SpanConstraints.scala +++ b/src/main/scala/epic/constraints/SpanConstraints.scala @@ -12,19 +12,19 @@ import epic.constraints.LabeledSpanConstraints.PromotedSpanConstraints * @author dlwh */ trait SpanConstraints { outer => - def apply(begin: Int, end: Int):Boolean = isAllowedSpan(begin, end) + def apply(begin: Int, end: Int): Boolean = isAllowedSpan(begin, end) def isAllowedSpan(begin: Int, end: Int): Boolean - def maxSpanLengthStartingAt(begin: Int):Int + def maxSpanLengthStartingAt(begin: Int): Int def |(other: SpanConstraints):SpanConstraints = new SpanConstraints { def isAllowedSpan(begin: Int, end: Int): Boolean = outer.isAllowedSpan(begin, end) || other.isAllowedSpan(begin, end) - def maxSpanLengthStartingAt(begin: Int):Int = outer.maxSpanLengthStartingAt(begin) max other.maxSpanLengthStartingAt(begin) + def maxSpanLengthStartingAt(begin: Int): Int = outer.maxSpanLengthStartingAt(begin) max other.maxSpanLengthStartingAt(begin) } def &(other: SpanConstraints):SpanConstraints = new SpanConstraints { def isAllowedSpan(begin: Int, end: Int): Boolean = outer.isAllowedSpan(begin, end) && other.isAllowedSpan(begin, end) - def maxSpanLengthStartingAt(begin: Int):Int = outer.maxSpanLengthStartingAt(begin) min other.maxSpanLengthStartingAt(begin) + def maxSpanLengthStartingAt(begin: Int): Int = outer.maxSpanLengthStartingAt(begin) min other.maxSpanLengthStartingAt(begin) } } diff --git a/src/main/scala/epic/corpora/CONLLSequenceReader.scala b/src/main/scala/epic/corpora/CONLLSequenceReader.scala index 6e7e7d79..d4e54a58 100644 --- a/src/main/scala/epic/corpora/CONLLSequenceReader.scala +++ b/src/main/scala/epic/corpora/CONLLSequenceReader.scala @@ -21,10 +21,9 @@ object CONLLSequenceReader { val outputs = new ArrayBuffer[String] import scala.util.control.Breaks._ breakable { - while(source.hasNext) { + while (source.hasNext) { val line = source.next() - if(line.trim().isEmpty) break - + if (line.trim().isEmpty) break val split = line.split(splitToken) inputs += split.take(split.length -1).toIndexedSeq outputs += split.last @@ -53,9 +52,9 @@ object CONLLSequenceReader { val inputs = new ArrayBuffer[IndexedSeq[String]]() import scala.util.control.Breaks._ breakable { - while(source.hasNext) { + while (source.hasNext) { val line = source.next() - if(line.trim().isEmpty) break + if (line.trim().isEmpty) break val split = line.split(splitToken) inputs += split diff --git a/src/main/scala/epic/corpora/MascUtil.scala b/src/main/scala/epic/corpora/MascUtil.scala index 868464e3..06378275 100644 --- a/src/main/scala/epic/corpora/MascUtil.scala +++ b/src/main/scala/epic/corpora/MascUtil.scala @@ -10,7 +10,6 @@ import epic.trees.Span import MascTransform._ - /** * Convert native MASC xml into CONLL format for named entity recognition. * @@ -18,13 +17,11 @@ import MascTransform._ */ object MascTransform { - case class MNode(id: String, targets: Seq[String]) case class MAnnotation(id: String, label: String, ref: String, features: Map[String,String]) case class MEdge(id: String, from: String, to: String) case class MRegion(id: String, start: Int, end: Int) extends Ordered[MRegion] { def span = Span(start, end) - def compare(that: MRegion) = this.start - that.start } @@ -32,9 +29,7 @@ object MascTransform { val mascDir = args(0) val outputDir = new File(if (args.length > 1) args(1) else "/tmp") outputDir.mkdirs - val targets = collectTargets(new File(mascDir)) - // Get 3/5 for train, 1/5 for dev, and 1/5 for test val targetsAndIndices = targets.zipWithIndex val trainSet = targetsAndIndices.filter(_._2 % 5 < 3).unzip._1 @@ -57,11 +52,9 @@ object MascTransform { System.err.println("Creating " + outputName) val outputDir = new File(parentDir, outputName) outputDir.mkdirs - val outputSentences = new FileWriter(new File(outputDir,outputName+"-sent.txt")) val outputTokens = new FileWriter(new File(outputDir,outputName+"-tok.txt")) val outputNer = new FileWriter(new File(outputDir,outputName+"-ner.txt")) - for (mfile <- MascFile(targets)) { for (sentence <- mfile.sentences) { val tokenizedSentence = new StringBuffer @@ -102,7 +95,6 @@ case class MascSentence ( bioLabels: Seq[String], orderedRegions: Seq[MRegion] ) { - lazy val numTokens = orderedTokens.length } @@ -112,9 +104,7 @@ class MascFile ( val rawtext: String, val sentences: Seq[MascSentence] ) { - lazy val numSentences = sentences.length - } object MascFile { @@ -152,7 +142,6 @@ object MascFile { val sentenceXml = loadXML(dirFile(prefix+"-s.xml")) val sentenceRegions = getRegions(sentenceXml).sorted - // Basic segment information val segmentXml = loadXML(dirFile(prefix+"-seg.xml")) val segmentRegions = getRegions(segmentXml).map(r => r.id -> r).toMap @@ -202,7 +191,6 @@ object MascFile { } } - // Insert the "missing" sentences. (Content not marked as a sentence, // but containing tokens.) @@ -223,7 +211,7 @@ object MascFile { // Pull out the sequence of token, pos, and NE for each sentence. val allOrderedTokRegions = tokenRegions.values.toIndexedSeq.sorted var index = 0 - val allDataBySentence = paddedSentenceRegions.flatMap { region => { + val allDataBySentence = paddedSentenceRegions.flatMap { region => //val startIndex = math.max(index, region.start) val startIndex = math.max(index, allOrderedTokRegions.indexWhere(t=>t.start>=region.start,index)) //val startIndex = index @@ -236,7 +224,7 @@ object MascFile { index = endIndex orderedTokPosNer(sentence) } - }} + } new MascFile(dir, prefix, rawtext, allDataBySentence) } @@ -260,7 +248,6 @@ object MascUtil { "date" -> "MISC" ).withDefault(x=>"O") - def getRegions(doc: Elem) = (doc \\ "region").toSeq.map { rxml => val Array(start, end) = (rxml \ "@anchors").toString.split(" ") MRegion(xmlId(rxml), start.toInt, end.toInt) diff --git a/src/main/scala/epic/dense/AffineOutputTransform.scala b/src/main/scala/epic/dense/AffineOutputTransform.scala index 167aa4df..39bc1707 100644 --- a/src/main/scala/epic/dense/AffineOutputTransform.scala +++ b/src/main/scala/epic/dense/AffineOutputTransform.scala @@ -16,12 +16,11 @@ import scala.util.Random */ case class AffineOutputTransform[FV](numOutputs: Int, numInputs: Int, innerTransform: Transform[FV, DenseVector[Double]], includeBias: Boolean = true) extends OutputTransform[FV, DenseVector[Double]] { - val index = SegmentedIndex(new AffineTransform.Index(numOutputs, numInputs, includeBias), innerTransform.index) def extractLayerAndPenultimateLayer(weights: DenseVector[Double], forTrain: Boolean) = { val mat = weights(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) - val bias = if(includeBias) { + val bias = if (includeBias) { weights(numOutputs * numInputs until index.componentOffset(1)) } else { DenseVector.zeros[Double](numOutputs) @@ -51,8 +50,7 @@ case class AffineOutputTransform[FV](numOutputs: Int, numInputs: Int, innerTrans override val index = AffineOutputTransform.this.index val weightst = weights.t -// val weightst = weights.t.copy - + //val weightst = weights.t.copy def activations(fv: FV) = { val out = weights * innerLayer.activations(fv) += bias @@ -74,7 +72,7 @@ case class AffineOutputTransform[FV](numOutputs: Int, numInputs: Int, innerTrans def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = { val scale = _scale val matDeriv = deriv(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) - val biasDeriv = if(includeBias) { + val biasDeriv = if (includeBias) { deriv(numOutputs * numInputs until index.componentOffset(1)) } else { DenseVector.zeros[Double](numOutputs) @@ -86,7 +84,7 @@ case class AffineOutputTransform[FV](numOutputs: Int, numInputs: Int, innerTrans // d/d(weights(::, i)) == scale(i) * innerAct for (i <- 0 until weights.rows) { val a: Double = scale(i) - if(a != 0.0) { + if (a != 0.0) { axpy(a, innerAct, matDeriv.t(::, i)) // so d/dbias(i) = scale(i) biasDeriv(i) += a diff --git a/src/main/scala/epic/dense/AffineTransform.scala b/src/main/scala/epic/dense/AffineTransform.scala index 5e6be860..1abeb952 100644 --- a/src/main/scala/epic/dense/AffineTransform.scala +++ b/src/main/scala/epic/dense/AffineTransform.scala @@ -12,7 +12,6 @@ case class AffineTransform[FV, Mid](numOutputs: Int, numInputs: Int, innerTransf (implicit mult: OpMulMatrix.Impl2[DenseMatrix[Double], Mid, DenseVector[Double]], canaxpy: scaleAdd.InPlaceImpl3[DenseVector[Double], Double, Mid]) extends Transform[FV, DenseVector[Double]] { - val index = SegmentedIndex(new AffineTransform.Index(numOutputs, numInputs, includeBias), innerTransform.index) def extractLayer(weights: DenseVector[Double], forTrain: Boolean) = { @@ -21,7 +20,7 @@ case class AffineTransform[FV, Mid](numOutputs: Int, numInputs: Int, innerTransf def extractLayerAndPenultimateLayer(weights: DenseVector[Double], forTrain: Boolean) = { val mat = weights(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) - val bias = if(includeBias) { + val bias = if (includeBias) { weights(numOutputs * numInputs until index.componentOffset(1)) } else { DenseVector.zeros[Double](numOutputs) @@ -58,8 +57,7 @@ case class AffineTransform[FV, Mid](numOutputs: Int, numInputs: Int, innerTransf override val index = AffineTransform.this.index val weightst = weights.t -// val weightst = weights.t.copy - + // val weightst = weights.t.copy def activations(fv: FV) = { val out = weights * innerLayer.activations(fv) += bias @@ -67,10 +65,10 @@ case class AffineTransform[FV, Mid](numOutputs: Int, numInputs: Int, innerTransf } def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = { -// println("SCALE: " + _scale) + // println("SCALE: " + _scale) val scale = _scale val matDeriv = deriv(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) - val biasDeriv = if(includeBias) { + val biasDeriv = if (includeBias) { deriv(numOutputs * numInputs until index.componentOffset(1)) } else { DenseVector.zeros[Double](numOutputs) @@ -82,18 +80,18 @@ case class AffineTransform[FV, Mid](numOutputs: Int, numInputs: Int, innerTransf // d/d(weights(::, i)) == scale(i) * innerAct for (i <- 0 until weights.rows) { val a: Double = scale(i) - if(a != 0.0) { + if (a != 0.0) { axpy(a, innerAct, matDeriv.t(::, i)) // so d/dbias(i) = scale(i) biasDeriv(i) += a } } -// biasDeriv += scale + // biasDeriv += scale // scale is f'(mat * inner(v) + bias) // d/dv is mat.t * f'(mat * inner(v) + bias) -// println("Intermediate scale: " + weightst * scale) + // println("Intermediate scale: " + weightst * scale) innerLayer.tallyDerivative(deriv(index.componentOffset(1) to -1), weightst * scale, fv) } @@ -108,8 +106,7 @@ object AffineTransform { canAxpy: scaleAdd.InPlaceImpl3[DenseVector[Double], Double, FV]) = new AffineTransform(numOutputs, numInputs, new IdentityTransform[FV], includeBias) def apply(numOutputs: Int, numInputs: Int, includeBias: Boolean):AffineTransform[DenseVector[Double], DenseVector[Double]] = apply(numOutputs, numInputs, new IdentityTransform[DenseVector[Double]], includeBias) def apply(numOutputs: Int, numInputs: Int):AffineTransform[DenseVector[Double], DenseVector[Double]] = apply(numOutputs, numInputs, true) - - + def getUniformAffineWeights(numWeights: Int, initWeightsScale: Double, rng: Random) = { DenseVector(Array.tabulate(numWeights)(i => rng.nextGaussian * initWeightsScale)) } @@ -160,7 +157,7 @@ object AffineTransform { def iterator: Iterator[Feature] = Iterator.range(0, size) map unapply map (_.get) - override val size: Int = if(includeBias) numOutputs * numInputs + numOutputs else numOutputs * numInputs + override val size: Int = if (includeBias) numOutputs * numInputs + numOutputs else numOutputs * numInputs override def toString() = ScalaRunTime._toString(this) } diff --git a/src/main/scala/epic/dense/BatchNormalizationTransform.scala b/src/main/scala/epic/dense/BatchNormalizationTransform.scala index 7c274300..e51aa283 100644 --- a/src/main/scala/epic/dense/BatchNormalizationTransform.scala +++ b/src/main/scala/epic/dense/BatchNormalizationTransform.scala @@ -87,8 +87,8 @@ case class BatchNormalizationTransform[FV](size: Int, useBias: Boolean, inner: T val mean = allActivations.reduce(_ + _) * (1.0/inputs.size) val variances = allActivations.map(act => (act - mean) :* (act - mean)).reduce(_ + _) * (1.0/inputs.size) val invStdDevs = variances.data.map(variance => 1.0/Math.sqrt(variance + 1e-6)) -// println(mean.data.toSeq) -// println(invStdDevs.toSeq) + // println(mean.data.toSeq) + // println(invStdDevs.toSeq) fcn = new NonlinearTransform.ShiftAndScaleEach(mean.data, invStdDevs) innerLayer.applyBatchNormalization(inputs) } diff --git a/src/main/scala/epic/dense/CachingLookupAndAffineTransformDense.scala b/src/main/scala/epic/dense/CachingLookupAndAffineTransformDense.scala index 2d8aa915..5e1ee59f 100644 --- a/src/main/scala/epic/dense/CachingLookupAndAffineTransformDense.scala +++ b/src/main/scala/epic/dense/CachingLookupAndAffineTransformDense.scala @@ -18,12 +18,11 @@ case class CachingLookupAndAffineTransformDense[FV](numOutputs: Int, word2vecIndexed: Word2VecIndexed[String], includeBias: Boolean = true) extends Transform[Array[Int], DenseVector[Double]] { - val index = new AffineTransform.Index(numOutputs, numInputs, includeBias) def extractLayer(weights: DenseVector[Double], forTrain: Boolean) = { val mat = weights(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) - val bias = if(includeBias) { + val bias = if (includeBias) { weights(numOutputs * numInputs until index.size) } else { DenseVector.zeros[Double](numOutputs) @@ -67,7 +66,7 @@ case class CachingLookupAndAffineTransformDense[FV](numOutputs: Int, def activations(fv: Array[Int]) = { val finalVector = DenseVector.zeros[Double](numOutputs) fv.indices.foreach { i => -// val wordPosn = fv(i) -> i + // val wordPosn = fv(i) -> i if (fv(i) != -1) { caches(i).synchronized { if (!caches(i).contains(fv(i))) { @@ -84,7 +83,7 @@ case class CachingLookupAndAffineTransformDense[FV](numOutputs: Int, def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: Array[Int]) = { val scale = _scale val matDeriv = deriv(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) - val biasDeriv = if(includeBias) { + val biasDeriv = if (includeBias) { deriv(numOutputs * numInputs until index.size) } else { DenseVector.zeros[Double](numOutputs) @@ -97,7 +96,7 @@ case class CachingLookupAndAffineTransformDense[FV](numOutputs: Int, // d/d(weights(::, i)) == scale(i) * innerAct for (i <- 0 until weights.rows) { val a: Double = scale(i) - if(a != 0.0) { + if (a != 0.0) { axpy(a, innerAct, matDeriv.t(::, i)) // so d/dbias(i) = scale(i) biasDeriv(i) += a diff --git a/src/main/scala/epic/dense/EmbeddingsTransform.scala b/src/main/scala/epic/dense/EmbeddingsTransform.scala index 271ee661..0fc82710 100644 --- a/src/main/scala/epic/dense/EmbeddingsTransform.scala +++ b/src/main/scala/epic/dense/EmbeddingsTransform.scala @@ -23,7 +23,7 @@ case class EmbeddingsTransform[FV](numOutputs: Int, def extractLayer(weights: DenseVector[Double], forTrain: Boolean) = { val mat = weights(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) - val bias = if(includeBias) { + val bias = if (includeBias) { weights(numOutputs * numInputs until index.indices(0).size) } else { DenseVector.zeros[Double](numOutputs) @@ -70,7 +70,7 @@ case class EmbeddingsTransform[FV](numOutputs: Int, def activations(fv: Array[Int]) = { val finalVector = DenseVector.zeros[Double](numOutputs) fv.indices.foreach { i => -// val wordPosn = fv(i) -> i + // val wordPosn = fv(i) -> i if (fv(i) != -1) { caches(i).synchronized { if (!caches(i).contains(fv(i))) { @@ -88,22 +88,20 @@ case class EmbeddingsTransform[FV](numOutputs: Int, def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: Array[Int]) = { val scale = _scale val matDeriv = deriv(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) - val biasDeriv = if(includeBias) { + val biasDeriv = if (includeBias) { deriv(numOutputs * numInputs until index.size) } else { DenseVector.zeros[Double](numOutputs) } - // whole function is f(mat * inner(fv) + bias) // scale(i) pushes in (f'(mat * inner(v) + bias))(i) val innerAct = DenseVector(word2vecIndexed.convertToVector(fv)) + Word2VecSurfaceFeaturizerIndexed.makeVectFromParams(fv, wordWeights) - val wordsDeriv = deriv(index.indices(0).size until index.indices(0).size + index.indices(1).size).asDenseMatrix.reshape(word2vecIndexed.vocSize, word2vecIndexed.wordRepSize, view = View.Require) val wordsDerivs = Array.tabulate(fv.length)(wordPosnIdx => wordsDeriv(fv(wordPosnIdx), ::).t) // d/d(weights(::, i)) == scale(i) * innerAct for (i <- 0 until weights.rows) { val a: Double = scale(i) - if(a != 0.0) { + if (a != 0.0) { axpy(a, innerAct, matDeriv.t(::, i)) var wordPosnIdx = 0 while (wordPosnIdx < fv.length) { @@ -119,8 +117,7 @@ case class EmbeddingsTransform[FV](numOutputs: Int, // scale is f'(mat * inner(v) + bias) // d/dv is mat.t * f'(mat * inner(v) + bias) } - - + def applyBatchNormalization(inputs: scala.collection.GenTraversable[Array[Int]]) = {} } } \ No newline at end of file diff --git a/src/main/scala/epic/dense/LowRankQuadraticTransform.scala b/src/main/scala/epic/dense/LowRankQuadraticTransform.scala index 269b8599..e6622e6e 100644 --- a/src/main/scala/epic/dense/LowRankQuadraticTransform.scala +++ b/src/main/scala/epic/dense/LowRankQuadraticTransform.scala @@ -21,10 +21,10 @@ case class LowRankQuadraticTransform[FV](numOutputs: Int, numRanks: Int, numLeft new OutputLayer(subTransforms, innerLayer) -> innerLayer } -// def extractLayer(weights: DenseVector[Double]) = { -// val subTransforms = (0 until neurons.size).map(i => neurons(i).extractLayer(weights(neuronIndex.componentOffset(i) until neuronIndex.componentOffset(i) + neuronIndex.indices(i).size))) -// new Layer(subTransforms, innerTransform.extractLayer(weights(index.componentOffset(1) to -1))) -// } + // def extractLayer(weights: DenseVector[Double]) = { + // val subTransforms = (0 until neurons.size).map(i => neurons(i).extractLayer(weights(neuronIndex.componentOffset(i) until neuronIndex.componentOffset(i) + neuronIndex.indices(i).size))) + // new Layer(subTransforms, innerTransform.extractLayer(weights(index.componentOffset(1) to -1))) + // } def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = { val subVects = DenseVector.vertcat(neurons.map(_.initialWeightVector(initWeightsScale, rng, outputLayer, spec)):_*) diff --git a/src/main/scala/epic/dense/OutputEmbeddingTransform.scala b/src/main/scala/epic/dense/OutputEmbeddingTransform.scala index ea7604d7..a5f7e01d 100644 --- a/src/main/scala/epic/dense/OutputEmbeddingTransform.scala +++ b/src/main/scala/epic/dense/OutputEmbeddingTransform.scala @@ -17,7 +17,6 @@ import scala.util.Random */ case class OutputEmbeddingTransform[FV](numOutputs: Int, outputDim: Int, innerTransform: Transform[FV, DenseVector[Double]], coarsenerForInitialization: Option[Int => Int] = None) extends OutputTransform[FV, DenseVector[Double]] { - val index = SegmentedIndex(new AffineTransform.Index(numOutputs, outputDim, true), innerTransform.index) diff --git a/src/main/scala/epic/dense/OutputTransform.scala b/src/main/scala/epic/dense/OutputTransform.scala index fbee5a5e..e910020a 100644 --- a/src/main/scala/epic/dense/OutputTransform.scala +++ b/src/main/scala/epic/dense/OutputTransform.scala @@ -7,30 +7,20 @@ import scala.util.Random trait OutputTransform[In, +Out] extends Serializable { val index: Index[Feature] - def extractLayer(dv: DenseVector[Double], forTrain: Boolean):OutputLayer = extractLayerAndPenultimateLayer(dv, forTrain)._1 - def extractLayerAndPenultimateLayer(dv: DenseVector[Double], forTrain: Boolean): (OutputLayer, Transform.Layer[In,Out]) - def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String): DenseVector[Double] - def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) - def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] - type OutputLayer <: OutputTransform.OutputLayer[In,Out] } object OutputTransform { trait OutputLayer[In, +Out] extends Transform.Layer[In,Out] { - def index: Index[Feature] - def activations(fv: In):Out - def activationsFromPenultimateDot(innerLayerActivations: DenseVector[Double], sparseIdx: Int): Double - def activationsFromPenultimateDot(innerLayerActivations: DenseVector[Double], sparseFeatures: Array[Int]): Double = { var value = 0.0 for (sparseFeature <- sparseFeatures) { @@ -38,9 +28,7 @@ object OutputTransform { } value } - def tallyDerivative(deriv: DenseVector[Double], scale: =>Vector[Double], fv: In) - def applyBatchNormalization(inputs: scala.collection.GenTraversable[In]) } diff --git a/src/main/scala/epic/dense/Transform.scala b/src/main/scala/epic/dense/Transform.scala index 72bc1af8..885ccc62 100644 --- a/src/main/scala/epic/dense/Transform.scala +++ b/src/main/scala/epic/dense/Transform.scala @@ -12,29 +12,19 @@ import scala.util.Random */ trait Transform[In, +Out] extends Serializable { val index: Index[Feature] - - def extractLayer(dv: DenseVector[Double], forTrain: Boolean):Layer - def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String): DenseVector[Double] - def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) - def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] - type Layer <: Transform.Layer[In,Out] } object Transform { trait Layer[In, +Out] { - def index: Index[Feature] - def activations(fv: In):Out - def tallyDerivative(deriv: DenseVector[Double], scale: =>Vector[Double], fv: In) - def applyBatchNormalization(inputs: scala.collection.GenTraversable[In]) } diff --git a/src/main/scala/epic/features/BilexicalFeaturizer.scala b/src/main/scala/epic/features/BilexicalFeaturizer.scala index a51b722f..95225a44 100644 --- a/src/main/scala/epic/features/BilexicalFeaturizer.scala +++ b/src/main/scala/epic/features/BilexicalFeaturizer.scala @@ -63,12 +63,11 @@ object BilexicalFeaturizer { } } - case class AdaptedSurfaceFeaturizer[W](base: SurfaceFeaturizer[W]) extends BilexicalFeaturizer[W] { def anchor(w: IndexedSeq[W]): BilexicalFeatureAnchoring[W] = new BilexicalFeatureAnchoring[W] { val ba = base.anchor(w) def featuresForAttachment(head: Int, dep: Int): Array[Feature] = { - if(head < dep) ba.featuresForSpan(head, dep) + if (head < dep) ba.featuresForSpan(head, dep) else ba.featuresForSpan(dep, head) } } @@ -91,7 +90,7 @@ object BilexicalFeaturizer { case class BinomialFeaturizer[W](headBase: BilexicalFeaturizer[W], depBase: BilexicalFeaturizer[W]) extends BilexicalFeaturizer[W] { def anchor(w: IndexedSeq[W]): BilexicalFeatureAnchoring[W] = new BilexicalFeatureAnchoring[W] { val hb = headBase.anchor(w) - val db = if(headBase eq depBase) hb else depBase.anchor(w) + val db = if (headBase eq depBase) hb else depBase.anchor(w) def featuresForAttachment(head: Int, dep: Int): Array[Feature] = { val hf = hb.featuresForAttachment(head, dep) val df = db.featuresForAttachment(head, dep) @@ -104,7 +103,7 @@ object BilexicalFeaturizer { case class HeadDepFeaturizer[W](headBase: WordFeaturizer[W], depBase: WordFeaturizer[W]) extends BilexicalFeaturizer[W] { def anchor(w: IndexedSeq[W]): BilexicalFeatureAnchoring[W] = new BilexicalFeatureAnchoring[W] { val hb = headBase.anchor(w) - val db = if(headBase eq depBase) hb else depBase.anchor(w) + val db = if (headBase eq depBase) hb else depBase.anchor(w) def featuresForAttachment(head: Int, dep: Int): Array[Feature] = { Arrays.crossProduct(hb.featuresForWord(head), db.featuresForWord(dep))((a, b) => HeadDepFeature(a,b):Feature) } @@ -133,7 +132,6 @@ trait BilexicalFeatureAnchoring[W] { def featuresForAttachment(head: Int, dep: Int):Array[Feature] } - @SerialVersionUID(1L) class ProductIndexedBilexicalFeaturizer[W](headFeaturizer: IndexedWordFeaturizer[W], depFeaturizer: IndexedWordFeaturizer[W], @@ -153,7 +151,6 @@ class ProductIndexedBilexicalFeaturizer[W](headFeaturizer: IndexedWordFeaturizer ret = f1 cache(head)(dep) = f1 } - ret } } @@ -208,8 +205,8 @@ object IndexedBilexicalFeaturizer { for( (head, dep) <- tree.arcs if head < tree.words.length) { builder.add(hanch.featuresForWord(head), danch.featuresForWord(dep)) - // builder.add(danch.featuresForWord(head), - // hanch.featuresForWord(dep)) + // builder.add(danch.featuresForWord(head), + // hanch.featuresForWord(dep)) } } diff --git a/src/main/scala/epic/features/BrownClusters.scala b/src/main/scala/epic/features/BrownClusters.scala index c39796a2..2c383559 100644 --- a/src/main/scala/epic/features/BrownClusters.scala +++ b/src/main/scala/epic/features/BrownClusters.scala @@ -22,17 +22,14 @@ object BrownClusters { } yield { word -> cluster.intern } - val map = pairs.toMap in.close() - map } lazy val clusterIds = theClusters.values.toSet - def clusterFor(w: String, default:String = "00"):String = theClusters.getOrElse(w, default) - + def clusterFor(w: String, default:String = "00"): String = theClusters.getOrElse(w, default) trait DSL { // Tkachenko and Simanovsky liked these values @@ -41,14 +38,13 @@ object BrownClusters { } } - case class BrownClusterFeature(f: String) extends Feature case class BrownClusterFeaturizer(lengths: Array[Int]) extends WordFeaturizer[String] with Serializable { def anchor(w: IndexedSeq[String]): WordFeatureAnchoring[String] = new WordFeatureAnchoring[String] { def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0 || pos >= words.length) + if (pos < 0 || pos >= words.length) Array(BoundaryFeature) else features(pos) @@ -68,7 +64,7 @@ case class BrownClusterFeaturizer(lengths: Array[Int]) extends WordFeaturizer[St private val clusterFeatures = { BrownClusters.clusterIds .iterator - .map(k => k -> lengths.map(l => if(l > k.length) BrownClusterFeature(k) else BrownClusterFeature(k.substring(0, l))).toSet[Feature].toArray[Feature]) + .map(k => k -> lengths.map(l => if (l > k.length) BrownClusterFeature(k) else BrownClusterFeature(k.substring(0, l))).toSet[Feature].toArray[Feature]) .toMap } } diff --git a/src/main/scala/epic/features/ContextFeaturizer.scala b/src/main/scala/epic/features/ContextFeaturizer.scala index b67600e9..1302ead1 100644 --- a/src/main/scala/epic/features/ContextFeaturizer.scala +++ b/src/main/scala/epic/features/ContextFeaturizer.scala @@ -22,7 +22,7 @@ case class ContextFeaturizer[W](featurizer: WordFeaturizer[W], window: Int) exte } def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0 || pos >= words.length) + if (pos < 0 || pos >= words.length) Array.empty else settedFeatures(pos) diff --git a/src/main/scala/epic/features/ContextWordFeaturizer.scala b/src/main/scala/epic/features/ContextWordFeaturizer.scala index 6737fc71..8c716f92 100644 --- a/src/main/scala/epic/features/ContextWordFeaturizer.scala +++ b/src/main/scala/epic/features/ContextWordFeaturizer.scala @@ -4,7 +4,6 @@ import epic.framework.Feature import scala.collection.mutable.ArrayBuffer import epic.util.Arrays - /** * * @author dlwh @@ -19,7 +18,6 @@ class ContextWordFeaturizer[W](offsetFeaturizer: WordFeaturizer[W], wordOffsetOr for(off <- -wordOffsetOrder to wordOffsetOrder if off != 0) { result ++= offsetAnchoring.featuresForWord(pos + off).map(f => OffsetFeature(off, f):Feature) } - /* val myFeats = offsetAnchoring.featuresForWord(pos) result ++= Arrays.crossProduct(Array(myFeats.head), offsetAnchoring.featuresForWord(pos+1)){BigramFeature(0, _, _)} @@ -30,7 +28,6 @@ class ContextWordFeaturizer[W](offsetFeaturizer: WordFeaturizer[W], wordOffsetOr def words: IndexedSeq[W] = w - } } diff --git a/src/main/scala/epic/features/CrossProductIndex.scala b/src/main/scala/epic/features/CrossProductIndex.scala index 510ec2f9..9d659f89 100644 --- a/src/main/scala/epic/features/CrossProductIndex.scala +++ b/src/main/scala/epic/features/CrossProductIndex.scala @@ -12,7 +12,7 @@ import scala.util.hashing.MurmurHash3 @SerialVersionUID(1743448091752596096L) case class CrossProductFeature[A, B](labelPart: A, surfacePart: B, id: String = "") extends Feature { - override def toString = s"${if(id.nonEmpty) id else "CrossProduct"}Feature($labelPart, $surfacePart)" + override def toString = s"${if (id.nonEmpty) id else "CrossProduct"}Feature($labelPart, $surfacePart)" } /** @@ -32,7 +32,6 @@ class CrossProductIndex[A, B] private (val firstIndex: Index[A], def surfacePart(i: Int) = surfacePartOfFeature(i - labelOnlySize) def labelPart(i: Int) = labelPartOfFeature(i - labelOnlySize) - def lock = { val lockedFirst: Index[A] = firstIndex match { case x: HashExtendingIndex[A] => x.lock @@ -58,39 +57,36 @@ class CrossProductIndex[A, B] private (val firstIndex: Index[A], case _ => -1 } - def mapped(labelFeature: Int, surfaceFeature: Int):Int = { - if(labelFeature < 0 || surfaceFeature < 0) { + def mapped(labelFeature: Int, surfaceFeature: Int): Int = { + if (labelFeature < 0 || surfaceFeature < 0) { -1 } else { val arr = mapping(labelFeature) - val f = if(arr ne null) { + val f = if (arr ne null) { arr(surfaceFeature) } else { -1 } - - if(f != -1 || numHashFeatures == 0) { + if (f != -1 || numHashFeatures == 0) { f } else if (f < -1) { // really not present -1 } else { val hf = MurmurHash3.mixLast(MurmurHash3.mix(10891, labelFeature.##), surfaceFeature.##).abs - if(!seenSet.addOrSeen(hf)) { + if (!seenSet.addOrSeen(hf)) { -1 } else { (hf % numHashFeatures) + trueSize } } } - } - - private val labelOnlySize: Int = if(includePlainLabelFeatures) firstIndex.size else 0 + private val labelOnlySize: Int = if (includePlainLabelFeatures) firstIndex.size else 0 private val trueSize = labelOnlySize + labelPartOfFeature.length override def size: Int = trueSize + numHashFeatures - def unapply(i: Int): Option[Feature] = if(i >= size || i < 0) None else Some(get(i)) + def unapply(i: Int): Option[Feature] = if (i >= size || i < 0) None else Some(get(i)) override def get(i: Int): Feature = { if (i >= size || i < 0) { @@ -110,15 +106,15 @@ class CrossProductIndex[A, B] private (val firstIndex: Index[A], def crossProduct(lFeatures: Array[Int], sFeatures: Array[Int], offset: Int = 0, usePlainLabelFeatures: Boolean = true):Array[Int] = { val builder = new mutable.ArrayBuilder.ofInt - builder.sizeHint(lFeatures.length * (sFeatures.length + {if(includePlainLabelFeatures) 1 else 0})) + builder.sizeHint(lFeatures.length * (sFeatures.length + {if (includePlainLabelFeatures) 1 else 0})) var i = 0 - while(i < lFeatures.length) { - if(usePlainLabelFeatures && includePlainLabelFeatures && lFeatures(i) >= 0) + while (i < lFeatures.length) { + if (usePlainLabelFeatures && includePlainLabelFeatures && lFeatures(i) >= 0) builder += (lFeatures(i) + offset) var j = 0 - while(j < sFeatures.length) { + while (j < sFeatures.length) { val m = mapped(lFeatures(i),sFeatures(j)) + offset - if(m != -1) + if (m != -1) builder += m j += 1 } @@ -133,32 +129,31 @@ class CrossProductIndex[A, B] private (val firstIndex: Index[A], val builder = new CSCMatrix.Builder[Double](firstIndex.size, secondIndex.size) val vbuilder = new VectorBuilder[Double](firstIndex.size) - if(includePlainLabelFeatures) { + if (includePlainLabelFeatures) { for(i <- 0 until firstIndex.size) { val w = weights(i) - if(w != 0.0) + if (w != 0.0) vbuilder.add(i, w) } } - if(numHashFeatures == 0) { + if (numHashFeatures == 0) { // if no hash features, we can just iterate over the enumerated part of the index for(((l, s), i) <- (labelPartOfFeature zip surfacePartOfFeature).zipWithIndex) { val w = weights(i + labelOnlySize) - if(w != 0.0) + if (w != 0.0) builder.add(l, s, w) } } else { // otherwise, check everything for(l <- 0 until firstIndex.size; s <- 0 until secondIndex.size) { val i = mapped(l, s) - if(i >= 0 && weights(i) != 0) { + if (i >= 0 && weights(i) != 0) { builder.add(l, s, weights(i)) } } } - (builder.result(), vbuilder.toSparseVector(true, true)) } @@ -175,7 +170,7 @@ class CrossProductIndex[A, B] private (val firstIndex: Index[A], */ def prune(shouldPrune: Int=>Boolean, rebuildSurfaceIndex: Boolean = true):CrossProductIndex[A, B] = { val newSecondIndex = Index[B]() - def newIndexOf(b: Int) = if(rebuildSurfaceIndex) newSecondIndex.index(secondIndex.get(b)) else b + def newIndexOf(b: Int) = if (rebuildSurfaceIndex) newSecondIndex.index(secondIndex.get(b)) else b def alreadyInNewIndex(b: Int) = !rebuildSurfaceIndex || newSecondIndex.contains(secondIndex.get(b)) val mapping = Array.fill(firstIndex.size)(new OpenAddressHashArray[Int](secondIndex.size max 1, -1, 4)) val newLabelPart, newSurfacePart = new ArrayBuffer[Int]() @@ -211,7 +206,7 @@ class CrossProductIndex[A, B] private (val firstIndex: Index[A], } new CrossProductIndex(firstIndex, - if(rebuildSurfaceIndex) newSecondIndex else secondIndex, + if (rebuildSurfaceIndex) newSecondIndex else secondIndex, mapping, newLabelPart.toArray, newSurfacePart.toArray, id, includePlainLabelFeatures, @@ -240,12 +235,12 @@ object CrossProductIndex { val includeLabelOnlyFeatures: Boolean = true, minCount: Int = 1, seenSet: LockableSeenSet[Long] = LockableSeenSet.always) extends SafeLogging { - def add(a: A, b: B):Int = add(firstIndex(a), secondIndex(b)) + def add(a: A, b: B): Int = add(firstIndex(a), secondIndex(b)) private val counts = Array.fill(firstIndex.size)(new OpenAddressHashArray[Int](secondIndex.size max 1, 0, 4)) private val mapping = Array.fill(firstIndex.size)(new OpenAddressHashArray[Int](secondIndex.size max 1, -1, 4)) private val labelPart, surfacePart = new ArrayBuffer[Int]() - private val labelOnlySize: Int = if(includeLabelOnlyFeatures) firstIndex.size else 0 + private val labelOnlySize: Int = if (includeLabelOnlyFeatures) firstIndex.size else 0 def size = labelPart.size + labelOnlySize @@ -257,14 +252,14 @@ object CrossProductIndex { secondArray.map(add(first, _)) } - def add(first: Int, second: Int):Int = { - if(first < 0 || second < 0) { + def add(first: Int, second: Int): Int = { + if (first < 0 || second < 0) { -1 } else { val currentIndex: Int = mapping(first)(second) - if(currentIndex == -1) { + if (currentIndex == -1) { val currentCount = counts(first)(second) - if(minCount <= 1 || currentCount + 1 >= minCount) { + if (minCount <= 1 || currentCount + 1 >= minCount) { val x = size mapping(first)(second) = x labelPart += first diff --git a/src/main/scala/epic/features/DistanceBinner.scala b/src/main/scala/epic/features/DistanceBinner.scala index b4cfb0ef..4a29f4e7 100644 --- a/src/main/scala/epic/features/DistanceBinner.scala +++ b/src/main/scala/epic/features/DistanceBinner.scala @@ -27,13 +27,13 @@ class DistanceBinner private (val binThresholds: Array[Int], preserveDirection: else bin + 1 } - def distanceBin(a: Int, b: Int):Int = { + def distanceBin(a: Int, b: Int): Int = { val dist: Int = b - a distanceBin(dist) } def distanceBin(dist: Int): Int = { - val array = if(dist < 0) negativeBins else bins + val array = if (dist < 0) negativeBins else bins val adist = math.min(math.abs(dist), array.length - 1) array(adist) } @@ -44,12 +44,12 @@ class DistanceBinner private (val binThresholds: Array[Int], preserveDirection: def binnedDistance(dist: Int): Int = { val bin = distanceBin(dist) - if(dist == 0) 0 - else if(bin < 0) { - if(-bin-1 >= binThresholds.length) + if (dist == 0) 0 + else if (bin < 0) { + if (-bin-1 >= binThresholds.length) -(binThresholds.last + 1) else -binThresholds(-bin-1) - } else if(bin >= binThresholds.length) { + } else if (bin >= binThresholds.length) { binThresholds.last + 1 } else binThresholds(bin-1) } @@ -58,10 +58,9 @@ class DistanceBinner private (val binThresholds: Array[Int], preserveDirection: } - object DistanceBinner { def mkBinArray(numBins: Int, numExactBins: Int): Array[Int] = { - if(numBins <= 1) Array(1) + if (numBins <= 1) Array(1) else { val exact = Array.range(1, numExactBins+1) exact ++ Array.iterate(exact.last, (numBins - numExactBins) max 1)(exact => exact * 2).drop(1) diff --git a/src/main/scala/epic/features/EnglishWordClassGenerator.scala b/src/main/scala/epic/features/EnglishWordClassGenerator.scala index 3a1a100d..c48400ca 100644 --- a/src/main/scala/epic/features/EnglishWordClassGenerator.scala +++ b/src/main/scala/epic/features/EnglishWordClassGenerator.scala @@ -1,6 +1,5 @@ package epic.features - /** * Converts a string into another string with properties of that string * Useful for rare or 0 count words @@ -30,7 +29,6 @@ object EnglishWordClassGenerator extends (String=>String) with Serializable { } else if (hasLower) { sb.append("-LC") } - if (hasDigit) { sb.append("-NUM") } diff --git a/src/main/scala/epic/features/HackyHeadFinder.scala b/src/main/scala/epic/features/HackyHeadFinder.scala index 99ed93a0..5e6f8e99 100644 --- a/src/main/scala/epic/features/HackyHeadFinder.scala +++ b/src/main/scala/epic/features/HackyHeadFinder.scala @@ -2,7 +2,6 @@ package epic.features import scala.collection.mutable.HashMap - /** * HackyHeadFinders find "heads" in a span using only preterminal labels. * It doesn't use the syntactic structure of the sentence. @@ -52,7 +51,7 @@ object RuleBasedHackyHeadFinder { headRules.put("PRN", (preterminals) => if (preterminals.size > 1) 1 else 0) headRules.put("S", (preterminals) => searchFindFirst(preterminals, L2R, Set("TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP"))) headRules.put("VP", (preterminals) => searchFindFirst(preterminals, L2R, Set("TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP"))) -// headRules.put("SBAR", (preterminals) => searchFindFirst(preterminals, L2R, Set("WP", "WP$", "WDT", "WRB", "IN", "PRP", "PRP$"))); + // headRules.put("SBAR", (preterminals) => searchFindFirst(preterminals, L2R, Set("WP", "WP$", "WDT", "WRB", "IN", "PRP", "PRP$"))); def searchFindFirst(preterminals: Seq[String], leftToRight: Boolean, goodOnes: Set[String]): Int = { val start = if (leftToRight) 0 else preterminals.size - 1 @@ -88,7 +87,7 @@ object RuleBasedHackyHeadFinder { } } if (headIdx == -1) { -// headIdx = if (leftToRight) preterminals.size - 1 else 0; + // headIdx = if (leftToRight) preterminals.size - 1 else 0; headIdx = if (leftToRight) Math.max(0, i - 1) else Math.min(i+1, preterminals.size) } if (headIdx < 0 || headIdx >= preterminals.size) { diff --git a/src/main/scala/epic/features/HackyHeadFinderTest.scala b/src/main/scala/epic/features/HackyHeadFinderTest.scala index 09f0c01f..2d1d967e 100644 --- a/src/main/scala/epic/features/HackyHeadFinderTest.scala +++ b/src/main/scala/epic/features/HackyHeadFinderTest.scala @@ -13,7 +13,7 @@ import scala.collection.mutable.HashMap object HackyHeadFinderTest { def main(args: Array[String]) { -// val treebank = new SimpleTreebank(new File(ptbPath), new File(ptbPath), new File(ptbPath)) + // val treebank = new SimpleTreebank(new File(ptbPath), new File(ptbPath), new File(ptbPath)) val treebank = Treebank.fromPennTreebankDir(new File("data/wsj")) val process = PartialTreeProcessor() @@ -46,8 +46,7 @@ object HackyHeadFinderTest { wordToTagMap.put(word, bestTag) } println("Done training lexicon") - - + val hf = HeadFinder.collins val hackyHeadFinder = new RuleBasedHackyHeadFinder @@ -80,10 +79,9 @@ object HackyHeadFinderTest { val tree = devTreesWords(i)._1 val words = devTreesWords(i)._2 rec(tree, words) - -// println(tree.render(devTreesWords(i)._2, false)) -// println(processedTrees(i).render(treesWords(i)._2, false)) -// println(processedTreesWithIndices(i).render(treesWords(i)._2, false)) + // println(tree.render(devTreesWords(i)._2, false)) + // println(processedTrees(i).render(treesWords(i)._2, false)) + // println(processedTreesWithIndices(i).render(treesWords(i)._2, false)) } var totalAcc = 0 var totalCount = 0 diff --git a/src/main/scala/epic/features/HackyLexicalProductionFeaturizer.scala b/src/main/scala/epic/features/HackyLexicalProductionFeaturizer.scala index c4028f42..10404238 100644 --- a/src/main/scala/epic/features/HackyLexicalProductionFeaturizer.scala +++ b/src/main/scala/epic/features/HackyLexicalProductionFeaturizer.scala @@ -86,13 +86,11 @@ class HackyLexicalProductionFeaturizer(wordTagCounts: Counter2[String, String, D RightTagDistanceRuleFeature(rule, rcHeadTag, distance), RightHeadDistanceRuleFeature(rule, if (wordCounts(rcHeadWord) >= wordThreshold) rcHeadWord else HackyLexicalProductionFeaturizer.RareToken, distance))) } - - + def featuresForUnaryRule(begin: Int, end: Int, rule: Int, ref: Int):Array[Feature] = emptyArray def featuresForSpan(begin: Int, end: Int, tag: Int, ref: Int):Array[Feature] = emptyArray } - } case class LeftTagDistanceRuleFeature(rule: Int, ltag: String, distance: Int) extends Feature @@ -101,7 +99,6 @@ case class RightTagDistanceRuleFeature(rule: Int, rtag: String, distance: Int) e case class RightHeadDistanceRuleFeature(rule: Int, rsuff: String, distance: Int) extends Feature case class HeadPairDistanceRuleFeature(rule: Int, lsuff: String, rsuff: String, distance: Int) extends Feature - object HackyLexicalProductionFeaturizer { val UnkTag = "NN" val RareToken = "" diff --git a/src/main/scala/epic/features/HackyLexicalSplitFeaturizer.scala b/src/main/scala/epic/features/HackyLexicalSplitFeaturizer.scala index 92989f74..0bc0e980 100644 --- a/src/main/scala/epic/features/HackyLexicalSplitFeaturizer.scala +++ b/src/main/scala/epic/features/HackyLexicalSplitFeaturizer.scala @@ -9,7 +9,7 @@ class HackyLexicalSplitFeaturizer[W]() extends SplitSpanFeaturizer[W] { private val theSplitNeedingAnchoring = new SplitSpanFeatureAnchoring[W] with Serializable { def featuresForSplit(begin: Int, split: Int, end: Int): Array[Feature] = { emptyArray -// Array(DistanceFeature(db.binnedDistance((end-split) - (split-begin)), label)) + // Array(DistanceFeature(db.binnedDistance((end-split) - (split-begin)), label)) } def featuresForSpan(begin: Int, end: Int): Array[Feature] = emptyArray diff --git a/src/main/scala/epic/features/HashExtendingIndex.scala b/src/main/scala/epic/features/HashExtendingIndex.scala index 4231b260..89ec2788 100644 --- a/src/main/scala/epic/features/HashExtendingIndex.scala +++ b/src/main/scala/epic/features/HashExtendingIndex.scala @@ -23,7 +23,7 @@ class HashExtendingIndex[T](val baseIndex: Index[T], def apply(t: T): Int = baseIndex(t) match { case -1 => val code = t.##.abs - if(!cache.addOrSeen(code)) + if (!cache.addOrSeen(code)) -1 else t.##.abs % numHashFeatures + baseIndex.size @@ -31,8 +31,8 @@ class HashExtendingIndex[T](val baseIndex: Index[T], } def unapply(i: Int): Option[T] = { - if(i < baseIndex.size) baseIndex.unapply(i) - else if(i < size) Some(hashWrapper(i - baseIndex.size)) + if (i < baseIndex.size) baseIndex.unapply(i) + else if (i < size) Some(hashWrapper(i - baseIndex.size)) else None } diff --git a/src/main/scala/epic/features/HashFeature.scala b/src/main/scala/epic/features/HashFeature.scala index 0bef6c78..cc159763 100644 --- a/src/main/scala/epic/features/HashFeature.scala +++ b/src/main/scala/epic/features/HashFeature.scala @@ -10,7 +10,7 @@ case class HashFeature(hashBucket: Int) extends Feature object HashFeature { sealed trait Scale { - def numFeatures(nonHashFeatures: Int):Int + def numFeatures(nonHashFeatures: Int): Int } case class Absolute(numHashFeatures: Int) extends Scale { diff --git a/src/main/scala/epic/features/IdentityWordFeaturizer.scala b/src/main/scala/epic/features/IdentityWordFeaturizer.scala index 53da3916..0d964023 100644 --- a/src/main/scala/epic/features/IdentityWordFeaturizer.scala +++ b/src/main/scala/epic/features/IdentityWordFeaturizer.scala @@ -19,7 +19,7 @@ class IdentityWordFeaturizer[W](wordCounts: Counter[W, Double], unknownWordThres def words = w def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0 || pos >= words.length) { + if (pos < 0 || pos >= words.length) { boundaryFeatures } else { _minimalFeatures(pos) @@ -28,7 +28,7 @@ class IdentityWordFeaturizer[W](wordCounts: Counter[W, Double], unknownWordThres private val _minimalFeatures: immutable.IndexedSeq[Array[Feature]] = words.indices.map { i => val index = indices(i) - if(index >= 0) { + if (index >= 0) { IdentityWordFeaturizer.this.minimalFeatures(index) } else { Array[Feature](Unk) @@ -42,14 +42,13 @@ class IdentityWordFeaturizer[W](wordCounts: Counter[W, Double], unknownWordThres private val wordIndex = Index(wordCounts.keySet) private val Unk = WordFeature("#UNK#", 'LowCount) private val boundaryFeatures = Array[Feature](BoundaryFeature) - - private val wordFeatures = Encoder.fromIndex(wordIndex).tabulateArray(s => if(wordCounts(s) > unknownWordThreshold) IndicatorFeature(s) else Unk) + private val wordFeatures = Encoder.fromIndex(wordIndex).tabulateArray(s => if (wordCounts(s) > unknownWordThreshold) IndicatorFeature(s) else Unk) // caches private val minimalFeatures = Array.tabulate[Array[Feature]](wordIndex.size){ i => val wc = wordCounts(wordIndex.get(i)) val w = wordFeatures(i) - if(wc > unknownWordThreshold) { + if (wc > unknownWordThreshold) { Array(w) } else { Array(Unk) diff --git a/src/main/scala/epic/features/IndexedSurfaceFeaturizer.scala b/src/main/scala/epic/features/IndexedSurfaceFeaturizer.scala index 2ef9be01..66171402 100644 --- a/src/main/scala/epic/features/IndexedSurfaceFeaturizer.scala +++ b/src/main/scala/epic/features/IndexedSurfaceFeaturizer.scala @@ -29,7 +29,7 @@ object IndexedSurfaceFeaturizer { constraintFactory: SpanConstraints.Factory[W], deduplicateFeatures: Boolean = false) : IndexedSurfaceFeaturizer[W] = { - val index = if(deduplicateFeatures) new NonRedundantIndexBuilder[Feature] else new NormalIndexBuilder[Feature]() + val index = if (deduplicateFeatures) new NonRedundantIndexBuilder[Feature] else new NormalIndexBuilder[Feature]() for(words <- data) { val cons = constraintFactory.get(words) @@ -41,16 +41,13 @@ object IndexedSurfaceFeaturizer { } } - new MySurfaceFeaturizer[W](feat, constraintFactory, index.result()) } @SerialVersionUID(1L) class CachedFeaturizer[W](val base: IndexedSurfaceFeaturizer[W], cache: collection.mutable.Map[IndexedSeq[W], IndexedSurfaceAnchoring[W]]) extends IndexedSurfaceFeaturizer[W] with Serializable { def featurizer: SurfaceFeaturizer[W] = base.featurizer - def featureIndex: Index[Feature] = base.featureIndex - def anchor(datum: IndexedSeq[W]): IndexedSurfaceAnchoring[W] = cache.getOrElseUpdate(datum, base.anchor(datum)) } @@ -62,7 +59,7 @@ object IndexedSurfaceFeaturizer { val cons = constraintsFactory.constraints(words) val anch = featurizer.anchor(words) val spanFeatures = TriangularArray.tabulate(words.length+1){ (i, j) => - if(cons(i,j) && i < j) { + if (cons(i,j) && i < j) { stripEncode(featureIndex, anch.featuresForSpan(i, j)) } else { null @@ -78,9 +75,9 @@ object IndexedSurfaceFeaturizer { val result = mutable.ArrayBuilder.make[Int]() result.sizeHint(features) var i = 0 - while(i < features.length) { + while (i < features.length) { val fi = ind(features(i)) - if(fi >= 0) + if (fi >= 0) result += fi i += 1 } diff --git a/src/main/scala/epic/features/IndexedWordFeaturizer.scala b/src/main/scala/epic/features/IndexedWordFeaturizer.scala index 73eeb531..08800475 100644 --- a/src/main/scala/epic/features/IndexedWordFeaturizer.scala +++ b/src/main/scala/epic/features/IndexedWordFeaturizer.scala @@ -20,7 +20,7 @@ object IndexedWordFeaturizer { data: IndexedSeq[IndexedSeq[W]], wordHashFeatures: Int = 0, deduplicateFeatures: Boolean = true): IndexedWordFeaturizer[W] = { - val wordIndex = if(deduplicateFeatures) new NonRedundantIndexBuilder[Feature] else new NormalIndexBuilder[Feature]() + val wordIndex = if (deduplicateFeatures) new NonRedundantIndexBuilder[Feature] else new NormalIndexBuilder[Feature]() for(words <- data) { val anch = feat.anchor(words) words.indices.foreach { i => @@ -28,7 +28,6 @@ object IndexedWordFeaturizer { } } - new MyWordFeaturizer[W](feat, wordIndex.result()) } @@ -47,9 +46,9 @@ object IndexedWordFeaturizer { val result = mutable.ArrayBuilder.make[Int]() result.sizeHint(features) var i = 0 - while(i < features.length) { + while (i < features.length) { val fi = ind(features(i)) - if(fi >= 0) + if (fi >= 0) result += fi i += 1 } @@ -59,7 +58,6 @@ object IndexedWordFeaturizer { } } - @SerialVersionUID(1L) class TabulatedIndexedWordAnchoring[W](val words: IndexedSeq[W], spanFeatures: Array[Array[Int]]) extends IndexedWordAnchoring[W] with Serializable { diff --git a/src/main/scala/epic/features/LongestFrequentSuffixFeaturizer.scala b/src/main/scala/epic/features/LongestFrequentSuffixFeaturizer.scala index 8f09405f..82918161 100644 --- a/src/main/scala/epic/features/LongestFrequentSuffixFeaturizer.scala +++ b/src/main/scala/epic/features/LongestFrequentSuffixFeaturizer.scala @@ -15,13 +15,11 @@ class LongestFrequentSuffixFeaturizer private (fixedMap: Map[String, Feature], def anchor(w: IndexedSeq[String]): WordFeatureAnchoring[String] = new WordFeatureAnchoring[String] { val feats = words.map(w => Array(fixedMap.getOrElse(w, LongestFrequentSuffix(lookup(w))))) - def featuresForWord(pos: Int): Array[Feature] = if(pos < 0 || pos >= w.length) Array(BeginSentFeature) else feats(pos) + def featuresForWord(pos: Int): Array[Feature] = if (pos < 0 || pos >= w.length) Array(BeginSentFeature) else feats(pos) def words: IndexedSeq[String] = w } - - def lookupSentence(sent: IndexedSeq[String]) = { sent.map(w => fixedMap.getOrElse(w, LongestFrequentSuffix(lookup(w))) match { case LongestFrequentSuffix(s) => "-" + s @@ -29,7 +27,7 @@ class LongestFrequentSuffixFeaturizer private (fixedMap: Map[String, Feature], }) } - private def lookup(x: String):String = { + private def lookup(x: String): String = { x.tails.find(suffixCounts(_) >= commonWordThreshold).getOrElse("-UNK-") } } @@ -43,12 +41,12 @@ object LongestFrequentSuffixFeaturizer { suffixCounts = suffixCounts.mapValues(v => v * I(v >= commonWordThreshold)) - def lookup(x: String):String = { + def lookup(x: String): String = { x.tails.find(suffixCounts(_) >= commonWordThreshold).getOrElse("-UNK-") } val map = Map.empty ++ (for( (w,v) <- counts.iterator) yield { - if(v > commonWordThreshold) + if (v > commonWordThreshold) w -> IndicatorFeature(w) else w -> LongestFrequentSuffix(lookup(w)) diff --git a/src/main/scala/epic/features/MinimalWordFeaturizer.scala b/src/main/scala/epic/features/MinimalWordFeaturizer.scala index 0fe2eabd..1f0ed3a3 100644 --- a/src/main/scala/epic/features/MinimalWordFeaturizer.scala +++ b/src/main/scala/epic/features/MinimalWordFeaturizer.scala @@ -21,7 +21,7 @@ class MinimalWordFeaturizer(wordCounts: Counter[String, Double], includeWordShap def words = w def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0 || pos >= words.length) { + if (pos < 0 || pos >= words.length) { boundaryFeatures } else { _minimalFeatures(pos) @@ -30,13 +30,13 @@ class MinimalWordFeaturizer(wordCounts: Counter[String, Double], includeWordShap private val _minimalFeatures: immutable.IndexedSeq[Array[Feature]] = words.indices.map { i => val index = indices(i) - if(index >= 0) { + if (index >= 0) { MinimalWordFeaturizer.this.minimalFeatures(index) } else { val ww = words(i) val classe = interner(WordFeature(EnglishWordClassGenerator(ww), 'Class)) val shape = interner(WordFeature(WordShapeGenerator(ww), 'Shape)) - if(includeWordShapeFeatures) { + if (includeWordShapeFeatures) { Array(shape, classe, Unk) } else{ Array(classe, Unk) @@ -54,21 +54,21 @@ class MinimalWordFeaturizer(wordCounts: Counter[String, Double], includeWordShap private val Unk = WordFeature("#UNK#", 'LowCount) private val boundaryFeatures = Array[Feature](BoundaryFeature) - private val wordFeatures = Encoder.fromIndex(wordIndex).tabulateArray(s => if(wordCounts(s) > unknownWordThreshold) interner(IndicatorFeature(s)) else Unk) - private val classes = Encoder.fromIndex(wordIndex).tabulateArray(w => if(wordCounts(w) > functionWordThreshold) wordFeatures(wordIndex(w)) else interner(WordFeature(EnglishWordClassGenerator(w), 'Class))) - private val shapes = if(includeWordShapeFeatures) Encoder.fromIndex(wordIndex).tabulateArray(w => if(wordCounts(w) > functionWordThreshold) wordFeatures(wordIndex(w)) else interner(WordFeature(WordShapeGenerator(w), 'Shape))) else null + private val wordFeatures = Encoder.fromIndex(wordIndex).tabulateArray(s => if (wordCounts(s) > unknownWordThreshold) interner(IndicatorFeature(s)) else Unk) + private val classes = Encoder.fromIndex(wordIndex).tabulateArray(w => if (wordCounts(w) > functionWordThreshold) wordFeatures(wordIndex(w)) else interner(WordFeature(EnglishWordClassGenerator(w), 'Class))) + private val shapes = if (includeWordShapeFeatures) Encoder.fromIndex(wordIndex).tabulateArray(w => if (wordCounts(w) > functionWordThreshold) wordFeatures(wordIndex(w)) else interner(WordFeature(WordShapeGenerator(w), 'Shape))) else null // caches private val minimalFeatures = Array.tabulate(wordIndex.size){ i => val wc = wordCounts(wordIndex.get(i)) val w = wordFeatures(i) val classe = classes(i) - if(wc > functionWordThreshold) Array(w) + if (wc > functionWordThreshold) Array(w) else if (includeWordShapeFeatures) { val shape = shapes(i) - if(wc > unknownWordThreshold) Array(w, shape, classe) + if (wc > unknownWordThreshold) Array(w, shape, classe) else Array(shape, classe, Unk) - } else if(wc > unknownWordThreshold) { + } else if (wc > unknownWordThreshold) { Array(w, classe) } else { Array(classe, Unk) diff --git a/src/main/scala/epic/features/MorphFeaturizer.scala b/src/main/scala/epic/features/MorphFeaturizer.scala index f4b449cb..d86f5462 100644 --- a/src/main/scala/epic/features/MorphFeaturizer.scala +++ b/src/main/scala/epic/features/MorphFeaturizer.scala @@ -21,10 +21,10 @@ class MorphFeaturizer private (morphLookupTable: MorphFeaturizer.MorphLookupTabl morphLookupTable(w) } val feats = w.indices.map(i => morphFeats(i).filter(feat => feat.label == "lem").map(feat => IndicatorFeature(feat): Feature)) -// logger.info("Feats for sentence: " + w) -// (0 until w.size).foreach(i => logger.info(w(i) + ": " + feats(i).toSeq)) + // logger.info("Feats for sentence: " + w) + // (0 until w.size).foreach(i => logger.info(w(i) + ": " + feats(i).toSeq)) - def featuresForWord(pos: Int): Array[Feature] = if(pos < 0 || pos >= w.length) Array(BeginSentFeature) else feats(pos) + def featuresForWord(pos: Int): Array[Feature] = if (pos < 0 || pos >= w.length) Array(BeginSentFeature) else feats(pos) def words: IndexedSeq[String] = w } diff --git a/src/main/scala/epic/features/MultiSurfaceFeaturizer.scala b/src/main/scala/epic/features/MultiSurfaceFeaturizer.scala index f17ffc88..82d3e965 100644 --- a/src/main/scala/epic/features/MultiSurfaceFeaturizer.scala +++ b/src/main/scala/epic/features/MultiSurfaceFeaturizer.scala @@ -13,7 +13,6 @@ case class MultiSurfaceFeaturizer[W](feats: IndexedSeq[SurfaceFeaturizer[W]]) ex def anchor(w: IndexedSeq[W]): SurfaceFeatureAnchoring[W] = new SurfaceFeatureAnchoring[W] { val anchs = feats.map(_.anchor(w)).toArray def words: IndexedSeq[W] = w - def featuresForSpan(beg: Int, end: Int): Array[Feature] = anchs.flatMap(_.featuresForSpan(beg, end)) } } diff --git a/src/main/scala/epic/features/MultiWordFeaturizer.scala b/src/main/scala/epic/features/MultiWordFeaturizer.scala index 04ddd76d..f3908bb8 100644 --- a/src/main/scala/epic/features/MultiWordFeaturizer.scala +++ b/src/main/scala/epic/features/MultiWordFeaturizer.scala @@ -13,7 +13,6 @@ case class MultiWordFeaturizer[W](featurizers: IndexedSeq[WordFeaturizer[W]]) ex def anchor(w: IndexedSeq[W]): WordFeatureAnchoring[W] = new WordFeatureAnchoring[W] { val anchs = featurizers.map(_.anchor(w)).toArray def words: IndexedSeq[W] = w - def featuresForWord(pos: Int): Array[Feature] = anchs.flatMap(_.featuresForWord(pos)) } } diff --git a/src/main/scala/epic/features/NGramSpanFeaturizer.scala b/src/main/scala/epic/features/NGramSpanFeaturizer.scala index cabefec8..0a412a4f 100644 --- a/src/main/scala/epic/features/NGramSpanFeaturizer.scala +++ b/src/main/scala/epic/features/NGramSpanFeaturizer.scala @@ -29,14 +29,14 @@ class NGramSpanFeaturizer(wordCounts: Counter[String,Double], def anchor(words: IndexedSeq[String]): SurfaceFeatureAnchoring[String] = { new SurfaceFeatureAnchoring[String] { def featuresForSpan(begin: Int, end: Int): Array[Feature] = { -// println("Span: " + words.slice(begin, end)) + // println("Span: " + words.slice(begin, end)) val unigramFeats = for (i <- begin until end) yield { -// println(words(i) + ": " + wordCounts(words(i))) + // println(words(i) + ": " + wordCounts(words(i))) NGramUnigramFeature(if (wordCounts(words(i)) < ngramCountThreshold) -1 else wordIndex(words(i))) } val bigramFeats = for (i <- begin until end - 1) yield { val pair = (words(i), words(i+1)) -// println(pair + ": " + bigramCounts(pair)) + // println(pair + ": " + bigramCounts(pair)) NGramBigramFeature(if (bigramCounts(pair) < ngramCountThreshold) -1 else bigramIndex(pair)) } val notFeats = if (useNot) { @@ -48,7 +48,7 @@ class NGramSpanFeaturizer(wordCounts: Counter[String,Double], } else if (NGramSpanFeaturizer.NotEndingPunc.contains(words(i))) { inNotSpan = false } else if (inNotSpan) { -// println(words.slice(begin, end) + " (not span): " + words(i)) + // println(words.slice(begin, end) + " (not span): " + words(i)) notFeats += NotFeature(if (wordCounts(words(i)) < ngramCountThreshold) -1 else wordIndex(words(i))) } } @@ -60,7 +60,7 @@ class NGramSpanFeaturizer(wordCounts: Counter[String,Double], val ngramFeats = (3 to maxOrder).flatMap(n => { for (i <- begin until end - n + 1) yield { val slice = words.slice(i, i+n) -// println(slice + ": " + higherOrderCounts(n-3)(slice)) + // println(slice + ": " + higherOrderCounts(n-3)(slice)) NGramFeature(n, if (higherOrderCounts(n-3)(slice) < ngramCountThreshold) -1 else higherOrderIndices(n-3)(slice)) } }) diff --git a/src/main/scala/epic/features/NGramSurfaceFeaturizer.scala b/src/main/scala/epic/features/NGramSurfaceFeaturizer.scala index f0f61a39..17736120 100644 --- a/src/main/scala/epic/features/NGramSurfaceFeaturizer.scala +++ b/src/main/scala/epic/features/NGramSurfaceFeaturizer.scala @@ -6,7 +6,7 @@ import breeze.util.CachedHashCode import scala.runtime.ScalaRunTime case class OrientedNGramFeature(offset: Int, features: IndexedSeq[Feature]) extends Feature with CachedHashCode { - override def equals(other: Any):Boolean = other match { + override def equals(other: Any): Boolean = other match { case x: OrientedNGramFeature => x.hashCode == hashCode && ScalaRunTime._equals(this, x) case _ => false } @@ -22,7 +22,6 @@ class NGramWordFeaturizer[W](base: WordFeaturizer[W], wordNgramOrder: Int = 2) e val baseAnch = base.anchor(w) def words: IndexedSeq[W] = w - def featuresForWord(pos: Int): Array[Feature] = { val result = ArrayBuffer[Feature]() ++= baseAnch.featuresForWord(pos) for(order <- 2 to wordNgramOrder) @@ -39,12 +38,10 @@ class NGramWordFeaturizer[W](base: WordFeaturizer[W], wordNgramOrder: Int = 2) e } } - } private def allConfigurations(seqOfSeqs: TraversableOnce[Array[Feature]]): IndexedSeq[IndexedSeq[Feature]] = { seqOfSeqs.foldLeft(IndexedSeq(IndexedSeq.empty[Feature]))((acc,currentFeatures) => {for(a <- acc; b <- currentFeatures) yield a :+ b}) } - } diff --git a/src/main/scala/epic/features/NonRedundantIndexBuilder.scala b/src/main/scala/epic/features/NonRedundantIndexBuilder.scala index 2414b073..1ed89327 100644 --- a/src/main/scala/epic/features/NonRedundantIndexBuilder.scala +++ b/src/main/scala/epic/features/NonRedundantIndexBuilder.scala @@ -17,7 +17,7 @@ class NonRedundantIndexBuilder[F] extends IndexBuilder[F] { for(f <- 0 until allSeenFeatures.size) { val c = contexts(f) - if(!c.exists(seenContexts)) { + if (!c.exists(seenContexts)) { c.foreach(seenContexts += _) result.index(allSeenFeatures.get(f)) } @@ -37,7 +37,7 @@ class NonRedundantIndexBuilder[F] extends IndexBuilder[F] { for(x <- featuresForContext) { val next = allSeenFeatures.index(x) - if(contexts.length <= next) { + if (contexts.length <= next) { contexts += Some(mutable.Set[Int](nextContext)) } else { contexts(next).foreach(_ += nextContext) diff --git a/src/main/scala/epic/features/NormalIndexBuilder.scala b/src/main/scala/epic/features/NormalIndexBuilder.scala index 0871a8d4..61266424 100644 --- a/src/main/scala/epic/features/NormalIndexBuilder.scala +++ b/src/main/scala/epic/features/NormalIndexBuilder.scala @@ -15,7 +15,6 @@ class NormalIndexBuilder[F] extends IndexBuilder[F] { def add(fs: TraversableOnce[F]):Unit = { fs.foreach(_result.index) - } } @@ -23,5 +22,4 @@ class NormalIndexBuilder[F] extends IndexBuilder[F] { trait IndexBuilder[F] { def result():Index[F] def add(fs: TraversableOnce[F]):Unit - } diff --git a/src/main/scala/epic/features/OffsetWordFeaturizer.scala b/src/main/scala/epic/features/OffsetWordFeaturizer.scala index 02c968a3..85d718f7 100644 --- a/src/main/scala/epic/features/OffsetWordFeaturizer.scala +++ b/src/main/scala/epic/features/OffsetWordFeaturizer.scala @@ -13,14 +13,9 @@ case class OffsetFeature(offset: Int, feature: Feature) extends Feature class OffsetWordFeaturizer[W](offsetFeaturizer: WordFeaturizer[W], offset:Int) extends WordFeaturizer[W] with Serializable { def anchor(w: IndexedSeq[W]): WordFeatureAnchoring[W] = new WordFeatureAnchoring[W] { val offsetAnchoring = offsetFeaturizer.anchor(w) - def featuresForWord(pos: Int): Array[Feature] = { offsetAnchoring.featuresForWord(pos + offset).map(OffsetFeature(offset, _)) } - def words: IndexedSeq[W] = w - - } - } diff --git a/src/main/scala/epic/features/PorterStemmer.scala b/src/main/scala/epic/features/PorterStemmer.scala index f198bf3c..a0b59aeb 100644 --- a/src/main/scala/epic/features/PorterStemmer.scala +++ b/src/main/scala/epic/features/PorterStemmer.scala @@ -24,7 +24,6 @@ package epic.features */ class PorterStemmer() extends (String=>String) { import PorterStemmer._ - def apply(w: String) = { if (w.length < 3) w.toLowerCase else { @@ -46,8 +45,6 @@ object PorterStemmer extends PorterStemmer { def apply() = this - - private def step1(w: String) = step1c(step1b(step1a(w))) // get rid of s's @@ -210,7 +207,6 @@ object PorterStemmer extends PorterStemmer { step5b(step5a(w)) } - private def step5a(w: String) = { if (w.length < 3) w else diff --git a/src/main/scala/epic/features/ProductSurfaceFeaturizer.scala b/src/main/scala/epic/features/ProductSurfaceFeaturizer.scala index 01badbee..3d88910f 100644 --- a/src/main/scala/epic/features/ProductSurfaceFeaturizer.scala +++ b/src/main/scala/epic/features/ProductSurfaceFeaturizer.scala @@ -19,8 +19,6 @@ class ProductSurfaceFeaturizer[W](f1: SurfaceFeaturizer[W], f2: SurfaceFeaturize } def words: IndexedSeq[W] = w - - } } diff --git a/src/main/scala/epic/features/ProductWordFeaturizer.scala b/src/main/scala/epic/features/ProductWordFeaturizer.scala index 09ec309e..ae58cca4 100644 --- a/src/main/scala/epic/features/ProductWordFeaturizer.scala +++ b/src/main/scala/epic/features/ProductWordFeaturizer.scala @@ -20,8 +20,6 @@ class ProductWordFeaturizer[W](f1: WordFeaturizer[W], f2: WordFeaturizer[W]) ext } def words: IndexedSeq[W] = w - - } } diff --git a/src/main/scala/epic/features/RuleAndSpansFeaturizer.scala b/src/main/scala/epic/features/RuleAndSpansFeaturizer.scala index d366d8ce..7433344c 100644 --- a/src/main/scala/epic/features/RuleAndSpansFeaturizer.scala +++ b/src/main/scala/epic/features/RuleAndSpansFeaturizer.scala @@ -18,7 +18,6 @@ trait RuleAndSpansFeaturizer[W] extends Serializable { } } - class ZeroRuleAndSpansFeaturizer[W]() extends RuleAndSpansFeaturizer[W] { val emptyArray = Array[Feature]() diff --git a/src/main/scala/epic/features/SegmentedIndex.scala b/src/main/scala/epic/features/SegmentedIndex.scala index 2e570b46..0de702ad 100644 --- a/src/main/scala/epic/features/SegmentedIndex.scala +++ b/src/main/scala/epic/features/SegmentedIndex.scala @@ -22,11 +22,11 @@ class SegmentedIndex[T,IndexType](val indices: IndexedSeq[IndexType])(implicit v override def size = offsets.last def unapply(i: Int): Option[Feature] = { - if(i < 0 || i >= size) { + if (i < 0 || i >= size) { None } else { var component = util.Arrays.binarySearch(offsets, i) - if(component < 0) component = ~component - 1 + if (component < 0) component = ~component - 1 indices(component).unapply(i - offsets(component)).map(ComponentFeature(component, _)) } } diff --git a/src/main/scala/epic/features/SentencePropertiesFeaturizer.scala b/src/main/scala/epic/features/SentencePropertiesFeaturizer.scala index ef26883b..40ad7798 100644 --- a/src/main/scala/epic/features/SentencePropertiesFeaturizer.scala +++ b/src/main/scala/epic/features/SentencePropertiesFeaturizer.scala @@ -18,30 +18,25 @@ class SentencePropertiesFeaturizer(db: DistanceBinner = new DistanceBinner()) ex def words: IndexedSeq[String] = w - def featuresForWord(pos: Int): Array[Feature] = featuresForSpan(pos, pos+1) def featuresForSpan(begin: Int, end: Int): Array[Feature] = { val feats = new ArrayBuffer[Feature]() feats += sentenceLengthFeature - if(wholeSentenceIsUpperCase) + if (wholeSentenceIsUpperCase) feats += WholeSentenceIsUpperCaseFeature - - -// if (begin == 0) -// feats += BeginSentFeature -// if(end == words.length) -// feats += EndSentFeature + // if (begin == 0) + // feats += BeginSentFeature + // if (end == words.length) + // feats += EndSentFeature if (begin == 0 && end == words.length) feats += WholeSentFeature - feats.toArray } } } } - case object BeginSentFeature extends Feature case object EndSentFeature extends Feature case object WholeSentFeature extends Feature diff --git a/src/main/scala/epic/features/SpanShapeGenerator.scala b/src/main/scala/epic/features/SpanShapeGenerator.scala index 8ffb7aba..c0c15155 100644 --- a/src/main/scala/epic/features/SpanShapeGenerator.scala +++ b/src/main/scala/epic/features/SpanShapeGenerator.scala @@ -22,9 +22,9 @@ class SpanShapeFeaturizerBetter(numContextWords: Int, useRichContext: Boolean) e new SurfaceFeatureAnchoring[String] { def featuresForSpan(begin: Int, end: Int): Array[Feature] = { val sig = SpanShapeGenerator.signatureAndContextFor(words, begin, end, numContextWords, useRichContext) -// println("Features for span " + words.slice(begin, end) + ": " + sig); -// val sig2 = SpanShapeGenerator.signatureFor(words, begin, end, includeContext = false) -// Array(SpanShapeFeature(sig), SpanShapeFeature(sig2)) + // println("Features for span " + words.slice(begin, end) + ": " + sig); + // val sig2 = SpanShapeGenerator.signatureFor(words, begin, end, includeContext = false) + // Array(SpanShapeFeature(sig), SpanShapeFeature(sig2)) Array(SpanShapeFeature(sig)) } } @@ -36,9 +36,9 @@ class FullWordSpanShapeFeaturizer(commonWords: Set[String], numContextWords: Int new SurfaceFeatureAnchoring[String] { def featuresForSpan(begin: Int, end: Int): Array[Feature] = { val sig = SpanShapeGenerator.signatureAndContextFor(words, begin, end, numContextWords, true, commonWords) - // println("Features for span " + words.slice(begin, end) + ": " + sig); - // val sig2 = SpanShapeGenerator.signatureFor(words, begin, end, includeContext = false) - // Array(SpanShapeFeature(sig), SpanShapeFeature(sig2)) + // println("Features for span " + words.slice(begin, end) + ": " + sig); + // val sig2 = SpanShapeGenerator.signatureFor(words, begin, end, includeContext = false) + // Array(SpanShapeFeature(sig), SpanShapeFeature(sig2)) Array(SpanShapeFeature(sig)) } } @@ -53,7 +53,6 @@ object SpanShapeGenerator extends Serializable { val MAX_LEN = 6 - def apply(v1: IndexedSeq[String], begin: Int, end: Int): String = signatureFor(v1,begin, end) def signatureFor(words: IndexedSeq[String], begin: Int, end: Int, includeContext: Boolean = true) = { @@ -71,7 +70,7 @@ object SpanShapeGenerator extends Serializable { appendWordShape(i, words, result) i += 1 } - if(i < end) { + if (i < end) { //val remainingLength = distanceBinner.binnedDistance(begin, end - MAX_LEN) //result ++= "~" * remainingLength result += '~' @@ -100,7 +99,7 @@ object SpanShapeGenerator extends Serializable { if (i < 0) { result += '#' } else { - if(commonWords(words(i))) { + if (commonWords(words(i))) { result ++= words(i) } else if (richContext) { appendWordShape(i, words, result) @@ -115,7 +114,7 @@ object SpanShapeGenerator extends Serializable { appendWordShape(i, words, result) i += 1 } - if(i < end) { + if (i < end) { //val remainingLength = distanceBinner.binnedDistance(begin, end - MAX_LEN) //result ++= "~" * remainingLength result += '~' @@ -130,7 +129,7 @@ object SpanShapeGenerator extends Serializable { if (i >= words.length) { result += '#' } else { - if(commonWords(words(i))) { + if (commonWords(words(i))) { result ++= words(i) } else if (richContext) { appendWordShape(i, words, result) @@ -143,7 +142,6 @@ object SpanShapeGenerator extends Serializable { result.toString } - def appendWordShape(i: Int, words: IndexedSeq[String], result: StringBuilder) { val w = if (i < 0 || i >= words.length) "#" else words(i) if (w.isEmpty) { @@ -151,7 +149,7 @@ object SpanShapeGenerator extends Serializable { result += 'ε' } else { var c = w(0) - if(c == '-') { + if (c == '-') { c = w match { case "-LRB-" => '(' case "-RRB-" => ')' @@ -188,7 +186,7 @@ object SpanShapeGenerator extends Serializable { if (i < 0) { result += '#' } else { - if(commonWords(words(i))) { + if (commonWords(words(i))) { result ++= words(i) } else if (richContext) { appendWordShape(i, words, result) @@ -204,8 +202,8 @@ object SpanShapeGenerator extends Serializable { i += 1 } - if(i <= split) { - if(i < split) { + if (i <= split) { + if (i < split) { result += '~' } appendWordShape(split, words, result) @@ -213,7 +211,7 @@ object SpanShapeGenerator extends Serializable { i = split + 2 } - if(i < end) { + if (i < end) { //val remainingLength = distanceBinner.binnedDistance(begin, end - MAX_LEN) //result ++= "~" * remainingLength result += '~' @@ -228,7 +226,7 @@ object SpanShapeGenerator extends Serializable { if (i >= words.length) { result += '#' } else { - if(commonWords(words(i))) { + if (commonWords(words(i))) { result ++= words(i) } else if (richContext) { appendWordShape(i, words, result) diff --git a/src/main/scala/epic/features/SplitSpanFeaturizer.scala b/src/main/scala/epic/features/SplitSpanFeaturizer.scala index 380e49e3..45232578 100644 --- a/src/main/scala/epic/features/SplitSpanFeaturizer.scala +++ b/src/main/scala/epic/features/SplitSpanFeaturizer.scala @@ -133,7 +133,7 @@ object SplitSpanFeaturizer { } def anchor(w: IndexedSeq[W]): SplitSpanFeatureAnchoring[W] = { - if(a.isInstanceOf[SplitPointMarker] || b.isInstanceOf[SplitPointMarker]) + if (a.isInstanceOf[SplitPointMarker] || b.isInstanceOf[SplitPointMarker]) theSplitNeedingAnchoring else theNotSplitNeedingAnchoring @@ -214,11 +214,11 @@ object SplitSpanFeaturizer { val afeats: Array[Feature] = aa.featuresForSpan(begin, end) val bfeats: Array[Feature] = ba.featuresForSpan(begin, end) val cross:Array[Feature] = Arrays.crossProduct(afeats, bfeats)(CrossProductFeature(_, _)) - if(keepJustA && keepJustB) { + if (keepJustA && keepJustB) { Arrays.concatenate[Feature](cross, afeats, bfeats) } else if (keepJustA) { Arrays.concatenate[Feature](cross, afeats) - } else if(keepJustB) { + } else if (keepJustB) { Arrays.concatenate[Feature](cross, bfeats) } else { cross @@ -237,10 +237,10 @@ object SplitSpanFeaturizer { Arrays.crossProduct(aSpan, bSplit)(CrossProductFeature(_, _, "Split")) ) - if(keepJustA) { + if (keepJustA) { results += aSplit } - if(keepJustB) { + if (keepJustB) { results += bSplit } @@ -256,7 +256,6 @@ trait SplitSpanFeatureAnchoring[W] extends SurfaceFeatureAnchoring[W] { def featuresForSplit(begin: Int, split: Int, end: Int):Array[Feature] } - trait IndexedSplitSpanFeaturizer[W] { def anchor(w: IndexedSeq[W]):IndexedSplitSpanFeatureAnchoring[W] def featureIndex: Index[Feature] @@ -272,8 +271,8 @@ object IndexedSplitSpanFeaturizer { hashFeatures: HashFeature.Scale = HashFeature.Relative(1.0), bloomFilter: Boolean = false, deduplicateFeatures: Boolean = false):IndexedSplitSpanFeaturizer[W] = { - def seenSet = if(bloomFilter) new ThreadLocalBloomFilter[Long](8 * 1024 * 1024 * 50, 3) else AlwaysSeenSet - val builder = if(deduplicateFeatures) new NonRedundantIndexBuilder[Feature] else new NormalIndexBuilder[Feature] + def seenSet = if (bloomFilter) new ThreadLocalBloomFilter[Long](8 * 1024 * 1024 * 50, 3) else AlwaysSeenSet + val builder = if (deduplicateFeatures) new NonRedundantIndexBuilder[Feature] else new NormalIndexBuilder[Feature] for (ti <- trees) { val wspec = f.anchor(ti.words) ti.tree.allChildren.foreach { @@ -286,7 +285,7 @@ object IndexedSplitSpanFeaturizer { val index = builder.result() - new BasicIndexedSplitSpanFeaturizer(f, if(hashFeatures.numFeatures(index.size) != 0) new HashExtendingIndex(index, HashFeature(_), hashFeatures, seenSet) else index) + new BasicIndexedSplitSpanFeaturizer(f, if (hashFeatures.numFeatures(index.size) != 0) new HashExtendingIndex(index, HashFeature(_), hashFeatures, seenSet) else index) } class BasicIndexedSplitSpanFeaturizer[W](f: SplitSpanFeaturizer[W], val featureIndex: Index[Feature]) extends IndexedSplitSpanFeaturizer[W] with Serializable { diff --git a/src/main/scala/epic/features/StandardSurfaceFeaturizer.scala b/src/main/scala/epic/features/StandardSurfaceFeaturizer.scala index 1d706160..41205a74 100644 --- a/src/main/scala/epic/features/StandardSurfaceFeaturizer.scala +++ b/src/main/scala/epic/features/StandardSurfaceFeaturizer.scala @@ -6,8 +6,6 @@ import scala.Array import scala.collection.mutable.ArrayBuffer import StandardSpanFeatures._ - - case class FirstWordCapsAnd(f: Feature) extends Feature case class NthWordCapsAnd(f: Feature) extends Feature case class SentenceLengthFeature(length: Int) extends Feature diff --git a/src/main/scala/epic/features/SurfaceFeaturizer.scala b/src/main/scala/epic/features/SurfaceFeaturizer.scala index 1e17fe98..bde68ae8 100644 --- a/src/main/scala/epic/features/SurfaceFeaturizer.scala +++ b/src/main/scala/epic/features/SurfaceFeaturizer.scala @@ -26,13 +26,11 @@ object SurfaceFeaturizer { def apply[W](f: (IndexedSeq[W], Span)=>Array[Feature]):SurfaceFeaturizer[W] = new TabulatedSurfaceFeaturizer[W](f) - /** begin of span */ object begin extends MarkerPos(0) /** end of span */ object end extends MarkerPos(0, false) - trait DSL { def whenLength[W](filt: Int=>Boolean)(f: SurfaceFeaturizer[W])= new LengthFilteredSurfaceFeaturizer(f, filt) @@ -70,7 +68,7 @@ object SurfaceFeaturizer { case class SpanEdgesFeaturizer[W](f1: MarkedWordFeaturizer[W], f2: MarkedWordFeaturizer[W]) extends SurfaceFeaturizer[W] { def anchor(w: IndexedSeq[W]): SurfaceFeatureAnchoring[W] = { val loc1 = f1.wf.anchor(w) - val loc2 = if(f1.wf eq f2.wf) loc1 else f2.wf.anchor(w) + val loc2 = if (f1.wf eq f2.wf) loc1 else f2.wf.anchor(w) new SurfaceFeatureAnchoring[W] { def featuresForSpan(begin: Int, end: Int): Array[Feature] = { val ffs1 = loc1.featuresForWord(f1.mp.toPos(begin, end)) @@ -91,7 +89,6 @@ object SurfaceFeaturizer { } } - case class SingleWordSpanFeaturizer[W](feat: WordFeaturizer[W]) extends SurfaceFeaturizer[W] with Serializable { override def anchor(words: IndexedSeq[W]): SurfaceFeatureAnchoring[W] = new SurfaceFeatureAnchoring[W] { val anch = feat.anchor(words) @@ -126,9 +123,9 @@ object SurfaceFeaturizer { def +(i: Int) = apply(i) def -(i: Int) = apply(-i) - def toPos(begin: Int, end: Int) = if(relativeToBegin) begin + offset else end + offset + def toPos(begin: Int, end: Int) = if (relativeToBegin) begin + offset else end + offset - override def toString = s"(${if(relativeToBegin) "b" else "e"}${if(offset == 0) "" else if(offset > 0) "+" + offset else offset})" + override def toString = s"(${if (relativeToBegin) "b" else "e"}${if (offset == 0) "" else if (offset > 0) "+" + offset else offset})" } class TabulatedSurfaceFeaturizer[W](f: (IndexedSeq[W], Span)=>Array[Feature]) extends SurfaceFeaturizer[W] { diff --git a/src/main/scala/epic/features/TagDictionaryFeaturizer.scala b/src/main/scala/epic/features/TagDictionaryFeaturizer.scala index 5284c4a5..060bb94f 100644 --- a/src/main/scala/epic/features/TagDictionaryFeaturizer.scala +++ b/src/main/scala/epic/features/TagDictionaryFeaturizer.scala @@ -18,7 +18,7 @@ class TagDictionaryFeaturizer[L](counts: Counter2[L, String, Double], commonWord private val emptyArray = Array.empty[Feature] private val argmaxes = Encoder.fromIndex(wordIndex).tabulateArray{w => val totalCount = sum(counts(::, w)) - if(totalCount >= commonWordThreshold) { + if (totalCount >= commonWordThreshold) { emptyArray } else if (totalCount <= 2) { emptyArray @@ -29,16 +29,16 @@ class TagDictionaryFeaturizer[L](counts: Counter2[L, String, Double], commonWord } private val variants = Encoder.fromIndex(wordIndex).tabulateArray{w => val totalCount = sum(counts(::, w)) - if(totalCount < commonWordThreshold) { + if (totalCount < commonWordThreshold) { variantFeatures(w) } else emptyArray } private def variantFeatures(w: String) = { val arr = mutable.ArrayBuilder.make[Feature] - if(w(0).isUpper) { + if (w(0).isUpper) { val lowerCount = sum(counts(::, w.toLowerCase)) - if(lowerCount != 0.0) { + if (lowerCount != 0.0) { arr += HasKnownLowerCaseVariant(counts(::, w.toLowerCase).argmax) } } @@ -47,24 +47,23 @@ class TagDictionaryFeaturizer[L](counts: Counter2[L, String, Double], commonWord if (dashIndex >= 0) { val afterDash = w.substring(dashIndex) val undashedCount = sum(counts(::, afterDash)) - if(undashedCount != 0.0) { + if (undashedCount != 0.0) { arr += HasKnownAfterDashSuffix(counts(::, afterDash).argmax) } } arr.result() } - def anchor(w: IndexedSeq[String]): WordFeatureAnchoring[String] = new WordFeatureAnchoring[String] { val indices = w.map(wordIndex) val myArgmaxes = indices.map{i => - if(i < 0) { + if (i < 0) { emptyArray } else argmaxes(i) } val variants: IndexedSeq[Array[Feature]] = indices.zipWithIndex.map{ case(i, pos) => - if(i < 0) { + if (i < 0) { variantFeatures(w(pos)) } else { TagDictionaryFeaturizer.this.variants(i) @@ -72,11 +71,11 @@ class TagDictionaryFeaturizer[L](counts: Counter2[L, String, Double], commonWord } def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0 || pos >= w.length) { + if (pos < 0 || pos >= w.length) { Array(IndicatorWSFeature('OutOfBounds)) } else { val am = myArgmaxes(pos) - if(variants(pos).length != 0) { + if (variants(pos).length != 0) { am ++ variants(pos) } else { am diff --git a/src/main/scala/epic/features/TransformedWordFeaturizer.scala b/src/main/scala/epic/features/TransformedWordFeaturizer.scala index 006b3ec1..17e1993a 100644 --- a/src/main/scala/epic/features/TransformedWordFeaturizer.scala +++ b/src/main/scala/epic/features/TransformedWordFeaturizer.scala @@ -31,7 +31,7 @@ class TransformedWordFeaturizer[W](initCounts: Counter[W, Double], def words = w def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0 || pos >= words.length) { + if (pos < 0 || pos >= words.length) { boundaryFeatures } else { _minimalFeatures(pos) @@ -40,7 +40,7 @@ class TransformedWordFeaturizer[W](initCounts: Counter[W, Double], private val _minimalFeatures: immutable.IndexedSeq[Array[Feature]] = words.indices.map { i => val index = indices(i) - if(index >= 0) { + if (index >= 0) { minimalFeatures(index) } else { Array[Feature](Unk) @@ -55,13 +55,13 @@ class TransformedWordFeaturizer[W](initCounts: Counter[W, Double], private val Unk = WordFeature("#UNK#", 'LowCount) private val boundaryFeatures = Array[Feature](BoundaryFeature) - private val wordFeatures = Encoder.fromIndex(wordIndex).tabulateArray(s => if(wordCounts(s) > unknownWordThreshold) TransformedFeature(transform(s)) else Unk) + private val wordFeatures = Encoder.fromIndex(wordIndex).tabulateArray(s => if (wordCounts(s) > unknownWordThreshold) TransformedFeature(transform(s)) else Unk) // caches private val minimalFeatures = Array.tabulate[Array[Feature]](wordIndex.size){ i => val wc = wordCounts(transform(wordIndex.get(i))) val w = wordFeatures(i) - if(wc > unknownWordThreshold) { + if (wc > unknownWordThreshold) { Array(w) } else { Array(Unk) diff --git a/src/main/scala/epic/features/WordClassFeaturizer.scala b/src/main/scala/epic/features/WordClassFeaturizer.scala index 482a667f..69a4603b 100644 --- a/src/main/scala/epic/features/WordClassFeaturizer.scala +++ b/src/main/scala/epic/features/WordClassFeaturizer.scala @@ -20,7 +20,7 @@ class WordClassFeaturizer(wordCounts: Counter[String, Double], def words = w def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0 || pos >= words.length) { + if (pos < 0 || pos >= words.length) { boundaryFeatures } else { _minimalFeatures(pos) @@ -29,7 +29,7 @@ class WordClassFeaturizer(wordCounts: Counter[String, Double], private val _minimalFeatures: immutable.IndexedSeq[Array[Feature]] = words.indices.map { i => val index = indices(i) - if(index >= 0) { + if (index >= 0) { WordClassFeaturizer.this.minimalFeatures(index) } else { val ww = words(i) @@ -47,7 +47,7 @@ class WordClassFeaturizer(wordCounts: Counter[String, Double], private val boundaryFeatures = Array[Feature](BoundaryFeature) - private val classes = Encoder.fromIndex(wordIndex).tabulateArray(w => if(wordCounts(w) > functionWordThreshold) interner(IndicatorFeature(w)) else interner(WordFeature(EnglishWordClassGenerator(w), 'Class))) + private val classes = Encoder.fromIndex(wordIndex).tabulateArray(w => if (wordCounts(w) > functionWordThreshold) interner(IndicatorFeature(w)) else interner(WordFeature(EnglishWordClassGenerator(w), 'Class))) // caches private val minimalFeatures = Array.tabulate(wordIndex.size){ i => diff --git a/src/main/scala/epic/features/WordFeaturizer.scala b/src/main/scala/epic/features/WordFeaturizer.scala index 3c744cfc..f138a575 100644 --- a/src/main/scala/epic/features/WordFeaturizer.scala +++ b/src/main/scala/epic/features/WordFeaturizer.scala @@ -63,9 +63,6 @@ object WordFeaturizer { val props = new WordPropertyFeaturizer(summedCounts) val lfsuf = LongestFrequentSuffixFeaturizer(summedCounts, commonWordThreshold) - - - def suffixes(order: Int = 5) = new WordSuffixFeaturizer(summedCounts, suffixOrder = order, commonWordThreshold = commonWordThreshold) def prefixes(order: Int = 5) = new WordPrefixFeaturizer(summedCounts, prefixOrder = order, commonWordThreshold = commonWordThreshold) @@ -82,7 +79,7 @@ object WordFeaturizer { def unigrams(f: WordFeaturizer[String], offsetOrder:Int = 1) = new MultiWordFeaturizer[String]({ for(i <- -offsetOrder to offsetOrder) yield { - if(i == 0) f else f(i) + if (i == 0) f else f(i) } }) @@ -114,7 +111,7 @@ object WordFeaturizer { val feats = words.map(f) - override def featuresForWord(pos: Int): Array[Feature] = if(pos < 0 || pos >= words.length) Array() else feats(pos) + override def featuresForWord(pos: Int): Array[Feature] = if (pos < 0 || pos >= words.length) Array() else feats(pos) } } } @@ -133,10 +130,8 @@ class ZeroFeaturizer[W] extends WordFeaturizer[W] with SurfaceFeaturizer[W] with } } - - class NextActualWordFeaturizer(f: WordFeaturizer[String], lookRight: Boolean, isPunct: (String=>Boolean) = _.forall(!_.isLetterOrDigit)) extends WordFeaturizer[String] with Serializable { - val dir = if(lookRight) 'Right else 'Left + val dir = if (lookRight) 'Right else 'Left def anchor(words: IndexedSeq[String]): WordFeatureAnchoring[String] = { val w = words new WordFeatureAnchoring[String] { @@ -145,13 +140,13 @@ class NextActualWordFeaturizer(f: WordFeaturizer[String], lookRight: Boolean, is val features: immutable.IndexedSeq[Array[Feature]] = w.indices.map { _pos => var pos = _pos - val delta = if(lookRight) 1 else -1 + val delta = if (lookRight) 1 else -1 val feats = new ArrayBuffer[Feature]() var done = false - while(!done && pos >= 0 && pos < w.length) { - if(isPunct(w(pos))) { + while (!done && pos >= 0 && pos < w.length) { + if (isPunct(w(pos))) { feats ++= base.featuresForWord(pos).map(PunctuationFeature(_, dir)) } else { feats ++= base.featuresForWord(pos).map(ActualWordFeature(_, dir)) @@ -160,20 +155,19 @@ class NextActualWordFeaturizer(f: WordFeaturizer[String], lookRight: Boolean, is pos += delta } - if(pos < 0 || pos >= w.length) feats ++= base.featuresForWord(pos) + if (pos < 0 || pos >= w.length) feats ++= base.featuresForWord(pos) feats.toArray } def words: IndexedSeq[String] = w def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0 || pos >= w.length) base.featuresForWord(pos) + if (pos < 0 || pos >= w.length) base.featuresForWord(pos) else features(pos) } } } - } case class PunctuationFeature(f: Feature, dir: Symbol) extends Feature diff --git a/src/main/scala/epic/features/WordPrefixFeaturizer.scala b/src/main/scala/epic/features/WordPrefixFeaturizer.scala index a0dfb90c..c33c494e 100644 --- a/src/main/scala/epic/features/WordPrefixFeaturizer.scala +++ b/src/main/scala/epic/features/WordPrefixFeaturizer.scala @@ -20,7 +20,6 @@ import breeze.linalg._ import collection.mutable.ArrayBuffer import breeze.util.{Encoder, Index} - class WordPrefixFeaturizer(wordCounts: Counter[String, Double], prefixOrder: Int = 5, commonWordThreshold: Int = 100) extends WordFeaturizer[String] with Serializable { private val wordIndex = Index(wordCounts.keysIterator) @@ -33,17 +32,16 @@ class WordPrefixFeaturizer(wordCounts: Counter[String, Double], prefixOrder: Int def featuresForWord(pos: Int): Array[Feature] = { myFeatures(pos) } - } def featuresFor(w: String): Array[Feature] = { val wc = wordCounts(w) - if(wc > commonWordThreshold) { + if (wc > commonWordThreshold) { Array.empty } else { val features = new ArrayBuffer[Feature] val wlen = w.length - if(wlen >= 4) { + if (wlen >= 4) { for(i <- 1 to ((wlen - 1) min prefixOrder)) { features += PrefixFeature(w.substring(0,i)) } @@ -55,7 +53,6 @@ class WordPrefixFeaturizer(wordCounts: Counter[String, Double], prefixOrder: Int def apply(w: String) = featuresFor(w) - } diff --git a/src/main/scala/epic/features/WordPropertyFeaturizer.scala b/src/main/scala/epic/features/WordPropertyFeaturizer.scala index adecb1a5..bb2c1fbd 100644 --- a/src/main/scala/epic/features/WordPropertyFeaturizer.scala +++ b/src/main/scala/epic/features/WordPropertyFeaturizer.scala @@ -30,8 +30,6 @@ final case class SeenWithTagFeature(str: Any) extends Feature final case class LeftWordFeature(str: Any) extends Feature final case class RightWordFeature(str: Any) extends Feature - - class WordPropertyFeaturizer(wordCounts: Counter[String, Double], commonWordThreshold: Int = 20) extends WordFeaturizer[String] with Serializable { import epic.features.WordPropertyFeaturizer._ @@ -44,14 +42,14 @@ class WordPropertyFeaturizer(wordCounts: Counter[String, Double], val indices = words.map(wordIndex) val myFeatures = words.indices.map(i => if (indices(i) < 0) featuresFor(words(i)).toArray else knownWordFeatures(indices(i))) def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0) Array(BeginSentFeature) - else if(pos >= words.length) Array(EndSentFeature) + if (pos < 0) Array(BeginSentFeature) + else if (pos >= words.length) Array(EndSentFeature) else { val base = myFeatures(pos) // initial words nee special treatment - if( (words(pos).charAt(0).isUpper || words(pos).charAt(0).isTitleCase) && base.length > 1) { + if ( (words(pos).charAt(0).isUpper || words(pos).charAt(0).isTitleCase) && base.length > 1) { val isInitialWord = pos == 0 || words(pos -1) == "``" - if(isInitialWord) { + if (isInitialWord) { base ++ base.map(FirstWordCapsAnd) } else { base ++ base.map(NthWordCapsAnd) @@ -61,14 +59,13 @@ class WordPropertyFeaturizer(wordCounts: Counter[String, Double], } } } - } // val signatureGenerator = EnglishWordClassGenerator def featuresFor(w: String): IndexedSeq[Feature] = { val wc = wordCounts(w) val features = ArrayBuffer[Feature]() - if(wc <= commonWordThreshold) { + if (wc <= commonWordThreshold) { val wlen = w.length val numCaps = (w:Seq[Char]).count{_.isUpper} val hasLetter = w.exists(_.isLetter) @@ -80,16 +77,16 @@ class WordPropertyFeaturizer(wordCounts: Counter[String, Double], val numPeriods = w.count('.' ==) val hasPeriod = numPeriods > 0 - if(numCaps > 0) features += hasCapFeature - if(numCaps > 1) features += hasManyCapFeature + if (numCaps > 0) features += hasCapFeature + if (numCaps > 1) features += hasManyCapFeature val isAllCaps = numCaps > 1 && !hasLower && !hasNotLetter - if(isAllCaps) features += isAllCapsFeature + if (isAllCaps) features += isAllCapsFeature - if(w.length == 2 && w(0).isLetter && w(0).isUpper && w(1) == '.') { + if (w.length == 2 && w(0).isLetter && w(0).isUpper && w(1) == '.') { features += isAnInitialFeature } - if(w.length > 1 && w.last == '.') { + if (w.length > 1 && w.last == '.') { features += endsWithPeriodFeature } @@ -98,9 +95,9 @@ class WordPropertyFeaturizer(wordCounts: Counter[String, Double], var hasTitleCaseVariant = false val hasInitialUpper: Boolean = w(0).isUpper || w(0).isTitleCase - if(hasInitialUpper) { + if (hasInitialUpper) { features += hasInitCapFeature - if(wordCounts(w.toLowerCase) > 0) { + if (wordCounts(w.toLowerCase) > 0) { features += hasKnownLCFeature knownLowerCase = true } else { @@ -111,16 +108,14 @@ class WordPropertyFeaturizer(wordCounts: Counter[String, Double], } } - - - if(!hasLower && hasLetter) features += hasNoLower - if(hasDash) features += hasDashFeature - if(hasDigit) { + if (!hasLower && hasLetter) features += hasNoLower + if (hasDash) features += hasDashFeature + if (hasDigit) { features += hasDigitFeature features += DigitNormalizedFeature(w.replaceAll("\\d", "0")) } - if(!hasLetter) features += hasNoLetterFeature - if(hasNotLetter) features += hasNotLetterFeature + if (!hasLetter) features += hasNoLetterFeature + if (hasNotLetter) features += hasNotLetterFeature // acronyms are all upper case with maybe some periods interspersed val hasAcronymShape = ( @@ -128,48 +123,45 @@ class WordPropertyFeaturizer(wordCounts: Counter[String, Double], || wlen >= 2 && hasPeriod && !hasLower && numCaps > 0 && !hasDigit && w.forall(c => c.isLetter || c == '.') ) // make sure it doesn't have a lwoer case or title case variant, common for titles and place names... - if(hasAcronymShape && !knownLowerCase && !hasTitleCaseVariant) { + if (hasAcronymShape && !knownLowerCase && !hasTitleCaseVariant) { features += isProbablyAcronymFeature } // year! - if(wlen == 4 && !hasNonDigit) { + if (wlen == 4 && !hasNonDigit) { val year = try{w.toInt} catch {case e: NumberFormatException => 0} - if(year >= 1400 && year < 2300) { + if (year >= 1400 && year < 2300) { features += isProbablyYearFeature } } - if(hasDigit && !hasLetter) { + if (hasDigit && !hasLetter) { try { val n = w.replaceAll(",","").toDouble - if(!hasPeriod) + if (!hasPeriod) features += integerFeature else features += floatFeature } catch {case e: NumberFormatException =>} } - if(wlen > 3 && w.endsWith("s") && !w.endsWith("ss") && !w.endsWith("us") && !w.endsWith("is")) { + if (wlen > 3 && w.endsWith("s") && !w.endsWith("ss") && !w.endsWith("us") && !w.endsWith("is")) { features += endsWithSFeature - if(hasInitialUpper) + if (hasInitialUpper) features += hasInitialCapsAndEndsWithSFeature // we mess up NNP and NNPS } - if(wlen > 10) { + if (wlen > 10) { features += longWordFeature - } else if(wlen < 5) { + } else if (wlen < 5) { features += shortWordFeature } } features } - - def apply(w: String) = featuresFor(w) - } object WordPropertyFeaturizer { diff --git a/src/main/scala/epic/features/WordShapeFeaturizer.scala b/src/main/scala/epic/features/WordShapeFeaturizer.scala index 8b569ebc..a12b5654 100644 --- a/src/main/scala/epic/features/WordShapeFeaturizer.scala +++ b/src/main/scala/epic/features/WordShapeFeaturizer.scala @@ -19,7 +19,7 @@ class WordShapeFeaturizer(wordCounts: Counter[String, Double], def words = w def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0 || pos >= words.length) { + if (pos < 0 || pos >= words.length) { boundaryFeatures } else { _minimalFeatures(pos) @@ -28,7 +28,7 @@ class WordShapeFeaturizer(wordCounts: Counter[String, Double], private val _minimalFeatures: IndexedSeq[Array[Feature]] = words.indices.map { i => val index = indices(i) - if(index >= 0) { + if (index >= 0) { WordShapeFeaturizer.this.minimalFeatures(index) } else { val ww = words(i) @@ -46,7 +46,7 @@ class WordShapeFeaturizer(wordCounts: Counter[String, Double], private val boundaryFeatures = Array[Feature](BoundaryFeature) - private val shapes = Encoder.fromIndex(wordIndex).tabulateArray(w => if(wordCounts(w) > functionWordThreshold) interner(IndicatorFeature(w)) else interner(WordFeature(WordShapeGenerator(w), 'Shape))) + private val shapes = Encoder.fromIndex(wordIndex).tabulateArray(w => if (wordCounts(w) > functionWordThreshold) interner(IndicatorFeature(w)) else interner(WordFeature(WordShapeGenerator(w), 'Shape))) // caches private val minimalFeatures = Array.tabulate(wordIndex.size){ i => diff --git a/src/main/scala/epic/features/WordSuffixFeaturizer.scala b/src/main/scala/epic/features/WordSuffixFeaturizer.scala index 5fea6011..39523897 100644 --- a/src/main/scala/epic/features/WordSuffixFeaturizer.scala +++ b/src/main/scala/epic/features/WordSuffixFeaturizer.scala @@ -20,7 +20,6 @@ import breeze.linalg._ import collection.mutable.ArrayBuffer import breeze.util.{Encoder, Index} - class WordSuffixFeaturizer(wordCounts: Counter[String, Double], suffixOrder: Int = 5, commonWordThreshold: Int = 100) extends WordFeaturizer[String] with Serializable { import WordPropertyFeaturizer._ @@ -34,33 +33,29 @@ class WordSuffixFeaturizer(wordCounts: Counter[String, Double], suffixOrder: Int def featuresForWord(pos: Int): Array[Feature] = { myFeatures(pos) } - } def featuresFor(w: String): Array[Feature] = { val wc = wordCounts(w) - if(wc > commonWordThreshold) { + if (wc > commonWordThreshold) { Array.empty } else { val features = new ArrayBuffer[Feature] val wlen = w.length - if(wlen >= 5) { + if (wlen >= 5) { for(i <- 1 to ((wlen-1) min suffixOrder)) { features += SuffixFeature(w.substring(wlen - i)) } - - // for(i <- 1 to ((wlen - 1) min prefixOrder)) { - // features += PrefixFeature(w.substring(0,i)) - // } + // for(i <- 1 to ((wlen - 1) min prefixOrder)) { + // features += PrefixFeature(w.substring(0,i)) + // } } - features.toArray } } def apply(w: String) = featuresFor(w) - } diff --git a/src/main/scala/epic/features/package.scala b/src/main/scala/epic/features/package.scala index 3bb12a53..713d49a5 100644 --- a/src/main/scala/epic/features/package.scala +++ b/src/main/scala/epic/features/package.scala @@ -14,7 +14,6 @@ package object features { for(x <- it) { builder.add(gen(x)) } - builder.result() } diff --git a/src/main/scala/epic/framework/EPInference.scala b/src/main/scala/epic/framework/EPInference.scala index 8491bfcf..48804423 100644 --- a/src/main/scala/epic/framework/EPInference.scala +++ b/src/main/scala/epic/framework/EPInference.scala @@ -36,16 +36,14 @@ class EPInference[Datum, Augment <: AnyRef](val inferences: IndexedSeq[Projectab def scorer(v: Datum): Scorer = EPScorer(inferences.map(_.scorer(v))) - override def forTesting = new EPInference(inferences.map(_.forTesting), maxEPIter, epInGold) - // ugh code duplication... def goldMarginal(scorer: Scorer, datum: Datum, augment: Augment): Marginal = { - if(!epInGold) { + if (!epInGold) { val marginals = inferences.indices.map { i => val inf = inferences(i) - if(inf eq null) + if (inf eq null) null.asInstanceOf[ProjectableInference[Datum, Augment]#Marginal] else inf.goldMarginal(scorer.scorers(i).asInstanceOf[inf.Scorer], datum) @@ -61,14 +59,10 @@ class EPInference[Datum, Augment <: AnyRef](val inferences: IndexedSeq[Projectab EPInference.doInference(datum, augment, inferences, scorer, (inf:ProjectableInference[Datum, Augment], scorer: ProjectableInference[Datum, Augment]#Scorer, q: Augment) => inf.marginal(scorer.asInstanceOf[inf.Scorer], datum, q), maxEPIter) } - - } - case class EPMarginal[Augment, Marginal](logPartition: Double, q: Augment, marginals: IndexedSeq[Marginal]) extends epic.framework.Marginal - object EPInference extends SafeLogging { val iters, calls = new AtomicLong(0) @@ -103,7 +97,7 @@ object EPInference extends SafeLogging { } val newAugment = inf.project(datum, iScorer.asInstanceOf[inf.Scorer], marg.asInstanceOf[inf.Marginal], q) marginals(i) = marg -// println("Leaving " + i) + // println("Leaving " + i) newAugment -> contributionToLikelihood } val ep = new ExpectationPropagation(project _, convergenceThreshold) @@ -117,7 +111,7 @@ object EPInference extends SafeLogging { state = s } EPInference.iters.addAndGet(iter) - if(EPInference.calls.incrementAndGet % 1000 == 0) { + if (EPInference.calls.incrementAndGet % 1000 == 0) { val calls = EPInference.calls.get() val iters = EPInference.iters.get() logger.info(s"EP Stats $iters $calls ${iters * 1.0 / calls} $maxEPIter") diff --git a/src/main/scala/epic/framework/EPModel.scala b/src/main/scala/epic/framework/EPModel.scala index bd0c6f2b..0ee62a60 100644 --- a/src/main/scala/epic/framework/EPModel.scala +++ b/src/main/scala/epic/framework/EPModel.scala @@ -37,22 +37,21 @@ class EPModel[Datum, Augment <: AnyRef](maxEPIter: Int, initFeatureValue: Featur private val offsets = models.map(_.numFeatures).unfold(0)(_ + _) for(i <- 0 until models.length) { println(models(i) + " " + models(i).featureIndex.size)} - def emptyCounts = { val counts = for (m <- models) yield m.emptyCounts EPExpectedCounts(0.0, counts.toIndexedSeq) } - def accumulateCounts(inf: Inference, s: Scorer, datum: Datum, marg: Marginal, accum: ExpectedCounts, scale: Double):Unit = { import marg._ for ( (model, i) <- models.zipWithIndex) { val marg = marginals(i) - if(marg != null) + if (marg != null) model.accumulateCounts(inf.inferences(i).asInstanceOf[model.Inference], s.scorers(i).asInstanceOf[model.Scorer], datum, marg.asInstanceOf[model.Marginal], accum.counts(i).asInstanceOf[model.ExpectedCounts], scale) } accum.loss += scale * marg.logPartition } + def numModels = models.length val featureIndex: Index[Feature] = { @@ -68,7 +67,6 @@ class EPModel[Datum, Augment <: AnyRef](maxEPIter: Int, initFeatureValue: Featur } } - /** * just saves feature weights to disk as a serialized counter. The file is prefix.ser.gz */ @@ -81,12 +79,12 @@ class EPModel[Datum, Augment <: AnyRef](maxEPIter: Int, initFeatureValue: Featur } for(i <- 0 until numModels) { val mySlice = initWeights.slice(offsets(i), offsets(i+1)) - if(mySlice.valuesIterator.exists(_ == 0)) { + if (mySlice.valuesIterator.exists(_ == 0)) { for(cw <- models(i).readCachedFeatureWeights(suffix+"-"+i)) { any = true var j = 0 - while(j < cw.length) { - if(mySlice(j) == 0.0) { + while (j < cw.length) { + if (mySlice(j) == 0.0) { mySlice(j) = cw(j) } j += 1 @@ -94,14 +92,13 @@ class EPModel[Datum, Augment <: AnyRef](maxEPIter: Int, initFeatureValue: Featur } } } - if(any) + if (any) Some(initWeights) else None } - /** * Caches the weights using the cache broker. */ @@ -122,7 +119,7 @@ class EPModel[Datum, Augment <: AnyRef](maxEPIter: Int, initFeatureValue: Featur val toUse = new ArrayBuffer[Int]() var inferences = ArrayBuffer.tabulate(models.length) { i => // hack, for now. - if(dropOutFraction > 0 && Rand.uniform.get < dropOutFraction) + if (dropOutFraction > 0 && Rand.uniform.get < dropOutFraction) null:ProjectableInference[Datum, Augment] else { toUse += i @@ -130,7 +127,7 @@ class EPModel[Datum, Augment <: AnyRef](maxEPIter: Int, initFeatureValue: Featur } } - if(!inferences.exists(_ ne null)) { + if (!inferences.exists(_ ne null)) { toUse.clear() inferences = ArrayBuffer.tabulate(models.length) { i => toUse += i @@ -140,8 +137,6 @@ class EPModel[Datum, Augment <: AnyRef](maxEPIter: Int, initFeatureValue: Featur if (dropOutFraction != 0.0) logger.info("Using inferences for models " + toUse.mkString(", ")) - - new EPInference(inferences, maxEPIter, epInGold = epInGold) } diff --git a/src/main/scala/epic/framework/EvaluableModel.scala b/src/main/scala/epic/framework/EvaluableModel.scala index 7fd2d8c6..dc83f93d 100644 --- a/src/main/scala/epic/framework/EvaluableModel.scala +++ b/src/main/scala/epic/framework/EvaluableModel.scala @@ -18,6 +18,6 @@ trait EvaluableModel[Datum] extends Model[Datum] { self => data.par.aggregate(None:Option[EvaluationResult])({(res, datum) => val result = evaluate(inf.annotate(datum, inf.marginal(datum)), datum, logResults) Some(res.foldLeft(result)(_ + _)) - }, {(a,b) => if(a.isEmpty) b else if(b.isEmpty) a else Some(a.get + b.get)}).get + }, {(a,b) => if (a.isEmpty) b else if (b.isEmpty) a else Some(a.get + b.get)}).get } } diff --git a/src/main/scala/epic/framework/EvaluationResult.scala b/src/main/scala/epic/framework/EvaluationResult.scala index fc8f1909..0dc02d22 100644 --- a/src/main/scala/epic/framework/EvaluationResult.scala +++ b/src/main/scala/epic/framework/EvaluationResult.scala @@ -1,6 +1,5 @@ package epic.framework - /** * Marker for the output of an evaluation routine. * @tparam R self type diff --git a/src/main/scala/epic/framework/Example.scala b/src/main/scala/epic/framework/Example.scala index bc16139e..bd3367a0 100644 --- a/src/main/scala/epic/framework/Example.scala +++ b/src/main/scala/epic/framework/Example.scala @@ -24,7 +24,6 @@ trait Example[+L,+T] extends Observation[T] with Labeled[L] with Serializable {o val features = outer.features } - override def toString = { "Example { id =" + id + ", label = " + label + ", features = " + features + "}" } diff --git a/src/main/scala/epic/framework/Inference.scala b/src/main/scala/epic/framework/Inference.scala index fe323670..51de925d 100644 --- a/src/main/scala/epic/framework/Inference.scala +++ b/src/main/scala/epic/framework/Inference.scala @@ -61,8 +61,6 @@ trait Inference[Datum] extends Serializable { def forTesting: Inference[Datum] = this } - - /** * AugmentableInference is an [[epic.framework.Inference]] that can support injecting * additional information into the structure computation. This can include diff --git a/src/main/scala/epic/framework/LossAugmentation.scala b/src/main/scala/epic/framework/LossAugmentation.scala index f6b8269d..fcfabfda 100644 --- a/src/main/scala/epic/framework/LossAugmentation.scala +++ b/src/main/scala/epic/framework/LossAugmentation.scala @@ -8,6 +8,5 @@ package epic.framework **/ trait LossAugmentation[Datum, Augment] extends (Datum=>Augment) { def lossAugmentation(datum: Datum):Augment - def apply(datum: Datum): Augment = lossAugmentation(datum) } diff --git a/src/main/scala/epic/framework/Model.scala b/src/main/scala/epic/framework/Model.scala index c304f883..59c83dfe 100644 --- a/src/main/scala/epic/framework/Model.scala +++ b/src/main/scala/epic/framework/Model.scala @@ -21,7 +21,6 @@ import breeze.linalg._ import breeze.util.Index import epic.util.{SafeLogging, WeightsCache} - /** * A Model represents a class for turning weight vectors into [[epic.framework.Inference]]s. * It's main job is to hook up with a [[epic.framework.ModelObjective]] and mediate @@ -43,7 +42,6 @@ trait Model[Datum] extends SafeLogging { self => def emptyCounts: ExpectedCounts def accumulateCounts(inf: Inference, s: Scorer, d: Datum, m: Marginal, accum: ExpectedCounts, scale: Double):Unit - final def expectedCounts(inf: Inference, d: Datum, scale: Double = 1.0):ExpectedCounts = { val ec = emptyCounts accumulateCounts(inf, d, ec, scale) @@ -82,7 +80,7 @@ trait Model[Datum] extends SafeLogging { self => def readCachedFeatureWeights(suffix:String=""):Option[DenseVector[Double]] = { val file = new File(weightsCacheName+suffix+".txt.gz") logger.info(s"Reading old weights from $file") - if(file.exists) { + if (file.exists) { Some(WeightsCache.read(file, featureIndex)) } else { None diff --git a/src/main/scala/epic/framework/ModelFactory.scala b/src/main/scala/epic/framework/ModelFactory.scala index 79b9818c..ca179c65 100644 --- a/src/main/scala/epic/framework/ModelFactory.scala +++ b/src/main/scala/epic/framework/ModelFactory.scala @@ -23,8 +23,4 @@ import breeze.util._ * Interface for producing Models from training data. * @author dlwh */ -trait ModelFactory[Datum] { - - - -} +trait ModelFactory[Datum] \ No newline at end of file diff --git a/src/main/scala/epic/framework/ModelObjective.scala b/src/main/scala/epic/framework/ModelObjective.scala index c789d51c..b6e14f35 100644 --- a/src/main/scala/epic/framework/ModelObjective.scala +++ b/src/main/scala/epic/framework/ModelObjective.scala @@ -40,7 +40,7 @@ class ModelObjective[Datum](val model: Model[Datum], case Some(vector) => vector case None => Encoder.fromIndex(featureIndex).tabulateDenseVector(f => model.initialValueForFeature(f)) } - if(randomize) { + if (randomize) { // Control the seed of the RNG for the weights val rng = new scala.util.Random(0) v += DenseVector(Array.tabulate(numFeatures)(i => rng.nextDouble * 2.0 * scale - scale)) @@ -51,7 +51,7 @@ class ModelObjective[Datum](val model: Model[Datum], var timeSinceLastWrite = 0L var nextSave = 5L * 20 * 1000 def calculate(x: DenseVector[Double], batch: IndexedSeq[Int]) = { - if(timeSinceLastWrite > nextSave) { + if (timeSinceLastWrite > nextSave) { logger.info("Saving feature weights...") val timeIn = System.currentTimeMillis() model.cacheFeatureWeights(x) @@ -73,10 +73,10 @@ class ModelObjective[Datum](val model: Model[Datum], } catch { case e: Exception => e.printStackTrace() -// new Exception("While processing " + datum, e).printStackTrace() + // new Exception("While processing " + datum, e).printStackTrace() _countsSoFar } - },{ (a,b) => if(a eq null) b else if (b eq null) a else b += a}) + },{ (a,b) => if (a eq null) b else if (b eq null) a else b += a}) val timeOut = System.currentTimeMillis() timeSinceLastWrite += timeOut - timeIn logger.info(f"Inference took: ${(timeOut - timeIn) * 1.0/1000}%.3fs" ) diff --git a/src/main/scala/epic/framework/OneBestInferenceAdaptor.scala b/src/main/scala/epic/framework/OneBestInferenceAdaptor.scala index b059fd1c..a46dbfd6 100644 --- a/src/main/scala/epic/framework/OneBestInferenceAdaptor.scala +++ b/src/main/scala/epic/framework/OneBestInferenceAdaptor.scala @@ -12,13 +12,10 @@ class OneBestInferenceAdaptor[Datum](val inference: AnnotatingInference[Datum]) type Marginal = inference.Marginal type Scorer = inference.Scorer - - def scorer(v: Datum): Scorer = inference.scorer(v) def goldMarginal(scorer: Scorer, v: Datum): Marginal = inference.goldMarginal(scorer, v) - /** * Produces the "guess marginal" which is the marginal conditioned on only the input data * @param v the example @@ -29,10 +26,8 @@ class OneBestInferenceAdaptor[Datum](val inference: AnnotatingInference[Datum]) goldMarginal(scorer, inference.annotate(v, m)) } - } - class OneBestModelAdaptor[Datum](val model: Model[Datum] { type Inference <: AnnotatingInference[Datum]}) extends Model[Datum] { type ExpectedCounts = model.ExpectedCounts type Marginal = model.Marginal @@ -40,7 +35,6 @@ class OneBestModelAdaptor[Datum](val model: Model[Datum] { type Inference <: Ann type Inference = OneBestInferenceAdaptor[Datum] { type Marginal = model.Marginal; type Scorer = model.Scorer} def emptyCounts: ExpectedCounts = model.emptyCounts - def accumulateCounts(inf: Inference, s: Scorer, d: Datum, m: Marginal, accum: ExpectedCounts, scale: Double) { model.accumulateCounts(inf.inference.asInstanceOf[model.Inference], s, d, m, accum, scale) } diff --git a/src/main/scala/epic/framework/StructSVM.scala b/src/main/scala/epic/framework/StructSVM.scala index 6b7b2896..d8c802f0 100644 --- a/src/main/scala/epic/framework/StructSVM.scala +++ b/src/main/scala/epic/framework/StructSVM.scala @@ -25,7 +25,7 @@ class StructSVM[Datum](val model: Model[Datum], for(i <- 0 until maxIter if !converged) { val newWeights = weights.copy for(i <- 0 until numBatches) { - val smoTol = if(i < 5) math.pow(10, -(i + 1)) else 1E-6 + val smoTol = if (i < 5) math.pow(10, -(i + 1)) else 1E-6 val inf = model.inferenceFromWeights(newWeights) val batch = Rand.subsetsOfSize(data, batchSize).draw() constraints ++= findNewConstraints(inf, batch) @@ -88,10 +88,10 @@ class StructSVM[Datum](val model: Model[Datum], val newAlphas = Array.newBuilder[Double] val newConstraints = new ArrayBuffer[Constraint]() for( i <- 0 until alphas.length) { - if(alphas(i).abs < 1E-5) constraints(i).age += 1 + if (alphas(i).abs < 1E-5) constraints(i).age += 1 else constraints(i).age = 0 - if(constraints(i).age < MAX_CONSTRAINT_AGE) { + if (constraints(i).age < MAX_CONSTRAINT_AGE) { newConstraints += constraints(i) newAlphas += alphas(i) } @@ -107,11 +107,11 @@ class StructSVM[Datum](val model: Model[Datum], alphas: DenseVector[Double], constraints: IndexedSeq[Constraint], smoTol: Double): Unit = { - if(alphas.sum < C) { + if (alphas.sum < C) { alphas += (C-alphas.sum)/alphas.length } for(i <- 0 until alphas.length) { - if(alphas(i) != 0.0) { + if (alphas(i) != 0.0) { constraints(i).axpy(alphas(i), weights) } } @@ -124,11 +124,11 @@ class StructSVM[Datum](val model: Model[Datum], val oldA1 = alphas(i) val j = perm(i) val oldA2 = alphas(j) - if( (oldA1 != 0 && oldA2 != 0)) { + if ( (oldA1 != 0 && oldA2 != 0)) { val con2 = constraints(j) var t = ((con1.loss - con2.loss) - ( (con2.dot(weights)) - (con1.dot(weights))))/(con1.ftf + con2.ftf) val tt = t - if(!t.isNaN && t != 0.0) { + if (!t.isNaN && t != 0.0) { t = t max (-oldA1) val newA1 = (oldA1 + t) min (oldA1 + oldA2) val newA2 = (oldA2 - t) max 0 diff --git a/src/main/scala/epic/framework/StructuredPerceptron.scala b/src/main/scala/epic/framework/StructuredPerceptron.scala index 5166276e..30708e5a 100644 --- a/src/main/scala/epic/framework/StructuredPerceptron.scala +++ b/src/main/scala/epic/framework/StructuredPerceptron.scala @@ -45,7 +45,7 @@ class StructuredPerceptron[Datum](model: Model[Datum], maxPasses: Int = 100, bat logger.info(f"this instance ${ec.loss}%.2f loss, ${numBad.get}/${batch.size} instances were not right!") } - if(totalCounts.isEmpty) + if (totalCounts.isEmpty) logger.info(f"this instance everything was fine!") } diff --git a/src/main/scala/epic/inference/ExpectationPropagation.scala b/src/main/scala/epic/inference/ExpectationPropagation.scala index 564dd3d8..457c8b13 100644 --- a/src/main/scala/epic/inference/ExpectationPropagation.scala +++ b/src/main/scala/epic/inference/ExpectationPropagation.scala @@ -40,7 +40,7 @@ class ExpectationPropagation[F,Q <: AnyRef](project: (Q,F)=>(Q,Double), criterio } def next() = { - if(consumed) hasNext + if (consumed) hasNext consumed = true cur } @@ -49,8 +49,6 @@ class ExpectationPropagation[F,Q <: AnyRef](project: (Q,F)=>(Q,Double), criterio it } - - } object ExpectationPropagation extends App { @@ -115,7 +113,5 @@ object ExpectationPropagation extends App { assert(!state.logPartition.isNaN, state.q.s + " " + state.q.b) } - - } diff --git a/src/main/scala/epic/inference/Factor.scala b/src/main/scala/epic/inference/Factor.scala index e2d7635d..15172b91 100644 --- a/src/main/scala/epic/inference/Factor.scala +++ b/src/main/scala/epic/inference/Factor.scala @@ -1,19 +1,15 @@ package epic.inference trait Factor[F] { this: F => - /** Pointwise multiplication */ def *(f: F):F /** Pointwise division */ def /(f: F):F - /** May be infinite */ def logPartition: Double - - def isConvergedTo(f: F, diff: Double=1E-4):Boolean + def isConvergedTo(f: F, diff: Double=1E-4): Boolean } - trait ExpFactor[F] extends Factor[F] { this: F => /** Exponentiation */ def **(f: Double):F diff --git a/src/main/scala/epic/lexicon/SignatureLexicon.scala b/src/main/scala/epic/lexicon/SignatureLexicon.scala index b8ff8cb0..17e8abae 100644 --- a/src/main/scala/epic/lexicon/SignatureLexicon.scala +++ b/src/main/scala/epic/lexicon/SignatureLexicon.scala @@ -7,8 +7,7 @@ import epic.util.SafeLogging * A simple lexicon that thresholds to decide when to open up the rare word to all (open) tags */ @SerialVersionUID(1L) -class SignatureLexicon[L, W](val labelIndex: Index[L], allowed: Map[W, Set[Int]], signature: W=>W) extends Lexicon[L, W] with Serializable with SafeLogging { - +class SignatureLexicon[L, W](val labelIndex: Index[L], allowed: Map[W, Set[Int]], signature: W => W) extends Lexicon[L, W] with Serializable with SafeLogging { override def morePermissive: Lexicon[L, W] = { new SignatureLexicon(labelIndex, Map.empty[W, Set[Int]].withDefaultValue(allTags), signature) @@ -28,7 +27,6 @@ class SignatureLexicon[L, W](val labelIndex: Index[L], allowed: Map[W, Set[Int]] def length: Int = words.length } - } diff --git a/src/main/scala/epic/lexicon/SignatureTagScorer.scala b/src/main/scala/epic/lexicon/SignatureTagScorer.scala index 650526ef..4345a70a 100644 --- a/src/main/scala/epic/lexicon/SignatureTagScorer.scala +++ b/src/main/scala/epic/lexicon/SignatureTagScorer.scala @@ -18,7 +18,6 @@ package epic.lexicon import math.log import breeze.linalg._ - /** * @param counts * @tparam L @@ -26,9 +25,7 @@ import breeze.linalg._ class SignatureTagScorer[L, String](counts: Counter2[L, String, Double], signature: String=>String) extends TagScorer[L, String] { def anchor(w: IndexedSeq[String]):Anchoring = new Anchoring { def words: IndexedSeq[String] = w - - val sigs = w.map(x => if(counts(::, x).valuesIterator.nonEmpty) x else signature(x)) - + val sigs = w.map(x => if (counts(::, x).valuesIterator.nonEmpty) x else signature(x)) def scoreTag(pos: Int, l: L) = { counts(l, sigs(pos)) } diff --git a/src/main/scala/epic/lexicon/TagScorer.scala b/src/main/scala/epic/lexicon/TagScorer.scala index 0e186711..89b62d8f 100644 --- a/src/main/scala/epic/lexicon/TagScorer.scala +++ b/src/main/scala/epic/lexicon/TagScorer.scala @@ -82,14 +82,14 @@ class SimpleTagScorer[L, W](counts: Counter2[L, W, Double]) extends TagScorer[L, var cWord = wordCounts(w) var cTagWord = counts(l, w) var pTag = labelCounts(l) / totalCount - if(pTag == 0.0) { + if (pTag == 0.0) { pTag = 1.0 } assert(cWord >= cTagWord) - if(cWord < 10 || cTagWord == 0.0) { + if (cWord < 10 || cTagWord == 0.0) { cWord += 1.0 cTagWord += counts(l, ::).size.toDouble / wordCounts.size - if(cTagWord == 0.0) { + if (cTagWord == 0.0) { cTagWord = 1.0 } } diff --git a/src/main/scala/epic/models/LanguageSpecific.scala b/src/main/scala/epic/models/LanguageSpecific.scala index e82b54e8..15d66cc6 100644 --- a/src/main/scala/epic/models/LanguageSpecific.scala +++ b/src/main/scala/epic/models/LanguageSpecific.scala @@ -1,13 +1,10 @@ package epic.models -trait LanguageSpecific { this:ModelLoader[_] => - +trait LanguageSpecific { this: ModelLoader[_] => def language: String - - def capabilities():Array[String] = Array(s"language:$language") + def capabilities(): Array[String] = Array(s"language:$language") } - -trait EnglishModel extends LanguageSpecific { this:ModelLoader[_] => +trait EnglishModel extends LanguageSpecific { this: ModelLoader[_] => def language = "en" } \ No newline at end of file diff --git a/src/main/scala/epic/models/ModelLoader.scala b/src/main/scala/epic/models/ModelLoader.scala index 5f3754d3..fc9926c7 100644 --- a/src/main/scala/epic/models/ModelLoader.scala +++ b/src/main/scala/epic/models/ModelLoader.scala @@ -9,10 +9,8 @@ import java.util.zip.GZIPInputStream * @author dlwh **/ trait ModelLoader[+T] { outer => - def load():T - - def capabilities:Array[String] - + def load(): T + def capabilities: Array[String] } abstract class ClassPathModelLoader[+T](modelPath: String = "model.ser.gz") extends ModelLoader[T] { @@ -27,10 +25,8 @@ abstract class ClassPathModelLoader[+T](modelPath: String = "model.ser.gz") exte } } - /* this class exists as a hack to get around limitations in service loader*/ class DelegatingLoader[+T](outer: ModelLoader[T]) extends ModelLoader[T] { def load() = outer.load() def capabilities = outer.capabilities - } diff --git a/src/main/scala/epic/models/ModelSelector.scala b/src/main/scala/epic/models/ModelSelector.scala index f1e78cd2..ac1b4bfe 100644 --- a/src/main/scala/epic/models/ModelSelector.scala +++ b/src/main/scala/epic/models/ModelSelector.scala @@ -16,15 +16,14 @@ trait ModelSelector[+T, Loader <: ModelLoader[T]] { private lazy val serviceLoader = ServiceLoader.load(manifest.runtimeClass.asInstanceOf[Class[Loader]], classLoader) - def findModel(features: String*):Option[Loader] = { - findModel{x => lazy val a = x.capabilities.toSet; features.forall(a)} + def findModel(features: String*): Option[Loader] = { + findModel{ x => lazy val a = x.capabilities.toSet; features.forall(a) } } - def findModel(filter: Loader=>Boolean) = serviceLoader.synchronized { + def findModel(filter: Loader => Boolean) = serviceLoader.synchronized { serviceLoader.asScala.find(filter) } - } diff --git a/src/main/scala/epic/models/NerModelLoader.scala b/src/main/scala/epic/models/NerModelLoader.scala index 8fd052f3..a7e03f5c 100644 --- a/src/main/scala/epic/models/NerModelLoader.scala +++ b/src/main/scala/epic/models/NerModelLoader.scala @@ -3,12 +3,10 @@ package epic.models import scala.reflect.ClassTag import epic.sequences.SemiCRF - trait NerModelLoader extends ModelLoader[SemiCRF[Any, String]] object NerSelector extends ModelSelector[SemiCRF[Any, String], NerModelLoader] { override protected def manifest: ClassTag[NerModelLoader] = scala.reflect.classTag[NerModelLoader] - def loadNer(language: String = "en"): Option[SemiCRF[Any, String]] = this.findModel(s"language:$language").map(_.load()) } diff --git a/src/main/scala/epic/models/ParserSelector.scala b/src/main/scala/epic/models/ParserSelector.scala index a4711c16..9ce8d5cb 100644 --- a/src/main/scala/epic/models/ParserSelector.scala +++ b/src/main/scala/epic/models/ParserSelector.scala @@ -11,7 +11,6 @@ import scala.reflect.ClassTag **/ object ParserSelector extends ModelSelector[Parser[AnnotatedLabel, String], ParserModelLoader] { override protected def manifest: ClassTag[ParserModelLoader] = scala.reflect.classTag[ParserModelLoader] - def loadParser(language: String = "en"): Option[Parser[AnnotatedLabel, String]] = this.findModel(s"language:$language").map(_.load()) } diff --git a/src/main/scala/epic/models/PosTagModelLoader.scala b/src/main/scala/epic/models/PosTagModelLoader.scala index 838a93de..8d8edd07 100644 --- a/src/main/scala/epic/models/PosTagModelLoader.scala +++ b/src/main/scala/epic/models/PosTagModelLoader.scala @@ -9,7 +9,6 @@ trait PosTagModelLoader extends ModelLoader[CRF[AnnotatedLabel, String]] object PosTagSelector extends ModelSelector[CRF[AnnotatedLabel, String], PosTagModelLoader] { override protected def manifest: ClassTag[PosTagModelLoader] = scala.reflect.classTag[PosTagModelLoader] - def loadTagger(language: String = "en"): Option[CRF[AnnotatedLabel, String]] = this.findModel(s"language:$language").map(_.load()) } diff --git a/src/main/scala/epic/models/package.scala b/src/main/scala/epic/models/package.scala index ab8716a6..6ba8b7af 100644 --- a/src/main/scala/epic/models/package.scala +++ b/src/main/scala/epic/models/package.scala @@ -12,7 +12,7 @@ import scala.util.{Success, Try} **/ package object models { - def deserialize[T](model: String):T = deserialize[T](model, new File(System.getProperty("user.dir"))) + def deserialize[T](model: String): T = deserialize[T](model, new File(System.getProperty("user.dir"))) def readFromJar[T](model: String, file: File): T = { val zip = new ZipFile(file) @@ -20,9 +20,7 @@ package object models { case e if e.getName == model || e.getName.endsWith("model.ser.gz") => breeze.util.nonstupidObjectInputStream(new GZIPInputStream(zip.getInputStream(e))).readObject().asInstanceOf[T] } - obj.getOrElse(throw new RuntimeException(s"Could not find model $model in jar $file")) - } /** @@ -33,10 +31,10 @@ package object models { * @tparam T * @return */ - def deserialize[T](model: String, path: File):T = { - if(!path.exists()) { + def deserialize[T](model: String, path: File): T = { + if (!path.exists()) { throw new FileNotFoundException(path.toString) - } else if(!path.isDirectory) { + } else if (!path.isDirectory) { try { readFromJar(model, path) } catch { @@ -60,7 +58,6 @@ package object models { case ex: Exception => throw new RuntimeException(s"Could not find model $model in path $path", ex) } - } case None => // look for jar files, try to read from there @@ -71,12 +68,8 @@ package object models { }.collectFirst { case Success(r) => r }.getOrElse { throw new RuntimeException(s"Could not find model $model in path $path") } - } - } - } - } diff --git a/src/main/scala/epic/ontonotes/ConllOntoReader.scala b/src/main/scala/epic/ontonotes/ConllOntoReader.scala index ae67980b..392005d3 100644 --- a/src/main/scala/epic/ontonotes/ConllOntoReader.scala +++ b/src/main/scala/epic/ontonotes/ConllOntoReader.scala @@ -31,14 +31,14 @@ import scala.io.Source object ConllOntoReader { - def readDocuments(file: File):IndexedSeq[Document] = try { + def readDocuments(file: File): IndexedSeq[Document] = try { val docIterator = new RawDocumentIterator(Source.fromFile(file).getLines()) - for ( (rawSentences_ :IndexedSeq[IndexedSeq[String]], docIndex: Int) <- docIterator.zipWithIndex.toIndexedSeq) yield { + for ((rawSentences_ :IndexedSeq[IndexedSeq[String]], docIndex: Int) <- docIterator.zipWithIndex.toIndexedSeq) yield { val rawSentences = rawSentences_.collect { case seq if seq.nonEmpty => seq.map(_.split("\\s+").toIndexedSeq) } - val sentences = for( (s,sentenceIndex) <- rawSentences.zipWithIndex) yield { + val sentences = for( (s,sentenceIndex) <- rawSentences.zipWithIndex) yield { val words = s.map(_(3)) val tags = s.map(_(4)) @@ -56,13 +56,13 @@ object ConllOntoReader { var currentChunkType = NerType.OutsideSentence s.indices.foreach { i => val chunk = s(i)(10) - if(chunk.startsWith("(")) { + if (chunk.startsWith("(")) { assert(currentChunkStart < 0) currentChunkStart = i currentChunkType = NerType.fromString(chunk.replaceAll("[()*]","")) } - if(chunk.endsWith(")")) { + if (chunk.endsWith(")")) { assert(currentChunkStart >= 0) entities += ((currentChunkStart -> (i+1)) -> currentChunkType) currentChunkStart = -1 @@ -82,12 +82,11 @@ object ConllOntoReader { for(name <- trimmed.split("[(]")) lastValue.push(name.trim -> i) } - if (s(i)(column).endsWith(")")) { for(close <- 0 until s(i)(column).count(_ == ')')) { assert(lastValue.nonEmpty, s.map(_(column)).mkString(",") + " " + i) val (name, start) = lastValue.pop() - if(name == "V") { + if (name == "V") { assert(start == i) verb = i } else { @@ -95,7 +94,6 @@ object ConllOntoReader { } } } - } assert(verb != -1, s.map(_(column)).mkString(",") ) @@ -111,13 +109,13 @@ object ConllOntoReader { } s.indices.foreach { i => val chunk = s(i).last - if(chunk != "-") + if (chunk != "-") for( id <- chunk.split("\\|")) { val tid = id.replaceAll("[()*]","").toInt - if(id.startsWith("(")) { + if (id.startsWith("(")) { stack(tid).push(i) } - if(id.endsWith(")")) { + if (id.endsWith(")")) { val start = stack(tid).pop() mentions(start -> (i+1)) = mention(tid) } @@ -131,16 +129,12 @@ object ConllOntoReader { val speaker = s.map(_(9)).find(_ != "-") val annotations = OntoAnnotations(tree, ner, coref, srl, speaker) - - - Sentence(docId, sentenceIndex,words, annotations) } Document(s"${file.toString}-$docIndex",sentences.toIndexedSeq) } - } catch { case ex: MalformedInputException => throw new RuntimeException("Error while processing " + file, ex) @@ -148,28 +142,28 @@ object ConllOntoReader { private val mentionCache = Array.tabulate(100)(i => Mention(i)) - private def mention(id: Int) = if(id < mentionCache.length) mentionCache(id) else Mention(id) + private def mention(id: Int) = if (id < mentionCache.length) mentionCache(id) else Mention(id) private class RawDocumentIterator(it: Iterator[String]) extends Iterator[IndexedSeq[IndexedSeq[String]]] { def hasNext = it.hasNext - def next():IndexedSeq[IndexedSeq[String]] = { + def next(): IndexedSeq[IndexedSeq[String]] = { var doneOuter = false val outBuf = new ArrayBuffer[IndexedSeq[String]] - while(it.hasNext && !doneOuter) { + while (it.hasNext && !doneOuter) { val buf = new ArrayBuffer[String] var done = false var seenSomethingNotBlank = false - while(it.hasNext && !done) { + while (it.hasNext && !done) { val next = it.next() - if(next.startsWith("#begin")) { + if (next.startsWith("#begin")) { // pass - } else if(next.startsWith("#end")) { + } else if (next.startsWith("#end")) { doneOuter = true - } else if(next.trim != "") { + } else if (next.trim != "") { seenSomethingNotBlank = true buf += next.trim - } else if(seenSomethingNotBlank) { + } else if (seenSomethingNotBlank) { done = true } } diff --git a/src/main/scala/epic/ontonotes/DSpan.scala b/src/main/scala/epic/ontonotes/DSpan.scala index c50ff74c..f5c6a08e 100644 --- a/src/main/scala/epic/ontonotes/DSpan.scala +++ b/src/main/scala/epic/ontonotes/DSpan.scala @@ -15,8 +15,8 @@ case class DSpan(doc: String, sentence: Int, begin: Int, end: Int) { * @param doc * @return */ - def render(doc: Document):String = render(doc.sentences.map(_.words)) - def render(doc: IndexedSeq[IndexedSeq[String]]):String = getYield(doc).mkString("[",", ", "]") + def render(doc: Document): String = render(doc.sentences.map(_.words)) + def render(doc: IndexedSeq[IndexedSeq[String]]): String = getYield(doc).mkString("[",", ", "]") /** * Gets the words associated with this document. @@ -34,9 +34,9 @@ object DSpan { def compare(x: DSpan, y: DSpan): Int = { x.doc.compare(y.doc) match { case 0 => - if(x.sentence < y.sentence) -1 - else if(x.sentence > y.sentence) 1 - else if(x.begin < y.begin) -1 + if (x.sentence < y.sentence) -1 + else if (x.sentence > y.sentence) 1 + else if (x.begin < y.begin) -1 else if (x.begin > y.begin) 1 else x.end - y.end case z => z @@ -53,14 +53,13 @@ case class DPos(doc: String, sentence: Int, pos: Int) { def asDSpan = DSpan(doc, sentence, pos, pos + 1) } - object DPos { implicit val ordering: Ordering[DPos] = new Ordering[DPos] { def compare(x: DPos, y: DPos): Int = { x.doc.compare(y.doc) match { case 0 => - if(x.sentence < y.sentence) -1 - else if(x.sentence > y.sentence) 1 + if (x.sentence < y.sentence) -1 + else if (x.sentence > y.sentence) 1 else x.pos - y.pos case z => z } diff --git a/src/main/scala/epic/ontonotes/Document.scala b/src/main/scala/epic/ontonotes/Document.scala index e9b57408..520a2a66 100644 --- a/src/main/scala/epic/ontonotes/Document.scala +++ b/src/main/scala/epic/ontonotes/Document.scala @@ -25,17 +25,11 @@ import epic.trees.{AnnotatedLabel, Tree} */ case class Document(id: String, sentences: IndexedSeq[Sentence]) extends Example[IndexedSeq[OntoAnnotations], IndexedSeq[IndexedSeq[String]]] { def dspans = sentences.flatMap(_.dspans) - def words: IndexedSeq[IndexedSeq[String]] = sentences.map(_.words) - def features = words - lazy val label: IndexedSeq[OntoAnnotations] = sentences.map(_.label) - lazy val trees: IndexedSeq[Tree[AnnotatedLabel]] = sentences.map(_.tree) - lazy val ner: Map[DSpan, NerType.Value] = sentences.map(_.ner).reduceLeft(_ ++ _) - lazy val coref: Map[DSpan, Mention] = sentences.map(_.coref).reduceLeft(_ ++ _) } diff --git a/src/main/scala/epic/ontonotes/Sentence.scala b/src/main/scala/epic/ontonotes/Sentence.scala index c5426127..022a685b 100644 --- a/src/main/scala/epic/ontonotes/Sentence.scala +++ b/src/main/scala/epic/ontonotes/Sentence.scala @@ -83,8 +83,8 @@ case class Frame(lemma: String, pos: Int, sense: Int, args: IndexedSeq[Argument] val newArgs = mutable.Stack[Argument]() val sorted = args.sortBy(a => (a.span.begin, -a.span.length))(Ordering.Tuple2) for(arg <- sorted) { - if(newArgs.isEmpty || !newArgs.top.span.contains(arg.span)) { // don't overlap at all - while(newArgs.nonEmpty && arg.span.contains(newArgs.top.span)) { + if (newArgs.isEmpty || !newArgs.top.span.contains(arg.span)) { // don't overlap at all + while (newArgs.nonEmpty && arg.span.contains(newArgs.top.span)) { newArgs.pop() } assert(newArgs.isEmpty || !arg.span.crosses(newArgs.top.span)) @@ -102,14 +102,14 @@ case class Frame(lemma: String, pos: Int, sense: Int, args: IndexedSeq[Argument] var last = 0 for( arg <- sorted ) { assert(last <= arg.span.begin) - while(arg.span.begin != last) { + while (arg.span.begin != last) { out += (Some(outside) -> Span(last,last+1)) last += 1 } out += (Some(arg.arg) -> Span(arg.span.begin, arg.span.end)) last = arg.span.end } - while(words.length != last) { + while (words.length != last) { out += (Some(outside) -> Span(last,last+1)) last += 1 } diff --git a/src/main/scala/epic/package.scala b/src/main/scala/epic/package.scala index 9b836f78..0799b3af 100644 --- a/src/main/scala/epic/package.scala +++ b/src/main/scala/epic/package.scala @@ -9,7 +9,6 @@ import scala.collection.mutable */ package object epic { - implicit class AwesomeBitSet(val bs: java.util.BitSet) extends AnyVal { def apply(r: Int) = bs.get(r) @@ -27,7 +26,7 @@ package object epic { def foreach[U](f: Int=>U) { var i = bs.nextSetBit(0) - while(i != -1) { + while (i != -1) { f(i) i = bs.nextSetBit(i+1) } diff --git a/src/main/scala/epic/parser/ChartDecoder.scala b/src/main/scala/epic/parser/ChartDecoder.scala index 7be32eb7..d6ca6f92 100644 --- a/src/main/scala/epic/parser/ChartDecoder.scala +++ b/src/main/scala/epic/parser/ChartDecoder.scala @@ -76,7 +76,7 @@ case class ViterbiDecoder[L, W]() extends ChartDecoder[L, W] with Serializable w val b = topology.child(r) val refB = refined.childRefinement(r, refR) val score = ruleScore + insideBotScore(begin, end, b, refB) - if(score > maxScore) { + if (score > maxScore) { maxScore = score maxChild = b maxChildRef = refB @@ -84,7 +84,7 @@ case class ViterbiDecoder[L, W]() extends ChartDecoder[L, W] with Serializable w } } - if(maxScore == Double.NegativeInfinity) { + if (maxScore == Double.NegativeInfinity) { throw new ParseExtractionException(s"Couldn't find a tree! [$begin,$end) ${topology.labelIndex.get(root)}", words) } @@ -101,7 +101,7 @@ case class ViterbiDecoder[L, W]() extends ChartDecoder[L, W] with Serializable w var maxSplit = -1 var maxRule = -1 - if(begin + 1 == end) { + if (begin + 1 == end) { return NullaryTree(labelIndex.get(root) -> rootRef, Span(begin, end)) } @@ -122,7 +122,7 @@ case class ViterbiDecoder[L, W]() extends ChartDecoder[L, W] with Serializable w + marginal.insideTopScore(split, end, c, refC) + spanScore ) - if(score > maxScore) { + if (score > maxScore) { maxScore = score maxLeft = b maxLeftRef = refB @@ -133,15 +133,13 @@ case class ViterbiDecoder[L, W]() extends ChartDecoder[L, W] with Serializable w } } - if(maxScore == Double.NegativeInfinity) { + if (maxScore == Double.NegativeInfinity) { throw new ParseExtractionException(s"Couldn't find a tree! [$begin,$end) ${topology.labelIndex.get(root)}\n", marginal.words) } else { val lchild = buildTreeUnary(begin, maxSplit, maxLeft, maxLeftRef) val rchild = buildTreeUnary(maxSplit, end, maxRight, maxRightRef) BinaryTree(labelIndex.get(root) -> rootRef, lchild, rchild, Span(begin, end)) } - - } val maxRootRef = refined.validLabelRefinements(0, length, rootIndex).maxBy(ref => insideTopScore(0, length, rootIndex, ref)) @@ -187,12 +185,9 @@ case class MaxVariationalDecoder[L, W]() extends ProjectingChartDecoder[L, W](ne class MaxConstituentDecoder[L, W] extends ChartDecoder[L, W] { def extractBestParse(marginal: ParseMarginal[L, W]): BinarizedTree[L] = { - - val length = marginal.length import marginal.topology - val spanMarginals = new AnchoredSpanProjector().projectSpanPosteriors(marginal) val maxSplit = TriangularArray.fill[Int](length+1)(0) val maxBotLabel = TriangularArray.fill[Int](length+1)(-1) @@ -202,7 +197,6 @@ class MaxConstituentDecoder[L, W] extends ChartDecoder[L, W] { val numLabels = topology.labelIndex.size - for { span <- 1 to length begin <- 0 to (length - span) @@ -214,7 +208,7 @@ class MaxConstituentDecoder[L, W] extends ChartDecoder[L, W] { maxTopLabel(begin, end) = argmax(spanMarginals.topType(begin, end).slice(0, numLabels)) maxTopScore(begin, end) = spanMarginals.botType(begin, end)(maxBotLabel(begin, end)) + maxBotScore(begin, end) - if(end - begin > 1) { + if (end - begin > 1) { val (split, splitScore) = (for (split <- begin + 1 until end) yield { val score = maxTopScore(begin, split) + maxTopScore(split, end) (split, score) @@ -260,9 +254,9 @@ class MaxConstituentDecoder[L, W] extends ChartDecoder[L, W] { def extract(begin: Int, end: Int):BinarizedTree[L] = { val bestBot = maxBotLabel(begin, end) - val lower = if(begin + 1== end) { -// if(maxBotScore(begin, end) == Double.NegativeInfinity) -// throw new RuntimeException(s"Couldn't make a good score for ${(begin, end)}. InsideIndices: ${inside.bot.enteredLabelIndexes(begin, end).toIndexedSeq}\noutside: ${outside.bot.enteredLabelIndexes(begin, end).toIndexedSeq} logPartition: $logPartition") + val lower = if (begin + 1== end) { + // if (maxBotScore(begin, end) == Double.NegativeInfinity) + // throw new RuntimeException(s"Couldn't make a good score for ${(begin, end)}. InsideIndices: ${inside.bot.enteredLabelIndexes(begin, end).toIndexedSeq}\noutside: ${outside.bot.enteredLabelIndexes(begin, end).toIndexedSeq} logPartition: $logPartition") NullaryTree(topology.labelIndex.get(bestBot), Span(begin, end)) } else { val split = maxSplit(begin, end) diff --git a/src/main/scala/epic/parser/GenerativeParser.scala b/src/main/scala/epic/parser/GenerativeParser.scala index 7abede9c..477c94a9 100644 --- a/src/main/scala/epic/parser/GenerativeParser.scala +++ b/src/main/scala/epic/parser/GenerativeParser.scala @@ -173,7 +173,7 @@ object GenerativeTrainer extends ParserPipeline { val refinedGrammar = Grammar.generative(xbar, xbarLexicon, indexedRefinements, binaryCounts, initUnaries, scorer) - if(params.grammarDumpPath != null) { + if (params.grammarDumpPath != null) { val out = new BufferedWriter(new FileWriter(params.grammarDumpPath)) refinedGrammar.prettyPrint(out) out.close() diff --git a/src/main/scala/epic/parser/Grammar.scala b/src/main/scala/epic/parser/Grammar.scala index 0a9896bb..754548d7 100644 --- a/src/main/scala/epic/parser/Grammar.scala +++ b/src/main/scala/epic/parser/Grammar.scala @@ -48,7 +48,6 @@ object Grammar { def topology = f1.topology def lexicon = f1.lexicon - override def withPermissiveLexicon: Grammar[L, W] = product(f1.withPermissiveLexicon, f2.withPermissiveLexicon) def anchor(words: IndexedSeq[W], @@ -63,7 +62,6 @@ object Grammar { def lexicon = l - override def withPermissiveLexicon: Grammar[L, W] = identity(ruleTopology, l.morePermissive) override def anchor(words: IndexedSeq[W], constraints: ChartConstraints[L]): GrammarAnchoring[L, W] = { diff --git a/src/main/scala/epic/parser/GrammarAnchoring.scala b/src/main/scala/epic/parser/GrammarAnchoring.scala index c7e0be62..cff71edc 100644 --- a/src/main/scala/epic/parser/GrammarAnchoring.scala +++ b/src/main/scala/epic/parser/GrammarAnchoring.scala @@ -37,7 +37,6 @@ trait GrammarAnchoring[L, W] { def logPartition: Double = marginal.logPartition - private lazy val lexLoc = lexicon.anchor(words) def tagConstraints: TagConstraints[L] = lexLoc @@ -85,8 +84,6 @@ trait GrammarAnchoring[L, W] { else new ProductGrammarAnchoring(this,other) } - - /** * Computes the pointwise division of two grammars, augmenting * their refinement space to reflect this. If they share the same annotationTag, @@ -104,7 +101,7 @@ trait GrammarAnchoring[L, W] { def maxMarginal = RefinedChartMarginal(this, maxMarginal = true) def marginal = RefinedChartMarginal(this, maxMarginal = false) - def isConvergedTo(f: GrammarAnchoring[L, W], diff: Double):Boolean = { + def isConvergedTo(f: GrammarAnchoring[L, W], diff: Double): Boolean = { import scala.util.control.Breaks._ var converged = true breakable { @@ -112,9 +109,6 @@ trait GrammarAnchoring[L, W] { def visitBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int, score: Double) { val myScore = scoreBinaryRule(begin, split, end, rule, ref) val theirScore = f.scoreBinaryRule(begin, split, end, rule, ref) - - - if (myScore != theirScore) { if (theirScore.isInfinite || myScore.isInfinite) { converged = false @@ -126,15 +120,12 @@ trait GrammarAnchoring[L, W] { break() } } - } def visitUnaryRule(begin: Int, end: Int, rule: Int, ref: Int, score: Double) { val myScore = scoreUnaryRule(begin, end, rule, ref) val theirScore = f.scoreUnaryRule(begin, end, rule, ref) assert(!myScore.isInfinite) - - if (myScore != theirScore) { if (theirScore.isInfinite || myScore.isInfinite) { converged = false @@ -146,14 +137,11 @@ trait GrammarAnchoring[L, W] { break() } } - } def visitSpan(begin: Int, end: Int, tag: Int, ref: Int, score: Double) = { val myScore = scoreSpan(begin, end, tag, ref) val theirScore = f.scoreSpan(begin, end, tag, ref) - - if (myScore != theirScore) { if (theirScore.isInfinite || myScore.isInfinite) { converged = true @@ -165,16 +153,14 @@ trait GrammarAnchoring[L, W] { break() } } - } } } -// println(converged) + // println(converged) converged } - /** * The annotationTag controls if two grammars are over the same refinements. * If they are, then * and / can be much faster. @@ -200,11 +186,9 @@ trait GrammarAnchoring[L, W] { def maxLabelRefinements: Int = (0 until topology.labelIndex.size).map(numValidRefinements _).max - def numValidRefinements(label: Int):Int - - def numValidRuleRefinements(rule: Int):Int - + def numValidRefinements(label: Int): Int + def numValidRuleRefinements(rule: Int): Int /** * For a given span and the parent's refinement, what refinements to the rule are allowed? @@ -221,10 +205,10 @@ trait GrammarAnchoring[L, W] { def validRuleRefinementsGivenRightChild(completionBegin: Int, completionEnd: Int, split: Int, end: Int, rule: Int, childRef: Int):Array[Int] def validUnaryRuleRefinementsGivenChild(begin: Int, end: Int, rule: Int, childRef: Int):Array[Int] - def leftChildRefinement(rule: Int, ruleRef: Int):Int - def rightChildRefinement(rule: Int, ruleRef: Int):Int - def parentRefinement(rule: Int, ruleRef: Int):Int - def childRefinement(rule: Int, ruleRef: Int):Int + def leftChildRefinement(rule: Int, ruleRef: Int): Int + def rightChildRefinement(rule: Int, ruleRef: Int): Int + def parentRefinement(rule: Int, ruleRef: Int): Int + def childRefinement(rule: Int, ruleRef: Int): Int /** * Returns the refined rule given parent and child refinements for a unary rule. @@ -234,7 +218,7 @@ trait GrammarAnchoring[L, W] { * @param refB child index * @return rule refinement id, or -1 if rule is not allowed with those refinements */ - def ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int):Int + def ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int): Int /** * Returns the refined rule given parent and child refinements for a unary rule. @@ -245,7 +229,7 @@ trait GrammarAnchoring[L, W] { * @param refC right child index * @return rule refinement id, or -1 if rule is not allowed with those refinements */ - def ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int, refC: Int):Int + def ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int, refC: Int): Int def validCoarseRulesGivenParentRefinement(a: Int, refA: Int): Array[Int] @@ -262,8 +246,6 @@ object GrammarAnchoring { UnrefinedGrammarAnchoring.identity[L, W](topology, lexicon, words, constraints) } - - trait StructureDelegatingAnchoring[L, W] extends GrammarAnchoring[L, W] { protected def baseAnchoring: GrammarAnchoring[L, W] @@ -272,11 +254,11 @@ object GrammarAnchoring { def words: IndexedSeq[W] = baseAnchoring.words -// def scoreSpan(begin: Int, end: Int, label: Int, ref: Int): Double = baseAnchoring.scoreSpan(begin: Int, end: Int, label: Int, ref: Int) + // def scoreSpan(begin: Int, end: Int, label: Int, ref: Int): Double = baseAnchoring.scoreSpan(begin: Int, end: Int, label: Int, ref: Int) -// def scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int): Double = baseAnchoring.scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int) + // def scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int): Double = baseAnchoring.scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int) -// def scoreUnaryRule(begin: Int, end: Int, rule: Int, ref: Int): Double = baseAnchoring.scoreUnaryRule(begin: Int, end: Int, rule: Int, ref: Int) + // def scoreUnaryRule(begin: Int, end: Int, rule: Int, ref: Int): Double = baseAnchoring.scoreUnaryRule(begin: Int, end: Int, rule: Int, ref: Int) def validLabelRefinements(begin: Int, end: Int, label: Int): Array[Int] = baseAnchoring.validLabelRefinements(begin: Int, end: Int, label: Int) @@ -300,7 +282,6 @@ object GrammarAnchoring { def childRefinement(rule: Int, ruleRef: Int): Int = baseAnchoring.childRefinement(rule: Int, ruleRef: Int) - def ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int): Int = baseAnchoring.ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int) def ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int, refC: Int): Int = baseAnchoring.ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int, refC: Int) diff --git a/src/main/scala/epic/parser/LatentTreeMarginal.scala b/src/main/scala/epic/parser/LatentTreeMarginal.scala index 03c838df..7e0b0a1c 100644 --- a/src/main/scala/epic/parser/LatentTreeMarginal.scala +++ b/src/main/scala/epic/parser/LatentTreeMarginal.scala @@ -35,7 +35,6 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], private val stree = insideScores() outsideScores(stree) - def isMaxMarginal: Boolean = false private val z = stree.label.inside.sum @@ -60,18 +59,18 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], } case t@UnaryTree(Beliefs(aLabels, _, _, aScores, aScale), Tree(Beliefs(cLabels, cScores,cScale, _, _), _, _), chain, span) => var pi = 0 - while(pi < aLabels.size) { + while (pi < aLabels.size) { val (a, aRef) = aLabels(pi) val opScore = aScores(pi) pi += 1 var ci = 0 - while(ci < cLabels.size) { + while (ci < cLabels.size) { val (c, cRef) = cLabels(ci) val icScore = cScores(ci) ci += 1 val rule = topology.index(UnaryRule(topology.labelIndex.get(a), topology.labelIndex.get(c), chain)) val ruleRef = anchoring.ruleRefinementFromRefinements(rule, aRef, cRef) - if(ruleRef != -1 ) { + if (ruleRef != -1 ) { val rs = math.exp(anchoring.scoreUnaryRule(t.span.begin, t.span.end, rule, ruleRef)) // exp! val ruleScore = Scaling.unscaleValue(opScore / z * rs * icScore, aScale + cScale - rootScale) assert(!ruleScore.isNaN) @@ -101,7 +100,6 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], } } - // private stuff to do the computation private def insideScores() = { @@ -123,46 +121,44 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], assert(!wScore.isNaN) foundOne = true } - if(!foundOne) { + if (!foundOne) { sys.error(s"Trouble with lexical $words(t.span.begin)") } t.label.scaleInside(0) case t@UnaryTree(Beliefs(aLabels, aScores, _, _, _), Tree(Beliefs(cLabels, cScores, cScale, _, _), _, _), chain, span) => var foundOne = false var ai = 0 - while(ai < aLabels.length) { + while (ai < aLabels.length) { val (a, aRef) = aLabels(ai) - var sum = 0.0 var ci = 0 - while(ci < cLabels.length) { + while (ci < cLabels.length) { val (c, cRef) = cLabels(ci) val rule = topology.index(UnaryRule(topology.labelIndex.get(a), topology.labelIndex.get(c), chain)) - if(rule != -1) { + if (rule != -1) { val ruleRef = anchoring.ruleRefinementFromRefinements(rule, aRef, cRef) if (ruleRef != -1) { val score = anchoring.scoreUnaryRule(t.span.begin, t.span.end, rule, ruleRef) val ruleScore = cScores(ci) * math.exp(score) // exp! sum += ruleScore assert(!ruleScore.isNaN) - if(score != Double.NegativeInfinity && math.exp(score) == 0.0) { + if (score != Double.NegativeInfinity && math.exp(score) == 0.0) { println("Underflow!!!") } - if(ruleScore != 0.0) { + if (ruleScore != 0.0) { foundOne = true } } } ci += 1 } - aScores(ai) = sum ai += 1 } - if(!foundOne) { + if (!foundOne) { sys.error("unary problems") -// sys.error(s"Trouble with unary $t.render(words)} ${grammar.labelIndex.get(a)} ${grammar.labelIndex.get(c)} $rule ${anchoring.scoreUnaryRule(t.span.begin, t.span.end, rule, 0)}") + // sys.error(s"Trouble with unary $t.render(words)} ${grammar.labelIndex.get(a)} ${grammar.labelIndex.get(c)} $rule ${anchoring.scoreUnaryRule(t.span.begin, t.span.end, rule, 0)}") } t.label.scaleInside(cScale) case t@BinaryTree(Beliefs(aLabels, aScores, _, _, _), @@ -173,21 +169,21 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], val split = t.leftChild.span.end val end = span.end var ai = 0 - while(ai < aScores.length) { + while (ai < aScores.length) { var sum = 0.0 val (a, aRef) = aLabels(ai) var bi = 0 - while(bi < bLabels.length) { + while (bi < bLabels.length) { val (b, bRef) = bLabels(bi) var ci = 0 - while(ci < cLabels.length) { + while (ci < cLabels.length) { val (c, cRef) = cLabels(ci) val rule = topology.index(BinaryRule(topology.labelIndex.get(a), topology.labelIndex.get(b), topology.labelIndex.get(c))) - if(rule != -1) { + if (rule != -1) { val ruleRef = anchoring.ruleRefinementFromRefinements(rule, aRef, bRef, cRef) - if(ruleRef != -1) { + if (ruleRef != -1) { val spanScore = anchoring.scoreSpan(begin, end, a, aRef) sum += ( bScores(bi) * cScores(ci) @@ -201,15 +197,15 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], bi += 1 } aScores(ai) = sum - if(aScores(ai) != 0) foundOne = true + if (aScores(ai) != 0) foundOne = true ai += 1 } - if(!foundOne) { -// val r = (BinaryRule(grammar.labelIndex.get(a), -// grammar.labelIndex.get(b), -// grammar.labelIndex.get(c))) -// sys.error(s"Trouble with binary ${t.render(words)}\n\n$r $rule $ai") + if (!foundOne) { + // val r = (BinaryRule(grammar.labelIndex.get(a), + // grammar.labelIndex.get(b), + // grammar.labelIndex.get(c))) + // sys.error(s"Trouble with binary ${t.render(words)}\n\n$r $rule $ai") } t.label.scaleInside(cScale + bScale) case _ => sys.error("bad tree!") @@ -257,7 +253,7 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], } { val rule = topology.index(UnaryRule(topology.labelIndex.get(a), topology.labelIndex.get(c), chain)) val ruleRef = anchoring.ruleRefinementFromRefinements(rule, aRef, cRef) - if(ruleRef != -1) { + if (ruleRef != -1) { val ruleScore = anchoring.scoreUnaryRule(span.begin, span.end, rule, ruleRef) sum += aScore * math.exp(ruleScore) // exp! } @@ -265,9 +261,6 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], child.label.outside(ci) = sum } child.label.scaleOutside(t.label.oscale) - - - } } @@ -302,7 +295,6 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], } case _ => Double.NegativeInfinity } - } // override def marginalAt(begin: Int, end: Int): Counter2[L, Int, Double] = { @@ -357,8 +349,8 @@ object LatentTreeMarginal { private object Beliefs { private[LatentTreeMarginal] def apply[L](labels: IndexedSeq[(Int, Int)]):Beliefs[L] = { val r = new Beliefs[L](labels, new Array[Double](labels.length), 0, new Array[Double](labels.length), 0) -// Arrays.fill(r.inside, Double.NegativeInfinity) -// Arrays.fill(r.outside, Double.NegativeInfinity) + // Arrays.fill(r.inside, Double.NegativeInfinity) + // Arrays.fill(r.outside, Double.NegativeInfinity) r } } diff --git a/src/main/scala/epic/parser/ParseEval.scala b/src/main/scala/epic/parser/ParseEval.scala index d833a4b7..96a6f5e7 100644 --- a/src/main/scala/epic/parser/ParseEval.scala +++ b/src/main/scala/epic/parser/ParseEval.scala @@ -15,7 +15,6 @@ package epic.parser limitations under the License. */ - import epic.trees._ import java.io.BufferedOutputStream import java.io.File @@ -29,7 +28,6 @@ import com.typesafe.scalalogging.slf4j.LazyLogging import java.text.DecimalFormat import epic.util.ProgressLog - /** * Hack approximation to true parse eval. Gives Labeled Precision * and Labeled Recall. @@ -43,10 +41,8 @@ class ParseEval[L](ignoredLabels: Set[L]) { * guess/gold pair of trees. */ def apply(guessgold: Iterator[(Tree[L],Tree[L])]):Statistics = { - val allStats = for( (guess,gold) <- guessgold) yield { apply(guess,gold) } - + val allStats = for((guess,gold) <- guessgold) yield { apply(guess,gold) } val stats = allStats.reduceLeft(_ + _) - stats } @@ -54,14 +50,13 @@ class ParseEval[L](ignoredLabels: Set[L]) { val guessSet = labeledConstituents(guess) val goldSet = labeledConstituents(gold) val inter = guessSet intersect goldSet - val exact = if(goldSet.size == inter.size && guessSet.size == inter.size) 1 else 0 + val exact = if (goldSet.size == inter.size && guessSet.size == inter.size) 1 else 0 val guessLeaves = guess.leaves val goldLeaves = gold.leaves - val numRight = goldLeaves.zip(guessLeaves).foldLeft(0) { (acc,gg) => if(gg._1.label == gg._2.label) acc + 1 else acc} + val numRight = goldLeaves.zip(guessLeaves).foldLeft(0) { (acc,gg) => if (gg._1.label == gg._2.label) acc + 1 else acc} Statistics(guessSet.size, goldSet.size, inter.size, exact, numRight, guess.span.end, 1) } - private def labeledConstituents(tree: Tree[L]) = Set() ++ { for(child <- tree.preorder if !ignoredLabels.contains(child.label) && !child.isLeaf) @@ -84,8 +79,8 @@ object ParseEval extends LazyLogging { numParses + stats.numParses) } - def precision = if(guess == 0) 1.0 else right * 1.0 / guess - def recall = if(guess == 0) 1.0 else right * 1.0 / gold + def precision = if (guess == 0) 1.0 else right * 1.0 / guess + def recall = if (guess == 0) 1.0 else right * 1.0 / gold def exact = numExact * 1.0 / numParses def tagAccuracy = tagsRight * 1.0 / numWords def f1 = (2 * precision * recall)/(precision + recall) @@ -143,7 +138,7 @@ object ParseEval extends LazyLogging { nthreads: Int = -1)(implicit deb: Debinarizer[L]) = { val parsedir = new File(evalDir) - if(!parsedir.exists() && !parsedir.mkdirs()) { + if (!parsedir.exists() && !parsedir.mkdirs()) { throw new RuntimeException("Couldn't make directory: " + parsedir) } val goldOut = new PrintStream(new BufferedOutputStream(new FileOutputStream(new File(parsedir,"gold")))) diff --git a/src/main/scala/epic/parser/ParseMarginal.scala b/src/main/scala/epic/parser/ParseMarginal.scala index bb77eb8c..00f1c640 100644 --- a/src/main/scala/epic/parser/ParseMarginal.scala +++ b/src/main/scala/epic/parser/ParseMarginal.scala @@ -83,7 +83,6 @@ trait ParseMarginal[L, W] extends VisitableMarginal[AnchoredVisitor[L]] { object ParseMarginal { - trait Factory[L, W] { def apply(w: IndexedSeq[W], constraints: ChartConstraints[L]):ParseMarginal[L, W] } @@ -123,11 +122,10 @@ object ParseMarginal { case class StandardChartFactory[L, W](refinedGrammar: Grammar[L, W], maxMarginal: Boolean = false) extends ParseMarginal.Factory[L, W] { def apply(w: IndexedSeq[W], constraints: ChartConstraints[L]):RefinedChartMarginal[L, W] = { val marg = RefinedChartMarginal(refinedGrammar.anchor(w, constraints), maxMarginal = maxMarginal) - if(!marg.logPartition.isInfinite) { + if (!marg.logPartition.isInfinite) { marg } else { RefinedChartMarginal(refinedGrammar.withPermissiveLexicon.anchor(w, constraints), maxMarginal = maxMarginal) } - } } diff --git a/src/main/scala/epic/parser/ParseText.scala b/src/main/scala/epic/parser/ParseText.scala index dab50ef6..01ef3d0f 100644 --- a/src/main/scala/epic/parser/ParseText.scala +++ b/src/main/scala/epic/parser/ParseText.scala @@ -10,7 +10,6 @@ import epic.models.ParserSelector */ object ParseText extends ProcessTextMain[Parser[AnnotatedLabel, String], Tree[AnnotatedLabel]] { - override def render(model: Parser[AnnotatedLabel, String], ann: Tree[AnnotatedLabel], tokens: IndexedSeq[String]): String = { ann.render(tokens, newline = false) } diff --git a/src/main/scala/epic/parser/Parser.scala b/src/main/scala/epic/parser/Parser.scala index ae3f85e2..eb71f984 100644 --- a/src/main/scala/epic/parser/Parser.scala +++ b/src/main/scala/epic/parser/Parser.scala @@ -62,17 +62,12 @@ final case class Parser[L,W](topology: RuleTopology[L], } } - - - - } object Parser { def apply[L, W](grammar: Grammar[L, W])(implicit deb: Debinarizer[L]): Parser[L, W]= { Parser(grammar.topology, grammar.lexicon, ChartConstraints.Factory.noSparsity, StandardChartFactory(grammar), ChartDecoder()) - } def apply[L, W](refined: Grammar[L, W], decoder: ChartDecoder[L, W])(implicit deb: Debinarizer[L]): Parser[L, W] = { @@ -83,7 +78,6 @@ object Parser { new Parser(refinedGrammar.topology, refinedGrammar.lexicon, ChartConstraints.Factory.noSparsity[L, W], new SimpleChartMarginal.SimpleChartFactory(refinedGrammar, decoder.wantsMaxMarginal), decoder) } - def apply[L, W](core: ChartConstraints.Factory[L, W], grammar: Grammar[L, W], decoder: ChartDecoder[L, W])(implicit deb: Debinarizer[L]): Parser[L, W] = { Parser(grammar.topology, grammar.lexicon, core, StandardChartFactory(grammar, decoder.wantsMaxMarginal), decoder) } diff --git a/src/main/scala/epic/parser/ParserAnnotator.scala b/src/main/scala/epic/parser/ParserAnnotator.scala index cebb9b52..b8a6eecd 100644 --- a/src/main/scala/epic/parser/ParserAnnotator.scala +++ b/src/main/scala/epic/parser/ParserAnnotator.scala @@ -11,16 +11,13 @@ import epic.trees.Tree **/ class ParserAnnotator[L](parser: Parser[L, String]) extends StringAnalysisFunction[Token with Sentence, Tree[L]] { - def apply[In <: Token with Sentence](slab: StringSlab[In]):StringSlab[In with epic.trees.Tree[L]] = { val annotatedSentences = for((span, sent) <- slab.iterator[Sentence].toIndexedSeq.par) yield { val tokens = slab.covered[Token](span) val tree = parser(tokens.map(_._2.token)) span -> tree } - slab.addLayer[Tree[L]](annotatedSentences.seq) } - } diff --git a/src/main/scala/epic/parser/ParserPipeline.scala b/src/main/scala/epic/parser/ParserPipeline.scala index 2d3057c2..9c79093a 100644 --- a/src/main/scala/epic/parser/ParserPipeline.scala +++ b/src/main/scala/epic/parser/ParserPipeline.scala @@ -48,7 +48,7 @@ object ParserParams { val g = RuleTopology(AnnotatedLabel.TOP, xbarBinaries.keysIterator.map(_._2) ++ xbarUnaries.keysIterator.map(_._2)) val lex = new SimpleLexicon(g.labelIndex, words) - if(path ne null) + if (path ne null) writeObject(path, g -> lex) g -> lex @@ -83,11 +83,8 @@ trait ParserPipeline extends LazyLogging { validate: Parser[AnnotatedLabel, String]=>ParseEval.Statistics, params: Params):Iterator[(String, Parser[AnnotatedLabel, String])] - def trainParser(treebank: ProcessedTreebank, params: Params):Iterator[(String, Parser[AnnotatedLabel, String])] = { import treebank._ - - val validateTrees = devTrees.take(100) def validate(parser: Parser[AnnotatedLabel, String]) = { ParseEval.evaluate[AnnotatedLabel](validateTrees, parser, asString={(l:AnnotatedLabel)=>l.label}, nthreads=params.threads) @@ -104,7 +101,7 @@ trait ParserPipeline extends LazyLogging { val params = CommandLineParser.readIn[JointParams[Params]](args) -// logger.info("Command line arguments for recovery:\n" + Configuration.fromObject(params).toCommandLineString) + // logger.info("Command line arguments for recovery:\n" + Configuration.fromObject(params).toCommandLineString) logger.info("Training Parser...") val parsers = trainParser(params.treebank, params.trainer) @@ -133,7 +130,6 @@ trait ParserPipeline extends LazyLogging { } } - def evalParser(testTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], parser: Parser[AnnotatedLabel, String], name: String):ParseEval.Statistics = { diff --git a/src/main/scala/epic/parser/ProductChartFactory.scala b/src/main/scala/epic/parser/ProductChartFactory.scala index 317ca2b9..fee1725b 100644 --- a/src/main/scala/epic/parser/ProductChartFactory.scala +++ b/src/main/scala/epic/parser/ProductChartFactory.scala @@ -13,11 +13,10 @@ class ProductChartFactory[L, W](grammars: IndexedSeq[Grammar[L, W]], maxIteratio def apply(words: IndexedSeq[W], initialCore: ChartConstraints[L]): RefinedChartMarginal[L, W] = { val anchorings = grammars.map(_.anchor(words, initialCore)) - if(anchorings.length == 1) { + if (anchorings.length == 1) { return RefinedChartMarginal(anchorings.head) } - val proj = new AnchoredRuleMarginalProjector[L, W] val augments = anchorings.map(_.marginal).map(proj.project(_)) val marg = augments.reduceLeft[UnrefinedGrammarAnchoring[L, W]](_ * _).marginal diff --git a/src/main/scala/epic/parser/ProductGrammarAnchoring.scala b/src/main/scala/epic/parser/ProductGrammarAnchoring.scala index c8183aec..9dcfcb8f 100644 --- a/src/main/scala/epic/parser/ProductGrammarAnchoring.scala +++ b/src/main/scala/epic/parser/ProductGrammarAnchoring.scala @@ -44,47 +44,47 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], override val sparsityPattern: ChartConstraints[L] = s1.sparsityPattern & s2.sparsityPattern override def annotationTag = { - if(refinementController == null) -1 + if (refinementController == null) -1 else refinementController.annotationTag } def scoreSpan(begin: Int, end: Int, label: Int, ref: Int) = { val r1 = s1.scoreSpan(begin, end, label, label1Ref(label, ref)) - if(r1 == Double.NegativeInfinity) r1 + if (r1 == Double.NegativeInfinity) r1 else r1 + alpha * s2.scoreSpan(begin, end, label, label2Ref(label, ref)) } def scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int) = { val r1 = s1.scoreBinaryRule(begin, split, end, rule, rule1Ref(rule, ref)) - if(r1 == Double.NegativeInfinity) r1 + if (r1 == Double.NegativeInfinity) r1 else r1 + alpha * s2.scoreBinaryRule(begin, split, end, rule, rule2Ref(rule, ref)) } def scoreUnaryRule(begin: Int, end: Int, rule: Int, ref: Int) = { val r1 = s1.scoreUnaryRule(begin, end, rule, rule1Ref(rule, ref)) - if(r1 == Double.NegativeInfinity) r1 + if (r1 == Double.NegativeInfinity) r1 else r1 + alpha * s2.scoreUnaryRule(begin, end, rule, rule2Ref(rule, ref)) } def validLabelRefinements(begin: Int, end: Int, label: Int) = { - if(refinementController ne null) refinementController.validLabelRefinements(begin, end, label) + if (refinementController ne null) refinementController.validLabelRefinements(begin, end, label) else for(a <- s1.validLabelRefinements(begin, end, label); b <- s2.validLabelRefinements(begin, end, label)) yield a * s2.numValidRefinements(label) + b } def numValidRefinements(label: Int) = { - if(refinementController ne null) refinementController.numValidRefinements(label) + if (refinementController ne null) refinementController.numValidRefinements(label) else s1.numValidRefinements(label) * s2.numValidRefinements(label) } def numValidRuleRefinements(rule: Int) = { - if(refinementController ne null) refinementController.numValidRuleRefinements(rule) + if (refinementController ne null) refinementController.numValidRuleRefinements(rule) else s1.numValidRuleRefinements(rule) * s2.numValidRuleRefinements(rule) } def validRuleRefinementsGivenParent(begin: Int, end: Int, rule: Int, parentRef: Int) = { - if(refinementController ne null) refinementController.validRuleRefinementsGivenParent(begin, end, rule, parentRef) + if (refinementController ne null) refinementController.validRuleRefinementsGivenParent(begin, end, rule, parentRef) else { val parent = topology.parent(rule) val bRefinements = s2.validRuleRefinementsGivenParent(begin, end, rule, label2Ref(parent, parentRef)) @@ -95,7 +95,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], } def validRuleRefinementsGivenLeftChild(begin: Int, split: Int, completionBegin: Int, completionEnd: Int, rule: Int, leftChildRef: Int): Array[Int] = { - if(refinementController ne null) refinementController.validRuleRefinementsGivenLeftChild(begin, split, completionBegin, completionEnd, rule, leftChildRef) + if (refinementController ne null) refinementController.validRuleRefinementsGivenLeftChild(begin, split, completionBegin, completionEnd, rule, leftChildRef) else { val leftChild = topology.leftChild(rule) val bRefinements = s2.validRuleRefinementsGivenLeftChild(begin, split, completionBegin, completionEnd, rule, label2Ref(leftChild, leftChildRef)) @@ -106,7 +106,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], } def validRuleRefinementsGivenRightChild(completionBegin: Int, completionEnd: Int, split: Int, end: Int, rule: Int, rightChildRef: Int): Array[Int] = { - if(refinementController ne null) refinementController.validRuleRefinementsGivenRightChild(completionBegin, completionEnd, split, end, rule, rightChildRef) + if (refinementController ne null) refinementController.validRuleRefinementsGivenRightChild(completionBegin, completionEnd, split, end, rule, rightChildRef) else { val rightChild = topology.rightChild(rule) val bRefinements = s2.validRuleRefinementsGivenRightChild(completionBegin, completionEnd, split, end, rule, label2Ref(rightChild, rightChildRef)) @@ -116,9 +116,8 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], } } - def validUnaryRuleRefinementsGivenChild(begin: Int, end: Int, rule: Int, childRef: Int) = { - if(refinementController ne null) refinementController.validUnaryRuleRefinementsGivenChild(begin, end, rule, childRef) + if (refinementController ne null) refinementController.validUnaryRuleRefinementsGivenChild(begin, end, rule, childRef) else { val child = topology.child(rule) val bRefinements = s2.validUnaryRuleRefinementsGivenChild(begin, end, rule, label2Ref(child, childRef)) @@ -129,7 +128,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], } def leftChildRefinement(rule: Int, ruleRef: Int) = { - if(refinementController ne null) refinementController.leftChildRefinement(rule,ruleRef) + if (refinementController ne null) refinementController.leftChildRefinement(rule,ruleRef) else { val l1 = s1.leftChildRefinement(rule, rule1Ref(rule, ruleRef)) val l2 = s2.leftChildRefinement(rule, rule2Ref(rule, ruleRef)) @@ -139,7 +138,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], def rightChildRefinement(rule: Int, ruleRef: Int) = { - if(refinementController ne null) refinementController.rightChildRefinement(rule,ruleRef) + if (refinementController ne null) refinementController.rightChildRefinement(rule,ruleRef) else { val l1 = s1.rightChildRefinement(rule, rule1Ref(rule, ruleRef)) val l2 = s2.rightChildRefinement(rule, rule2Ref(rule, ruleRef)) @@ -149,7 +148,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], def parentRefinement(rule: Int, ruleRef: Int) = { - if(refinementController ne null) refinementController.parentRefinement(rule,ruleRef) + if (refinementController ne null) refinementController.parentRefinement(rule,ruleRef) else { val l1 = s1.parentRefinement(rule, rule1Ref(rule, ruleRef)) val l2 = s2.parentRefinement(rule, rule2Ref(rule, ruleRef)) @@ -158,7 +157,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], } def childRefinement(rule: Int, ruleRef: Int) = { - if(refinementController ne null) refinementController.childRefinement(rule,ruleRef) + if (refinementController ne null) refinementController.childRefinement(rule,ruleRef) else { val l1 = s1.childRefinement(rule, rule1Ref(rule, ruleRef)) val l2 = s2.childRefinement(rule, rule2Ref(rule, ruleRef)) @@ -167,7 +166,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], } def ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int) = { - if(refinementController ne null) refinementController.ruleRefinementFromRefinements(r, refA, refB) + if (refinementController ne null) refinementController.ruleRefinementFromRefinements(r, refA, refB) else { val a1 = label1Ref(topology.parent(r), refA) val a2 = label2Ref(topology.parent(r), refA) @@ -175,13 +174,13 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], val b2 = label2Ref(topology.child(r), refB) val l1 = s1.ruleRefinementFromRefinements(r, a1, b1) val l2 = s2.ruleRefinementFromRefinements(r, a2, b2) - if(l1 < 0 || l2 < 0) -1 + if (l1 < 0 || l2 < 0) -1 else l1 * s2.numValidRuleRefinements(r) + l2 } } def ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int, refC: Int) = { - if(refinementController ne null) refinementController.ruleRefinementFromRefinements(r, refA, refB, refC) + if (refinementController ne null) refinementController.ruleRefinementFromRefinements(r, refA, refB, refC) else { val a1 = label1Ref(topology.parent(r), refA) val a2 = label2Ref(topology.parent(r), refA) @@ -191,7 +190,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], val c2 = label2Ref(topology.rightChild(r), refC) val l1 = s1.ruleRefinementFromRefinements(r, a1, b1, c1) val l2 = s2.ruleRefinementFromRefinements(r, a2, b2, c2) - if(l1 < 0 || l2 < 0) -1 + if (l1 < 0 || l2 < 0) -1 else l1 * s2.numValidRuleRefinements(r) + l2 } @@ -208,7 +207,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], def validParentRefinementsGivenRule(begin: Int, splitBegin: Int, splitEnd: Int, end: Int, rule: Int): Array[Int] = { - if(refinementController ne null) refinementController.validParentRefinementsGivenRule(begin, splitBegin, splitEnd, end, rule) + if (refinementController ne null) refinementController.validParentRefinementsGivenRule(begin, splitBegin, splitEnd, end, rule) else { val r1arr = s1.validParentRefinementsGivenRule(begin, splitBegin, splitEnd, end, rule) val r2arr = s2.validParentRefinementsGivenRule(begin, splitBegin, splitEnd, end, rule) @@ -220,7 +219,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], def validLeftChildRefinementsGivenRule(begin: Int, end: Int, completionBegin: Int, completionEnd: Int, rule: Int): Array[Int] = { - if(refinementController ne null) refinementController.validLeftChildRefinementsGivenRule(begin, end, completionBegin, completionEnd, rule) + if (refinementController ne null) refinementController.validLeftChildRefinementsGivenRule(begin, end, completionBegin, completionEnd, rule) else { val r1arr = s1.validLeftChildRefinementsGivenRule(begin, end, completionBegin, completionEnd, rule) val r2arr = s2.validLeftChildRefinementsGivenRule(begin, end, completionBegin, completionEnd, rule) @@ -230,7 +229,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], } def validRightChildRefinementsGivenRule(completionBegin: Int, completionEnd: Int, begin: Int, end: Int, rule: Int): Array[Int] = { - if(refinementController ne null) refinementController.validRightChildRefinementsGivenRule(completionBegin, completionEnd, begin, end, rule) + if (refinementController ne null) refinementController.validRightChildRefinementsGivenRule(completionBegin, completionEnd, begin, end, rule) else { val r1arr = s1.validRightChildRefinementsGivenRule(completionBegin, completionEnd, begin, end, rule) val r2arr = s2.validRightChildRefinementsGivenRule(completionBegin, completionEnd, begin, end, rule) @@ -242,16 +241,16 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], abstract class ProductRefinementsHandler[L, W](s1: GrammarAnchoring[L, W], s2: GrammarAnchoring[L, W]) { protected final val refinementController: GrammarAnchoring[L, W] = { - if(s1.annotationTag == 0) s2 - else if(s2.annotationTag == 0) s1 + if (s1.annotationTag == 0) s2 + else if (s2.annotationTag == 0) s1 else if (s1.annotationTag < 0 || s2.annotationTag < 0) null - else if(s1.annotationTag == s2.annotationTag) s1 + else if (s1.annotationTag == s2.annotationTag) s1 else null } @inline protected final def label1Ref(label: Int, ref: Int): Int = { - if(refinementController != null) ref + if (refinementController != null) ref else { val num = s1.numValidRefinements(label) ref / num @@ -260,7 +259,7 @@ abstract class ProductRefinementsHandler[L, W](s1: GrammarAnchoring[L, W], s2: G @inline protected final def label2Ref(label: Int, ref: Int): Int = { - if(refinementController != null) ref + if (refinementController != null) ref else { val num = s1.numValidRefinements(label) ref % num @@ -269,7 +268,7 @@ abstract class ProductRefinementsHandler[L, W](s1: GrammarAnchoring[L, W], s2: G @inline protected final def rule1Ref(rule: Int, ref: Int): Int = { - if(refinementController != null) ref + if (refinementController != null) ref else { val num = s1.numValidRuleRefinements(rule) ref / num @@ -278,7 +277,7 @@ abstract class ProductRefinementsHandler[L, W](s1: GrammarAnchoring[L, W], s2: G @inline protected final def rule2Ref(rule: Int, ref: Int): Int = { - if(refinementController != null) ref + if (refinementController != null) ref else { val num = s1.numValidRuleRefinements(rule) ref % num diff --git a/src/main/scala/epic/parser/ProductRefinedFeaturizer.scala b/src/main/scala/epic/parser/ProductRefinedFeaturizer.scala index 39fad371..4e57f2ac 100644 --- a/src/main/scala/epic/parser/ProductRefinedFeaturizer.scala +++ b/src/main/scala/epic/parser/ProductRefinedFeaturizer.scala @@ -27,7 +27,6 @@ class ProductRefinedFeaturizer[L, W, Feat1, Feat2](sf1: Grammar[L, W], feat2: RefinedFeaturizer[L, W, Feat2]) extends RefinedFeaturizer[L, W, Either[Feat1, Feat2]] { def index: EitherIndex[Feat1, Feat2] = feat1.index | feat2.index - override def lock = new ProductRefinedFeaturizer(sf1, sf2, feat1.lock, feat2.lock) def anchor(w: IndexedSeq[W]):Anchoring = { diff --git a/src/main/scala/epic/parser/ProductUnrefinedGrammarAnchoring.scala b/src/main/scala/epic/parser/ProductUnrefinedGrammarAnchoring.scala index d8a832e8..d17c2324 100644 --- a/src/main/scala/epic/parser/ProductUnrefinedGrammarAnchoring.scala +++ b/src/main/scala/epic/parser/ProductUnrefinedGrammarAnchoring.scala @@ -27,7 +27,6 @@ final case class ProductUnrefinedGrammarAnchoring[L, W](s1: UnrefinedGrammarAnch s2: UnrefinedGrammarAnchoring[L, W], alpha: Double = 1.0) extends UnrefinedGrammarAnchoring[L, W] { - // def sparsityPattern = ChartConstraints.noSparsity[L] override def addConstraints(cs: ChartConstraints[L]): UnrefinedGrammarAnchoring[L, W] = copy(s1.addConstraints(cs)) @@ -39,9 +38,8 @@ final case class ProductUnrefinedGrammarAnchoring[L, W](s1: UnrefinedGrammarAnch def words = s1.words - -// override val sparsityPattern: ChartConstraints[L] = s1.sparsityPattern & s2.sparsityPattern -// def addConstraints(cs: ChartConstraints[L]): CoreAnchoring[L, W] = new ProductCoreAnchoring(s1.addConstraints(cs), s2, alpha) + // override val sparsityPattern: ChartConstraints[L] = s1.sparsityPattern & s2.sparsityPattern + // def addConstraints(cs: ChartConstraints[L]): CoreAnchoring[L, W] = new ProductCoreAnchoring(s1.addConstraints(cs), s2, alpha) def scoreSpan(begin: Int, end: Int, label: Int) = { val r1 = s1.scoreSpan(begin, end, label) diff --git a/src/main/scala/epic/parser/ProductionFeaturizer.scala b/src/main/scala/epic/parser/ProductionFeaturizer.scala index 22afdd11..8022db83 100644 --- a/src/main/scala/epic/parser/ProductionFeaturizer.scala +++ b/src/main/scala/epic/parser/ProductionFeaturizer.scala @@ -28,12 +28,12 @@ import epic.features.IndicatorFeature */ @SerialVersionUID(1L) class ProductionFeaturizer[L, L2, W](val topology: RuleTopology[L], refinements: GrammarRefinements[L, L2], - lGen: L2=>Seq[Feature] = {(x:L2)=>if(x.isInstanceOf[Feature]) Seq(x.asInstanceOf[Feature]) else Seq(IndicatorFeature(x))}, + lGen: L2=>Seq[Feature] = {(x:L2)=>if (x.isInstanceOf[Feature]) Seq(x.asInstanceOf[Feature]) else Seq(IndicatorFeature(x))}, rGen: Rule[L2] => Seq[Feature] = {(x: Rule[L2]) => Seq(x)}, filterRedundantFeatures: Boolean = false) extends RefinedFeaturizer[L, W, Feature] with Serializable { private val (index_ :Index[Feature], ruleFeatures: Array[Array[Int]], labelFeatures: Array[Array[Int]]) = { - if(filterRedundantFeatures) { + if (filterRedundantFeatures) { val index = epic.features.buildNonRedundantFeatureIndex[Either[Rule[L2], L2], Feature](refinements.rules.fineIndex.iterator.map(Left(_)) ++ refinements.labels.fineIndex.iterator.map(Right(_)), { case Left(r) => rGen(r) case Right(l) => lGen(l) @@ -63,7 +63,6 @@ class ProductionFeaturizer[L, L2, W](val topology: RuleTopology[L], refinements: def featuresForLabel(l: Int): Array[Int] = labelFeatures(l) - override def lock: RefinedFeaturizer[L, W, Feature] = this def anchor(w: IndexedSeq[W]) = new Anchoring { diff --git a/src/main/scala/epic/parser/ProjectionsGrammarAnchoring.scala b/src/main/scala/epic/parser/ProjectionsGrammarAnchoring.scala index 3f97b710..5b81b8ea 100644 --- a/src/main/scala/epic/parser/ProjectionsGrammarAnchoring.scala +++ b/src/main/scala/epic/parser/ProjectionsGrammarAnchoring.scala @@ -12,7 +12,6 @@ trait ProjectionsGrammarAnchoring[L, L2, W] extends GrammarAnchoring[L, W] { def refinements: GrammarRefinements[L, L2] def refinedTopology: RuleTopology[L2] - final def validLabelRefinements(begin: Int, end: Int, label: Int) = { refinements.labels.localRefinements(label) } @@ -60,7 +59,7 @@ trait ProjectionsGrammarAnchoring[L, L2, W] extends GrammarAnchoring[L, W] { val b2 = refinements.labels.globalize(b, refB) val rule = UnaryRule(refinements.labels.fineIndex.get(a2), refinements.labels.fineIndex.get(b2), topology.chain(r)) val refinedRuleIndex = refinements.rules.fineIndex(rule) - if(refinedRuleIndex < 0) { + if (refinedRuleIndex < 0) { -1 } else { refinements.rules.localize(refinedRuleIndex) @@ -79,7 +78,7 @@ trait ProjectionsGrammarAnchoring[L, L2, W] extends GrammarAnchoring[L, W] { refinements.labels.fineIndex.get(c2) ) val fi = refinements.rules.fineIndex(rule) - if(fi < 0) throw new RuntimeException(s"No such rule: $rule") + if (fi < 0) throw new RuntimeException(s"No such rule: $rule") refinements.rules.localize(fi) } @@ -100,6 +99,4 @@ trait ProjectionsGrammarAnchoring[L, L2, W] extends GrammarAnchoring[L, W] { refinements.rightChildRefinementsCompatibleWithRule(rule) } - - } diff --git a/src/main/scala/epic/parser/RefinedChartMarginal.scala b/src/main/scala/epic/parser/RefinedChartMarginal.scala index 076b576c..5e8ae9e8 100644 --- a/src/main/scala/epic/parser/RefinedChartMarginal.scala +++ b/src/main/scala/epic/parser/RefinedChartMarginal.scala @@ -41,7 +41,6 @@ final case class RefinedChartMarginal[L, W](anchoring: GrammarAnchoring[L, W], logPartition: Double, override val isMaxMarginal: Boolean) extends ParseMarginal[L, W] with SafeLogging { - override def insideTopScore(begin: Int, end: Int, sym: Int, ref: Int): Double = inside.top(begin, end, sym, ref) override def insideBotScore(begin: Int, end: Int, sym: Int, ref: Int): Double = inside.bot(begin, end, sym, ref) @@ -49,11 +48,9 @@ final case class RefinedChartMarginal[L, W](anchoring: GrammarAnchoring[L, W], val in = inside.bot.decodedLabelScores(begin, end) in += outside.bot.decodedLabelScores(begin, end) in -= logPartition - breeze.numerics.exp(in) } - def feasibleSplitPoints(begin: Int, end: Int, leftChild: Int, leftChildRef: Int, rightChild: Int, rightChildRef: Int):IndexedSeq[Int] = { inside.top.feasibleSplitPoints(begin, end, leftChild, leftChildRef, rightChild, rightChildRef).toIndexedSeq } @@ -62,8 +59,8 @@ final case class RefinedChartMarginal[L, W](anchoring: GrammarAnchoring[L, W], * Forest traversal that visits spans in a "bottom up" order. */ def visitPostorder(spanVisitor: AnchoredVisitor[L], spanThreshold: Double = Double.NegativeInfinity):Unit = { - if(logPartition.isInfinite) throw new RuntimeException("No parse for " + words) - if(logPartition.isNaN) throw new RuntimeException("NaN prob!") + if (logPartition.isInfinite) throw new RuntimeException("No parse for " + words) + if (logPartition.isNaN) throw new RuntimeException("NaN prob!") val itop = inside.top @@ -98,12 +95,12 @@ final case class RefinedChartMarginal[L, W](anchoring: GrammarAnchoring[L, W], val aOutside = outside.bot.labelScore(begin, end, a, refA) val labelMarginal = aOutside + inside.bot.labelScore(begin, end, a, refA) - logPartition val aScore = aOutside + anchoring.scoreSpan(begin, end, a, refA) - if(labelMarginal > spanThreshold) { + if (labelMarginal > spanThreshold) { spanVisitor.visitSpan(begin, end, a, refA, math.exp(labelMarginal)) - if(!spanVisitor.skipBinaryRules) { + if (!spanVisitor.skipBinaryRules) { val rules = anchoring.validCoarseRulesGivenParentRefinement(a, refA) - while(i < rules.length) { + while (i < rules.length) { val r = rules(i) val b = topology.leftChild(r) val c = topology.rightChild(r) @@ -111,10 +108,10 @@ final case class RefinedChartMarginal[L, W](anchoring: GrammarAnchoring[L, W], val feasibleCoarseRange = inside.top.feasibleSplitPoints(begin, end, b, c) - if(feasibleCoarseRange.nonEmpty) { + if (feasibleCoarseRange.nonEmpty) { val refinements = anchoring.validRuleRefinementsGivenParent(begin, end, r, refA) var ruleRefIndex = 0 - while(ruleRefIndex < refinements.length) { + while (ruleRefIndex < refinements.length) { val refR = refinements(ruleRefIndex) ruleRefIndex += 1 val refB = anchoring.leftChildRefinement(r, refR) @@ -125,7 +122,7 @@ final case class RefinedChartMarginal[L, W](anchoring: GrammarAnchoring[L, W], var split = feasibleSplitRange.begin val endSplit = feasibleSplitRange.end - while(split < endSplit) { + while (split < endSplit) { val bInside = itop.labelScore(begin, split, b, refB) val cInside = itop.labelScore(split, end, c, refC) val withoutRefined = bInside + cInside @@ -147,7 +144,7 @@ final case class RefinedChartMarginal[L, W](anchoring: GrammarAnchoring[L, W], } // Unaries - if(!spanVisitor.skipUnaryRules) + if (!spanVisitor.skipUnaryRules) for { span <- 1 to words.length begin <- 0 to (words.length - span) @@ -201,7 +198,6 @@ final case class RefinedChartMarginal[L, W](anchoring: GrammarAnchoring[L, W], this } - def checkForTreeOutside(tree: BinarizedTree[(L, Int)]) { for (t <- tree.allChildren) t match { case tree@UnaryTree( (label, ref), _, _, span) => @@ -243,14 +239,13 @@ object RefinedChartMarginal { apply(grammar.anchor(sent)) } - def apply[L, W](anchoring: GrammarAnchoring[L, W]): RefinedChartMarginal[L, W] = { apply(anchoring, false) } def apply[L, W](anchoring: GrammarAnchoring[L, W], maxMarginal: Boolean): RefinedChartMarginal[L, W] = { val sent = anchoring.words - val sum = if(maxMarginal) MaxSummer else LogSummer + val sum = if (maxMarginal) MaxSummer else LogSummer val inside = buildInsideChart(anchoring, sent, sum) val logPartition = rootScore(anchoring, inside, sum) val outside = buildOutsideChart(anchoring, inside, sum) @@ -269,7 +264,7 @@ object RefinedChartMarginal { private[parser] object MaxSummer extends Summer { def apply(a: Double, b: Double): Double = math.max(a,b) - def apply(a: Array[Double], length: Int): Double = if(length == 0) Double.NegativeInfinity else max.array(a, length) + def apply(a: Array[Double], length: Int): Double = if (length == 0) Double.NegativeInfinity else max.array(a, length) } private def rootScore[L, W](anchoring: GrammarAnchoring[L, W], inside: RefinedParseChart[L], sum: Summer): Double = { @@ -278,13 +273,13 @@ object RefinedChartMarginal { var offset = 0 for(ref <- inside.top.enteredLabelRefinements(0, inside.length, rootIndex)) { val score = inside.top.labelScore(0, inside.length, rootIndex, ref) - if(score != Double.NegativeInfinity) { + if (score != Double.NegativeInfinity) { rootScores(offset) = score offset += 1 } } val score = sum(rootScores, offset) -// assert(score != 0.0, rootScores.mkString(", ") + anchoring.words) + // assert(score != 0.0, rootScores.mkString(", ") + anchoring.words) assert(!score.isNaN, rootScores.mkString(", ")) score } @@ -329,7 +324,6 @@ object RefinedChartMarginal { } { val end = begin + span - for ( a <- 0 until anchoring.topology.labelIndex.size if anchoring.sparsityPattern.bot.isAllowedLabeledSpan(begin, end, a)) { val numValidLabelRefs = anchoring.numValidRefinements(a) java.util.Arrays.fill(offsets, 0) @@ -337,27 +331,26 @@ object RefinedChartMarginal { val rules = anchoring.topology.indexedBinaryRulesWithParent(a) var ruleIndex = 0 // into rules - while(ruleIndex < rules.length) { + while (ruleIndex < rules.length) { val r = rules(ruleIndex) val b = anchoring.topology.leftChild(r) val c = anchoring.topology.rightChild(r) ruleIndex += 1 - val feasibleCoarseRange = inside.top.feasibleSplitPoints(begin, end, b, c) - if(feasibleCoarseRange.nonEmpty) { + if (feasibleCoarseRange.nonEmpty) { val validA = anchoring.validParentRefinementsGivenRule(begin, feasibleCoarseRange.begin, feasibleCoarseRange.end, end, r) var ai = 0 - while(ai < validA.length) { + while (ai < validA.length) { val refA = validA(ai) ai += 1 val spanScore = anchoring.scoreSpan(begin, end, a, refA) - if(!spanScore.isInfinite) { + if (!spanScore.isInfinite) { val refinements = anchoring.validRuleRefinementsGivenParent(begin, end, r, refA) var ruleRefIndex = 0 - while(ruleRefIndex < refinements.length) { + while (ruleRefIndex < refinements.length) { val refR = refinements(ruleRefIndex) ruleRefIndex += 1 val refB = anchoring.leftChildRefinement(r, refR) @@ -368,21 +361,21 @@ object RefinedChartMarginal { var split = feasibleSplitRange.begin val endSplit = feasibleSplitRange.end - while(split < endSplit) { + while (split < endSplit) { val bScore = inside.top.labelScore(begin, split, b, refB) val cScore = inside.top.labelScore(split, end, c, refC) val withoutRule = bScore + cScore + spanScore - if(withoutRule != Double.NegativeInfinity) { + if (withoutRule != Double.NegativeInfinity) { val prob = withoutRule + anchoring.scoreBinaryRule(begin, split, end, r, refR) assert(!prob.isNaN, s"$withoutRule ${anchoring.scoreBinaryRule(begin, split, end, r, refR)} $bScore $cScore $spanScore") - if(prob != Double.NegativeInfinity) { + if (prob != Double.NegativeInfinity) { scoreArray(refA)(offsets(refA)) = prob offsets(refA) += 1 // buffer full - if(offsets(refA) == scoreArray(refA).length) { + if (offsets(refA) == scoreArray(refA).length) { scoreArray(refA)(0) = sum(scoreArray(refA), offsets(refA)) offsets(refA) = 1 } @@ -395,14 +388,12 @@ object RefinedChartMarginal { } // end a refinement - - } // end canBuildThisRule } // end rules enterScoresForLabelRefinements(sum, scoreArray, offsets, inside.bot, begin, end, a, numValidLabelRefs) // assert(rootScore(anchoring, inside, sum) != 0.0, (begin, end, a)) -// if(!foundSomething && refined.sparsityPattern != ChartConstraints.noSparsity) { +// if (!foundSomething && refined.sparsityPattern != ChartConstraints.noSparsity) { // logger.warn(s"Failed to replicate a span in ($begin, $end) of ${anchoring.words}. Label is ${anchoring.grammar.labelIndex.get(a)}") // // } @@ -413,7 +404,6 @@ object RefinedChartMarginal { inside } - private def enterScoresForLabelRefinements[L](sum: Summer, scoreArray: Array[Array[Double]], offsets: Array[Int], bot: RefinedParseChart[L]#ChartScores, begin: Int, end: Int, parent: Int, numValidLabelRefs: Int) { var foundSomething = false var ai = 0 @@ -436,7 +426,6 @@ object RefinedChartMarginal { val grammar = anchoring.topology val rootIndex = grammar.labelIndex(grammar.root) - val length = inside.length val outside = RefinedParseChart(grammar.labelIndex, Array.tabulate(grammar.labelIndex.size)(refined.numValidRefinements), @@ -458,7 +447,7 @@ object RefinedChartMarginal { val enteredTop = inside.top.enteredLabelIndexes(begin, end) var a = 0 - while(a < grammar.labelIndex.size) { + while (a < grammar.labelIndex.size) { // we're going to populate a by looking at rules p -> a rc, p -> lc a if (enteredTop.contains(a)) { java.util.Arrays.fill(offsets, 0) @@ -476,7 +465,6 @@ object RefinedChartMarginal { outside } - private def doOutsideLeftCompletionUpdates[W, L](inside: RefinedParseChart[L], outside: RefinedParseChart[L], anchoring: GrammarAnchoring[L, W], begin: Int, end: Int, @@ -486,7 +474,6 @@ object RefinedChartMarginal { val grammar = refined.topology val rules = anchoring.topology.indexedBinaryRulesWithLeftChild(label) - var br = 0 while (br < rules.length) { val r = rules(br) @@ -528,7 +515,7 @@ object RefinedChartMarginal { if (cInside != Double.NegativeInfinity && pOutside != Double.NegativeInfinity) { val ruleScore = refined.scoreBinaryRule(begin, end, completion, r, refR) val score = cInside + ruleScore + pOutside - if(score != Double.NegativeInfinity) { + if (score != Double.NegativeInfinity) { scoreArray(refA)(offsets(refA)) = score offsets(refA) += 1 // buffer full @@ -557,7 +544,6 @@ object RefinedChartMarginal { val rcMaxCompletion = inside.top.rightMostEndForBegin(end)(rc)(refC) val completionBegin = math.max(math.max(parentMinCompletion, rcMinCompletion), end + 1) val completionEnd = math.min(parentMaxCompletion, rcMaxCompletion) - Span(completionBegin, completionEnd) } @@ -568,7 +554,6 @@ object RefinedChartMarginal { val rcMaxCompletion = inside.top.coarseRightMostEndForBegin(end)(rc) val completionBegin = math.max(math.max(parentMinCompletion, rcMinCompletion), end + 1) val completionEnd = math.min(parentMaxCompletion, rcMaxCompletion) - Span(completionBegin, completionEnd) } @@ -621,7 +606,7 @@ object RefinedChartMarginal { if (bInside != Double.NegativeInfinity && pOutside != Double.NegativeInfinity) { val ruleScore = refined.scoreBinaryRule(completion, begin, end, r, refR) val score = bInside + ruleScore + pOutside - if(score != Double.NegativeInfinity) { + if (score != Double.NegativeInfinity) { scoreArray(refA)(offsets(refA)) = score offsets(refA) += 1 // buffer full @@ -642,7 +627,6 @@ object RefinedChartMarginal { } } - private def feasibleSpanForRightCompletion[L, W](begin: Int, end: Int, p: Int, refP: Int, lc: Int, refB: Int, inside: RefinedParseChart[L]) = { val parentMinCompletion = inside.bot.leftMostBeginForEnd(end)(p)(refP) val rcMinCompletion = inside.top.leftMostBeginForEnd(begin)(lc)(refB) @@ -674,13 +658,13 @@ object RefinedChartMarginal { for(bi <- chart.bot.enteredLabelIndexes(begin, end); refB <- chart.bot.enteredLabelRefinements(begin, end, bi)) { val b = bi val bScore = chart.bot.labelScore(begin, end, b, refB) - if(bScore != Double.NegativeInfinity) { + if (bScore != Double.NegativeInfinity) { val rules = grammar.indexedUnaryRulesWithChild(b) var j = 0 - while(j < rules.length) { + while (j < rules.length) { val r = rules(j) val a = grammar.parent(r) - if(refined.sparsityPattern.top.isAllowedLabeledSpan(begin, end, a)) { + if (refined.sparsityPattern.top.isAllowedLabeledSpan(begin, end, a)) { for (refR <- refined.validUnaryRuleRefinementsGivenChild(begin, end, r, refB)) { val refA = refined.parentRefinement(r, refR) val ruleScore: Double = refined.scoreUnaryRule(begin, end, r, refR) @@ -709,15 +693,15 @@ object RefinedChartMarginal { val bScore = chart.top.labelScore(begin, end, a, refA) val rules = grammar.indexedUnaryRulesWithParent(a) var j = 0 - while(j < rules.length) { + while (j < rules.length) { val r = rules(j) val b = grammar.child(r) - if(inside.bot.isLabelEntered(begin, end, b)) + if (inside.bot.isLabelEntered(begin, end, b)) for(refR <- refined.validRuleRefinementsGivenParent(begin, end, rules(j), refA)) { val refB = refined.childRefinement(rules(j), refR) val ruleScore: Double = refined.scoreUnaryRule(begin, end, rules(j), refR) val prob: Double = bScore + ruleScore - if(prob != Double.NegativeInfinity) { + if (prob != Double.NegativeInfinity) { chart.bot.enter(begin, end, b, refB, sum(chart.bot.labelScore(begin, end, b, refB), prob)) } } @@ -727,8 +711,6 @@ object RefinedChartMarginal { } - - } diff --git a/src/main/scala/epic/parser/RefinedFeaturizer.scala b/src/main/scala/epic/parser/RefinedFeaturizer.scala index a42cb491..7e81eb8b 100644 --- a/src/main/scala/epic/parser/RefinedFeaturizer.scala +++ b/src/main/scala/epic/parser/RefinedFeaturizer.scala @@ -19,7 +19,6 @@ import epic.trees.{LexicalProduction, Production, Rule} import breeze.util.Index import epic.framework.Feature - /** * * @author dlwh diff --git a/src/main/scala/epic/parser/RefinedParseChart.scala b/src/main/scala/epic/parser/RefinedParseChart.scala index beb0b077..89068b4b 100644 --- a/src/main/scala/epic/parser/RefinedParseChart.scala +++ b/src/main/scala/epic/parser/RefinedParseChart.scala @@ -48,15 +48,13 @@ class RefinedParseChart[L](val index: Index[L], /** (begin,end) -> label -> refinement -> score */ // fill in arrays for spans we might touch val score: TriangularArray[Array[Array[Double]]] = TriangularArray.tabulate(length+1){(begin, end) => - if(sparsity.isAllowedSpan(begin, end)) { + if (sparsity.isAllowedSpan(begin, end)) { makeGrammarScoreArray(begin, end) } else { null } } - - /** (begin,end) -> which labels are on */ val enteredLabels: Array[BitSet] = mkBitSetArray(TriangularArray.arraySize(length+1)) /** (begin,end) -> label -> which refinements of label are on */ @@ -94,19 +92,19 @@ class RefinedParseChart[L](val index: Index[L], def enteredLabelScores(begin: Int, end: Int) = { val scoreArray = score(begin, end) - if(scoreArray eq null) Iterator.empty + if (scoreArray eq null) Iterator.empty else enteredLabels(TriangularArray.index(begin, end)).iterator.map { i => (index.get(i), scoreArray(i))} } def decodedLabelScores(begin: Int, end: Int):Counter2[L,Int,Double] = { val scoreArray = score(begin, end) - if(scoreArray eq null) Counter2() + if (scoreArray eq null) Counter2() else { val ret = Counter2[L, Int, Double]() for(i <- enteredLabels(TriangularArray.index(begin, end))) { val l = index.get(i) for((v,s) <- scoreArray(i).zipWithIndex) { - if(v != zero) + if (v != zero) ret(l,s) = v } } @@ -116,18 +114,17 @@ class RefinedParseChart[L](val index: Index[L], def decodedLabelScores(begin: Int, end: Int, label: Int):Counter[Int,Double] = { val scoreArray = score(begin, end) - if(scoreArray == null || scoreArray(label) == null) Counter() + if (scoreArray == null || scoreArray(label) == null) Counter() else { val ret = Counter[Int, Double]() for((v,s) <- scoreArray(label).zipWithIndex) { - if(v != zero) + if (v != zero) ret(s) = v } ret } } - private def rawEnter(begin: Int, end: Int, parent: Int, ref: Int, w: Double) = { val arrx = score(begin, end) val arr = arrx(parent) @@ -136,11 +133,10 @@ class RefinedParseChart[L](val index: Index[L], oldScore } - def enter(begin: Int, end: Int, parent: Int, ref: Int, w: Double): Unit = { val oldScore = rawEnter(begin, end, parent, ref, w) - if(oldScore == zero) { + if (oldScore == zero) { val index = TriangularArray.index(begin, end) updateExtents(index, parent, ref, begin, end) } @@ -190,22 +186,19 @@ class RefinedParseChart[L](val index: Index[L], narrowR < end } - - def feasibleSplitPoints(begin: Int, end: Int, b: Int, c: Int) = { val narrowR = coarseLeftMostEndForBegin(begin)(b) val narrowL = coarseRightMostBeginForEnd(end)(c) var split = math.max(narrowR, coarseLeftMostBeginForEnd(end)(c)) val endSplit = math.min(coarseRightMostEndForBegin(begin)(b), narrowL) + 1 val canBuildThisRule = narrowR < end && narrowL >= narrowR && split <= narrowL && split < endSplit - if(!canBuildThisRule) + if (!canBuildThisRule) split = endSplit - Span(split, endSplit) } def feasibleSplitPoints(begin: Int, end: Int, b: Int, refB: Int, c: Int, refC: Int) = { - if(leftMostEndForBegin(begin)(b) == null || rightMostBeginForEnd(end)(c) == null) { + if (leftMostEndForBegin(begin)(b) == null || rightMostBeginForEnd(end)(c) == null) { Span(0,0) } else { val narrowR = leftMostEndForBegin(begin)(b)(refB) @@ -213,9 +206,8 @@ class RefinedParseChart[L](val index: Index[L], var split = math.max(narrowR, leftMostBeginForEnd(end)(c)(refC)) val endSplit = math.min(rightMostEndForBegin(begin)(b)(refB), narrowL) + 1 val canBuildThisRule = narrowR < end && narrowL >= narrowR && split <= narrowL && split < endSplit - if(!canBuildThisRule) + if (!canBuildThisRule) split = endSplit - Span(split, endSplit) } } @@ -228,7 +220,6 @@ class RefinedParseChart[L](val index: Index[L], arr } - private def makeGrammarScoreArray(begin: Int, end: Int): Array[Array[Double]] = { val arr = new Array[Array[Double]](index.size) var l = 0 @@ -245,15 +236,12 @@ class RefinedParseChart[L](val index: Index[L], protected final def zero = Double.NegativeInfinity } - - object RefinedParseChart { def apply[L](g: Index[L], refinements: Array[Int], length: Int, constraints: ChartConstraints[L]): RefinedParseChart[L] = { new RefinedParseChart(g, refinements, length, constraints) } - // all of these methods could be replaced by an Array.fill or Array.tabulate, but // those were showing up in the profile. @@ -269,13 +257,12 @@ object RefinedParseChart { Array.fill[Array[BitSet]](length)(mkBitSetArray(grammarSize)) } - private def makeRefinedExtentArray(len: Int, refinementsFor: Array[Int], fillValue: Int): Array[Array[Array[Int]]] = { val arr = Array.ofDim[Array[Int]](len, refinementsFor.length) var pos = 0 while (pos < len) { var l = 0 - while(l < refinementsFor.length) { + while (l < refinementsFor.length) { arr(pos)(l) = Array.fill[Int](refinementsFor(l))(fillValue) l += 1 } diff --git a/src/main/scala/epic/parser/RuleFeaturizer.scala b/src/main/scala/epic/parser/RuleFeaturizer.scala index 4cc87d57..2bfa3529 100644 --- a/src/main/scala/epic/parser/RuleFeaturizer.scala +++ b/src/main/scala/epic/parser/RuleFeaturizer.scala @@ -41,6 +41,5 @@ class RuleFeaturizer[L, W](grammar: RuleTopology[L]) extends RefinedFeaturizer[L } } - override def lock = this } diff --git a/src/main/scala/epic/parser/RuleTopology.scala b/src/main/scala/epic/parser/RuleTopology.scala index 7868fdee..e44cdba1 100644 --- a/src/main/scala/epic/parser/RuleTopology.scala +++ b/src/main/scala/epic/parser/RuleTopology.scala @@ -44,12 +44,11 @@ final class RuleTopology[L] private ( val rootIndex = labelIndex(root) - def labelEncoder = Encoder.fromIndex(labelIndex) // Accessors for properties of indexed rules /** Returns the parent label index from the rule index */ - def parent(r: Int):Int = indexedRules(r).parent + def parent(r: Int): Int = indexedRules(r).parent /** Returns the left child label index from the rule index */ def leftChild(r: Int): Int = indexedRules(r).asInstanceOf[BinaryRule[Int]].left /** Returns the right child label index from the rule index */ @@ -85,7 +84,7 @@ final class RuleTopology[L] private ( for( (parent,block) <- blocks) { var first = true for (r <- block) { - if(!first) + if (!first) builder ++= (" "*startLength) else builder ++= labelStrings(parent).padTo(startLength, ' ') @@ -94,7 +93,7 @@ final class RuleTopology[L] private ( r match { case UnaryRule(a, b, chain) => - if(chain.nonEmpty) + if (chain.nonEmpty) chain.addString(builder, "(", "^", ")^") builder ++= labelStrings(b) case BinaryRule(a, b, c) => @@ -187,8 +186,6 @@ object RuleTopology { unaryRulesByChild.map(_.toArray)) } - - @SerialVersionUID(1) private class SerializedForm[L](var root: L, var labelIndex: Index[L], var ri: Index[Rule[L]]) extends Serializable { @throws(classOf[ObjectStreamException]) diff --git a/src/main/scala/epic/parser/SimpleChartMarginal.scala b/src/main/scala/epic/parser/SimpleChartMarginal.scala index f6625608..f2da72ca 100644 --- a/src/main/scala/epic/parser/SimpleChartMarginal.scala +++ b/src/main/scala/epic/parser/SimpleChartMarginal.scala @@ -30,8 +30,8 @@ final case class SimpleChartMarginal[L, L2, W](anchoring: SimpleGrammar.Anchorin } override def visitPostorder(spanVisitor: AnchoredVisitor[L], spanThreshold: Double): Unit = { - if(logPartition.isInfinite) throw new RuntimeException("No parse for " + words) - if(logPartition.isNaN) throw new RuntimeException("NaN prob!") + if (logPartition.isInfinite) throw new RuntimeException("No parse for " + words) + if (logPartition.isNaN) throw new RuntimeException("NaN prob!") val refinedTopology = anchoring.refinedTopology @@ -62,20 +62,20 @@ final case class SimpleChartMarginal[L, L2, W](anchoring: SimpleGrammar.Anchorin val end = begin + span val aOutside = outside.bot(begin, end, parent) val labelMarginal = inside.bot(begin, end, parent) + aOutside - logPartition - if(labelMarginal > spanThreshold) { + if (labelMarginal > spanThreshold) { val aCoarse = anchoring.refinements.labels.project(parent) val aRef = anchoring.refinements.labels.localize(parent) spanVisitor.visitSpan(begin, end, aCoarse, aRef, math.exp(labelMarginal)) - if(!spanVisitor.skipBinaryRules) { + if (!spanVisitor.skipBinaryRules) { val rules = anchoring.refinedTopology.indexedBinaryRulesWithParent(parent) var i = 0 - while(i < rules.length) { + while (i < rules.length) { val r = rules(i) val b = refinedTopology.leftChild(r) val c = refinedTopology.rightChild(r) var split = begin + 1 - while(split < end) { + while (split < end) { val bInside = inside.top.labelScore(begin, split, b) val cInside = inside.top.labelScore(split, end, c) val ruleScore = anchoring.grammar.ruleScore(r) @@ -85,7 +85,7 @@ final case class SimpleChartMarginal[L, L2, W](anchoring: SimpleGrammar.Anchorin val margScore = bInside + cInside + ruleScore + aOutside - logPartition - if(margScore != Double.NegativeInfinity) { + if (margScore != Double.NegativeInfinity) { spanVisitor.visitBinaryRule(begin, split, end, coarseR, refR, math.exp(margScore)) } @@ -100,7 +100,7 @@ final case class SimpleChartMarginal[L, L2, W](anchoring: SimpleGrammar.Anchorin } } - if(!spanVisitor.skipUnaryRules) + if (!spanVisitor.skipUnaryRules) for { span <- 1 to words.length begin <- 0 to (words.length - span) @@ -136,7 +136,7 @@ object SimpleChartMarginal { } def apply[L, L2, W](anchoring: SimpleGrammar.Anchoring[L, L2, W], maxMarginal: Boolean): SimpleChartMarginal[L, L2, W] = { - val sum = if(maxMarginal) MaxSummer else LogSummer + val sum = if (maxMarginal) MaxSummer else LogSummer val inside = buildInsideChart(anchoring, sum) val outside = buildOutsideChart(anchoring, inside, sum) SimpleChartMarginal(anchoring, inside, outside, maxMarginal) @@ -188,24 +188,23 @@ object SimpleChartMarginal { val rdoff = rcell.offset var lc = 0 - while(lc < numSyms) { + while (lc < numSyms) { val lcSpan = tensor.leftChildRange(lc) var rcOff = lcSpan.begin val rcEnd = lcSpan.end val bInside = ldata(ldoff + lc) - if(bInside != Double.NegativeInfinity) { - while(rcOff < rcEnd) { + if (bInside != Double.NegativeInfinity) { + while (rcOff < rcEnd) { val rc = tensor.rightChildForOffset(rcOff) val cInside = rdata(rdoff + rc) val rcSpan = tensor.rightChildRange(rcOff) val withoutRule = bInside + cInside - - if(cInside != Double.NegativeInfinity) { + if (cInside != Double.NegativeInfinity) { var pOff = rcSpan.begin val pEnd = rcSpan.end - while(pOff < pEnd) { + while (pOff < pEnd) { val p = tensor.parentForOffset(pOff) val score = tensor.ruleScoreForOffset(pOff) + withoutRule pdata(p + pdoff) = sum(pdata(p + pdoff), score) @@ -215,8 +214,6 @@ object SimpleChartMarginal { } - - rcOff += 1 } } @@ -228,7 +225,6 @@ object SimpleChartMarginal { split += 1 } - updateInsideUnaries(chart, anchoring, begin, end, sum) } @@ -257,7 +253,7 @@ object SimpleChartMarginal { val pdoff = pcell.offset var a = 0 - while(a < numSyms) { + while (a < numSyms) { val outsideA = pdata(pdoff + a) if (outsideA != Double.NegativeInfinity) { val pSpan = tensor.leftChildRange(a) @@ -280,23 +276,23 @@ object SimpleChartMarginal { val ordoff = orcell.offset var lcOff = pSpan.begin - while(lcOff < lcEnd) { + while (lcOff < lcEnd) { val lc = tensor.rightChildForOffset(lcOff) val bInside = ldata(ldoff + lc) - if(bInside != Double.NegativeInfinity) { + if (bInside != Double.NegativeInfinity) { val lcSpan = tensor.rightChildRange(lcOff) var rcOff = lcSpan.begin val rcEnd = lcSpan.end - while(rcOff < rcEnd) { + while (rcOff < rcEnd) { val rc = tensor.parentForOffset(rcOff) val score = tensor.ruleScoreForOffset(rcOff) + outsideA val cInside = rdata(rdoff + rc) if (cInside != Double.NegativeInfinity) { oldata(oldoff + lc) = sum(oldata(oldoff + lc), cInside + score) ordata(ordoff + rc) = sum(ordata(ordoff + rc), bInside + score) -// outside.top.enter(begin, split, lc, sum(outside.top.labelScore(begin, split, lc), cInside + score)) -// outside.top.enter(split, end, rc, sum(outside.top.labelScore(split, end, rc), bInside + score)) + // outside.top.enter(begin, split, lc, sum(outside.top.labelScore(begin, split, lc), cInside + score)) + // outside.top.enter(split, end, rc, sum(outside.top.labelScore(split, end, rc), bInside + score)) } rcOff += 1 } @@ -312,7 +308,6 @@ object SimpleChartMarginal { outside } - private def updateInsideUnaries[L, L2, W](chart: SimpleParseChart[L2], anchoring: SimpleGrammar.Anchoring[L, L2, W], begin: Int, end: Int, sum: Summer) = { @@ -320,10 +315,8 @@ object SimpleChartMarginal { val parentCell = chart.top.cell(begin, end) val tensor = anchoring.grammar.insideTensor doMatrixMultiply(childCell, parentCell, tensor, sum) - } - private def doMatrixMultiply[W, L2, L](childCell: DenseVector[Double], parentCell: DenseVector[Double], tensor: SparseRuleTensor[L2], sum: RefinedChartMarginal.Summer) { val numSyms = childCell.size val cdata = childCell.data @@ -345,7 +338,6 @@ object SimpleChartMarginal { aOff += 1 } } - b += 1 } } @@ -357,7 +349,6 @@ object SimpleChartMarginal { val parentCell = outside.top.cell(begin, end) val tensor = anchoring.grammar.outsideTensor doMatrixMultiply(parentCell, childCell, tensor, sum) - } case class SimpleChartFactory[L, L2, W](refinedGrammar: SimpleGrammar[L, L2, W], maxMarginal: Boolean = false) extends ParseMarginal.Factory[L, W] { @@ -368,8 +359,6 @@ object SimpleChartMarginal { } - - @SerialVersionUID(1) final class SimpleParseChart[L](val index: Index[L], val length: Int) extends Serializable { @@ -386,7 +375,6 @@ final class SimpleParseChart[L](val index: Index[L], val length: Int) extends Se //scores(::, TriangularArray.index(begin, end)) } - def apply(begin: Int, end: Int, label: L):Double = apply(begin, end, index(label)) def labelScore(begin: Int, end: Int, label: L):Double = apply(begin, end, index(label)) @@ -397,6 +385,3 @@ final class SimpleParseChart[L](val index: Index[L], val length: Int) extends Se } } - - - diff --git a/src/main/scala/epic/parser/SimpleGrammar.scala b/src/main/scala/epic/parser/SimpleGrammar.scala index 61c14123..7ebb56ae 100644 --- a/src/main/scala/epic/parser/SimpleGrammar.scala +++ b/src/main/scala/epic/parser/SimpleGrammar.scala @@ -103,7 +103,6 @@ object SimpleGrammar { } - private def doCloseUnaries(matrix: DenseMatrix[Double], closureType: CloseUnaries.Value, syms: Index[AnnotatedLabel]): immutable.IndexedSeq[(UnaryRule[AnnotatedLabel], Double)] = closureType match { case CloseUnaries.None => val probs = breeze.numerics.log(matrix) @@ -161,13 +160,13 @@ object SimpleGrammar { val binaryIn = Source.fromInputStream(new FileInputStream(prefix+".binary")) for ( line <- binaryIn.getLines()) { val Array(_a,_b,_c, score) = line.split("\\s+") - val a = if(_a.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_a) - val b = if(_b.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_b) - val c = if(_c.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_c) + val a = if (_a.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_a) + val b = if (_b.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_b) + val c = if (_c.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_c) val logScore = math.log(score.toDouble) - if(logScore >= threshold) { + if (logScore >= threshold) { val ruleId = rules.index(BinaryRule(a,b,c).map(AnnotatedLabel(_))) - if(ruleId == ruleScores.length) + if (ruleId == ruleScores.length) ruleScores += logScore syms.index(AnnotatedLabel(a)) syms.index(AnnotatedLabel(b)) @@ -182,10 +181,10 @@ object SimpleGrammar { val unclosedUnaries: DenseMatrix[Double] = DenseMatrix.eye[Double](syms.size) for ( line <- unaryIn.getLines()) { val Array(_a, _b,score) = line.split("\\s+") - val a = if(_a.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_a) - val b = if(_b.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_b) + val a = if (_a.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_a) + val b = if (_b.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_b) val logScore = math.log(score.toDouble) - if(logScore >= threshold) { + if (logScore >= threshold) { val ai = syms(AnnotatedLabel(a)) val bi = syms(AnnotatedLabel(b)) require(ai >= 0 && bi >= 0, a + " " + b + " " + syms) @@ -231,7 +230,6 @@ object SimpleGrammar { scorer) } - def preprocessSymbol(_sym: String): String = { val sym = if (_sym == "ROOT") "TOP" else if (_sym == "PRT|ADVP") "PRT" else _sym sym.replaceAll("ROOT","TOP").replaceAll("PRT\\|ADVP_[0-9]*", "PRT_0") @@ -248,7 +246,6 @@ object SimpleGrammar { case class Anchoring[L, L2, W](grammar: SimpleGrammar[L, L2, W], words: IndexedSeq[W], override val sparsityPattern: ChartConstraints[L]) extends ProjectionsGrammarAnchoring[L, L2, W] { - override def addConstraints(constraints: ChartConstraints[L]): GrammarAnchoring[L, W] = copy(sparsityPattern = sparsityPattern & constraints) def topology = grammar.topology @@ -260,7 +257,7 @@ object SimpleGrammar { override def toString() = "SimpleRefinedGrammar.Anchoring(...)" def scoreSpan(begin: Int, end: Int, label: Int, ref: Int) = { - val baseScore = if(begin + 1 == end) { + val baseScore = if (begin + 1 == end) { val fullId = refinements.labels.globalize(label, ref) tagAnchoring.scoreTag(begin, refinements.labels.fineIndex.get(fullId)) } else { @@ -279,5 +276,4 @@ object SimpleGrammar { } - } diff --git a/src/main/scala/epic/parser/SparseRuleTensor.scala b/src/main/scala/epic/parser/SparseRuleTensor.scala index 6b9c6250..3332277a 100644 --- a/src/main/scala/epic/parser/SparseRuleTensor.scala +++ b/src/main/scala/epic/parser/SparseRuleTensor.scala @@ -37,7 +37,6 @@ final class SparseRuleTensor[L] private(val leftChildOffsets: Array[Int], java.lang.Double.longBitsToDouble((first.toLong << 32) | (second.toLong&0xFFFFFFFFL)) } - def unaryChildRange(lc: Int):Span = Span(unaryChildPtrs(lc), unaryChildPtrs(lc+1) ) def unaryParentForOffset(off: Int) = unaryParentIndicesAndScores(off * 3) @@ -57,7 +56,7 @@ final class SparseRuleTensor[L] private(val leftChildOffsets: Array[Int], p = parentForOffset(pOff) score = ruleScoreForOffset(pOff) } yield { - if(outside) + if (outside) BinaryRule(lc, rc, p) -> score else BinaryRule(p, lc, rc) -> score @@ -79,27 +78,26 @@ object SparseRuleTensor { var lastRcOffset = 0 var lastOffset = 0 -// leftChildOffsets += 0 + // leftChildOffsets += 0 for(r <- orderedRuleIndices) { val lc = leftChild(r) var endRightChild = false assert(lastLc <= lc) - while(lastLc != lc) { + while (lastLc != lc) { lastLc += 1 leftChildOffsets += lastRcOffset endRightChild = true } val rc = rightChild(r) - if(endRightChild || rc != lastRc) { + if (endRightChild || rc != lastRc) { rightChildIndicesAndOffsets += rc rightChildIndicesAndOffsets += lastOffset lastRc = rc lastRcOffset += 1 } - val p = parent(r) val rs = grammar.ruleScore(r) val span: Span = new Span(java.lang.Double.doubleToLongBits(rs)) @@ -123,7 +121,7 @@ object SparseRuleTensor { for(r <- unaryRules) { val lc = child(r) assert(lastLc <= lc) - while(lastLc != lc) { + while (lastLc != lc) { unaryChildOffsets += lastOffset lastLc += 1 } @@ -136,7 +134,7 @@ object SparseRuleTensor { unaryParentIndicesAndScores += (p, encodedFirst, encodedSecond) lastOffset += 1 } - while(lastLc <= grammar.refinedTopology.labelIndex.size) { + while (lastLc <= grammar.refinedTopology.labelIndex.size) { lastLc += 1 unaryChildOffsets += lastOffset } @@ -161,27 +159,26 @@ object SparseRuleTensor { var lastRcOffset = 0 var lastOffset = 0 -// leftChildOffsets += 0 + // leftChildOffsets += 0 for(r <- orderedRuleIndices) { val lc = parent(r) var endRightChild = false assert(lastLc <= lc) - while(lastLc != lc) { + while (lastLc != lc) { lastLc += 1 leftChildOffsets += lastRcOffset endRightChild = true } val rc = leftChild(r) - if(endRightChild || rc != lastRc) { + if (endRightChild || rc != lastRc) { rightChildIndicesAndOffsets += rc rightChildIndicesAndOffsets += lastOffset lastRc = rc lastRcOffset += 1 } - val p = rightChild(r) val rs = grammar.ruleScore(r) val span: Span = new Span(java.lang.Double.doubleToLongBits(rs)) @@ -204,7 +201,7 @@ object SparseRuleTensor { lastOffset = 0 for(r <- unaryRules) { val lc = parent(r) - while(lastLc != lc) { + while (lastLc != lc) { unaryChildOffsets += lastOffset lastLc += 1 } @@ -217,19 +214,17 @@ object SparseRuleTensor { unaryParentIndicesAndScores += (p, encodedFirst, encodedSecond) lastOffset += 1 } - while(lastLc <= labelIndex.size) { + while (lastLc <= labelIndex.size) { lastLc += 1 unaryChildOffsets += lastOffset } val ret = new SparseRuleTensor[L2](leftChildOffsets.toArray, rightChildIndicesAndOffsets.toArray, parentIndicesAndScores.toArray, unaryChildOffsets.toArray, unaryParentIndicesAndScores.toArray, true) - assert(ret.ruleIterator.map(_._1).toIndexedSeq == orderedRuleIndices.map(indexedRules(_)), s"\n${ret.ruleIterator.map(_._1).toIndexedSeq}\n${orderedRuleIndices.map(indexedRules(_))}") assert(ret.ruleIterator.map(_._2).toIndexedSeq == orderedRuleIndices.map(grammar.ruleScore(_))) ret } - } \ No newline at end of file diff --git a/src/main/scala/epic/parser/TreeMarginal.scala b/src/main/scala/epic/parser/TreeMarginal.scala index f66d084e..ed8090a9 100644 --- a/src/main/scala/epic/parser/TreeMarginal.scala +++ b/src/main/scala/epic/parser/TreeMarginal.scala @@ -38,16 +38,16 @@ case class TreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], case n@NullaryTree( (a, ref), span ) => val aI = topology.labelIndex(a) score += anchoring.scoreSpan(span.begin, span.end, aI, ref) - if(score.isInfinite) throw new Exception(s"Could not score the terminal with tag ${a -> ref} at $span. $words") + if (score.isInfinite) throw new Exception(s"Could not score the terminal with tag ${a -> ref} at $span. $words") case UnaryTree( (a, refA), child@Tree((b, refB), _, _), chain, span) => val r = topology.index(UnaryRule(a, b, chain)) assert(r != -1, "Could not find rule " + UnaryRule(a, b, chain)) val ruleRef = anchoring.ruleRefinementFromRefinements(r, refA, refB) - if(ruleRef < 0) throw new Exception(s"Bad refined rule in gold tree!: ${UnaryRule(a, b, chain)} aRef: $refA bRef: $refB") + if (ruleRef < 0) throw new Exception(s"Bad refined rule in gold tree!: ${UnaryRule(a, b, chain)} aRef: $refA bRef: $refB") score += anchoring.scoreUnaryRule(t.span.begin, t.span.end, r, ruleRef) - if(score.isInfinite) throw new Exception(s"Could not score gold tree!\n Partial Tree: ${t.render(words)}\n Full Tree: ${tree.render(words)}\n ") + if (score.isInfinite) throw new Exception(s"Could not score gold tree!\n Partial Tree: ${t.render(words)}\n Full Tree: ${tree.render(words)}\n ") rec(child) case t@BinaryTree( (a, refA), bt@Tree( (b, refB), _, _), ct@Tree((c, refC), _, _), span) => val aI = topology.labelIndex(a) @@ -55,13 +55,12 @@ case class TreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], val ruleRef = anchoring.ruleRefinementFromRefinements(rule, refA, refB, refC) score += anchoring.scoreSpan(t.span.begin, t.span.end, aI, refA) score += anchoring.scoreBinaryRule(t.span.begin, bt.span.end, t.span.end, rule, ruleRef) - if(score.isInfinite) throw new Exception("Could not score gold tree!" + t.render(words)) + if (score.isInfinite) throw new Exception("Could not score gold tree!" + t.render(words)) rec(bt) rec(ct) } rec(tree) - score } @@ -73,7 +72,7 @@ case class TreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], case t@UnaryTree( (a, refA), Tree((b, refB), _, _), chain, span) => val r = topology.index(UnaryRule(a, b, chain)) val ruleRef = anchoring.ruleRefinementFromRefinements(r, refA, refB) - if(ruleRef < 0) throw new Exception(s"Bad refined rule in gold tree!: ${UnaryRule(a, b, chain)} aRef: $refA bRef: $refB") + if (ruleRef < 0) throw new Exception(s"Bad refined rule in gold tree!: ${UnaryRule(a, b, chain)} aRef: $refA bRef: $refB") visitor.visitUnaryRule(t.span.begin, t.span.end, r, ruleRef, 1.0) case t@BinaryTree( (a, refA), bt@Tree( (b, refB), _, _), Tree((c, refC), _, _), span) => val aI = topology.labelIndex(a) @@ -99,7 +98,6 @@ case class TreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], case Some(UnaryTree(_, Tree(a, _, span2), chain, span)) => logI(a == (sym -> ref)) case _ => Double.NegativeInfinity } - } override def insideTopScore(begin: Int, end: Int, sym: Int, ref: Int): Double = { @@ -107,7 +105,6 @@ case class TreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], case Some(UnaryTree(a, _, chain, span)) => logI(a == (sym -> ref)) case _ => Double.NegativeInfinity } - } def marginalAt(begin: Int, end: Int): Counter2[L, Int, Double] = { diff --git a/src/main/scala/epic/parser/UnrefinedGrammarAnchoring.scala b/src/main/scala/epic/parser/UnrefinedGrammarAnchoring.scala index 95be5b8e..4d66fc5d 100644 --- a/src/main/scala/epic/parser/UnrefinedGrammarAnchoring.scala +++ b/src/main/scala/epic/parser/UnrefinedGrammarAnchoring.scala @@ -63,12 +63,11 @@ trait UnrefinedGrammarAnchoring[L, W] extends GrammarAnchoring[L, W] with Factor override def *(other: UnrefinedGrammarAnchoring[L, W]): UnrefinedGrammarAnchoring[L, W] = { // hacky multimethod dispatch is hacky if (other eq null) this // ugh - else if(other.isInstanceOf[UnrefinedGrammarAnchoring.Identity[L, W]]) this.addConstraints(other.sparsityPattern) - else if(this.isInstanceOf[UnrefinedGrammarAnchoring.Identity[L, W]]) other.addConstraints(this.sparsityPattern) + else if (other.isInstanceOf[UnrefinedGrammarAnchoring.Identity[L, W]]) this.addConstraints(other.sparsityPattern) + else if (this.isInstanceOf[UnrefinedGrammarAnchoring.Identity[L, W]]) other.addConstraints(this.sparsityPattern) else new ProductUnrefinedGrammarAnchoring(this,other) } - /** * The annotationTag controls if two grammars are over the same refinements. * If they are, then * and / can be much faster. @@ -92,8 +91,8 @@ trait UnrefinedGrammarAnchoring[L, W] extends GrammarAnchoring[L, W] with Factor def /(other: UnrefinedGrammarAnchoring[L, W]) = { // hacky multimethod dispatch is hacky if (other eq null) this // ugh - else if(this eq other) new UnrefinedGrammarAnchoring.Identity[L, W](topology, lexicon, words, this.sparsityPattern) - else if(other.isInstanceOf[UnrefinedGrammarAnchoring.Identity[L, W]]) this.addConstraints(other.sparsityPattern) + else if (this eq other) new UnrefinedGrammarAnchoring.Identity[L, W](topology, lexicon, words, this.sparsityPattern) + else if (other.isInstanceOf[UnrefinedGrammarAnchoring.Identity[L, W]]) this.addConstraints(other.sparsityPattern) else new ProductUnrefinedGrammarAnchoring(this, other, -1) } @@ -195,23 +194,11 @@ object UnrefinedGrammarAnchoring { */ @SerialVersionUID(1L) case class Identity[L, W](topology: RuleTopology[L], lexicon: Lexicon[L, W], words: IndexedSeq[W], sparsityPattern: ChartConstraints[L]) extends UnrefinedGrammarAnchoring[L, W] { - - // def sparsityPattern = ChartConstraints.noSparsity[L] override def addConstraints(cs: ChartConstraints[L]): UnrefinedGrammarAnchoring[L, W] = copy(sparsityPattern = sparsityPattern & cs) - def scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int) = 0.0 - def scoreUnaryRule(begin: Int, end: Int, rule: Int) = 0.0 - def scoreSpan(begin: Int, end: Int, tag: Int) = 0.0 - - } } - - - - - diff --git a/src/main/scala/epic/parser/kbest/AStarKBestParser.scala b/src/main/scala/epic/parser/kbest/AStarKBestParser.scala index f337270a..1535e05f 100644 --- a/src/main/scala/epic/parser/kbest/AStarKBestParser.scala +++ b/src/main/scala/epic/parser/kbest/AStarKBestParser.scala @@ -4,7 +4,6 @@ package kbest import epic.parser.projections.{AnchoredRuleMarginalProjector, ChartProjector} import epic.trees.BinarizedTree - /** * Uses Top Down KBest A* (as implemented in [[epic.parser.kbest.TopDownKBestAStar]]) to generate * kbest lists. diff --git a/src/main/scala/epic/parser/kbest/KBestListMarginal.scala b/src/main/scala/epic/parser/kbest/KBestListMarginal.scala index 2b585fef..41b3fa6e 100644 --- a/src/main/scala/epic/parser/kbest/KBestListMarginal.scala +++ b/src/main/scala/epic/parser/kbest/KBestListMarginal.scala @@ -19,7 +19,6 @@ import epic.parser._ import breeze.linalg._ import breeze.numerics._ - case class KBestListMarginal[L, W](anchoring: GrammarAnchoring[L, W], marginals: IndexedSeq[ParseMarginal[L, W]]) extends ParseMarginal[L, W] { @@ -41,12 +40,10 @@ case class KBestListMarginal[L, W](anchoring: GrammarAnchoring[L, W], m.visitPostorder(new AnchoredVisitor[L] { def visitUnaryRule(begin: Int, end: Int, rule: Int, ref: Int, score: Double) { spanVisitor.visitUnaryRule(begin, end, rule, ref, score * probsPerTree(i)) - } def visitSpan(begin: Int, end: Int, tag: Int, ref: Int, score: Double) { spanVisitor.visitSpan(begin, end, tag, ref, score * probsPerTree(i)) - } def visitBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int, score: Double) { @@ -56,8 +53,6 @@ case class KBestListMarginal[L, W](anchoring: GrammarAnchoring[L, W], } } - - override def insideBotScore(begin: Int, end: Int, sym: Int, ref: Int): Double = ??? override def insideTopScore(begin: Int, end: Int, sym: Int, ref: Int): Double = ??? diff --git a/src/main/scala/epic/parser/kbest/KBestParseTreebank.scala b/src/main/scala/epic/parser/kbest/KBestParseTreebank.scala index 5d2bcd3c..339de1ec 100644 --- a/src/main/scala/epic/parser/kbest/KBestParseTreebank.scala +++ b/src/main/scala/epic/parser/kbest/KBestParseTreebank.scala @@ -44,7 +44,7 @@ object KBestParseTreebank { def parse(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], out: PrintWriter) = { val parred = trainTrees.par - if(params.threads > 0) + if (params.threads > 0) parred.tasksupport = new ForkJoinTaskSupport(new ForkJoinPool(params.threads)) parred .map(ti => ti.words -> kbest.bestKParses(ti.words, params.k)) diff --git a/src/main/scala/epic/parser/kbest/TopDownKBestAStar.scala b/src/main/scala/epic/parser/kbest/TopDownKBestAStar.scala index 4035f1b7..f9c33275 100644 --- a/src/main/scala/epic/parser/kbest/TopDownKBestAStar.scala +++ b/src/main/scala/epic/parser/kbest/TopDownKBestAStar.scala @@ -13,11 +13,10 @@ object TopDownKBestAStar { def apply[L, W](chart: RefinedChartMarginal[L, W], k: Int):IndexedSeq[(BinarizedTree[L], Double)] = { import chart._ val root = chart.topology.rootIndex - val kbestList = new ArrayBuffer[(BinarizedTree[L], Double)]() val queue = new mutable.PriorityQueue[TKAItem[(Int, Int)]] queue.enqueue(StartItem) - while(queue.nonEmpty && kbestList.size < k) { + while (queue.nonEmpty && kbestList.size < k) { queue.dequeue() match { case StartItem => val begin = 0 @@ -40,7 +39,7 @@ object TopDownKBestAStar { val chain = topology.chain(r) val refB = anchoring.childRefinement(r, refR) val bScore = inside.bot.labelScore(begin, end, b, refB) - if(!bScore.isInfinite) { + if (!bScore.isInfinite) { val rScore = anchoring.scoreUnaryRule(begin, end, r, refR) val newWeight = weight - aScore + bScore + rScore val newParentLabel = (b,refB) @@ -83,7 +82,7 @@ object TopDownKBestAStar { ) assert(score <= aScore + 1E-4, score -> aScore) val newWeight = weight - aScore + score - if(!newWeight.isInfinite) { + if (!newWeight.isInfinite) { val newZipper = zipper.copy(BinaryTree(zipper.tree.label, NullaryTree(b -> refB, Span(begin,split)), NullaryTree(c -> refC, Span(split, end)), zipper.tree.span)).down.get @@ -91,10 +90,7 @@ object TopDownKBestAStar { queue += TopItem(newZipper, newWeight) } } - - } - } kbestList } @@ -109,7 +105,6 @@ object TopDownKBestAStar { private case class BotItem[L](zipper: Zipper[L], weight: Double) extends TKAItem[L] private case class CompleteTreeItem[L](tree: BinarizedTree[L], weight: Double) extends TKAItem[L] - } diff --git a/src/main/scala/epic/parser/models/AnnotatedParserInference.scala b/src/main/scala/epic/parser/models/AnnotatedParserInference.scala index 3540fa6b..c32ca88f 100644 --- a/src/main/scala/epic/parser/models/AnnotatedParserInference.scala +++ b/src/main/scala/epic/parser/models/AnnotatedParserInference.scala @@ -36,7 +36,6 @@ case class AnnotatedParserInference[L, W](featurizer: RefinedFeaturizer[L, W, Fe grammar: Grammar[L, W], constrainer: ChartConstraints.Factory[L, W]) extends ParserInference[L, W] { - override def forTesting = copy(featurizer.forTesting, constrainer = ChartConstraints.Factory.noSparsity) def goldMarginal(scorer: Scorer, ti: TreeInstance[L, W], aug: UnrefinedGrammarAnchoring[L, W]): Marginal = { @@ -45,5 +44,4 @@ case class AnnotatedParserInference[L, W](featurizer: RefinedFeaturizer[L, W, Fe TreeMarginal(scorer, annotated) } - } diff --git a/src/main/scala/epic/parser/models/EPParserModelFactory.scala b/src/main/scala/epic/parser/models/EPParserModelFactory.scala index 24e5284a..ccd7910c 100644 --- a/src/main/scala/epic/parser/models/EPParserModelFactory.scala +++ b/src/main/scala/epic/parser/models/EPParserModelFactory.scala @@ -44,14 +44,12 @@ case class EPParserModelFactory(ep: EPParams, oldWeights: File = null) extends ParserExtractableModelFactory[AnnotatedLabel, String] { type MyModel = EPParserModel[AnnotatedLabel, String] - override def make(train: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], constrainer: Factory[AnnotatedLabel, String]): MyModel = { type ModelType = EPModel.CompatibleModel[TreeInstance[AnnotatedLabel, String], UnrefinedGrammarAnchoring[AnnotatedLabel, String]] val models = model.filterNot(_ eq null) map { model => model.make(train, topology, lexicon, constrainer): ModelType } val featureCounter = readWeights(oldWeights) - new EPParserModel[AnnotatedLabel, String](topology, lexicon, constrainer, ep.maxIterations, featureCounter.get, false, ep.dropOutFraction)(models:_*) } } diff --git a/src/main/scala/epic/parser/models/FeaturizedLexicon.scala b/src/main/scala/epic/parser/models/FeaturizedLexicon.scala index f4cd26bf..63d99e13 100644 --- a/src/main/scala/epic/parser/models/FeaturizedLexicon.scala +++ b/src/main/scala/epic/parser/models/FeaturizedLexicon.scala @@ -26,7 +26,6 @@ import epic.lexicon.TagScorer class FeaturizedLexicon[L, L2, W](val weights: DenseVector[Double], val featureIndexer: IndexedFeaturizer[L, L2, W]) extends TagScorer[L2, W] { - def anchor(w: IndexedSeq[W]): Anchoring = new Anchoring { val fi = featureIndexer.anchor(w) def words: IndexedSeq[W] = w diff --git a/src/main/scala/epic/parser/models/IndexedFeaturizer.scala b/src/main/scala/epic/parser/models/IndexedFeaturizer.scala index 9c2b088f..41539e9c 100644 --- a/src/main/scala/epic/parser/models/IndexedFeaturizer.scala +++ b/src/main/scala/epic/parser/models/IndexedFeaturizer.scala @@ -38,7 +38,6 @@ class IndexedFeaturizer[L, L2, W](val index: CrossProductIndex[Feature, Feature] indexedProjections: GrammarRefinements[L, L2], ruleCache: Array[Array[Int]]) extends RefinedFeaturizer[L, W, Feature] with Encoder[Feature] with Serializable { outer => - import indexedProjections._ def labelIndex = labels.fineIndex @@ -49,7 +48,6 @@ class IndexedFeaturizer[L, L2, W](val index: CrossProductIndex[Feature, Feature] def anchor(words: IndexedSeq[W]) = new Spec(words) - case class Spec private[IndexedFeaturizer](words: IndexedSeq[W]) extends super.Anchoring { val anch = wGen.anchor(words) @@ -74,12 +72,9 @@ class IndexedFeaturizer[L, L2, W](val index: CrossProductIndex[Feature, Feature] index.crossProduct(feat.featuresForLabel(tag), anch.featuresForWord(pos), usePlainLabelFeatures = false) } - def computeWeight(pos: Int, l: Int, weights: DenseVector[Double]) = new FeatureVector(featuresFor(pos, l)) dot weights } - - } object IndexedFeaturizer { diff --git a/src/main/scala/epic/parser/models/LatentParserModel.scala b/src/main/scala/epic/parser/models/LatentParserModel.scala index 2b57e97c..a0e334b3 100644 --- a/src/main/scala/epic/parser/models/LatentParserModel.scala +++ b/src/main/scala/epic/parser/models/LatentParserModel.scala @@ -102,7 +102,6 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma oldWeights: File = null) extends ParserModelFactory[AnnotatedLabel, String] with SafeLogging { type MyModel = LatentParserModel[AnnotatedLabel, (AnnotatedLabel, Int), String] - override def make(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], constrainer: Factory[AnnotatedLabel, String]): MyModel = { @@ -119,7 +118,7 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma AnnotatedLabel(split(0)) -> split(1).toInt } pairs.toMap + (xbarGrammar.root -> 1) - } else if(splitUselessStates) { + } else if (splitUselessStates) { Map(xbarGrammar.root -> 1) } else { LatentModelFactory.statesToNotSplit.iterator.map(s => AnnotatedLabel(s) -> 1).toMap + (xbarGrammar.root -> 1) diff --git a/src/main/scala/epic/parser/models/LexModel.scala b/src/main/scala/epic/parser/models/LexModel.scala index 3597d74d..b2f7f7fa 100644 --- a/src/main/scala/epic/parser/models/LexModel.scala +++ b/src/main/scala/epic/parser/models/LexModel.scala @@ -103,19 +103,14 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], private val unaryOffset = index.componentOffset(2) private val splitOffset = index.componentOffset(3) - override def lock = this - def joinTagRef(head: Int, ref: Int, length: Int) : Int = { head + ref * length } - def anchor(datum: IndexedSeq[W]):Spec = new Spec(datum) - - class Spec(val words: IndexedSeq[W]) extends Anchoring { private val fspec = ruleFeaturizer.anchor(words) private val bilexSpec = bilexFeaturizer.anchor(words) @@ -127,34 +122,34 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], def featuresForUnaryRule(begin: Int, end: Int, rule: Int, ref: Int) = { val head = unaryHeadIndex(ref) - if(head < begin || head >= end) throw new RuntimeException(s"Head $head not in bounds for rule $rule in span [$begin, $end)}") + if (head < begin || head >= end) throw new RuntimeException(s"Head $head not in bounds for rule $rule in span [$begin, $end)}") val ruleRef = unaryRuleRefinement(ref) val globalizedRule = refinements.rules.globalize(rule, ruleRef) var rcache = headCache(head) - if(rcache eq null) { + if (rcache eq null) { rcache = new OpenAddressHashArray[Array[Int]](refinements.rules.fineIndex.size) headCache(head) = rcache } var headCached = rcache(globalizedRule) - if(headCached == null) { + if (headCached == null) { val surfFeatures = unarySpec.featuresForWord(head) val rFeatures = fspec.featuresForUnaryRule(begin, end, rule, ruleRef) headCached = unaryFeatureIndex.crossProduct(rFeatures, surfFeatures, unaryOffset) rcache(globalizedRule) = headCached } - if(splitSpanSpec.isEmpty) { + if (splitSpanSpec.isEmpty) { headCached } else { var ucache = unarySpanCache(begin, end) - if(ucache eq null) { + if (ucache eq null) { ucache = new OpenAddressHashArray[Array[Int]](refinements.rules.fineIndex.size) unarySpanCache(begin, end) = ucache } var surfCached = ucache(globalizedRule) - if(surfCached == null) { + if (surfCached == null) { surfCached = splitSpanFeatureIndex.crossProduct(fspec.featuresForUnaryRule(begin, end, rule, ruleRef), getSpanFeatures(begin, end), splitOffset, true) ucache(globalizedRule) = surfCached @@ -168,27 +163,27 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], val localTagRef = tagRefinement(ref) val refinedTag = refinements.labels.globalize(tag, localTagRef) val head = headTagIndex(ref) - if(head < begin || head >= end) throw new RuntimeException(s"Head $head not in bounds for tag $tag in span [$begin, $end)}") + if (head < begin || head >= end) throw new RuntimeException(s"Head $head not in bounds for tag $tag in span [$begin, $end)}") var rcache = wordCache(head) - if(rcache eq null) { + if (rcache eq null) { rcache = new OpenAddressHashArray[Array[Int]](refinements.labels.fineIndex.size, null:Array[Int], 2) wordCache(head) = rcache } var cache = rcache(refinedTag) - if(cache == null) { + if (cache == null) { cache = wordFeatureIndex.crossProduct(fspec.featuresForSpan(begin, end, tag, localTagRef), wordSpec.featuresForWord(head), offset = wordOffset, usePlainLabelFeatures = false) rcache(refinedTag) = cache } - if(splitSpanSpec.nonEmpty && begin < end - 1) { + if (splitSpanSpec.nonEmpty && begin < end - 1) { var labelCache = spanCache(begin, end) - if(labelCache eq null) { + if (labelCache eq null) { labelCache = new OpenAddressHashArray[Array[Int]](refinements.labels.fineIndex.size) spanCache(begin, end) = labelCache } var lcached = labelCache(refinedTag) - if(lcached == null) { + if (lcached == null) { val spanFeats: Array[Int] = fspec.featuresForSpan(begin, end, tag, localTagRef) lcached = splitSpanFeatureIndex.crossProduct(spanFeats, getSpanFeatures(begin, end), splitOffset, true) labelCache(refinedTag) = lcached @@ -198,9 +193,6 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], cache } - - - def featuresForBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int) = { val head = headIndex(ref) val dep = depIndex(ref) @@ -211,11 +203,11 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], val arrays = new ArrayBuffer[Array[Int]]() - if(useBilexRuleFeatures) { + if (useBilexRuleFeatures) { arrays += featuresForHeadDepRule(begin, split, end, head, dep, rule, ruleRef) } - if(splitSpanSpec.nonEmpty) { + if (splitSpanSpec.nonEmpty) { arrays += featuresForSplitRule(begin, split, end, rule, ruleRef) } @@ -223,7 +215,6 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], val refinedTag = refinements.labels.globalize(tag, refinements.parentRefinement(rule, ruleRef)) arrays += featuresForAttach(head, dep, refinedTag) - Arrays.concatenate(arrays:_*) } @@ -242,7 +233,7 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], bilexCache(head)(dep) = bilexFeatures } - val fi = Arrays.concatenate(rawLabelFeatures(refinedTag), if(head < dep) rawDirFeatures(0) else rawDirFeatures(1)) + val fi = Arrays.concatenate(rawLabelFeatures(refinedTag), if (head < dep) rawDirFeatures(0) else rawDirFeatures(1)) feats = bilexFeatureIndex.crossProduct(fi, bilexFeatures, offset = bilexOffset, usePlainLabelFeatures = false) cache(refinedTag) = feats @@ -250,17 +241,15 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], feats } - def featuresForHeadDepRule(begin: Int, split: Int, end: Int, head: Int, dep: Int, rule: Int, ruleRef: Int): Array[Int] = { var cache = ruleCache(head)(dep) if (cache == null) { cache = new OpenAddressHashArray[Array[Int]](refinements.rules.fineIndex.size, null:Array[Int], 256) ruleCache(head)(dep) = cache } - -// val x = cache.activeSize * 1.0/cache.size -// val y = cache.activeSize * 1.0/cache.data.length -// if(math.random < .01) println(x + " " + y + " " + cache.size) + // val x = cache.activeSize * 1.0/cache.size + // val y = cache.activeSize * 1.0/cache.data.length + // if (math.random < .01) println(x + " " + y + " " + cache.size) var feats = cache(refinements.rules.globalize(rule, ruleRef)) if (feats == null) { var bilexFeatures: Array[Int] = bilexCache(head)(dep) @@ -268,18 +257,16 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], bilexFeatures = bilexSpec.featuresForAttachment(head, dep) bilexCache(head)(dep) = bilexFeatures } - val fi = fspec.featuresForBinaryRule(begin, split, end, rule, ruleRef) feats = bilexFeatureIndex.crossProduct(fi, bilexFeatures, offset = bilexOffset, usePlainLabelFeatures = true) cache(refinements.rules.globalize(rule, ruleRef)) = feats - } feats } def featuresForSplitRule(begin: Int, split: Int, end: Int, rule: Int, ruleRef: Int): Array[Int] = { val globalizedRule = refinements.rules.globalize(rule, ruleRef) - + var ucache = binaryCache(begin, end) if (ucache eq null) { ucache = new Array[OpenAddressHashArray[Array[Int]]](end - begin) @@ -294,12 +281,12 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], var lcached = scache(globalizedRule) if (lcached == null) { -// val spanFeatures = getSpanFeatures(begin, end) -// lcached = splitSpanFeatureIndex.crossProduct(fspec.featuresForBinaryRule(begin, split, end, rule, ruleRef), spanFeatures, splitOffset, true) + // val spanFeatures = getSpanFeatures(begin, end) + // lcached = splitSpanFeatureIndex.crossProduct(fspec.featuresForBinaryRule(begin, split, end, rule, ruleRef), spanFeatures, splitOffset, true) lcached = splitSpanFeatureIndex.crossProduct(fspec.featuresForBinaryRule(begin, split, end, rule, ruleRef), getSplitFeatures(begin, split, end), splitOffset, true) -// if (forSplit.length > 0) -// lcached = Arrays.concatenate(lcached, forSplit) + // if (forSplit.length > 0) + // lcached = Arrays.concatenate(lcached, forSplit) scache(globalizedRule) = lcached } lcached @@ -337,18 +324,17 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], private def getSpanFeatures(begin: Int, end: Int):Array[Int] = { var cache = rawSpanCache(begin, end) - if(cache eq null) { + if (cache eq null) { cache = splitSpanSpec.get.featuresForSpan(begin, end) rawSpanCache(begin, end) = cache } cache } - private def getSplitFeatures(begin: Int, split: Int, end: Int):Array[Int] = { var cache = rawSplitCache(begin, end) - if(cache eq null) { + if (cache eq null) { cache = new Array[Array[Int]](end- begin) rawSplitCache(begin, end) = cache } @@ -424,7 +410,7 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], private def dot(features: Array[Int]) = { var i = 0 var score = 0.0 - while(i < features.length) { + while (i < features.length) { score += weights(features(i)) i += 1 } @@ -439,7 +425,6 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], dot(f.featuresForUnaryRule(begin, end, rule, ref)) } - val attachCache = Array.ofDim[OpenAddressHashArray[Double]](words.length, words.length) val ruleCache = new TriangularArray[Array[OpenAddressHashArray[Double]]](words.length + 1) def scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int): Double = { @@ -459,13 +444,13 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], val ruleRef = this.binaryRuleRef(ref) val refinedTag = refinements.labels.globalize(tag, refinements.parentRefinement(rule, ruleRef)) var attachScore = cache(refinedTag) - if(java.lang.Double.isNaN(attachScore)) { + if (java.lang.Double.isNaN(attachScore)) { attachScore = dot(f.featuresForAttach(head, dep, refinedTag)) cache(refinedTag) = attachScore } score += attachScore - if(f.splitSpanSpec.nonEmpty) { + if (f.splitSpanSpec.nonEmpty) { var ucache = ruleCache(begin, end) if (ucache eq null) { ucache = new Array[OpenAddressHashArray[Double]](end - begin) @@ -489,7 +474,7 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], score += lcached } - if(featurizer.useBilexRuleFeatures) { + if (featurizer.useBilexRuleFeatures) { score += dot(f.featuresForHeadDepRule(begin, split, end, head, dep, rule, ruleRef)) } @@ -509,7 +494,6 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], head + ref * words.length * words.length } - def joinUnaryRuleRef(head: Int, ref: Int) : Int = { head + ref * words.length } @@ -526,13 +510,12 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], epic.util.Arrays.crossProduct(lexRefs, ruleRefs, words.length) } - def validLabelRefinements(begin: Int, end: Int, label: Int) = joinTagRefs(Array.range(begin,end), refinements.labels.localRefinements(label)) def numValidRefinements(label: Int) = joinTagRef(words.length, refinements.labels.numRefinements(label)) def numValidRuleRefinements(rule: Int): Int = { - if(binaries(rule)) { + if (binaries(rule)) { joinBinaryRuleRef(words.length * words.length, refinements.rules.numRefinements(rule)) } else { joinUnaryRuleRef(words.length, refinements.rules.numRefinements(rule)) @@ -540,12 +523,12 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], } def validRuleRefinementsGivenParent(begin: Int, end: Int, rule: Int, parentRef: Int) = { - if(!binaries(rule)) { + if (!binaries(rule)) { val lexicalizedRefinements = Array(unaryHeadIndex(parentRef)) val ruleRefs = refinements.ruleRefinementsCompatibleWithParentRef(rule, tagRef(parentRef)) joinUnaryRuleRefs(lexicalizedRefinements, ruleRefs) } else { - val lexicalizedRefinements = if(isHeadOnLeftForRule(rule)) { + val lexicalizedRefinements = if (isHeadOnLeftForRule(rule)) { val head = unaryHeadIndex(parentRef) // val x = Array.range(0,numValidRuleRefinements(rule)).filter(x => leftChildRefinement(rule,x) == parentRef && rightChildRefinement(rule, x) > parentRef && rightChildRefinement(rule, x) < end) Array.fillWith[Int](end - (head + 1)) { i => head * words.length + head + 1 + i } @@ -562,13 +545,13 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], } override def validRuleRefinementsGivenParent(begin: Int, splitBegin: Int, splitEnd: Int, end: Int, rule: Int, parentRef: Int): Array[Int] = { - if(!binaries(rule)) { + if (!binaries(rule)) { val lexicalizedRefinements = Array(parentRef:Int) val ruleRefs = refinements.ruleRefinementsCompatibleWithParentRef(rule, tagRef(parentRef)) joinUnaryRuleRefs(lexicalizedRefinements, ruleRefs) } else { val headIndex = unaryHeadIndex(parentRef) - val lexicalizedRefinements = if(isHeadOnLeftForRule(rule)) { + val lexicalizedRefinements = if (isHeadOnLeftForRule(rule)) { // if the head is on the left, then the dependent // can be in Span(math.max(splitBegin, ref1+1), end). // Further, if the ref1 is <= splitEnd, then @@ -577,7 +560,7 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], // ^------ref1------^ // max: ^------^----dep---------^ // - if(splitEnd <= headIndex) return Array.empty + if (splitEnd <= headIndex) return Array.empty val firstPossibleStart = math.max(headIndex +1, splitBegin) Array.fillWith[Int](end - firstPossibleStart)(i => headIndex * words.length + firstPossibleStart + i) } else { @@ -589,12 +572,12 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], // ^--------ref1------^ // ^-----------dep---^-----^ : min // - if(splitBegin >= headIndex) return Array.empty + if (splitBegin >= headIndex) return Array.empty val lastPossibleEnd = math.min(headIndex, splitEnd) Array.fillWith[Int](lastPossibleEnd - begin)(i => headIndex * words.length + begin + i) } - if(lexicalizedRefinements.isEmpty) { + if (lexicalizedRefinements.isEmpty) { lexicalizedRefinements } else { val ruleRefs = refinements.ruleRefinementsCompatibleWithParentRef(rule, tagRef(parentRef)) @@ -606,7 +589,7 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], def validRuleRefinementsGivenLeftChild(begin: Int, split: Int, completionBegin:Int, completionEnd: Int, rule: Int, lcRef: Int) = { val lc = unaryHeadIndex(lcRef) - val lexicalizedRefinements = if(isHeadOnLeftForRule(rule)) + val lexicalizedRefinements = if (isHeadOnLeftForRule(rule)) Array.fillWith[Int](completionEnd - split)(i => lc * words.length + split + i) else Array.fillWith[Int](completionEnd - split)(i => (split + i) * words.length + lc) @@ -616,7 +599,7 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], def validRuleRefinementsGivenRightChild(completionBegin: Int, completionEnd: Int, split: Int, end: Int, rule: Int, rcRef: Int): Array[Int] = { val rc = unaryHeadIndex(rcRef) - val lexicalizedRefinements = if(!isHeadOnLeftForRule(rule)) + val lexicalizedRefinements = if (!isHeadOnLeftForRule(rule)) Array.fillWith[Int](split - completionBegin)(i => rc * words.length + completionBegin + i) else Array.fillWith[Int](split - completionBegin)(i => (completionBegin + i) * words.length + rc) @@ -631,43 +614,38 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], } def leftChildRefinement(rule: Int, ruleRef: Int) = { - val word = if(isHeadOnLeftForRule(rule)) { + val word = if (isHeadOnLeftForRule(rule)) { headIndex(ruleRef) } else { depIndex(ruleRef) } - val refinedRuleId = refinements.rules.globalize(rule, binaryRuleRef(ruleRef)) val tagref = refinements.labels.localize(refinedGrammar.leftChild(refinedRuleId)) - joinTagRef(word, tagref) } def rightChildRefinement(rule: Int, ruleRef: Int) = { - val word = if(isHeadOnRightForRule(rule)) { + val word = if (isHeadOnRightForRule(rule)) { headIndex(ruleRef) } else { depIndex(ruleRef) } - val refinedRuleId = refinements.rules.globalize(rule, binaryRuleRef(ruleRef)) val tagref = refinements.labels.localize(refinedGrammar.rightChild(refinedRuleId)) joinTagRef(word, tagref) } def parentRefinement(rule: Int, ruleRef: Int) = { - val word = if(binaries(rule)) { + val word = if (binaries(rule)) { headIndex(ruleRef) } else { unaryHeadIndex(ruleRef) } - - val rr = if(binaries(rule)) { + val rr = if (binaries(rule)) { binaryRuleRef(ruleRef) } else { unaryRuleRef(ruleRef) } - val refinedRuleId = refinements.rules.globalize(rule, rr) val tagref = refinements.labels.localize(refinedGrammar.parent(refinedRuleId)) joinTagRef(word, tagref) @@ -675,7 +653,6 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], def childRefinement(rule: Int, ruleRef: Int) = { val word = unaryHeadIndex(ruleRef) - val refinedRuleId = refinements.rules.globalize(rule, unaryRuleRef(ruleRef)) val tagref = refinements.labels.localize(refinedGrammar.child(refinedRuleId)) joinTagRef(word, tagref) @@ -693,12 +670,11 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], val b2 = refinements.labels.globalize(b, labelB) val rule = UnaryRule(refinements.labels.fineIndex.get(a2), refinements.labels.fineIndex.get(b2), topology.chain(r)) val refinedRuleIndex = refinements.rules.fineIndex(rule) - val refR = if(refinedRuleIndex < 0) { + val refR = if (refinedRuleIndex < 0) { -1 } else { refinements.rules.localize(refinedRuleIndex) } - joinUnaryRuleRef(hA, refR) } @@ -706,19 +682,16 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], val hA = unaryHeadIndex(refA) val hB = unaryHeadIndex(refB) val hC = unaryHeadIndex(refC) - - val lexRef = if(isHeadOnLeftForRule(r)) { + val lexRef = if (isHeadOnLeftForRule(r)) { require(hA == hB) hA * words.length + hC } else { require(hA == hC) hA * words.length + hB } - val labelA = tagRef(refA) val labelB = tagRef(refB) val labelC = tagRef(refC) - val a = topology.parent(r) val b = topology.leftChild(r) val c = topology.rightChild(r) @@ -728,8 +701,7 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], val refR = refinements.rules.localize(refinements.rules.fineIndex(BinaryRule(refinements.labels.fineIndex.get(a2), refinements.labels.fineIndex.get(b2), refinements.labels.fineIndex.get(c2) - )) ) - + ))) assert(headIndex(lexRef) == hA) joinBinaryRuleRef(lexRef, refR) } @@ -743,11 +715,9 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], if (isHeadOnLeftForRule(rule)) Array.range(begin, splitEnd) else Array.range(splitBegin, end) } - joinTagRefs(lexRefs, refinements.parentRefinementsCompatibleWithRule(rule)) } - def validLeftChildRefinementsGivenRule(begin: Int, splitBegin: Int, splitEnd: Int, end: Int, rule: Int): Array[Int] = { val lexRefs = Array.range(begin, splitEnd) joinTagRefs(lexRefs, refinements.leftChildRefinementsCompatibleWithRule(rule)) @@ -773,7 +743,7 @@ case class LexGrammarBundle[L, L2, W](topology: RuleTopology[L], for( (rule@BinaryRule(a, b,c), r) <- bg.index.iterator.zipWithIndex) { binaries(r) = true val headChild = headFinder.findHeadChild(rule) - if(headChild == 0) { + if (headChild == 0) { leftRules(r) = true } else { rightRules(r) = true @@ -823,7 +793,7 @@ object IndexedLexFeaturizer extends LazyLogging { val words = hasWords.get(ti) val tree = ann(hasTree.get(ti), words) // returns head - def rec(t: BinarizedTree[L2]):Int= t match { + def rec(t: BinarizedTree[L2]): Int= t match { case NullaryTree(a, span) => val (ai, aref) = refinements.labels.indexAndLocalize(a) wordBuilder.add(ruleSpec.featuresForSpan(span.begin, span.end, ai, aref), @@ -835,14 +805,14 @@ object IndexedLexFeaturizer extends LazyLogging { val (ri, rref) = refinements.rules.indexAndLocalize(r) unaryBuilder.add(ruleSpec.featuresForUnaryRule(span.begin, span.end, ri, rref), unarySpec.featuresForWord(head)) - if(splitSpanSpec.nonEmpty) + if (splitSpanSpec.nonEmpty) splitBuilder.add(ruleSpec.featuresForUnaryRule(span.begin, span.end, ri, rref), splitSpanSpec.get.featuresForSpan(span.begin, span.end)) head case t@BinaryTree(a, b, c, span) => val (leftHead,rightHead) = (rec(t.leftChild), rec(t.rightChild)) val headIsLeft = headFinder.findHeadChild(t) == 0 - val (head, dep) = if(headIsLeft) leftHead -> rightHead else rightHead -> leftHead + val (head, dep) = if (headIsLeft) leftHead -> rightHead else rightHead -> leftHead val r = BinaryRule[L2](a, b.label, c.label) val (ri, rref) = refinements.rules.indexAndLocalize(r) val bilexFeatures = bilexSpec.featuresForAttachment(head, dep) @@ -852,18 +822,18 @@ object IndexedLexFeaturizer extends LazyLogging { wordSpec.featuresForWord(head)) val aglob = refinements.labels.fineIndex(a) - if(useBilexRuleFeatures) + if (useBilexRuleFeatures) bilexBuilder.add(ruleSpec.featuresForBinaryRule(span.begin, split, span.end, ri, rref), bilexFeatures) bilexBuilder.add(labelFeatures(aglob), bilexFeatures) - bilexBuilder.add(attachFeatures(if(headIsLeft) 0 else 1), bilexFeatures) + bilexBuilder.add(attachFeatures(if (headIsLeft) 0 else 1), bilexFeatures) - if(splitSpanFeaturizer.nonEmpty) splitBuilder.add(ruleSpec.featuresForBinaryRule(span.begin, t.splitPoint, span.end, ri, rref), + if (splitSpanFeaturizer.nonEmpty) splitBuilder.add(ruleSpec.featuresForBinaryRule(span.begin, t.splitPoint, span.end, ri, rref), splitSpanSpec.get.featuresForSpan(span.begin, span.end)) - if(splitSpanFeaturizer.nonEmpty) splitBuilder.add(ruleSpec.featuresForBinaryRule(span.begin, t.splitPoint, span.end, ri, rref), + if (splitSpanFeaturizer.nonEmpty) splitBuilder.add(ruleSpec.featuresForBinaryRule(span.begin, t.splitPoint, span.end, ri, rref), splitSpanSpec.get.featuresForSplit(span.begin, t.splitPoint, span.end)) - if(splitSpanFeaturizer.nonEmpty) splitBuilder.add(ruleSpec.featuresForSpan(span.begin, span.end, ai, aref), + if (splitSpanFeaturizer.nonEmpty) splitBuilder.add(ruleSpec.featuresForSpan(span.begin, span.end, ai, aref), splitSpanSpec.get.featuresForSpan(span.begin, span.end)) head } @@ -877,7 +847,6 @@ object IndexedLexFeaturizer extends LazyLogging { val ufi = unaryBuilder.result() val sfi = splitBuilder.result() - new IndexedLexFeaturizer(ruleFeaturizer.topology, labelFeatures, attachFeatures, @@ -904,7 +873,6 @@ case class LexModelFactory(@Help(text= "The kind of annotation to do on the refi useBilexRuleFeatures: Boolean = true) extends ParserModelFactory[AnnotatedLabel, String] with SafeLogging { type MyModel = LexModel[AnnotatedLabel, AnnotatedLabel, String] - override def make(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], constrainer: Factory[AnnotatedLabel, String]): MyModel = { @@ -928,13 +896,12 @@ case class LexModelFactory(@Help(text= "The kind of annotation to do on the refi + offsets(dep) ) - bilexF = bilexF (wf, lfsuf + offsets, bilexF) } - val spanFeaturizer = if(!useSpanFeatures) { + val spanFeaturizer = if (!useSpanFeatures) { new ZeroSplitSpanFeaturizer[String] } else { val dsl = new WordFeaturizer.DSL(initLexicon) with SurfaceFeaturizer.DSL with SplitSpanFeaturizer.DSL @@ -955,20 +922,20 @@ case class LexModelFactory(@Help(text= "The kind of annotation to do on the refi featurizer += baseCat(end) // } -// if(useSplits) { +// if (useSplits) { featurizer += leftOfSplit featurizer += baseCat(split) // } -// if(useSpanLength) { +// if (useSpanLength) { featurizer += length // } -// if(useShape) { +// if (useShape) { featurizer += spanShape // } -// if(useBinaryLengths) { +// if (useBinaryLengths) { featurizer += distance[String](begin, split) featurizer += distance[String](split, end) // } @@ -986,19 +953,17 @@ case class LexModelFactory(@Help(text= "The kind of annotation to do on the refi } val indexedSplitSpanFeaturizer = { - if(useSpanFeatures) + if (useSpanFeatures) Some(IndexedSplitSpanFeaturizer.fromData(spanFeaturizer, trees)) else None } - - def labelFeaturizer(l: AnnotatedLabel) = Set(l, l.baseAnnotatedLabel).toSeq def ruleFeaturizer(r: Rule[AnnotatedLabel]) = r match { case r@BinaryRule(a,b,c) => val headIsLeft = headFinder.findHeadChild(r) == 0 - val dir = if(headIsLeft) AttachLeft else AttachRight + val dir = if (headIsLeft) AttachLeft else AttachRight Set(r, r.map(_.baseAnnotatedLabel), dir).toSeq case r@UnaryRule(a,b,c) => Set(r, r.map(_.baseAnnotatedLabel)).toSeq diff --git a/src/main/scala/epic/parser/models/NeuralModel.scala b/src/main/scala/epic/parser/models/NeuralModel.scala index 3a22fb77..f2dae6c2 100644 --- a/src/main/scala/epic/parser/models/NeuralModel.scala +++ b/src/main/scala/epic/parser/models/NeuralModel.scala @@ -40,8 +40,6 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], constrainer: Factory[AnnotatedLabel, String]): MyModel = { - - val annTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]] = trainTrees.map(annotator(_)) println("Here's what the annotation looks like on the first few trees") annTrees.slice(0, Math.min(3, annTrees.size)).foreach(tree => println(tree.render(false))) @@ -61,12 +59,10 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma span += new SingleWordSpanFeaturizer[String](wf) - val indexedSurface = IndexedSplitSpanFeaturizer.fromData(span, annTrees, bloomFilter = false) - def labelFeaturizer(l: AnnotatedLabel) = Set(l, l.baseAnnotatedLabel).toSeq - def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if(r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty + def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if (r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty val featurizer = new ProductionFeaturizer[AnnotatedLabel, AnnotatedLabel, String](xbarGrammar, indexedRefinements, lGen=labelFeaturizer, @@ -75,15 +71,18 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma val transform = new AffineTransform( featurizer.index.size, numOutputs, - new TanhTransform(new AffineTransform(numOutputs, numHidden, - new TanhTransform[FeatureVector](numHidden, indexedSurface.featureIndex.size, true)))) - + new TanhTransform( + new AffineTransform(numOutputs, numHidden, + new TanhTransform[FeatureVector](numHidden, indexedSurface.featureIndex.size, true))) + ) - new TransformModel(annotator.latent, + new TransformModel( + annotator.latent, constrainer, topology, lexicon, refGrammar, indexedRefinements, featurizer, indexedSurface, - transform) + transform + ) } } \ No newline at end of file diff --git a/src/main/scala/epic/parser/models/NeuralParserTrainer.scala b/src/main/scala/epic/parser/models/NeuralParserTrainer.scala index 049fbeeb..e6f59e1e 100644 --- a/src/main/scala/epic/parser/models/NeuralParserTrainer.scala +++ b/src/main/scala/epic/parser/models/NeuralParserTrainer.scala @@ -82,7 +82,7 @@ object NeuralParserTrainer extends epic.parser.ParserPipeline with LazyLogging { import params._ import extraPTParams._ -// if(threads >= 1) +// if (threads >= 1) // collection.parallel.ForkJoinTasks.defaultForkJoinPool.setParallelism(params.threads) val initialParser = params.parser match { @@ -107,14 +107,14 @@ object NeuralParserTrainer extends epic.parser.ParserPipeline with LazyLogging { var theTrees = trainTrees.toIndexedSeq.filterNot(sentTooLong(_, params.maxParseLength)) - if(useConstraints && enforceReachability) { + if (useConstraints && enforceReachability) { val treebankGrammar = GenerativeParser.annotated(initialParser.topology, initialParser.lexicon, TreeAnnotator.identity, trainTrees) val markovizedGrammar = GenerativeParser.annotated(initialParser.topology, initialParser.lexicon, annotator, trainTrees) val proj = new OracleParser(treebankGrammar, markovizedGrammar) theTrees = theTrees.par.map(ti => ti.copy(tree=proj.forTree(ti.tree, ti.words, constraints.constraints(ti.words)))).seq.toIndexedSeq } - val baseMeasure = if(useConstraints) { + val baseMeasure = if (useConstraints) { constraints } else { ChartConstraints.Factory.noSparsity[AnnotatedLabel, String] @@ -126,9 +126,9 @@ object NeuralParserTrainer extends epic.parser.ParserPipeline with LazyLogging { val cachedObj = new CachedBatchDiffFunction(obj) println("Initializing weights custom for model " + model.getClass) val init = model.initialWeightVector(initWeightsScale, initializerSpec) - if(checkGradient) { + if (checkGradient) { val cachedObj2 = new CachedBatchDiffFunction(new ModelObjective(model, theTrees.take(opt.batchSize), params.threads)) - val defaultIndices = (0 until 10).map(i => if(i < 0) model.featureIndex.size + i else i) + val defaultIndices = (0 until 10).map(i => if (i < 0) model.featureIndex.size + i else i) val indices = if (model.transforms.nonEmpty) { model.transforms(0).getInterestingWeightIndicesForGradientCheck(0) } else { @@ -152,7 +152,6 @@ object NeuralParserTrainer extends epic.parser.ParserPipeline with LazyLogging { } } - val name = Option(params.name).orElse(Option(model.getClass.getSimpleName).filter(_.nonEmpty)).getOrElse("DiscrimParser") val itr: Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State] = if (determinizeTraining) { val scanningBatchesObj = cachedObj.withScanningBatches(params.opt.batchSize) @@ -208,7 +207,7 @@ object NeuralParserTrainer extends epic.parser.ParserPipeline with LazyLogging { def evaluateNow = { val sentinel = new File("EVALUATE_NOW") - if(sentinel.exists()) { + if (sentinel.exists()) { sentinel.delete() logger.info("Evaluating now!!!!") true diff --git a/src/main/scala/epic/parser/models/ParserExtractable.scala b/src/main/scala/epic/parser/models/ParserExtractable.scala index 3b6517d0..4f8a17f7 100644 --- a/src/main/scala/epic/parser/models/ParserExtractable.scala +++ b/src/main/scala/epic/parser/models/ParserExtractable.scala @@ -36,11 +36,10 @@ trait ParserExtractable[L, W] { def extractParser(weights: DenseVector[Double])(implicit deb: Debinarizer[L]): Parser[L, W] } - trait ParserExtractableModelFactory[L,W] { def make(train: IndexedSeq[TreeInstance[L, W]], topology: RuleTopology[L], lexicon: Lexicon[L, W], constraintsFactory: ChartConstraints.Factory[L, W]): MyModel - def readWeights(in: File):Counter[Feature, Double] = if(in != null && in.exists) { + def readWeights(in: File):Counter[Feature, Double] = if (in != null && in.exists) { try { val ctr = breeze.util.readObject[Counter[Feature, Double]](in) ctr @@ -53,7 +52,6 @@ trait ParserExtractableModelFactory[L,W] { type MyModel <: Model[TreeInstance[L,W]] with ParserExtractable[L,W] - protected def extractBasicCounts[L, W](trees: IndexedSeq[TreeInstance[L, W]]): (Counter2[L, W, Double], Counter2[L, BinaryRule[L], Double], Counter2[L, UnaryRule[L], Double]) = { GenerativeParser.extractCounts(trees) } diff --git a/src/main/scala/epic/parser/models/ParserModel.scala b/src/main/scala/epic/parser/models/ParserModel.scala index 45a9ce62..274ef114 100644 --- a/src/main/scala/epic/parser/models/ParserModel.scala +++ b/src/main/scala/epic/parser/models/ParserModel.scala @@ -35,12 +35,8 @@ trait ParserModel[L, W] extends epic.framework.StandardExpectedCounts.Model[Tree val inf = inferenceFromWeights(weights).forTesting Parser(constrainer, inf.grammar, ChartDecoder[L, W]()) } - - - } - trait ParserInference[L, W] extends ProjectableInference[TreeInstance[L, W], UnrefinedGrammarAnchoring[L, W]] { type ExpectedCounts = StandardExpectedCounts[Feature] type Marginal = epic.parser.ParseMarginal[L, W] @@ -51,12 +47,10 @@ trait ParserInference[L, W] extends ProjectableInference[TreeInstance[L, W], Unr override def forTesting: ParserInference[L, W] = this - def scorer(v: TreeInstance[L, W]): Scorer = { grammar.anchor(v.words, constrainer.constraints(v.words)) } - /** * Produces the "guess marginal" which is the marginal conditioned on only the input data * @param v the example @@ -80,7 +74,6 @@ trait ParserInference[L, W] extends ProjectableInference[TreeInstance[L, W], Unr def baseAugment(v: TreeInstance[L, W]) = UnrefinedGrammarAnchoring.identity(grammar.topology, grammar.lexicon, v.words, ChartConstraints.noSparsity) - def project(v: TreeInstance[L, W], s: Scorer, m: Marginal, oldAugment: UnrefinedGrammarAnchoring[L, W]): UnrefinedGrammarAnchoring[L, W] = { projector.project(this, v, m) } @@ -90,7 +83,6 @@ trait ParserInference[L, W] extends ProjectableInference[TreeInstance[L, W], Unr trait ParserModelFactory[L, W] extends ParserExtractableModelFactory[L, W] { type MyModel <: ParserModel[L, W] - } diff --git a/src/main/scala/epic/parser/models/ParserTrainer.scala b/src/main/scala/epic/parser/models/ParserTrainer.scala index 931da01f..303ff4a7 100644 --- a/src/main/scala/epic/parser/models/ParserTrainer.scala +++ b/src/main/scala/epic/parser/models/ParserTrainer.scala @@ -87,7 +87,7 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging { validate: (Parser[AnnotatedLabel, String]) => Statistics, params: Params) = { import params._ -// if(threads >= 1) +// if (threads >= 1) // collection.parallel.ForkJoinTasks.defaultForkJoinPool.setParallelism(params.threads) val initialParser = params.parser match { @@ -112,14 +112,14 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging { var theTrees = trainTrees.toIndexedSeq.filterNot(sentTooLong(_, params.maxParseLength)) - if(useConstraints && enforceReachability) { + if (useConstraints && enforceReachability) { val treebankGrammar = GenerativeParser.annotated(initialParser.topology, initialParser.lexicon, TreeAnnotator.identity, trainTrees) val markovizedGrammar = GenerativeParser.annotated(initialParser.topology, initialParser.lexicon, annotator, trainTrees) val proj = new OracleParser(treebankGrammar, markovizedGrammar) theTrees = theTrees.par.map(ti => ti.copy(tree=proj.forTree(ti.tree, ti.words, constraints.constraints(ti.words)))).seq.toIndexedSeq } - val baseMeasure = if(useConstraints) { + val baseMeasure = if (useConstraints) { constraints } else { ChartConstraints.Factory.noSparsity[AnnotatedLabel, String] @@ -129,9 +129,9 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging { val obj = new ModelObjective(model, theTrees, params.threads) val cachedObj = new CachedBatchDiffFunction(obj) val init = obj.initialWeightVector(randomize) - if(checkGradient) { + if (checkGradient) { val cachedObj2 = new CachedBatchDiffFunction(new ModelObjective(model, theTrees.take(opt.batchSize), params.threads)) - val indices = (0 until 10).map(i => if(i < 0) model.featureIndex.size + i else i) + val indices = (0 until 10).map(i => if (i < 0) model.featureIndex.size + i else i) println("testIndices: " + indices) GradientTester.testIndices(cachedObj2, obj.initialWeightVector(randomize = true), indices, toString={(i: Int) => model.featureIndex.get(i).toString}, skipZeros = true) println("test") @@ -150,7 +150,6 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging { } } - val name = Option(params.name).orElse(Option(model.getClass.getSimpleName).filter(_.nonEmpty)).getOrElse("DiscrimParser") val itr: Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State] = if (determinizeTraining) { val scanningBatchesObj = cachedObj.withScanningBatches(params.opt.batchSize) @@ -190,14 +189,13 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging { def evaluateNow = { val sentinel = new File("EVALUATE_NOW") - if(sentinel.exists()) { + if (sentinel.exists()) { sentinel.delete() logger.info("Evaluating now!!!!") true } else { false } - } def computeLL(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], model: Model[TreeInstance[AnnotatedLabel, String]], weights: DenseVector[Double]) { @@ -216,25 +214,18 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging { } } - object Suffixes extends LazyLogging { + def main(args: Array[String]):Unit = { val tb = CommandLineParser.readIn[ProcessedTreebank](args) - val counts = GenerativeParser.extractCounts(tb.trainTrees)._1 - val marginalized: Counter[String, Double] = sum(counts(::, *)) - val lfs = LongestFrequentSuffixFeaturizer(marginalized) - for(ti <- tb.trainTrees) { val suffixes = lfs.lookupSentence(ti.words) println("original: " + ti.words.mkString(" ")) println("suffixes: " + suffixes.mkString(" ")) } - - } - } diff --git a/src/main/scala/epic/parser/models/PositionalNeuralModel.scala b/src/main/scala/epic/parser/models/PositionalNeuralModel.scala index 8e78b327..7f9ec2a6 100644 --- a/src/main/scala/epic/parser/models/PositionalNeuralModel.scala +++ b/src/main/scala/epic/parser/models/PositionalNeuralModel.scala @@ -67,7 +67,7 @@ class PositionalNeuralModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W override type Inference = PositionalNeuralModel.Inference[L, L2, W] override def accumulateCounts(inf: Inference, s: Scorer, d: TreeInstance[L, W], m: Marginal, accum: ExpectedCounts, scale: Double): Unit = { -// println("Extracting ecounts") + // println("Extracting ecounts") inf.grammar.extractEcounts(m, accum.counts, scale) if (maybeSparseSurfaceFeaturizer.isDefined) { @@ -78,7 +78,7 @@ class PositionalNeuralModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W val totalTransformSize = transforms.map(_.index.size).sum + depTransforms.map(_.index.size).sum + decoupledTransforms.map(_.index.size).sum accum.counts += DenseVector.vertcat(DenseVector.zeros[Double](totalTransformSize), innerAccum.counts) } -// println("Ecounts extracted") + // println("Ecounts extracted") accum.loss += scale * m.logPartition } @@ -195,7 +195,6 @@ object PositionalNeuralModel { depFeaturizer, layers, penultimateLayers, depLayers, maybeSparseSurfaceFeaturizer, decoupledLayers, penultimateDecoupledLayers, weights, origPTModel) } - /** * N.B. does not extracted expected counts for sparse features; this is done outside this loop */ @@ -206,7 +205,7 @@ object PositionalNeuralModel { val depSpec = depFeaturizer.anchor(w) val lspec = labelFeaturizer.anchor(w) -// val maxTetraLen = ((w.size + 2) * (w.size + 3) * (w.size + 4))/6 + ((w.size + 1) * (w.size + 2))/2 + w.size + 2 + // val maxTetraLen = ((w.size + 2) * (w.size + 3) * (w.size + 4))/6 + ((w.size + 1) * (w.size + 2))/2 + w.size + 2 def tetra(begin: Int, split: Int, end: Int) = { (end * (end + 1) * (end + 2))/6 + ((split + 1) * split / 2 + begin) @@ -217,10 +216,10 @@ object PositionalNeuralModel { val unaryRuleCountsPerState = new HashMap[Int,SparseVector[Double]] val binaryRuleCountsPerState = new HashMap[Int,SparseVector[Double]] val spanCountsPerState = new HashMap[Int,SparseVector[Double]] -// val ruleCountsPerState = Array.fill(maxTetraLen)(SparseVector.zeros[Double](labelFeaturizer.index.size)) -// val countsPerHeadDepPair = Array.tabulate(w.size, w.size)((i, j) => 0.0) -// val statesUsed = Array.fill(maxTetraLen)(false) -// val untetra = Array.fill(maxTetraLen)((-1, -1, -1)) + // val ruleCountsPerState = Array.fill(maxTetraLen)(SparseVector.zeros[Double](labelFeaturizer.index.size)) + // val countsPerHeadDepPair = Array.tabulate(w.size, w.size)((i, j) => 0.0) + // val statesUsed = Array.fill(maxTetraLen)(false) + // val untetra = Array.fill(maxTetraLen)((-1, -1, -1)) val untetra = new HashMap[Int,(Int,Int,Int)] m visit new AnchoredVisitor[L] { @@ -407,7 +406,7 @@ object PositionalNeuralModel { var i = 0 var score = 0.0 val wdata = weights.data - while(i < features.length) { + while (i < features.length) { score += wdata(features(i) + sparseFeaturesOffset) i += 1 } diff --git a/src/main/scala/epic/parser/models/PositionalNeuralModelFactory.scala b/src/main/scala/epic/parser/models/PositionalNeuralModelFactory.scala index 04e5b2a9..68d897f2 100644 --- a/src/main/scala/epic/parser/models/PositionalNeuralModelFactory.scala +++ b/src/main/scala/epic/parser/models/PositionalNeuralModelFactory.scala @@ -112,8 +112,6 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma type MyModel = PositionalNeuralModel[AnnotatedLabel, AnnotatedLabel, String] - - override def make(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], @@ -141,7 +139,6 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma val prodFeaturizer = new ProductionFeaturizer[AnnotatedLabel, AnnotatedLabel, String](xbarGrammar, indexedRefinements, lGen=labelFeaturizer, rGen=ruleFeaturizer) - /////////////////////// // READ IN WORD VECTORS val tagCountsLexicon = TagSpanShapeGenerator.makeStandardLexicon(annTrees) @@ -163,7 +160,7 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma } // Convert Array[Float] values to Array[Double] values and rescale them val word2vecDoubleVect = word2vec.map(keyValue => keyValue._1 -> keyValue._2.map(_.toDouble * vectorRescaling)) -// val word2vecDoubleVect = word2vec.map(keyValue => (keyValue._1 -> new DenseVector[Double](keyValue._2.map(_.toDouble)))) + // val word2vecDoubleVect = word2vec.map(keyValue => (keyValue._1 -> new DenseVector[Double](keyValue._2.map(_.toDouble)))) val word2vecIndexed: Word2VecIndexed[String] = if (embeddingType == "normalpos") { Word2VecIndexed(word2vecDoubleVect, (str: String) => Word2Vec.convertWord(str, lowercasedVectors)).augment(freqTagger.tagTypesIdx.size, freqTagger.convertToFeaturizer) } else { @@ -229,7 +226,7 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma annotator.latent, indexedRefinements, xbarGrammar, - if(dummyFeats < 0) HashFeature.Absolute(-dummyFeats.toInt) else HashFeature.Relative(dummyFeats), + if (dummyFeats < 0) HashFeature.Absolute(-dummyFeats.toInt) else HashFeature.Relative(dummyFeats), filterUnseenFeatures = false, minFeatCount = 1, trainTrees) @@ -289,8 +286,7 @@ object PositionalNeuralModelFactory { val innerTransform = buildNetInnerTransforms(word2vecIndexed, inputSize, numHidden, numHiddenLayers, nonLinType, dropoutRate, backpropIntoEmbeddings) new AffineOutputTransform(outputSize, if (numHiddenLayers >= 1) numHidden else inputSize, innerTransform) } - - + def buildNetOutputEmbedding(word2vecIndexed: Word2VecIndexed[String], inputSize: Int, numHidden: Int, diff --git a/src/main/scala/epic/parser/models/ProductParserModelFactory.scala b/src/main/scala/epic/parser/models/ProductParserModelFactory.scala index b3551d9b..20a2dc1e 100644 --- a/src/main/scala/epic/parser/models/ProductParserModelFactory.scala +++ b/src/main/scala/epic/parser/models/ProductParserModelFactory.scala @@ -45,11 +45,10 @@ case class ProductParserModelFactory(annotator: TreeAnnotator[AnnotatedLabel, St oldWeights: File = null, splitFactor: Int = 1) extends ParserModelFactory[AnnotatedLabel, String] with SafeLogging { - type MyModel = LatentParserModel[AnnotatedLabel, (AnnotatedLabel, Seq[Int]), String] def genSplits(numModels: Int, numStates: Int):Seq[IndexedSeq[Int]] = { - if(numModels == 0) Seq(IndexedSeq.empty) + if (numModels == 0) Seq(IndexedSeq.empty) else for(r <- genSplits(numModels -1, numStates); i <- 0 until numStates) yield i +: r } @@ -59,7 +58,6 @@ case class ProductParserModelFactory(annotator: TreeAnnotator[AnnotatedLabel, St def unsplit(x: (AnnotatedLabel, Seq[Int])) = x._1 - def splitRule[L, L2](r: Rule[L], split: L=>Seq[L2]):Seq[Rule[L2]] = r match { case BinaryRule(a, b, c) => for(aa <- split(a); bb <- split(b); cc <- split(c)) yield BinaryRule(aa, bb, cc) // don't allow non-identity rule refinements for identity rewrites @@ -67,12 +65,10 @@ case class ProductParserModelFactory(annotator: TreeAnnotator[AnnotatedLabel, St case UnaryRule(a, b, chain) => for(aa <- split(a); bb <- split(b)) yield UnaryRule(aa, bb, chain) } - override def make(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], constrainer: Factory[AnnotatedLabel, String]): MyModel = { val annTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]] = trainTrees.map(annotator(_)) val (annWords, annBinaries, annUnaries) = this.extractBasicCounts(annTrees) - val (xbarGrammar, xbarLexicon) = topology -> lexicon val cFactory = constrainer @@ -112,7 +108,6 @@ case class ProductParserModelFactory(annotator: TreeAnnotator[AnnotatedLabel, St Counter[Feature, Double]() } - def latentAnnotator(t: BinarizedTree[AnnotatedLabel], w: IndexedSeq[String]) = { annotator(t, w).map(finalRefinements.labels.refinementsOf) } diff --git a/src/main/scala/epic/parser/models/SpanModel.scala b/src/main/scala/epic/parser/models/SpanModel.scala index 353ba26f..510e4055 100644 --- a/src/main/scala/epic/parser/models/SpanModel.scala +++ b/src/main/scala/epic/parser/models/SpanModel.scala @@ -55,7 +55,6 @@ class SpanModel[L, L2, W](val featurizer: RefinedFeaturizer[L, W, Feature], initialFeatureVal: (Feature => Option[Double]) = { _ => None }) extends ParserModel[L, W] with Serializable { type Inference = LatentParserInference[L, L2, W] - override def initialValueForFeature(f: Feature) = initialFeatureVal(f) getOrElse 0.0 def inferenceFromWeights(weights: DenseVector[Double]) = { @@ -63,13 +62,11 @@ class SpanModel[L, L2, W](val featurizer: RefinedFeaturizer[L, W, Feature], new LatentParserInference(featurizer, annotator, dpGrammar, constrainer, refinements) } - def accumulateCounts(inf: Inference, s: Scorer, d: TreeInstance[L, W], m: Marginal, accum: ExpectedCounts, scale: Double): Unit = { m.expectedCounts(featurizer, accum, scale) } } - @SerialVersionUID(4749637878577393596L) class DotProductGrammar[L, L2, W, Feature](val topology: RuleTopology[L], val lexicon: Lexicon[L, W], @@ -78,14 +75,12 @@ class DotProductGrammar[L, L2, W, Feature](val topology: RuleTopology[L], val weights: DenseVector[Double], val featurizer: RefinedFeaturizer[L, W, Feature]) extends Grammar[L, W] { - override def withPermissiveLexicon: Grammar[L, W] = { new DotProductGrammar(topology, lexicon.morePermissive, refinedTopology, refinements, weights, featurizer) } def anchor(w: IndexedSeq[W], cons: ChartConstraints[L]):GrammarAnchoring[L, W] = new ProjectionsGrammarAnchoring[L, L2, W] { - override def addConstraints(constraints: ChartConstraints[L]): GrammarAnchoring[L, W] = { anchor(w, cons & constraints) } @@ -117,14 +112,13 @@ class DotProductGrammar[L, L2, W, Feature](val topology: RuleTopology[L], var i = 0 var score = 0.0 val wdata = weights.data - while(i < features.length) { + while (i < features.length) { score += wdata(features(i)) i += 1 } score } - } } @@ -162,14 +156,14 @@ case class IndexedSpanFeaturizer[L, L2, W](wordFeatureIndex: CrossProductIndex[F val ind = TriangularArray.index(begin, end) var rcache = spanCache(ind) - if(rcache eq null) { + if (rcache eq null) { rcache = new OpenAddressHashArray[Array[Int]](refinements.labels.fineIndex.size) spanCache(ind) = rcache } var cache = rcache(globalized) - if(cache == null) { + if (cache == null) { val spanFeats: Array[Int] = fspec.featuresForSpan(begin, end, tag, ref) - cache = if(begin + 1 == end) { + cache = if (begin + 1 == end) { wordFeatureIndex.crossProduct(spanFeats, wspec.featuresForWord(begin), wordOffset) } else { require(rspec.featuresForSpan(begin, end, tag, ref).isEmpty, "Span features on the extraProductionFeaturizer currently unsupported") @@ -184,12 +178,12 @@ case class IndexedSpanFeaturizer[L, L2, W](wordFeatureIndex: CrossProductIndex[F val globalized = refinements.rules.globalize(rule, ref) val ind = TriangularArray.index(begin, end) var rcache = unaryCache(ind) - if(rcache eq null) { + if (rcache eq null) { rcache = new OpenAddressHashArray[Array[Int]](refinements.rules.fineIndex.size) unaryCache(ind) = rcache } var cache = rcache(globalized) - if(cache == null) { + if (cache == null) { require(rspec.featuresForUnaryRule(begin, end, rule, ref).isEmpty, "Span features on the extraProductionFeaturizer currently unsupported") cache = spanFeatureIndex.crossProduct(fspec.featuresForUnaryRule(begin, end, rule, ref), getSpanFeatures(begin, end), spanOffset, true) @@ -202,24 +196,24 @@ case class IndexedSpanFeaturizer[L, L2, W](wordFeatureIndex: CrossProductIndex[F val globalized = refinements.rules.globalize(rule, ref) val ind = TriangularArray.index(begin, end) var rcache = binaryCache(ind) - if(rcache eq null) { + if (rcache eq null) { rcache = new Array[OpenAddressHashArray[Array[Int]]](end - begin) binaryCache(ind) = rcache } var scache = rcache(split - begin) - if(scache eq null) { + if (scache eq null) { scache = new OpenAddressHashArray[Array[Int]](refinements.rules.fineIndex.size) rcache(split - begin) = scache } var cache = scache(globalized) - if(cache == null) { + if (cache == null) { val spanFeatures = getSpanFeatures(begin, end) cache = spanFeatureIndex.crossProduct(fspec.featuresForBinaryRule(begin, split, end, rule, ref),spanFeatures, spanOffset, true) -// val forSplit = spanFeatureIndex.crossProduct(fspec.featuresForBinaryRule(begin, split, end, rule, ref), sspec.featuresForSplit(begin, split, end), spanOffset, false) + // val forSplit = spanFeatureIndex.crossProduct(fspec.featuresForBinaryRule(begin, split, end, rule, ref), sspec.featuresForSplit(begin, split, end), spanOffset, false) val ruleAndSpansFeatures = RuleAndSpansFeaturizer.indexAndOffset(ruleAndSpansFeatureIndex, rspec.featuresForBinaryRule(begin, split, end, rule, ref), ruleAndSpansOffset) val forSplit = Arrays.concatenate(spanFeatureIndex.crossProduct(fspec.featuresForBinaryRule(begin, split, end, rule, ref), sspec.featuresForSplit(begin, split, end), spanOffset, false), ruleAndSpansFeatures) - if(forSplit.length > 0) + if (forSplit.length > 0) cache = Arrays.concatenate(cache, forSplit) scache(globalized) = cache } @@ -230,7 +224,7 @@ case class IndexedSpanFeaturizer[L, L2, W](wordFeatureIndex: CrossProductIndex[F private def getSpanFeatures(begin: Int, end: Int):Array[Int] = { val ind = TriangularArray.index(begin, end) var cache = rawSpanCache(ind) - if(cache eq null) { + if (cache eq null) { cache = sspec.featuresForSpan(begin, end) rawSpanCache(ind) = cache } @@ -263,7 +257,7 @@ object IndexedSpanFeaturizer { minFeatCount: Int, trees: Traversable[TreeInstance[L, W]]): IndexedSpanFeaturizer[L, L2, W] = { - def seenSet = if(filterUnseenFeatures) new ThreadLocalBloomFilter[Long](8 * 1024 * 1024 * 50, 3) else AlwaysSeenSet + def seenSet = if (filterUnseenFeatures) new ThreadLocalBloomFilter[Long](8 * 1024 * 1024 * 50, 3) else AlwaysSeenSet val spanBuilder = new CrossProductIndex.Builder(featurizer.index, surfaceFeaturizer.featureIndex, dummyFeatScale, seenSet = seenSet, minCount = minFeatCount) val wordBuilder = new CrossProductIndex.Builder(featurizer.index, wordFeaturizer.featureIndex, dummyFeatScale, seenSet = seenSet, includeLabelOnlyFeatures = false) @@ -285,7 +279,7 @@ object IndexedSpanFeaturizer { for(a <- as; b <- bs.label) { val r = UnaryRule(a, b, chain) val (ri, rref) = refinements.rules.indexAndLocalize(r) - if(rref != -1) { + if (rref != -1) { spanBuilder.add(spec.featuresForUnaryRule(span.begin, span.end, ri, rref), sspec.featuresForSpan(span.begin, span.end)) RuleAndSpansFeaturizer.addToIndex(ruleAndSpansIndex, rspec.featuresForUnaryRule(span.begin, span.end, ri, rref)) } @@ -295,7 +289,7 @@ object IndexedSpanFeaturizer { val (ai, aref) = refinements.labels.indexAndLocalize(a) val r = BinaryRule(a, b, c) val (ri, rref) = refinements.rules.indexAndLocalize(r) - if(rref != -1) { + if (rref != -1) { spanBuilder.add(spec.featuresForBinaryRule(span.begin, t.splitPoint, span.end, ri, rref), sspec.featuresForSpan(span.begin, span.end)) spanBuilder.add(spec.featuresForBinaryRule(span.begin, t.splitPoint, span.end, ri, rref), @@ -313,7 +307,6 @@ object IndexedSpanFeaturizer { } } - @SerialVersionUID(-155022487059445275L) case class ExtraParams(useHackyLexicalFeatures:Boolean = false, hackyLexicalFeatureDesc:String = "", @@ -349,7 +342,6 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma type MyModel = SpanModel[AnnotatedLabel, AnnotatedLabel, String] - override def make(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], @@ -376,35 +368,31 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma var wf = posFeaturizer.getOrElse( SpanModelFactory.defaultPOSFeaturizer(annWords)) - if(useMorph) + if (useMorph) wf += mf - - - var span: SplitSpanFeaturizer[String] = spanFeaturizer.getOrElse(SpanModelFactory.goodFeaturizer(annWords, commonWordThreshold, useShape = useShape)) - if(useRichSpanContext) + if (useRichSpanContext) span += spanShapeBetter - if(useNGrams) + if (useNGrams) span += ngramF - if(useTagSpanShape) + if (useTagSpanShape) span += tagSpanShape - if(useFullShape) + if (useFullShape) span += fullShape val indexedWord = IndexedWordFeaturizer.fromData(wf, annTrees.map{_.words}, deduplicateFeatures = pruneRedundantFeatures) val surface = IndexedSplitSpanFeaturizer.fromData(span, annTrees, bloomFilter = false, deduplicateFeatures = pruneRedundantFeatures) - - + def labelFeaturizer(l: AnnotatedLabel) = Set(l, l.baseAnnotatedLabel).toSeq -// def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if(useGrammar) Set(r, r.map(_.baseAnnotatedLabel)).toSeq else if(r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty - def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if(useGrammar) { +// def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if (useGrammar) Set(r, r.map(_.baseAnnotatedLabel)).toSeq else if (r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty + def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if (useGrammar) { if (useChildFeats && r.isInstanceOf[BinaryRule[AnnotatedLabel]]) { Set(r, r.map(_.baseAnnotatedLabel), @@ -413,13 +401,12 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma } else { Set(r, r.map(_.baseAnnotatedLabel)).toSeq } - } else if(r.isInstanceOf[UnaryRule[AnnotatedLabel]]) { + } else if (r.isInstanceOf[UnaryRule[AnnotatedLabel]]) { Set(r.parent, r.parent.baseAnnotatedLabel).toSeq } else { Seq.empty } - - + val featurizer = new ProductionFeaturizer[AnnotatedLabel, AnnotatedLabel, String](xbarGrammar, indexedRefinements, lGen=labelFeaturizer, rGen=ruleFeaturizer, filterRedundantFeatures = pruneRedundantFeatures) @@ -439,7 +426,7 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma annotator.latent, indexedRefinements, xbarGrammar, - if(dummyFeats < 0) HashFeature.Absolute(-dummyFeats.toInt) else HashFeature.Relative(dummyFeats), + if (dummyFeats < 0) HashFeature.Absolute(-dummyFeats.toInt) else HashFeature.Relative(dummyFeats), filterUnseenFeatures = false, minFeatCount, trainTrees) @@ -453,13 +440,8 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma new SpanModel[AnnotatedLabel, AnnotatedLabel, String](indexed, indexed.index, annotator.latent, constrainer, xbarGrammar, xbarLexicon, refGrammar, indexedRefinements,featureCounter.get(_)) } - - } - - - case class LatentSpanModelFactory(inner: SpanModelFactory, @Help(text="Path to substates to use for each symbol. Uses numStates for missing states.") substates: File = null, @@ -470,7 +452,6 @@ case class LatentSpanModelFactory(inner: SpanModelFactory, type MyModel = SpanModel[AnnotatedLabel, (AnnotatedLabel, Int), String] - override def make(train: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], constrainer: Factory[AnnotatedLabel, String]): MyModel = { import inner.{logger => _, _} import extraParams._ @@ -482,7 +463,6 @@ case class LatentSpanModelFactory(inner: SpanModelFactory, val xbarLexicon = lexicon - val substateMap = if (substates != null && substates.exists) { val in = Source.fromFile(substates).getLines() val pairs = for (line <- in) yield { @@ -490,7 +470,7 @@ case class LatentSpanModelFactory(inner: SpanModelFactory, AnnotatedLabel(split(0)) -> split(1).toInt } pairs.toMap + (topology.root -> 1) - } else if(splitUselessStates) { + } else if (splitUselessStates) { Map(topology.root -> 1) } else { LatentModelFactory.statesToNotSplit.iterator.map(s => AnnotatedLabel(s) -> 1).toMap + (topology.root -> 1) @@ -540,22 +520,20 @@ case class LatentSpanModelFactory(inner: SpanModelFactory, var span: SplitSpanFeaturizer[String] = SpanModelFactory.goodFeaturizer(annWords, commonWordThreshold) - if(useRichSpanContext) + if (useRichSpanContext) span += spanShapeBetter - if(useNGrams) + if (useNGrams) span += ngramF - - if(useFullShape) + if (useFullShape) span += fullShape val indexedWord = IndexedWordFeaturizer.fromData(wf, annTrees.map{_.words}) val surface = IndexedSplitSpanFeaturizer.fromData(span, annTrees) - def labelFeaturizer(l: (AnnotatedLabel, Int)) = Set[Feature](IndicatorFeature(l), l._1, l._1.baseAnnotatedLabel).toSeq - def ruleFeaturizer(r: Rule[(AnnotatedLabel, Int)]) = if(useGrammar) Set(r, r.map(_._1)).toSeq else if(r.isInstanceOf[UnaryRule[(AnnotatedLabel, Int)]]) labelFeaturizer(r.parent) else Seq.empty + def ruleFeaturizer(r: Rule[(AnnotatedLabel, Int)]) = if (useGrammar) Set(r, r.map(_._1)).toSeq else if (r.isInstanceOf[UnaryRule[(AnnotatedLabel, Int)]]) labelFeaturizer(r.parent) else Seq.empty val featurizer = new ProductionFeaturizer[AnnotatedLabel, (AnnotatedLabel, Int), String](topology, finalRefinements, lGen=labelFeaturizer, @@ -573,23 +551,23 @@ case class LatentSpanModelFactory(inner: SpanModelFactory, annotator(t, w).map(finalRefinements.labels.refinementsOf) } - val indexed = IndexedSpanFeaturizer.extract[AnnotatedLabel, (AnnotatedLabel, Int), String](indexedWord, + val indexed = IndexedSpanFeaturizer.extract[AnnotatedLabel, (AnnotatedLabel, Int), String]( + indexedWord, surface, featurizer, ruleAndSpansFeaturizer, latentAnnotator, finalRefinements, topology, - if(dummyFeats < 0) HashFeature.Absolute(-dummyFeats.toInt) else HashFeature.Relative(dummyFeats), -// filterUnseenFeatures = true, + if (dummyFeats < 0) HashFeature.Absolute(-dummyFeats.toInt) else HashFeature.Relative(dummyFeats), + // filterUnseenFeatures = true, filterUnseenFeatures = false, - 1, - train) - + 1, + train + ) val featureCounter = this.readWeights(oldWeights) - val refGrammar = RuleTopology(finalRefinements.labels.refinementsOf(topology.root)(0), finalRefinements.labels.fineIndex, finalRefinements.rules.fineIndex) @@ -640,7 +618,7 @@ object SpanModelFactory { featurizer += distance[String](begin, split) featurizer += distance[String](split, end) - if(useShape) + if (useShape) featurizer += spanShape featurizer } @@ -677,7 +655,6 @@ object SpanModelFactory { new CachedChartConstraintsFactory[AnnotatedLabel, String](uncached) } - val mf = new SpanModelFactory(annotator = annotator, posFeaturizer = posFeaturizer, spanFeaturizer = spanFeaturizer).make(trees, topo, lexicon, constraints) val mobj = new ModelObjective(mf, trees) diff --git a/src/main/scala/epic/parser/models/StructModel.scala b/src/main/scala/epic/parser/models/StructModel.scala index d8bb0a9c..3a28762e 100644 --- a/src/main/scala/epic/parser/models/StructModel.scala +++ b/src/main/scala/epic/parser/models/StructModel.scala @@ -56,7 +56,6 @@ class StructModel[L, L2, W](indexedFeatures: IndexedFeaturizer[L, L2, W], initialFeatureVal: (Feature => Option[Double]) = { _ => None }) extends ParserModel[L, W] with Serializable { type Inference = AnnotatedParserInference[L, W] - def featureIndex = indexedFeatures.index override def initialValueForFeature(f: Feature) = initialFeatureVal(f) getOrElse 0.0 @@ -71,7 +70,6 @@ class StructModel[L, L2, W](indexedFeatures: IndexedFeaturizer[L, L2, W], new AnnotatedParserInference(indexedFeatures, reannotate, grammar, constrainer) } - def accumulateCounts(inf: Inference, s: Scorer, d: TreeInstance[L, W], m: Marginal, accum: ExpectedCounts, scale: Double): Unit = { m.expectedCounts(indexedFeatures, accum, scale) } @@ -84,14 +82,13 @@ case class StructModelFactory(@Help(text= "The kind of annotation to do on the r annotatedTreesDumpPath: File = null) extends ParserModelFactory[AnnotatedLabel, String] { type MyModel = StructModel[AnnotatedLabel, AnnotatedLabel, String] - override def make(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], constrainer: Factory[AnnotatedLabel, String]): MyModel = { val transformed = trainTrees.par.map(annotator).seq.toIndexedSeq - if(annotatedTreesDumpPath != null) { + if (annotatedTreesDumpPath != null) { val ps = new PrintStream(new FileOutputStream(annotatedTreesDumpPath)) for( (x,y) <- trainTrees zip transformed) { ps.println("Treebank:\n" + x.render() + "\nAnnotated:\n" + y.render() + "\n==========\n") @@ -106,13 +103,10 @@ case class StructModelFactory(@Help(text= "The kind of annotation to do on the r val surfaceFeaturizer = { val dsl = new WordFeaturizer.DSL(initLexicon) import dsl._ - - ( - unigrams(word + clss, 1) - + suffixes() - + prefixes() - + props - ) + unigrams(word + clss, 1) + + suffixes() + + prefixes() + + props } val wordFeaturizer = IndexedWordFeaturizer.fromData(surfaceFeaturizer, transformed.map{_.words}) def labelFlattener(l: AnnotatedLabel): Seq[AnnotatedLabel] = { diff --git a/src/main/scala/epic/parser/models/ThreePointModel.scala b/src/main/scala/epic/parser/models/ThreePointModel.scala index 8edd3d7e..72bc18ba 100644 --- a/src/main/scala/epic/parser/models/ThreePointModel.scala +++ b/src/main/scala/epic/parser/models/ThreePointModel.scala @@ -35,8 +35,8 @@ class ThreePointModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W]) => labelFeaturizer: RefinedFeaturizer[L, W, Feature], wordFeaturizer: IndexedWordFeaturizer[W], rank: Int) extends ParserModel[L, W] { - override type Inference = ThreePointModel.ThreePointInference[L, L2, W] + override type Inference = ThreePointModel.ThreePointInference[L, L2, W] override def accumulateCounts(inf: Inference, s: Scorer, d: TreeInstance[L, W], m: Marginal, accum: ExpectedCounts, scale: Double): Unit = { inf.grammar.extractEcounts(m, accum.counts, scale) @@ -44,9 +44,8 @@ class ThreePointModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W]) => } override val featureIndex = new SegmentedIndex(new AffineTransform.Index(rank, labelFeaturizer.index.size, false) +: IndexedSeq.fill(3)(new AffineTransform.Index(rank, wordFeaturizer.featureIndex.size, false))) - override def inferenceFromWeights(weights: DenseVector[Double]): Inference = { - + override def inferenceFromWeights(weights: DenseVector[Double]): Inference = { val grammar = new ThreePointModel.Grammar[L, L2, W](topology, lexicon, refinedTopology, refinements, labelFeaturizer, wordFeaturizer, featureIndex, @@ -57,6 +56,7 @@ class ThreePointModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W]) => } override def initialValueForFeature(f: Feature): Double = f.hashCode().toDouble / 1000 % 2 + } object ThreePointModel { @@ -79,8 +79,6 @@ object ThreePointModel { LatentTreeMarginal(product, annotated) } - - } @SerialVersionUID(1L) @@ -95,7 +93,6 @@ object ThreePointModel { val IndexedSeq(ruleMatrix, wordMatrices@ _*) = reshapeWeightMatrices(weights) assert(wordMatrices.length == 3) - private def reshapeWeightMatrices(weights: DenseVector[Double]): IndexedSeq[DenseMatrix[Double]] = { val segments = featureIndex.shardWeights(weights) (featureIndex.indices zip segments).map { case (index, segment) => index.makeMatrix(segment)} @@ -136,12 +133,12 @@ object ThreePointModel { // doesn't include split point, which we'll do online val precachedSpanActivations = TriangularArray.tabulate(words.length + 1) { (i, j) => - if(sparsityPattern.isAllowedSpan(i, j) && i != j) { + if (sparsityPattern.isAllowedSpan(i, j) && i != j) { val result = DenseVector.ones[Double](wordActivations.head.head.size) result :*= actForPos(i, Point.First) result :*= actForPos(j - 1, Point.Last) -// println(result) + // println(result) result } else { @@ -149,10 +146,9 @@ object ThreePointModel { } } - def scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int) = { val surfaceAct = precachedSpanActivations(begin, end) - if(surfaceAct == null) { + if (surfaceAct == null) { Double.NegativeInfinity } else { val rfeats = lspec.featuresForBinaryRule(begin, split, end, rule, ref) @@ -162,7 +158,7 @@ object ThreePointModel { def scoreUnaryRule(begin: Int, end: Int, rule: Int, ref: Int) = { val surfaceAct = precachedSpanActivations(begin, end) - if(surfaceAct == null) { + if (surfaceAct == null) { Double.NegativeInfinity } else { val rfeats = lspec.featuresForUnaryRule(begin, end, rule, ref) @@ -172,7 +168,7 @@ object ThreePointModel { def scoreSpan(begin: Int, end: Int, tag: Int, ref: Int) = { val surfaceAct = precachedSpanActivations(begin, end) - if(surfaceAct == null) { + if (surfaceAct == null) { Double.NegativeInfinity } else { val rfeats = lspec.featuresForSpan(begin, end, tag, ref) @@ -199,9 +195,9 @@ object ThreePointModel { def checkFlush(begin: Int, split: Int, end: Int) { val state: (Int, Int) = (begin, end) val oldState: (Int, Int) = states(split) - if(oldState != state) { - if(oldState != UNUSED) { - val ffeats = if(split >= length) sspec.featuresForSpan(oldState._1, oldState._2) else sspec.featuresForSplit(oldState._1, split, oldState._2) + if (oldState != state) { + if (oldState != UNUSED) { + val ffeats = if (split >= length) sspec.featuresForSpan(oldState._1, oldState._2) else sspec.featuresForSplit(oldState._1, split, oldState._2) layer.tallyDerivative(deriv, ruleCountsPerState(split) *= scale, new FeatureVector(ffeats)) ruleCountsPerState(split) := 0.0 } @@ -252,8 +248,6 @@ object ThreePointModel { for(f <- sspec.featuresForWord(end - 1)) { axpy(score * scale, actWithoutEnd, dWeights(Point.Last.id)(::, f)) } - - } override def visitBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int, score: Double): Unit = { @@ -279,15 +273,12 @@ object ThreePointModel { for(f <- sspec.featuresForWord(split)) { axpy(score * scale, splitAct, dWeights(Point.Split.id)(::, f)) } - } } } } - - } case class ThreePointModelFactory(@Help(text= @@ -311,8 +302,6 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma type MyModel = ThreePointModel[AnnotatedLabel, AnnotatedLabel, String] - - override def make(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], @@ -342,19 +331,18 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma } } - if(useMorph) + if (useMorph) wf += MorphFeaturizer(pathsToMorph.split(",")) val indexedWord = IndexedWordFeaturizer.fromData(wf, annTrees.map{_.words}) def labelFeaturizer(l: AnnotatedLabel) = Set(l, l.baseAnnotatedLabel).toSeq - def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if(useGrammar) Set(r, r.map(_.baseAnnotatedLabel)).toSeq else if(r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty + def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if (useGrammar) Set(r, r.map(_.baseAnnotatedLabel)).toSeq else if (r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty val featurizer = new ProductionFeaturizer[AnnotatedLabel, AnnotatedLabel, String](xbarGrammar, indexedRefinements, lGen=labelFeaturizer, rGen=ruleFeaturizer) - new ThreePointModel(annotator.latent, constrainer, topology, lexicon, @@ -364,6 +352,4 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma } - - } diff --git a/src/main/scala/epic/parser/models/TransformModel.scala b/src/main/scala/epic/parser/models/TransformModel.scala index e86cd986..cc35e123 100644 --- a/src/main/scala/epic/parser/models/TransformModel.scala +++ b/src/main/scala/epic/parser/models/TransformModel.scala @@ -34,7 +34,6 @@ class TransformModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W]) => B val transform: Transform[FeatureVector, Vector[Double]]) extends ParserModel[L, W] { override type Inference = TransformModel.Inference[L, L2, W, transform.type] - override def accumulateCounts(inf: Inference, s: Scorer, d: TreeInstance[L, W], m: Marginal, accum: ExpectedCounts, scale: Double): Unit = { inf.grammar.extractEcounts(m, accum.counts, scale) accum.loss += scale * m.logPartition @@ -87,7 +86,6 @@ object TransformModel { new TransformGrammar(topology, lexicon.morePermissive, refinedTopology, refinements, labelFeaturizer, surfaceFeaturizer, layer) } - def extractEcounts(m: ParseMarginal[L, W], deriv: DenseVector[Double], scale: Double): Unit = { val w = m.words val length = w.length @@ -105,9 +103,9 @@ object TransformModel { def checkFlush(begin: Int, split: Int, end: Int) { val state: (Int, Int) = (begin, end) val oldState: (Int, Int) = states(split) - if(oldState != state) { - if(oldState != UNUSED) { - val ffeats = if(split >= length) sspec.featuresForSpan(oldState._1, oldState._2) else sspec.featuresForSplit(oldState._1, split, oldState._2) + if (oldState != state) { + if (oldState != UNUSED) { + val ffeats = if (split >= length) sspec.featuresForSpan(oldState._1, oldState._2) else sspec.featuresForSplit(oldState._1, split, oldState._2) layer.tallyDerivative(deriv, ruleCountsPerState(split) *= scale, new FeatureVector(ffeats)) ruleCountsPerState(split) := 0.0 } @@ -120,21 +118,20 @@ object TransformModel { override def visitUnaryRule(begin: Int, end: Int, rule: Int, ref: Int, score: Double): Unit = { checkFlush(begin, length, end) axpy(score, new FeatureVector(lspec.featuresForUnaryRule(begin, end, rule, ref)), ruleCountsPerState(length)) -// val ffeats = sspec.featuresForSpan(begin, end) -// layer.tallyDerivative(deriv, SparseVector(labelFeaturizer.index.size)(lspec.featuresForUnaryRule(begin, end, rule, ref).map(_ -> (scale * score)):_*), new FeatureVector(ffeats)) + // val ffeats = sspec.featuresForSpan(begin, end) + // layer.tallyDerivative(deriv, SparseVector(labelFeaturizer.index.size)(lspec.featuresForUnaryRule(begin, end, rule, ref).map(_ -> (scale * score)):_*), new FeatureVector(ffeats)) } override def visitSpan(begin: Int, end: Int, tag: Int, ref: Int, score: Double): Unit = { checkFlush(begin, length + 1, end) axpy(score, new FeatureVector(lspec.featuresForSpan(begin, end, tag, ref)), ruleCountsPerState(length + 1)) -// val ffeats = sspec.featuresForSpan(begin, end) -// layer.tallyDerivative(deriv, SparseVector(labelFeaturizer.index.size)(lspec.featuresForSpan(begin, end, tag, ref).map(_ -> (scale * score)):_*), new FeatureVector(ffeats)) - + // val ffeats = sspec.featuresForSpan(begin, end) + // layer.tallyDerivative(deriv, SparseVector(labelFeaturizer.index.size)(lspec.featuresForSpan(begin, end, tag, ref).map(_ -> (scale * score)):_*), new FeatureVector(ffeats)) } override def visitBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int, score: Double): Unit = { -// val ffeats = sspec.featuresForSplit(begin, split, end) -// layer.tallyDerivative(deriv, SparseVector(labelFeaturizer.index.size)(lspec.featuresForBinaryRule(begin, split, end, rule, ref).map(_ -> (scale * score)):_*), new FeatureVector(ffeats)) + // val ffeats = sspec.featuresForSplit(begin, split, end) + // layer.tallyDerivative(deriv, SparseVector(labelFeaturizer.index.size)(lspec.featuresForBinaryRule(begin, split, end, rule, ref).map(_ -> (scale * score)):_*), new FeatureVector(ffeats)) checkFlush(begin, split, end) axpy(score, new FeatureVector(lspec.featuresForBinaryRule(begin, split, end, rule, ref)), ruleCountsPerState(split)) } @@ -147,7 +144,6 @@ object TransformModel { def anchor(w: IndexedSeq[W], cons: ChartConstraints[L]):GrammarAnchoring[L, W] = new ProjectionsGrammarAnchoring[L, L2, W] { - override def addConstraints(constraints: ChartConstraints[L]): GrammarAnchoring[L, W] = { anchor(w, cons & constraints) } @@ -168,7 +164,7 @@ object TransformModel { private def tetra(begin: Int, split: Int, end: Int) = { (end.toLong * (end + 1) * (end + 2))/6 + ((split + 1) * split / 2 + begin) -// (begin, split, end) + // (begin, split, end) } def scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int) = { @@ -176,11 +172,10 @@ object TransformModel { val sfeats = sspec.featuresForSplit(begin, split, end) layer.activations(new FeatureVector(sfeats)) }) -// if(fs != layer.activations(new FeatureVector( sspec.featuresForSplit(begin, split, end)))) { -// println("!!!!") -// } + // if (fs != layer.activations(new FeatureVector( sspec.featuresForSplit(begin, split, end)))) { + // println("!!!!") + // } val rfeats = lspec.featuresForBinaryRule(begin, split, end, rule, ref) - new FeatureVector(rfeats) dot fs } @@ -205,12 +200,8 @@ object TransformModel { } } - - - } - case class TransformModelFactory(@Help(text= """The kind of annotation to do on the refined grammar. Default uses just parent annotation. You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Manning 2003. @@ -232,8 +223,6 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma type MyModel = TransformModel[AnnotatedLabel, AnnotatedLabel, String] - - override def make(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], @@ -255,41 +244,37 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma val summedWordCounts: Counter[String, Double] = sum(annWords, Axis._0) lazy val ngramF = new NGramSpanFeaturizer(summedWordCounts, NGramSpanFeaturizer.countBigrams(annTrees), annTrees.map(_.words), ngramCountThreshold, maxNGramOrder, useNot = false) lazy val tagSpanShape = new TagSpanShapeFeaturizer(TagSpanShapeGenerator.makeBaseLexicon(trainTrees)) -// lazy val fullShape = new FullWordSpanShapeFeaturizer(summedWordCounts.iterator.filter(_._2 > commonWordThreshold * 10).map(_._1).toSet, numSpanContextWords, useRichSpanContext) + // lazy val fullShape = new FullWordSpanShapeFeaturizer(summedWordCounts.iterator.filter(_._2 > commonWordThreshold * 10).map(_._1).toSet, numSpanContextWords, useRichSpanContext) var wf = posFeaturizer.getOrElse( SpanModelFactory.defaultPOSFeaturizer(annWords)) - if(useMorph) + if (useMorph) wf += mf - var span: SplitSpanFeaturizer[String] = spanFeaturizer.getOrElse(SpanModelFactory.goodFeaturizer(annWords, commonWordThreshold, useShape = false)) - if(useNGrams) + if (useNGrams) span += ngramF span += new SingleWordSpanFeaturizer[String](wf) - val indexedSurface = IndexedSplitSpanFeaturizer.fromData(span, annTrees, bloomFilter = false) - def labelFeaturizer(l: AnnotatedLabel) = Set(l, l.baseAnnotatedLabel).toSeq - def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if(useGrammar) Set(r, r.map(_.baseAnnotatedLabel)).toSeq else if(r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty + def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if (useGrammar) Set(r, r.map(_.baseAnnotatedLabel)).toSeq else if (r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty val featurizer = new ProductionFeaturizer[AnnotatedLabel, AnnotatedLabel, String](xbarGrammar, indexedRefinements, lGen=labelFeaturizer, rGen=ruleFeaturizer) - - new TransformModel(annotator.latent, + new TransformModel( + annotator.latent, constrainer, topology, lexicon, refGrammar, indexedRefinements, featurizer, indexedSurface, - new AffineTransform(featurizer.index.size, rank, new AffineTransform(rank, indexedSurface.featureIndex.size, new IdentityTransform[FeatureVector]()))) + new AffineTransform(featurizer.index.size, rank, new AffineTransform(rank, indexedSurface.featureIndex.size, new IdentityTransform[FeatureVector]())) + ) } - - } \ No newline at end of file diff --git a/src/main/scala/epic/parser/projections/AnchoredForestProjector.scala b/src/main/scala/epic/parser/projections/AnchoredForestProjector.scala index 0b3c19f5..b8e34c96 100644 --- a/src/main/scala/epic/parser/projections/AnchoredForestProjector.scala +++ b/src/main/scala/epic/parser/projections/AnchoredForestProjector.scala @@ -46,7 +46,7 @@ class AnchoredForestProjector(threshold: Double) extends Serializable { } def getOrElseUpdate[T<:AnyRef](arr: Array[T], i: Int, t : =>T) = { - if(arr(i) == null) { + if (arr(i) == null) { arr(i) = t } arr(i) @@ -63,10 +63,10 @@ class AnchoredForestProjector(threshold: Double) extends Serializable { val visitor = new AnchoredVisitor[L] { def visitSpan(begin: Int, end: Int, tag: Int, ref: Int, score: Double) { // fill in spans with 0 if they're active - if(score > 0.0) { + if (score > 0.0) { val index = TriangularArray.index(begin, end) getOrElseUpdate(lexicalScores, index, projVector())(tag) = 1.0 - if(totals(index) eq null) { + if (totals(index) eq null) { totals(index) = projVector() } totals(index)(tag) += score @@ -74,16 +74,16 @@ class AnchoredForestProjector(threshold: Double) extends Serializable { } def visitBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int, count: Double) { - if(count > 0.0) { + if (count > 0.0) { val index = TriangularArray.index(begin, end) var forSpan = binaryScores(index) - if(forSpan eq null) { + if (forSpan eq null) { val numSplits = end - begin forSpan = new Array[OpenAddressHashArray[Double]](numSplits) binaryScores(index) = forSpan } - val parentArray = if(forSpan(split-begin) eq null) { + val parentArray = if (forSpan(split-begin) eq null) { forSpan(split-begin) = projRuleVector() forSpan(split-begin) } else { @@ -95,14 +95,14 @@ class AnchoredForestProjector(threshold: Double) extends Serializable { def visitUnaryRule(begin: Int, end: Int, rule: Int, ref: Int, count: Double) { val index = TriangularArray.index(begin, end) - val parentArray = if(unaryScores(index) eq null) { + val parentArray = if (unaryScores(index) eq null) { unaryScores(index) = projRuleVector() unaryScores(index) } else { unaryScores(index) } parentArray(rule) += count - if(totalsUnaries(index) eq null) { + if (totalsUnaries(index) eq null) { totalsUnaries(index) = projVector() } totalsUnaries(index)(charts.topology.parent(rule)) += count @@ -116,7 +116,6 @@ class AnchoredForestProjector(threshold: Double) extends Serializable { } } - object AnchoredForestProjector { /** diff --git a/src/main/scala/epic/parser/projections/AnchoredSpanProjector.scala b/src/main/scala/epic/parser/projections/AnchoredSpanProjector.scala index 2d3a6fc0..bf139ecb 100644 --- a/src/main/scala/epic/parser/projections/AnchoredSpanProjector.scala +++ b/src/main/scala/epic/parser/projections/AnchoredSpanProjector.scala @@ -45,21 +45,17 @@ class AnchoredSpanProjector(threshold: Double = Double.NegativeInfinity) extends val totals = TriangularArray.fill[DenseVector[Double]](length+1)(labelBeliefs) val totalsUnaries = TriangularArray.fill[DenseVector[Double]](length+1)(labelBeliefs) - val visitor = new AnchoredVisitor[L] { def visitSpan(begin: Int, end: Int, tag: Int, ref: Int, score: Double): Unit = { // fill in spans with 0 if they're active - if(score > 0.0) { + if (score > 0.0) { totals(begin, end)(tag) += score } } - override def skipBinaryRules: Boolean = true - def visitBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int, count: Double): Unit = { - - } + def visitBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int, count: Double): Unit = () def visitUnaryRule(begin: Int, end: Int, rule: Int, ref: Int, count: Double): Unit = { if (count > 0.0) @@ -68,18 +64,12 @@ class AnchoredSpanProjector(threshold: Double = Double.NegativeInfinity) extends } - charts.visitPostorder(visitor, threshold) new AnchoredSpanProjector.AnchoredData(totalsUnaries, totals) } } - - - - - object AnchoredSpanProjector { /** diff --git a/src/main/scala/epic/parser/projections/ChartProjector.scala b/src/main/scala/epic/parser/projections/ChartProjector.scala index 5cd07f80..8a21b074 100644 --- a/src/main/scala/epic/parser/projections/ChartProjector.scala +++ b/src/main/scala/epic/parser/projections/ChartProjector.scala @@ -33,7 +33,7 @@ trait ChartProjector[L, W] { def project(charts: ParseMarginal[L, W], goldTagPolicy: GoldTagPolicy[L] = GoldTagPolicy.noGoldTags[L]):MyAnchoring = { - if(charts.logPartition.isInfinite) throw new NoParseException("infinite partition", charts.words) + if (charts.logPartition.isInfinite) throw new NoParseException("infinite partition", charts.words) val ruleData = proj.projectRulePosteriors(charts, goldTagPolicy) createAnchoring(charts, ruleData, charts.logPartition) } diff --git a/src/main/scala/epic/parser/projections/ConstraintAnchoring.scala b/src/main/scala/epic/parser/projections/ConstraintAnchoring.scala index 099832c9..fe55d70c 100644 --- a/src/main/scala/epic/parser/projections/ConstraintAnchoring.scala +++ b/src/main/scala/epic/parser/projections/ConstraintAnchoring.scala @@ -61,7 +61,6 @@ class ParserChartConstraintsFactory[L, W](val parser: Parser[L, W], val prunedtags = new AtomicInteger(0) val notprunedtags = new AtomicInteger(0) - private val synthetics = BitSet.empty ++ (0 until topology.labelIndex.size).filter(l => isIntermediate(labelIndex.get(l))) def constraints(w: IndexedSeq[W]):ChartConstraints[L] = constraints(w, GoldTagPolicy.noGoldTags[L]) @@ -74,7 +73,7 @@ class ParserChartConstraintsFactory[L, W](val parser: Parser[L, W], logger.debug(s"Building Constraints for ${marg.words}") assert(marg.isMaxMarginal) val length = marg.length - if(marg.logPartition.isInfinite) + if (marg.logPartition.isInfinite) throw new NoParseException("No parse for sentence we're trying to constrain!", marg.words) val (botLabelScores, unaryScores) = computeScores(length, marg) @@ -92,20 +91,19 @@ class ParserChartConstraintsFactory[L, W](val parser: Parser[L, W], assert(labelThresholds(i, i+1) != null && labelThresholds(i,i+1).nonEmpty, "label thresholds" + labelThresholds(i, i+1)) assert(topLabelThresholds(i, i+1) != null && topLabelThresholds(0,length).nonEmpty, "top label thresholds" + topLabelThresholds(i, i+1)) } - if(topLabelThresholds(0,length) == null || !topLabelThresholds(0,length).contains(marg.topology.rootIndex)) + if (topLabelThresholds(0,length) == null || !topLabelThresholds(0,length).contains(marg.topology.rootIndex)) throw new NoParseException("No score at the root!", marg.words) -// val hasMaximalProjection: BitSet = BitSet.empty ++ (0 to length).filter{ i => -// ((labelThresholds(i) ne null) && (topLabelThresholds(i) ne null)) && ((labelThresholds(i)|topLabelThresholds(i)) -- synthetics).nonEmpty -// } + // val hasMaximalProjection: BitSet = BitSet.empty ++ (0 to length).filter{ i => + // ((labelThresholds(i) ne null) && (topLabelThresholds(i) ne null)) && ((labelThresholds(i)|topLabelThresholds(i)) -- synthetics).nonEmpty + // } - //, hasMaximalProjection) + //, hasMaximalProjection) val con = ChartConstraints[L](topLabelThresholds, labelThresholds) -// PrecacheConstraints.checkConstraints(TreeInstance("viterbi", vit, marg.words), con, this) + // PrecacheConstraints.checkConstraints(TreeInstance("viterbi", vit, marg.words), con, this) con } - private def extractLabelThresholds(length: Int, numLabels: Int, scores: Array[Array[Double]], index: Index[_], @@ -120,20 +118,20 @@ class ParserChartConstraintsFactory[L, W](val parser: Parser[L, W], }) } - if(arr ne null) - if(j == i) { - } else if(j - i > 1) { + if (arr ne null) + if (j == i) { + } else if (j - i > 1) { this.notpruned.addAndGet(thresholdedTags.size) this.pruned.addAndGet(arr.count(_ != 0.0) - thresholdedTags.size) } else { - if(thresholdedTags.isEmpty) assert(false, arr.toIndexedSeq) + if (thresholdedTags.isEmpty) assert(false, arr.toIndexedSeq) this.notprunedtags.addAndGet(thresholdedTags.size) this.prunedtags.addAndGet(arr.count(_ != 0.0) - thresholdedTags.size) } val goldTags = (0 until numLabels).filter { isGold(i, j, _) } for(t <- goldTags if arr == null || arr(t) < threshold) { - if(arr == null) { + if (arr == null) { logger.warn(s"Can't even construct span that has gold tag ${labelIndex.get(t)}!") } else { logger.warn(s"Got a below threshold for a goldTag! ${arr(t)} $threshold ${labelIndex.get(t)} " @@ -166,8 +164,8 @@ class ParserChartConstraintsFactory[L, W](val parser: Parser[L, W], for(c <- 0 until topology.labelIndex.size) { thresholds += arr(c) nConstructed += 1 - if(gold.isGoldBotTag(i, j, c)) { - if(arr(c) != 0) + if (gold.isGoldBotTag(i, j, c)) { + if (arr(c) != 0) nGoldConstructed += 1 else { throw new RuntimeException("Can't construct gold tree for " + " " + marg.words) @@ -182,8 +180,8 @@ class ParserChartConstraintsFactory[L, W](val parser: Parser[L, W], for(c <- 0 until grammar.labelIndex.size) { thresholds += arr(c) nConstructed += 1 - if(gold.isGoldTopTag(i, j, c)) { - if(arr(c) != 0) + if (gold.isGoldTopTag(i, j, c)) { + if (arr(c) != 0) nGoldConstructed += 1 else counts(c) += 1 gThresholds += arr(c) @@ -202,7 +200,6 @@ class ParserChartConstraintsFactory[L, W](val parser: Parser[L, W], val visitor = new AnchoredVisitor[L] { def visitBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int, score: Double) {} - override def skipBinaryRules: Boolean = true def visitUnaryRule(begin: Int, end: Int, rule: Int, ref: Int, score: Double) { @@ -215,7 +212,6 @@ class ParserChartConstraintsFactory[L, W](val parser: Parser[L, W], } } - def visitSpan(begin: Int, end: Int, tag: Int, ref: Int, score: Double) { val index = TriangularArray.index(begin, end) if (score != 0.0) { @@ -239,7 +235,7 @@ object ParserChartConstraintsFactory { case class PruningStatistics(data: Array[Double], nConstructed: Double, pruningCounts: DenseVector[Double]) { def merge(other: PruningStatistics, nAllowed:Int = data.length): PruningStatistics = { - if(nAllowed >= data.length + other.data.length) { + if (nAllowed >= data.length + other.data.length) { PruningStatistics(data ++ other.data, this.nConstructed + other.nConstructed, pruningCounts + other.pruningCounts) } else { val subsetThisSize = new Binomial(nAllowed, nConstructed/(other.nConstructed + nConstructed)).draw() @@ -256,9 +252,6 @@ object ParserChartConstraintsFactory { } - - - /** * Object for creating [[epic.constraints.CachedChartConstraintsFactory]] * from a parser and prepopulating it with the contents of a treebank. @@ -285,7 +278,7 @@ object PrecacheConstraints extends LazyLogging { treebank.devTrees.par.foreach { ti => logger.info(s"Ensuring existing constraint for dev tree ${ti.id} ${ti.words}") val constraints = cached.constraints(ti.words) - if(verifyNoGoldPruningInTrain) + if (verifyNoGoldPruningInTrain) checkConstraints(ti.copy(tree = ti.tree.map(_.baseAnnotatedLabel)), constraints, constrainer) } treebank.testTrees.par.foreach { ti => @@ -295,7 +288,6 @@ object PrecacheConstraints extends LazyLogging { cached } - /** * Method for creating [[epic.constraints.CachedChartConstraintsFactory]] * from a parser and prepopulating it with constraints for a training set. @@ -315,16 +307,16 @@ object PrecacheConstraints extends LazyLogging { logger.info(s"Building constraints for ${ti.id} ${ti.words}") constrainer.constraints(ti.words) }) - if(located) { + if (located) { logger.info(s"Already had constraints for ${ti.id} ${ti.words}.") - } else if(verifyNoGoldPruning) { + } else if (verifyNoGoldPruning) { checkConstraints(ti, constraints, constrainer) } val count: Int = parsed.incrementAndGet() - if(count % 10 == 0) { + if (count % 10 == 0) { logger.info("Pruning statistics so far: " + constrainer.overallStatistics) } - if(count % 100 == 0) { + if (count % 100 == 0) { logger.info(s"Parsed $count/$len.") } @@ -337,7 +329,6 @@ object PrecacheConstraints extends LazyLogging { new CachedChartConstraintsFactory(constrainer, cache) } - def checkConstraints[W, L](ti: TreeInstance[L, W], constraints: ChartConstraints[L], constrainer: ParserChartConstraintsFactory[L, W]) { // val decoded = new ViterbiDecoder[L, W].extractBestParse(marg) var printTree = true diff --git a/src/main/scala/epic/parser/projections/EnumeratedAnchoring.scala b/src/main/scala/epic/parser/projections/EnumeratedAnchoring.scala index 60fd88a8..0757fb4f 100644 --- a/src/main/scala/epic/parser/projections/EnumeratedAnchoring.scala +++ b/src/main/scala/epic/parser/projections/EnumeratedAnchoring.scala @@ -31,12 +31,12 @@ case class AnchoredPCFGProjector[L, W](threshold: Double = Double.NegativeInfini type MyAnchoring = EnumeratedAnchoring[L, W] private def normalize(grammar: RuleTopology[L], ruleScores: OpenAddressHashArray[Double], totals: OpenAddressHashArray[Double]):OpenAddressHashArray[Double] = { - if(ruleScores eq null) null + if (ruleScores eq null) null else { val r = new OpenAddressHashArray[Double](ruleScores.length, Double.NegativeInfinity, ruleScores.activeSize) for( (rule, score) <- ruleScores.activeIterator) { val parent = grammar.parent(rule) - if(score > 0) + if (score > 0) r(rule) = math.log(score) - math.log(totals(parent)) } r @@ -44,7 +44,7 @@ case class AnchoredPCFGProjector[L, W](threshold: Double = Double.NegativeInfini } private def logify(ruleScores: OpenAddressHashArray[Double]):OpenAddressHashArray[Double] = { - if(ruleScores eq null) null + if (ruleScores eq null) null else { val r = new OpenAddressHashArray[Double](ruleScores.length, Double.NegativeInfinity, ruleScores.activeSize) for( (rule, score) <- ruleScores.activeIterator) { @@ -61,7 +61,7 @@ case class AnchoredPCFGProjector[L, W](threshold: Double = Double.NegativeInfini } val normBinaries:Array[Array[OpenAddressHashArray[Double]]] = for ((splits, totals) <- binaryScores zip totalsBinaries) yield { - if(splits eq null) null + if (splits eq null) null else for(ruleScores <- splits) yield normalize(charts.topology, ruleScores, totals) } val sparsity = charts.anchoring.sparsityPattern @@ -70,7 +70,6 @@ case class AnchoredPCFGProjector[L, W](threshold: Double = Double.NegativeInfini } - /** * Creates anchorings for a set of trees from some parser using p(rule | sentence) marginals. * @author dlwh @@ -78,7 +77,7 @@ case class AnchoredPCFGProjector[L, W](threshold: Double = Double.NegativeInfini @SerialVersionUID(469174684243960202L) case class AnchoredRuleMarginalProjector[L, W](threshold: Double = Double.NegativeInfinity) extends ChartProjector[L, W] { private def normalize(ruleScores: OpenAddressHashArray[Double]):OpenAddressHashArray[Double] = { - if(ruleScores eq null) null + if (ruleScores eq null) null else { val r = new OpenAddressHashArray[Double](ruleScores.length, Double.NegativeInfinity, ruleScores.activeSize) for( (rule, score) <- ruleScores.activeIterator) { @@ -97,7 +96,7 @@ case class AnchoredRuleMarginalProjector[L, W](threshold: Double = Double.Negati val normUnaries:Array[OpenAddressHashArray[Double]] = unaryScores.map(normalize) val normBinaries:Array[Array[OpenAddressHashArray[Double]]] = for (splits <- binaryScores) yield { - if(splits eq null) null + if (splits eq null) null else splits.map(normalize) } val sparsity = charts.anchoring.sparsityPattern @@ -127,7 +126,6 @@ case class EnumeratedAnchoring[L, W](topology: RuleTopology[L], override def addConstraints(cs: ChartConstraints[L]): UnrefinedGrammarAnchoring[L, W] = copy(sparsityPattern = sparsityPattern & cs) - /** * Computes the pointwise division of two grammars, augmenting * their refinement space to reflect this. If they share the same annotationTag, @@ -143,7 +141,6 @@ case class EnumeratedAnchoring[L, W](topology: RuleTopology[L], } } - /** * Computes the point-wise division of this grammar with some other grammar. * @@ -156,12 +153,11 @@ case class EnumeratedAnchoring[L, W](topology: RuleTopology[L], case that: EnumeratedAnchoring[L, W] => EnumeratedAnchoring.divide(this, that) case _ => super./(other) } - } def scoreUnaryRule(begin: Int, end: Int, rule: Int) = { val forSpan = unaryScores(TriangularArray.index(begin, end)) - if(forSpan eq null) Double.NegativeInfinity + if (forSpan eq null) Double.NegativeInfinity else forSpan(rule) } @@ -169,24 +165,22 @@ case class EnumeratedAnchoring[L, W](topology: RuleTopology[L], val ti = TriangularArray.index(begin, end) val forSpan = binaryScores(ti) val cached = checkCache(split, rule, ti) - if(!java.lang.Double.isNaN(cached)) { + if (!java.lang.Double.isNaN(cached)) { cached - } else if(forSpan eq null) { + } else if (forSpan eq null) { Double.NegativeInfinity } else { val forSplit = forSpan(split - begin) - val result = if(forSplit eq null) Double.NegativeInfinity + val result = if (forSplit eq null) Double.NegativeInfinity else forSplit(rule) - updateCache(split, rule, ti, result) - result } } def scoreSpan(begin: Int, end: Int, tag: Int): Double = { val scores = spanScores(TriangularArray.index(begin, end)) - if(scores ne null) scores(tag) + if (scores ne null) scores(tag) else Double.NegativeInfinity } @@ -196,7 +190,7 @@ case class EnumeratedAnchoring[L, W](topology: RuleTopology[L], private def checkCache(splitPoint: Int, rule: Int, ti: Int) = { val crule = cache(splitPoint * 4) val cti = cache(splitPoint * 4 + 1) - if(rule == crule && cti == ti) { + if (rule == crule && cti == ti) { java.lang.Double.longBitsToDouble(Span(cache(splitPoint * 4 + 2), cache(splitPoint * 4 + 3)).encoded) } else { Double.NaN @@ -218,7 +212,7 @@ object EnumeratedAnchoring { val newSpanScores = Array.tabulate(a.spanScores.length) { i => val oldA = a.spanScores(i) val oldB = b.spanScores(i) - if(null == oldA || null == oldB) { + if (null == oldA || null == oldB) { null } else { doDivide(oldA, oldB) @@ -228,24 +222,23 @@ object EnumeratedAnchoring { val newUnaryScores = Array.tabulate(a.unaryScores.length) { i => val oldA = a.unaryScores(i) val oldB = b.unaryScores(i) - if(null == oldA || null == oldB) { + if (null == oldA || null == oldB) { null } else { doDivide(oldA, oldB) } } - val newBinaryScores = Array.tabulate(a.binaryScores.length) { i => val aArray = a.binaryScores(i) val bArray = b.binaryScores(i) - if(null == aArray || null == bArray) { + if (null == aArray || null == bArray) { null } else { Array.tabulate(aArray.length) { split => val oldA = aArray(split) val oldB = bArray(split) - if(null == oldA || null == oldB) { + if (null == oldA || null == oldB) { null } else { doDivide(oldA, oldB) @@ -259,21 +252,19 @@ object EnumeratedAnchoring { } private def doDivide(a: OpenAddressHashArray[Double], b: OpenAddressHashArray[Double]) = { - if(a == null || b == null) { + if (a == null || b == null) { null } else { val oah = new OpenAddressHashArray[Double](a.size, a.default, a.activeSize min b.activeSize) - var off = 0 - while(off < a.iterableSize) { - if(a.isActive(off)) { + while (off < a.iterableSize) { + if (a.isActive(off)) { val aa = a.valueAt(off) val ii = a.indexAt(off) val bb = b(ii) - if(aa != Double.NegativeInfinity && bb != Double.NegativeInfinity) { + if (aa != Double.NegativeInfinity && bb != Double.NegativeInfinity) { oah(ii) = aa - bb } - } off += 1 } diff --git a/src/main/scala/epic/parser/projections/GoldTagPolicy.scala b/src/main/scala/epic/parser/projections/GoldTagPolicy.scala index 3ac8a080..cf560e28 100644 --- a/src/main/scala/epic/parser/projections/GoldTagPolicy.scala +++ b/src/main/scala/epic/parser/projections/GoldTagPolicy.scala @@ -27,7 +27,7 @@ import breeze.collection.mutable.TriangularArray * @tparam L */ trait GoldTagPolicy[L] { - def isGoldSpan(start: Int, end: Int):Boolean + def isGoldSpan(start: Int, end: Int): Boolean def isGoldTopTag(start: Int, end: Int, tag: Int): Boolean def isGoldBotTag(start: Int, end: Int, tag: Int): Boolean } @@ -36,16 +36,16 @@ object GoldTagPolicy { def noGoldTags[L]:GoldTagPolicy[L] = new GoldTagPolicy[L] { def isGoldTopTag(start: Int, end: Int, tag: Int): Boolean = false def isGoldBotTag(start: Int, end: Int, tag: Int): Boolean = false - def isGoldSpan(start: Int, end: Int):Boolean = false + def isGoldSpan(start: Int, end: Int): Boolean = false } def goldTreeForcing[L](trees: BinarizedTree[Int]*):GoldTagPolicy[L] ={ val goldTop = TriangularArray.raw(trees.head.span.end+1,collection.mutable.BitSet()) val goldBot = TriangularArray.raw(trees.head.span.end+1,collection.mutable.BitSet()) for(tree <- trees) { - if(tree != null) { + if (tree != null) { for( t <- tree.allChildren if t.label != -1) { - if(t.children.size == 1) + if (t.children.size == 1) goldTop(TriangularArray.index(t.span.begin,t.span.end)) += t.label else goldBot(TriangularArray.index(t.span.begin,t.span.end)) += t.label @@ -53,7 +53,7 @@ object GoldTagPolicy { } } new GoldTagPolicy[L] { - def isGoldSpan(start: Int, end: Int):Boolean = { + def isGoldSpan(start: Int, end: Int): Boolean = { val set = goldTop(TriangularArray.index(start,end)) set != null && set.nonEmpty } diff --git a/src/main/scala/epic/parser/projections/GrammarRefinements.scala b/src/main/scala/epic/parser/projections/GrammarRefinements.scala index 4e7ef8b4..91c8e15c 100644 --- a/src/main/scala/epic/parser/projections/GrammarRefinements.scala +++ b/src/main/scala/epic/parser/projections/GrammarRefinements.scala @@ -69,19 +69,15 @@ final case class GrammarRefinements[C, F](labels: ProjectionIndexer[C, F], rules } /** Gives the localized refinement of each parent */ - def parentRefinement(r: Int, ref: Int):Int = parentRefinements(r)(ref) + def parentRefinement(r: Int, ref: Int): Int = parentRefinements(r)(ref) private val parentRefinements: Array[Array[Int]] = Array.tabulate(rules.coarseIndex.size) { r => val parent = labels.coarseIndex(rules.coarseIndex.get(r).parent) - rules.refinementsOf(r).map { ref => labels.localize(rules.fineIndex.get(ref).parent)._2 } - - } - // rule -> parentRef -> [ruleRef] private val parentCompatibleRefinements: Array[Array[Array[Int]]] = Array.tabulate(rules.coarseIndex.size) { r => val parent = labels.coarseIndex(rules.coarseIndex.get(r).parent) @@ -94,7 +90,7 @@ final case class GrammarRefinements[C, F](labels: ProjectionIndexer[C, F], rules } private val leftChildCompatibleRefinements: Array[Array[Array[Int]]] = Array.tabulate(rules.coarseIndex.size) { r => - if(rules.coarseIndex.get(r).isInstanceOf[UnaryRule[C]]) { + if (rules.coarseIndex.get(r).isInstanceOf[UnaryRule[C]]) { null } else { val leftChild = labels.coarseIndex(rules.coarseIndex.get(r).asInstanceOf[BinaryRule[C]].left) @@ -108,10 +104,9 @@ final case class GrammarRefinements[C, F](labels: ProjectionIndexer[C, F], rules } private val rightChildCompatibleRefinements: Array[Array[Array[Int]]] = Array.tabulate(rules.coarseIndex.size) { r => - if(rules.coarseIndex.get(r).isInstanceOf[UnaryRule[C]]) { + if (rules.coarseIndex.get(r).isInstanceOf[UnaryRule[C]]) { null } else { - val rightChild = labels.coarseIndex(rules.coarseIndex.get(r).asInstanceOf[BinaryRule[C]].right) val rightChildRefs = Array.fill(labels.refinementsOf(rightChild).length){ArrayBuffer[Int]()} for(ruleRef <- rules.refinementsOf(r)) { @@ -124,7 +119,7 @@ final case class GrammarRefinements[C, F](labels: ProjectionIndexer[C, F], rules // rule -> parentRef -> [ruleRef] private val childCompatibleRefinements: Array[Array[Array[Int]]] = Array.tabulate(rules.coarseIndex.size) { r => - if(rules.coarseIndex.get(r).isInstanceOf[UnaryRule[C]]) { + if (rules.coarseIndex.get(r).isInstanceOf[UnaryRule[C]]) { val child = labels.coarseIndex(rules.coarseIndex.get(r).asInstanceOf[UnaryRule[C]].child) val childRefs = Array.fill(labels.refinementsOf(child).length){ArrayBuffer[Int]()} for(ruleRef <- rules.refinementsOf(r)) { @@ -138,14 +133,13 @@ final case class GrammarRefinements[C, F](labels: ProjectionIndexer[C, F], rules } private val coarseRulesGivenParentRefinement = Array.tabulate(labels.coarseIndex.size) { p => - // refinement -> rules + // refinement -> rules val result = Array.fill(labels.refinementsOf(p).length)(ArrayBuffer[Int]()) for( (rule, r) <- rules.coarseIndex.pairs if labels.coarseIndex(rule.parent) == p && rule.isInstanceOf[BinaryRule[_]]; ref <- result.indices) { - if(parentCompatibleRefinements(r)(ref).nonEmpty) { + if (parentCompatibleRefinements(r)(ref).nonEmpty) { result(ref) += r } } - result.map(_.toArray) } @@ -156,7 +150,7 @@ final case class GrammarRefinements[C, F](labels: ProjectionIndexer[C, F], rules } private val leftChildRefinementsGivenCoarseRule:Array[Array[Int]] = Array.tabulate(rules.coarseIndex.size) { r => - if(rules.coarseIndex.get(r).isInstanceOf[UnaryRule[_]]) Array.empty + if (rules.coarseIndex.get(r).isInstanceOf[UnaryRule[_]]) Array.empty else { def fineLeftChild(r: Int) = labels.fineIndex(rules.fineIndex.get(r).asInstanceOf[BinaryRule[F]].left) rules.refinementsOf(r).map(fineLeftChild).toSet.toArray.map(labels.localize).sorted @@ -164,7 +158,7 @@ final case class GrammarRefinements[C, F](labels: ProjectionIndexer[C, F], rules } private val rightChildRefinementsGivenCoarseRule:Array[Array[Int]] = Array.tabulate(rules.coarseIndex.size) { r => - if(rules.coarseIndex.get(r).isInstanceOf[UnaryRule[_]]) Array.empty + if (rules.coarseIndex.get(r).isInstanceOf[UnaryRule[_]]) Array.empty else { def fineRightChild(r: Int) = labels.fineIndex(rules.fineIndex.get(r).asInstanceOf[BinaryRule[F]].right) rules.refinementsOf(r).map(fineRightChild).toSet.toArray.map(labels.localize).sorted diff --git a/src/main/scala/epic/parser/projections/LabeledSpanProjector.scala b/src/main/scala/epic/parser/projections/LabeledSpanProjector.scala index ad347718..9369f4ba 100644 --- a/src/main/scala/epic/parser/projections/LabeledSpanProjector.scala +++ b/src/main/scala/epic/parser/projections/LabeledSpanProjector.scala @@ -29,14 +29,14 @@ case class LabeledSpanProjector[L, W](topology: RuleTopology[L], threshold: Doub type MyAnchoring = SpanAnchoring[L, W] private def normalize(ruleScores: OpenAddressHashArray[Double], totals: OpenAddressHashArray[Double]):OpenAddressHashArray[Double] = { - if(ruleScores eq null) null + if (ruleScores eq null) null else { val r = new OpenAddressHashArray[Double](ruleScores.length, Double.NegativeInfinity) for( (rule, score) <- ruleScores.activeIterator) { val parent = topology.parent(rule) - if(score > 0.9999999) { + if (score > 0.9999999) { r(rule) = 10 - } else if(score > 0) { + } else if (score > 0) { r(rule) = math.log(score) - math.log1p(-score) } } @@ -45,13 +45,13 @@ case class LabeledSpanProjector[L, W](topology: RuleTopology[L], threshold: Doub } private def normalizeSpans(totals: OpenAddressHashArray[Double]):OpenAddressHashArray[Double] = { - if(totals eq null) null + if (totals eq null) null else { val r = new OpenAddressHashArray[Double](totals.length, Double.NegativeInfinity) for( (parent, score) <- totals.activeIterator) { - if(score > 0.9999999) { + if (score > 0.9999999) { r(parent) = 10 - } else if(score > 0) { + } else if (score > 0) { r(parent) = math.log(score) - math.log1p(-score) } } @@ -97,13 +97,13 @@ case class SpanAnchoring[L, W](topology: RuleTopology[L], def scoreUnaryRule(begin: Int, end: Int, rule: Int) = { val forSpan = unaryScores(TriangularArray.index(begin, end)) - if(forSpan eq null) Double.NegativeInfinity + if (forSpan eq null) Double.NegativeInfinity else forSpan(rule) } def scoreSpan(begin: Int, end: Int, tag: Int) = { val scores = spanScores(TriangularArray.index(begin, end)) - if(scores ne null) scores(tag) + if (scores ne null) scores(tag) else Double.NegativeInfinity } } diff --git a/src/main/scala/epic/parser/projections/OracleParser.scala b/src/main/scala/epic/parser/projections/OracleParser.scala index 7ec154eb..4a2b3aeb 100644 --- a/src/main/scala/epic/parser/projections/OracleParser.scala +++ b/src/main/scala/epic/parser/projections/OracleParser.scala @@ -52,7 +52,7 @@ class OracleParser[L, L2, W](val grammar: SimpleGrammar[L, L2, W], backupGrammar val projectedTree: BinarizedTree[L] = tree.map(grammar.refinements.labels.project) cache.getOrElseUpdate(words, { val treeconstraints = ChartConstraints.fromTree(grammar.topology.labelIndex, projectedTree) - if(constraints.top.containsAll(treeconstraints.top) && constraints.bot.containsAll(treeconstraints.bot)) { + if (constraints.top.containsAll(treeconstraints.top) && constraints.bot.containsAll(treeconstraints.bot)) { synchronized(total += 1) tree } else try { @@ -97,7 +97,6 @@ class OracleParser[L, L2, W](val grammar: SimpleGrammar[L, L2, W], backupGrammar throw ex } - def makeGoldPromotingAnchoring(grammar: SimpleGrammar[L, L2, W], w: IndexedSeq[W], tree: BinarizedTree[L2], @@ -118,7 +117,6 @@ class OracleParser[L, L2, W](val grammar: SimpleGrammar[L, L2, W], backupGrammar makeGoldPromotingAnchoring(grammar, w, tree, treeconstraints, constraints & cs) } - override def sparsityPattern: ChartConstraints[L] = constraints def scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int): Double = { @@ -148,7 +146,6 @@ class OracleParser[L, L2, W](val grammar: SimpleGrammar[L, L2, W], backupGrammar } } - } def oracleMarginalFactory(trees: IndexedSeq[TreeInstance[L2, W]]):ParseMarginal.Factory[L, W] = new ParseMarginal.Factory[L, W] { @@ -227,7 +224,6 @@ object OracleParser { case e: Exception => e.printStackTrace() } - val name = params.name println("Parser " + name) @@ -239,7 +235,6 @@ object OracleParser { println(stats) } - } } diff --git a/src/main/scala/epic/parser/projections/ProjectingCoreGrammar.scala b/src/main/scala/epic/parser/projections/ProjectingCoreGrammar.scala index f7ea0062..84be4b32 100644 --- a/src/main/scala/epic/parser/projections/ProjectingCoreGrammar.scala +++ b/src/main/scala/epic/parser/projections/ProjectingCoreGrammar.scala @@ -25,12 +25,10 @@ import epic.constraints.ChartConstraints case class ProjectingCoreGrammar[L, W](parser: Parser[L, W], projector: ChartProjector[L, W]) extends Grammar[L, W] { - def topology = parser.topology def lexicon = parser.lexicon - override def withPermissiveLexicon: Grammar[L, W] = { ??? } @@ -46,4 +44,3 @@ case class ProjectingCoreGrammar[L, W](parser: Parser[L, W], } } - diff --git a/src/main/scala/epic/parser/projections/ProjectionIndexer.scala b/src/main/scala/epic/parser/projections/ProjectionIndexer.scala index e60b5d43..e9e837d2 100644 --- a/src/main/scala/epic/parser/projections/ProjectionIndexer.scala +++ b/src/main/scala/epic/parser/projections/ProjectionIndexer.scala @@ -41,37 +41,37 @@ final class ProjectionIndexer[C, F] private (val coarseIndex: Index[C], Array.range(0, arr.length) } - def localize(f: Int):Int = localizationArray(f) - def globalize(c: Int, f: Int):Int = globalRefinements(c)(f) + def localize(f: Int): Int = localizationArray(f) + def globalize(c: Int, f: Int): Int = globalRefinements(c)(f) def globalize(c: C, f: Int):F = fineIndex.get(globalRefinements(coarseIndex(c))(f)) def indexAndLocalize(f: F):(Int, Int) = { val glob = fineIndex(f) - if(glob < 0) (-1, -1) + if (glob < 0) (-1, -1) else project(glob) -> localize(glob) } - def localize(f: F):(C, Int) = { val i = fineIndex(f) - if(i < 0) throw new RuntimeException(s"Not in fine index: $f") + if (i < 0) throw new RuntimeException(s"Not in fine index: $f") coarseIndex.get(indexedProjections(i)) -> localizationArray(i) } def refinementsOf(c: Int):Array[Int] = globalRefinements(c) + def localRefinements(c: Int):Array[Int] = perSymbolRefinements(c) def numRefinements(c: Int): Int = perSymbolRefinements(c).length def refinementsOf(c: C):IndexedSeq[F] = { val ci = coarseIndex(c) - if(ci < 0) throw new RuntimeException("Not a coarse symbol: " + c) + if (ci < 0) throw new RuntimeException("Not a coarse symbol: " + c) globalRefinements(ci).map(fineIndex.get _) } /** * Computes the projection of the indexed fine label f to an indexed coarse label. */ - def project(f: Int):Int = indexedProjections(f) + def project(f: Int): Int = indexedProjections(f) def project(f: F):C = coarseIndex.get(project(fineIndex(f))) @@ -93,7 +93,6 @@ final class ProjectionIndexer[C, F] private (val coarseIndex: Index[C], coarseIndex.map(x => x -> refinementsOf(x)).mkString("ProjectionIndexer(", ", ", ")") } - def localizeArray[T:ClassTag](array: Array[T]):Array[Array[T]] = { require(array.length == fineIndex.size) Array.tabulate(coarseIndex.size) { c => @@ -110,8 +109,8 @@ object ProjectionIndexer { val indexedProjections = Encoder.fromIndex(fineIndex).fillArray(-1) for( (l, idx) <- fineIndex.zipWithIndex) { val projectedIdx = coarseIndex(proj(l)) - if(projectedIdx < 0) { - if(!skipMissingCoarse) + if (projectedIdx < 0) { + if (!skipMissingCoarse) throw new RuntimeException("error while indexing" + l + " to " + proj(l) + fineIndex(l)) } else { indexedProjections(idx) = projectedIdx @@ -130,7 +129,6 @@ object ProjectionIndexer { } } new ProjectionIndexer(coarseIndex, fineIndex, indexedProjections) - } def fromSplitter[C, F](coarseIndex: Index[C], split: C=>Seq[F]) = { @@ -141,7 +139,6 @@ object ProjectionIndexer { indexedProjections += cf } new ProjectionIndexer(coarseIndex, fineIndex, indexedProjections.toArray) - } } diff --git a/src/main/scala/epic/parser/repl/DSLGrammar.scala b/src/main/scala/epic/parser/repl/DSLGrammar.scala index e22563fe..46537bd2 100644 --- a/src/main/scala/epic/parser/repl/DSLGrammar.scala +++ b/src/main/scala/epic/parser/repl/DSLGrammar.scala @@ -38,8 +38,6 @@ object DSLGrammar { case DSLLex(a, word, w) => lexicon(a, word) = w } - - val grammar = RuleTopology("S", binaryProductions, unaryProductions) val unsmoothed = new UnsmoothedLexicon(grammar.labelIndex, lexicon.keySet.toSet) Grammar.generative(grammar, unsmoothed, binaryProductions, unaryProductions, lexicon) diff --git a/src/main/scala/epic/preprocess/JavaWordTokenizer.scala b/src/main/scala/epic/preprocess/JavaWordTokenizer.scala index ab27eb61..0d7a068b 100644 --- a/src/main/scala/epic/preprocess/JavaWordTokenizer.scala +++ b/src/main/scala/epic/preprocess/JavaWordTokenizer.scala @@ -34,7 +34,6 @@ import epic.slab.Sentence class JavaWordTokenizer(locale: Locale) extends Tokenizer { def this() = this(Locale.getDefault) - override def apply[In <: Sentence](slab: StringSlab[In]): StringSlab[In with Token] = { slab.addLayer[Token](slab.iterator[Sentence].flatMap { s => val breaker = BreakIterator.getWordInstance(locale) diff --git a/src/main/scala/epic/preprocess/MLSentenceSegmenter.scala b/src/main/scala/epic/preprocess/MLSentenceSegmenter.scala index 626ecfba..a604c39a 100644 --- a/src/main/scala/epic/preprocess/MLSentenceSegmenter.scala +++ b/src/main/scala/epic/preprocess/MLSentenceSegmenter.scala @@ -27,9 +27,9 @@ class MLSentenceSegmenter(inf: MLSentenceSegmenter.ClassificationInference) exte slab.addLayer[Sentence]( Iterators.fromProducer { def rec():Option[(Span, Sentence)] = { - if(iter.hasNext) { + if (iter.hasNext) { val pos = iter.next() - if(!iter.hasNext || inf.classify(MLSentenceSegmenter.featuresForEndPointDetection(text, pos))) { + if (!iter.hasNext || inf.classify(MLSentenceSegmenter.featuresForEndPointDetection(text, pos))) { val res = Some(Span(lastOffset, math.min(pos + 1, text.length)) -> Sentence()) lastOffset = pos + 1 res @@ -43,7 +43,6 @@ class MLSentenceSegmenter(inf: MLSentenceSegmenter.ClassificationInference) exte rec() }.filterNot(s => text.substring(s._1.begin, s._1.end).forall(_.isWhitespace)) ) - } override def toString = "MLSentenceSegmenter(...)" } @@ -57,7 +56,7 @@ object MLSentenceSegmenter { val oin = new ObjectInputStream(new GZIPInputStream(strm)) oin.readObject().asInstanceOf[MLSentenceSegmenter] } finally { - if(strm != null) + if (strm != null) strm.close() } } @@ -67,11 +66,11 @@ object MLSentenceSegmenter { breeze.util.readObject[MLSentenceSegmenter](file) } - def nextPotentialSentenceBoundary(text: String, offset: Int):Int = { + def nextPotentialSentenceBoundary(text: String, offset: Int): Int = { var start = offset + 1 while (start < text.length) { val codepoint = text.codePointAt(start) - if(isPotentialSentenceBoundary(text, start, codepoint)) { + if (isPotentialSentenceBoundary(text, start, codepoint)) { return start } start += Character.charCount(codepoint) @@ -80,15 +79,13 @@ object MLSentenceSegmenter { } def codepointToString(cp: Int) = { - if(Character.charCount(cp) == 1 && !Character.isISOControl(cp) && !Character.isSpaceChar(cp)) { + if (Character.charCount(cp) == 1 && !Character.isISOControl(cp) && !Character.isSpaceChar(cp)) { cp.toChar.toString } else { Character.getName(cp) } - } - case class CodePointFeature(cp: String, offset: Int = 0) extends Feature case class NextRealLetterFeature(ct: Int) extends Feature { override def toString = { @@ -115,7 +112,7 @@ object MLSentenceSegmenter { case class JavaDistFeature(x: Int) extends Feature case object LineIsShortFeature extends Feature - private def stringForCharType(ct: Int):String = { + private def stringForCharType(ct: Int): String = { val characterClass = Class.forName("java.lang.Character") val fields = characterClass.getDeclaredFields() for (f <- fields) { @@ -136,16 +133,16 @@ object MLSentenceSegmenter { Array(BiasFeature, EOFFeature) } else { val buf = new ArrayBuffer[Feature] -// val break = BreakIterator.getSentenceInstance -// break.setText(text) -// val pos = break.following(math.max(offset - 3, 0)) -// buf += JavaDistFeature(math.min(pos - offset, 5)) + // val break = BreakIterator.getSentenceInstance + // break.setText(text) + // val pos = break.following(math.max(offset - 3, 0)) + // buf += JavaDistFeature(math.min(pos - offset, 5)) buf += BiasFeature // baseline features for the current char val curCharFeatures: IndexedSeq[Feature] = addCharFeatures(text, offset, 0) buf ++= curCharFeatures - if(previousLineIsShort(text, offset)) { + if (previousLineIsShort(text, offset)) { buf += LineIsShortFeature for(m <- curCharFeatures) { buf += CrossProductFeature(LineIsShortFeature, m) @@ -176,34 +173,31 @@ object MLSentenceSegmenter { buf += CrossProductFeature(f1, CrossProductFeature(fmid, f2)) } - for(f1 <- addCharFeatures(text, offset, -1); f2 <- addCharFeatures(text, offset, 2)) { buf += CrossProductFeature(f1, f2) } - val prevSpace = math.max(text.lastIndexWhere(!_.isLetterOrDigit, offset - 2), -1) // -1 is ok, assume BOS is space buf += ContextWord(text.substring(prevSpace + 1, offset)) buf += LastWordLength(offset - prevSpace) val nextNotSpace = text.indexWhere(c => !c.isSpaceChar && !c.isControl, offset + 1) - if(nextNotSpace >= 0) { + if (nextNotSpace >= 0) { val nextWordEnd = if (text.charAt(nextNotSpace).isLetterOrDigit){ text.indexWhere(c => !c.isLetterOrDigit, nextNotSpace + 1) } else { text.indexWhere(c => Character.getType(c) != text.charAt(nextNotSpace), nextNotSpace + 1) } buf += ContextWord(text.substring(prevSpace + 1, prevSpace + 2)+"--" + text.substring(nextNotSpace, nextNotSpace + 1), -3) -// if(nextWordEnd >= 0) { -// buf += ContextWord(text.substring(nextNotSpace, nextWordEnd), 1) -// } + // if (nextWordEnd >= 0) { + // buf += ContextWord(text.substring(nextNotSpace, nextWordEnd), 1) + // } } val nextLetterPos = text.indexWhere(_.isLetterOrDigit, offset + 1) - if(nextLetterPos >= 0) { + if (nextLetterPos >= 0) { buf += NextRealLetterFeature(Character.getType(text.charAt(nextLetterPos))) } - buf += SurroundingCharFeature(if (offset == 0) "BOS" else codepointToString(text.codePointBefore(offset)), if (nextNotSpace < 0) "EOS" else codepointToString(text.codePointAt(nextNotSpace))) @@ -215,15 +209,13 @@ object MLSentenceSegmenter { buf.toArray } - } - def addCharFeatures(text: String, base: Int, rel: Int): IndexedSeq[Feature] = { val buf = new ArrayBuffer[Feature] - val next = try {text.offsetByCodePoints(base, rel)} catch { case ex: IndexOutOfBoundsException => if(rel > 0) text.length else 0} + val next = try {text.offsetByCodePoints(base, rel)} catch { case ex: IndexOutOfBoundsException => if (rel > 0) text.length else 0} val (cp, cps) = - if(next < 0 || next >= text.length) { + if (next < 0 || next >= text.length) { 0 -> "###" } else { val cp = text.codePointAt(next) @@ -242,12 +234,11 @@ object MLSentenceSegmenter { case Character.OTHER_PUNCTUATION if ch == '\'' || ch == '"' => true case _ => false } - } // http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/SentenceBreakProperty.txt // http://www.unicode.org/reports/tr29/#Sentence_Boundaries - def isPotentialSentenceBoundary(text: String, offset: Int, codepoint: Int):Boolean = { + def isPotentialSentenceBoundary(text: String, offset: Int, codepoint: Int): Boolean = { Character.getType(codepoint) match { case Character.OTHER_PUNCTUATION => codepoint != ',' && isProbablyNotContraction(text, offset, codepoint, '\'') case Character.INITIAL_QUOTE_PUNCTUATION => true @@ -265,7 +256,7 @@ object MLSentenceSegmenter { } case Character.CONTROL => isControl(codepoint) && (offset == 0 || -// !isPotentialSentenceBoundary(text, offset - Character.charCount(codepoint), text.codePointBefore(offset)) + // !isPotentialSentenceBoundary(text, offset - Character.charCount(codepoint), text.codePointBefore(offset)) text.codePointBefore(offset)!= ',' && (offset == text.length - 1 || isControl(text.codePointAt(offset + 1)) || previousLineIsShort(text, offset) || Character.isUpperCase(text.codePointAt(offset + 1))) ) @@ -275,18 +266,15 @@ object MLSentenceSegmenter { } - def isControl(codepoint: Int): Boolean = { codepoint == '\r' || codepoint == '\n' || codepoint == '\t' } - def previousLineIsShort(s: String, pos: Int): Boolean = { val SHORT_LINE = 35 // in characters (pos - s.lastIndexOf('\n', pos - 1) ) < SHORT_LINE } - def isProbablyNotContraction(text: String, offset: Int, codepoint: Int, quote: Char): Boolean = { codepoint != quote || offset >= text.length - 1 || offset == 0 || !Character.isLetterOrDigit(text.codePointAt(offset + 1)) || !Character.isLetterOrDigit(text.codePointBefore(offset)) } @@ -309,43 +297,41 @@ object MLSentenceSegmenter { var lastSpan = Span(0, 0) val mapped = for(s@Span(begin, _p) <- endPoints if !lastSpan.crosses(s) && !lastSpan.contains(s)) yield { var p = math.max(_p, 0) - var cp = text.codePointAt(p) - if(p > 0 && !Character.isSpaceChar(cp) && !isPotentialSentenceBoundary(text, p, cp)) { + if (p > 0 && !Character.isSpaceChar(cp) && !isPotentialSentenceBoundary(text, p, cp)) { p -= Character.charCount(cp) cp = text.codePointAt(p) } var earliestControlChar = p val nextNonSpacePos = text.indexWhere(!_.isSpaceChar, p) - if(nextNonSpacePos > p) { + if (nextNonSpacePos > p) { val ccp = text.charAt(nextNonSpacePos) if (ccp == '\n' || ccp == '\t' || ccp == '\r') { earliestControlChar = nextNonSpacePos } } - while(p > 0 && (Character.isSpaceChar(cp) || cp == '\n' || cp == '\t' || cp == '\r')) { - if(!Character.isSpaceChar(cp)) { + while (p > 0 && (Character.isSpaceChar(cp) || cp == '\n' || cp == '\t' || cp == '\r')) { + if (!Character.isSpaceChar(cp)) { earliestControlChar = p } p -= Character.charCount(cp) cp = text.codePointAt(p) } - - if(!isPotentialSentenceBoundary(text, p, cp)) { + if (!isPotentialSentenceBoundary(text, p, cp)) { p += Character.charCount(cp) cp = text.codePointAt(p) } - if(Character.isSpaceChar(cp) && p < text.length) { + if (Character.isSpaceChar(cp) && p < text.length) { p = earliestControlChar cp = text.codePointAt(p) } - if(lastSpan.crosses(s) || lastSpan.contains(s)) { + if (lastSpan.crosses(s) || lastSpan.contains(s)) { println(text.substring(lastSpan.begin, lastSpan.end)) println(text.substring(s.begin, s.end)) println(text.charAt(p)) @@ -382,7 +368,6 @@ object MLSentenceSegmenter { def main(args: Array[String]):Unit = { val mascDir = new File(args(0)) - var sentenceBoundaryProblems = for(dir <- new File(new File(mascDir,"data"), "written").listFiles() if !dir.toString.contains("twitter") && dir.isDirectory; f <- dir.listFiles(new FilenameFilter { @@ -391,19 +376,18 @@ object MLSentenceSegmenter { val slab = MascSlab(f.toURI.toURL) val slabWithSentences = MascSlab.s(slab) - val guessPoints: IndexedSeq[Int] = potentialSentenceBoundariesIterator(slabWithSentences.content).toIndexedSeq val text = slab.content val goldPoints = adjustGoldSentenceBoundaries(text, slabWithSentences.iterator[Sentence].map(_._1)) -// println("<<<<" + f ) -// printOutSentenceBoundaries(text, guessPoints.toSet, goldPoints) + // println("<<<<" + f ) + // printOutSentenceBoundaries(text, guessPoints.toSet, goldPoints) for(guess <- guessPoints) yield { val contextBegin = math.max(0, guess - 50) val contextEnd = math.min(text.length, guess + 50) - val context = if(guess != text.length) { + val context = if (guess != text.length) { text.substring(contextBegin, guess) + "[[" + text.charAt(guess) + "]]" + text.substring(guess + 1, contextEnd) } else { text.substring(contextBegin, guess) + "[[]]" @@ -415,14 +399,13 @@ object MLSentenceSegmenter { } } - val extraInstances = { for ( (text, goldPoints) <- extraExamples) yield { val guessPoints: IndexedSeq[Int] = potentialSentenceBoundariesIterator(text).toIndexedSeq for (guess <- guessPoints) yield { val contextBegin = math.max(0, guess - 50) val contextEnd = math.min(text.length, guess + 50) - val context = if(guess != text.length) { + val context = if (guess != text.length) { text.substring(contextBegin, guess) + "[[" + text.charAt(guess) + "]]" + text.substring(guess + 1, contextEnd) } else { text.substring(contextBegin, guess) + "[[]]" @@ -464,7 +447,6 @@ object MLSentenceSegmenter { println("Special") evalDev(inf, extraInstances.flatten, decoded) - val segmenter: MLSentenceSegmenter = new MLSentenceSegmenter(inf) breeze.util.writeObject(new File("en-sent-segmenter.model.ser.gz"), segmenter) @@ -516,7 +498,6 @@ object MLSentenceSegmenter { } } - case class Marginal(prob: Double, logPartition: Double) extends epic.framework.Marginal class ClassificationModel(val featureIndex: Index[Feature]) extends StandardExpectedCounts.Model[SentenceDecisionInstance] { @@ -526,7 +507,6 @@ object MLSentenceSegmenter { type Inference = MLSentenceSegmenter.ClassificationInference type Scorer = ClassificationInference - override def inferenceFromWeights(weights: DenseVector[Double]): Inference = new ClassificationInference(featureIndex, weights) override def accumulateCounts(inf: Inference, s: Scorer, d: SentenceDecisionInstance, m: Marginal, accum: ExpectedCounts, scale: Double): Unit = { @@ -536,7 +516,6 @@ object MLSentenceSegmenter { } } - @SerialVersionUID(1L) case class ClassificationInference(featureIndex: Index[Feature], weights: DenseVector[Double]) extends epic.framework.Inference[SentenceDecisionInstance] { type Scorer = ClassificationInference @@ -565,7 +544,6 @@ object MLSentenceSegmenter { val fs = new FeatureVector(v.features.map(featureIndex).filterNot(_ == -1)) val act = weights dot fs val prob = sigmoid(act) - Marginal(prob, -log1p(-prob)) } } diff --git a/src/main/scala/epic/preprocess/NewLineSentenceSegmenter.scala b/src/main/scala/epic/preprocess/NewLineSentenceSegmenter.scala index ccddd041..75f4ab4b 100644 --- a/src/main/scala/epic/preprocess/NewLineSentenceSegmenter.scala +++ b/src/main/scala/epic/preprocess/NewLineSentenceSegmenter.scala @@ -19,15 +19,14 @@ class NewLineSentenceSegmenter(locale: Locale = Locale.getDefault) extends Sente val spans = new ArrayBuffer[(Span, Sentence)]() var start = 0 - while(m.find()) { + while (m.find()) { val end = m.end() - if(end - start > 1) + if (end - start > 1) spans += (Span(start, end) -> Sentence()) start = end } spans += Span(start, slab.content.length) -> Sentence() - slab.addLayer[Sentence](spans) } } diff --git a/src/main/scala/epic/preprocess/RegexSearchTokenizer.scala b/src/main/scala/epic/preprocess/RegexSearchTokenizer.scala index 32160148..f2d54840 100644 --- a/src/main/scala/epic/preprocess/RegexSearchTokenizer.scala +++ b/src/main/scala/epic/preprocess/RegexSearchTokenizer.scala @@ -28,19 +28,14 @@ import epic.trees.Span case class RegexSearchTokenizer(pattern : String) extends Tokenizer { private val compiled = pattern.r - def apply[In <: Sentence](slab:StringSlab[In]):StringSlab[In with Token] = { slab.addLayer[Token](slab.iterator[Sentence].flatMap { s => compiled.findAllMatchIn(slab.spanned(s._1)).map{ m => Span(m.start, m.end) -> new Token(m.group(0))} }) } - - - - -// override def apply(doc : String) = new Iterable[String] { -// override def iterator = (pattern.r.findAllIn(doc)); -// } + // override def apply(doc : String) = new Iterable[String] { + // override def iterator = (pattern.r.findAllIn(doc)); + // } override def toString: String = ScalaRunTime._toString(this) } diff --git a/src/main/scala/epic/preprocess/RegexSplitTokenizer.scala b/src/main/scala/epic/preprocess/RegexSplitTokenizer.scala index 9fd3a7f4..115c257f 100644 --- a/src/main/scala/epic/preprocess/RegexSplitTokenizer.scala +++ b/src/main/scala/epic/preprocess/RegexSplitTokenizer.scala @@ -45,7 +45,7 @@ case class RegexSplitTokenizer(pattern : String) extends Tokenizer { spans += (Span(start, end) -> Token(slab.content.substring(start, end))) start = m.end() } - if(start != slab.content.length) + if (start != slab.content.length) spans += Span(start, slab.content.length) -> Token(slab.content.substring(start, slab.content.length)) slab.addLayer[Token](spans) } diff --git a/src/main/scala/epic/preprocess/SentenceSegmenter.scala b/src/main/scala/epic/preprocess/SentenceSegmenter.scala index 040db71b..9bb0361f 100644 --- a/src/main/scala/epic/preprocess/SentenceSegmenter.scala +++ b/src/main/scala/epic/preprocess/SentenceSegmenter.scala @@ -20,7 +20,6 @@ trait SentenceSegmenter extends StringAnalysisFunction[Any, Sentence] with (Stri } - object SegmentSentences { case class Params(splitOnNewline: Boolean = false) def main(_args: Array[String]):Unit = { @@ -28,7 +27,7 @@ object SegmentSentences { val params = config.readIn[Params]() import params._ - val ins = if(args.isEmpty) IndexedSeq(System.in) else args.toStream.map(new FileInputStream(_)) + val ins = if (args.isEmpty) IndexedSeq(System.in) else args.toStream.map(new FileInputStream(_)) val streaming = new StreamSentenceSegmenter(MLSentenceSegmenter.bundled().get, segmentOnNewLines = params.splitOnNewline) for(in <- ins) { try { diff --git a/src/main/scala/epic/preprocess/StreamSentenceSegmenter.scala b/src/main/scala/epic/preprocess/StreamSentenceSegmenter.scala index add7d5bf..c03be66c 100644 --- a/src/main/scala/epic/preprocess/StreamSentenceSegmenter.scala +++ b/src/main/scala/epic/preprocess/StreamSentenceSegmenter.scala @@ -39,11 +39,11 @@ class StreamSentenceSegmenter(val baseSegmenter: SentenceSegmenter, segmentOnNew val buffer = new Array[Char](1024 * 1024) var done = false Iterators.fromProducer { - if(done) { + if (done) { None } else { val numRead = reader.read(buffer) - if(numRead == -1) { + if (numRead == -1) { done = true None } else { diff --git a/src/main/scala/epic/preprocess/TextExtractor.scala b/src/main/scala/epic/preprocess/TextExtractor.scala index be42595f..f399be49 100644 --- a/src/main/scala/epic/preprocess/TextExtractor.scala +++ b/src/main/scala/epic/preprocess/TextExtractor.scala @@ -73,14 +73,11 @@ object TextExtractor { val textHandler = new ToTextContentHandler() { override def ignorableWhitespace(ch: Array[Char], start: Int, length: Int): Unit = characters(ch, start, length) - override def startElement(uri: String, localName: String, qName: String, attributes: Attributes): Unit = { super.startElement(uri, localName, qName, attributes) - if (newLineTags(qName.toLowerCase)) { ignorableWhitespace(Array('\n'), 0, 1) } - } override def endElement(uri: String, localName: String, qName: String): Unit = { @@ -91,7 +88,7 @@ object TextExtractor { } } - val handler = if(extractMainContentOnly) { + val handler = if (extractMainContentOnly) { new BoilerpipeContentHandler(textHandler, ArticleExtractor.getInstance()) { // stupid handler doesn't pass whitespace /* @@ -103,9 +100,6 @@ object TextExtractor { } } */ - - - setIncludeMarkup(true) } } else { @@ -125,14 +119,11 @@ object TextExtractor { stream.close() } - val content = textHandler.toString.trim Slab(content).addLayer(Span(0, content.length) -> epic.slab.Source(url)) } - - /* TODO: I'd like to be able to keep the XHTML formatting in the text, but right now that looks like it's going to cause problems with the way slabs work. (Namely, we'll get discontiguous blocks of text, even in the middle of words. * Uses boilerpipe to extract the content from an XHTML document @@ -168,11 +159,9 @@ object TextExtractor { sb.append("\"") } - sb.append('>') } - override def endElement(uri: String, localName: String, qName: String): Unit = { sb.append(" Content(getLabelsForTextElement(doc, index))) } } @@ -204,7 +193,6 @@ object TextExtractor { } - private def getLabelsForTextElement(doc: TextDocument, index: Int): Set[String] = { doc.getTextBlocks.asScala.find(_.getContainedTextElements.get(index)).map(b => Option(b.getLabels).map(_.asScala).iterator.flatten.toSet).getOrElse(Set.empty) } @@ -213,12 +201,9 @@ object TextExtractor { def extractXHTML(url: URL) = { val metadata = new Metadata() val stream: InputStream = TikaInputStream.get(url, metadata) - val loader = new Loader() new Tika().getParser.parse(stream, loader, metadata, new ParseContext) - loader.value - } def foo(url: URL)= { @@ -241,7 +226,7 @@ import scala.xml._ override def endDocument() { newAdapter.endDocument() // the pdf parser sends two end documents... - if(newAdapter.scopeStack.nonEmpty) + if (newAdapter.scopeStack.nonEmpty) newAdapter.scopeStack.pop() } override def endElement(uri: String, localName: String, qName: String) { @@ -261,8 +246,6 @@ import scala.xml._ override def ignorableWhitespace(ch: Array[Char], start: Int, length: Int): Unit = { characters(ch, start, length) } - - } def hasTika = { diff --git a/src/main/scala/epic/preprocess/Textify.scala b/src/main/scala/epic/preprocess/Textify.scala index aef153be..40702ca2 100644 --- a/src/main/scala/epic/preprocess/Textify.scala +++ b/src/main/scala/epic/preprocess/Textify.scala @@ -18,17 +18,12 @@ object Textify { println(f) val out = new File(outdir, f.getName) val toks = preprocess(f) - - val oo = new PrintWriter(new FileWriter(out)) - for(line <- toks) { oo.println(line.mkString("\t")) } - oo.close() } - } } diff --git a/src/main/scala/epic/preprocess/TreebankTokenizer.scala b/src/main/scala/epic/preprocess/TreebankTokenizer.scala index 656fb5f9..470d7576 100644 --- a/src/main/scala/epic/preprocess/TreebankTokenizer.scala +++ b/src/main/scala/epic/preprocess/TreebankTokenizer.scala @@ -29,7 +29,6 @@ class TreebankTokenizer() extends Tokenizer with Serializable { }) } - } object TreebankTokenizer extends TreebankTokenizer { @@ -71,7 +70,6 @@ object TreebankTokenizer extends TreebankTokenizer { slabWithTokens.iterator[Sentence].map{sent => val gold = slabWithTokens.covered[Segment](sent._1).map { case (span, tok) => slab.spanned(span)} val guess = TreebankTokenizer(slab.spanned(sent._1)) - (gold, guess, slab.spanned(sent._1)) } } diff --git a/src/main/scala/epic/preprocess/WhitespaceTokenizer.scala b/src/main/scala/epic/preprocess/WhitespaceTokenizer.scala index 1f08ad4f..facc4cfc 100644 --- a/src/main/scala/epic/preprocess/WhitespaceTokenizer.scala +++ b/src/main/scala/epic/preprocess/WhitespaceTokenizer.scala @@ -24,10 +24,8 @@ class WhitespaceTokenizer() extends RegexSplitTokenizer("\\s+") object WhitespaceTokenizer { def apply() : WhitespaceTokenizer = new WhitespaceTokenizer - private val _instance : WhitespaceTokenizer = apply() def apply(in : String) : Iterable[String] = _instance(in) - } diff --git a/src/main/scala/epic/preprocess/package.scala b/src/main/scala/epic/preprocess/package.scala index b01f062f..da3ac3f3 100644 --- a/src/main/scala/epic/preprocess/package.scala +++ b/src/main/scala/epic/preprocess/package.scala @@ -11,7 +11,7 @@ import java.net.URL package object preprocess { def tokenize(sentence: String): IndexedSeq[String] = TreebankTokenizer(sentence) - def loadContent(url: URL):String = TextExtractor.extractText(url) + def loadContent(url: URL): String = TextExtractor.extractText(url) def preprocess(url: URL):IndexedSeq[IndexedSeq[String]] = { preprocess(loadContent(url)) @@ -25,7 +25,6 @@ package object preprocess { preprocess(file.toURI.toURL) } - private lazy val _seg = MLSentenceSegmenter.bundled().get } diff --git a/src/main/scala/epic/sentiment/SentimentLossAugmentation.scala b/src/main/scala/epic/sentiment/SentimentLossAugmentation.scala index e877c2cb..8d37d037 100644 --- a/src/main/scala/epic/sentiment/SentimentLossAugmentation.scala +++ b/src/main/scala/epic/sentiment/SentimentLossAugmentation.scala @@ -20,7 +20,7 @@ case class SentimentLossAugmentation[W](trainTrees: IndexedSeq[TreeInstance[Anno val losses = Array.tabulate(5,5)(loss) - def projectedLabel(l: AnnotatedLabel) = if(l == AnnotatedLabel.TOP) -1 else l.label.toInt + def projectedLabel(l: AnnotatedLabel) = if (l == AnnotatedLabel.TOP) -1 else l.label.toInt val sentimentScores: Array[Int] = topology.labelEncoder.tabulateArray(projectedLabel) val trainingMap = trainTrees.iterator.map(ti => ti.words -> ti).toMap @@ -28,11 +28,9 @@ case class SentimentLossAugmentation[W](trainTrees: IndexedSeq[TreeInstance[Anno def lossAugmentation(datum: TreeInstance[AnnotatedLabel, W]): UnrefinedGrammarAnchoring[AnnotatedLabel, W] = { // drop the root val goldMap = datum.tree.map(projectedLabel).preorder.filter(_.label != -1).map{t => t.span -> t.label}.toMap - new SentimentLossAnchoring(topology, lexicon, datum.words, goldMap, constraintFactory.constraints(datum.words)) } - /** * Returns a [[epic.parser.UnrefinedGrammarAnchoring]] for this particular sentence. * @param words @@ -60,7 +58,7 @@ case class SentimentLossAugmentation[W](trainTrees: IndexedSeq[TreeInstance[Anno case Some(goldLabel) => assert(goldLabel != -1) val guessLabel = sentimentScores(tag) - if(guessLabel == -1) { + if (guessLabel == -1) { breeze.numerics.I(goldLabel == guessLabel) * 10000 } else { losses(goldLabel)(guessLabel) * (if (begin == 0 && end == words.size) rootLossScaling else 1.0) @@ -72,7 +70,6 @@ case class SentimentLossAugmentation[W](trainTrees: IndexedSeq[TreeInstance[Anno } - } object SentimentLossAugmentation { diff --git a/src/main/scala/epic/sentiment/SentimentTreebankPipeline.scala b/src/main/scala/epic/sentiment/SentimentTreebankPipeline.scala index 61f56a07..7b27dc21 100644 --- a/src/main/scala/epic/sentiment/SentimentTreebankPipeline.scala +++ b/src/main/scala/epic/sentiment/SentimentTreebankPipeline.scala @@ -46,22 +46,20 @@ object SentimentTreebankPipeline extends LazyLogging { rootLossScaling: Double = 1.0, computeTrainLL: Boolean = false) - def main(args: Array[String]):Unit = { val params = CommandLineParser.readIn[Options](args) val treebank = new ProcessedTreebank(params.path, treebankType = "simple") var trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]] = treebank.trainTrees - if(params.evalOnTest && params.includeDevInTrain) + if (params.evalOnTest && params.includeDevInTrain) trainTrees ++= treebank.devTrees println(trainTrees.size + " train trees, " + treebank.devTrees.size + " dev trees, " + treebank.testTrees.size + " test trees") val gen = GenerativeParser.fromTrees(trainTrees) - class GoldBracketingsConstraints extends ChartConstraints.Factory[AnnotatedLabel, String] { val trees = (trainTrees ++ treebank.devTrees ++ treebank.testTrees).map(ti => ti.words -> ti.tree).toMap -// val trees = ((if (params.includeDevInTrain) trainTrees else trainTrees ++ treebank.devTrees) ++ treebank.testTrees).map(ti => ti.words -> ti.tree).toMap + // val trees = ((if (params.includeDevInTrain) trainTrees else trainTrees ++ treebank.devTrees) ++ treebank.testTrees).map(ti => ti.words -> ti.tree).toMap def constraints(w: IndexedSeq[String]): ChartConstraints[AnnotatedLabel] = { val constraints = SpanConstraints.fromTree(trees.getOrElse(w, gen.bestBinarizedTree(w))) @@ -87,7 +85,7 @@ object SentimentTreebankPipeline extends LazyLogging { sentimentLoss, params.rootLossScaling) -// val model = new SpanModelFactory(annotator = GenerativeParser.defaultAnnotator(vertical = params.v), dummyFeats = 0.5).make(trainTrees, constrainer) + // val model = new SpanModelFactory(annotator = GenerativeParser.defaultAnnotator(vertical = params.v), dummyFeats = 0.5).make(trainTrees, constrainer) val model = params.modelFactory.make(trainTrees, gen.topology, gen.lexicon, new GoldBracketingsConstraints) val obj = new ModelObjective(model, trainTrees) @@ -100,10 +98,10 @@ object SentimentTreebankPipeline extends LazyLogging { for ((state, iter) <- itr.take(params.maxIterations).zipWithIndex if iter % params.iterationsPerEval == 0) try { val parser = model.extractParser(state.x).copy(decoder=new MaxConstituentDecoder[AnnotatedLabel, String]) -// if(params.evalOnTest) -// println("Eval: " + evaluate(s"$name-$iter", parser, treebank.testTrees)) -// else -// println("Eval: " + evaluate(s"$name-$iter", parser, treebank.devTrees)) + // if (params.evalOnTest) + // println("Eval: " + evaluate(s"$name-$iter", parser, treebank.testTrees)) + // else + // println("Eval: " + evaluate(s"$name-$iter", parser, treebank.devTrees)) if (params.computeTrainLL) { computeLL(trainTrees, model, state.x) } @@ -119,12 +117,10 @@ object SentimentTreebankPipeline extends LazyLogging { case e: Exception => e.printStackTrace(); throw e } - } def renderArr(arr: Array[Array[Int]]) = arr.map(_.map(_.toString).reduce(_ + "\t" + _)).reduce(_ + "\n" + _) - class Model[L, W](val inner: ParserModel[L, W]) extends epic.framework.Model[TreeInstance[L, W]] { type ExpectedCounts = inner.ExpectedCounts type Marginal = inner.Marginal @@ -133,7 +129,6 @@ object SentimentTreebankPipeline extends LazyLogging { def emptyCounts = inner.emptyCounts - def accumulateCounts(inf: Inference, s: Scorer, d: TreeInstance[L, W], m: Marginal, accum: ExpectedCounts, scale: Double): Unit = { inner.accumulateCounts(inf.pm.asInstanceOf[inner.Inference], s, d, m, accum, scale) } @@ -164,7 +159,6 @@ object SentimentTreebankPipeline extends LazyLogging { pm.goldMarginal(scorer, v) } - def marginal(anch: Scorer, v: TreeInstance[L, W]): Inference[L, W]#Marginal = { LatentTreeMarginal[L, W](anch, v.tree.map(l => labels:scala.collection.IndexedSeq[(L, Int)])) } @@ -191,14 +185,12 @@ object SentimentTreebankPipeline extends LazyLogging { rootsRightBinary + stats.rootsRightBinary, numBinaryRoots + stats.numBinaryRoots) - override def toString = { "Spans: " + SentimentEvaluator.renderNumerDenom(spansRight, numSpans) + " (Ternary: " + SentimentEvaluator.renderNumerDenom(spansRightTernary, numSpans) + "), Roots: " + SentimentEvaluator.renderNumerDenom(rootsRight, numRoots) + " (Ternary: " + SentimentEvaluator.renderNumerDenom(rootsRightTernary, numRoots) + ")" } - - -// override def toString = f"Stats(cspans=${coarseSpansRight.toDouble/coarseSpans}%.4f: $coarseSpansRight/$coarseSpans spans=${spansRight.toDouble/numSpans}%.4f: $spansRight/$numSpans, coarseRoots=${coarseRootsRight.toDouble/numCoarseRoots}: $coarseRootsRight/$numCoarseRoots , roots=${rootsRight.toDouble/numRoots}%.4f: $rootsRight/$numRoots)" + + // override def toString = f"Stats(cspans=${coarseSpansRight.toDouble/coarseSpans}%.4f: $coarseSpansRight/$coarseSpans spans=${spansRight.toDouble/numSpans}%.4f: $spansRight/$numSpans, coarseRoots=${coarseRootsRight.toDouble/numCoarseRoots}: $coarseRootsRight/$numCoarseRoots , roots=${rootsRight.toDouble/numRoots}%.4f: $rootsRight/$numRoots)" } object DecodeType extends Enumeration { @@ -291,7 +283,6 @@ object SentimentTreebankPipeline extends LazyLogging { val neg = summed(AnnotatedLabel("0")) + summed(AnnotatedLabel("1")) val pos = summed(AnnotatedLabel("3")) + summed(AnnotatedLabel("4")) val neutral = summed(AnnotatedLabel("2")) - if(neg > pos && neg > neutral) { AnnotatedLabel("0") } else if (pos > neg && pos > neutral) { @@ -304,8 +295,7 @@ object SentimentTreebankPipeline extends LazyLogging { } } } - - + def computeLL(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], model: SpanModel[AnnotatedLabel,AnnotatedLabel,String], weights: DenseVector[Double]) { println("Computing final log likelihood on the whole training set...") val inf = model.inferenceFromWeights(weights) diff --git a/src/main/scala/epic/sequences/CRF.scala b/src/main/scala/epic/sequences/CRF.scala index 252f7c2e..5685cc08 100644 --- a/src/main/scala/epic/sequences/CRF.scala +++ b/src/main/scala/epic/sequences/CRF.scala @@ -83,7 +83,6 @@ object CRF { buildSimple(fixedData, false, gazetteer, opt = opt) } - trait Anchoring[L, W] extends TagConstraints[L] { def words : IndexedSeq[W] def length: Int = words.length @@ -92,7 +91,6 @@ object CRF { def startSymbol: L def validSymbols(pos: Int): Set[Int] - override def allowedTags(pos: Int): Set[Int] = validSymbols(pos) def *(other: Anchoring[L, W]):Anchoring[L, W] = { @@ -104,7 +102,6 @@ object CRF { } } - trait Marginal[L, W] extends VisitableMarginal[TransitionVisitor[L, W]] { def anchoring: Anchoring[L, W] @@ -125,7 +122,7 @@ object CRF { var prev = 0 val numLabels: Int = anchoring.labelIndex.size var sum = 0.0 - while(prev < numLabels) { + while (prev < numLabels) { sum += transitionMarginal(pos, prev, label) prev += 1 } @@ -133,7 +130,6 @@ object CRF { } } - object Marginal { def apply[L, W](scorer: Anchoring[L, W]):Marginal[L, W] = { @@ -143,7 +139,6 @@ object CRF { val partition = softmax(forwardScores.last) val _s = scorer - new Marginal[L, W] { def anchoring: Anchoring[L, W] = _s @@ -154,11 +149,11 @@ object CRF { while (pos < length) { var label = 0 while (label < numLabels) { - if(!backwardScore(pos+1)(label).isInfinite) { + if (!backwardScore(pos+1)(label).isInfinite) { var prevLabel = 0 while (prevLabel < numLabels) { val score = transitionMarginal(pos, prevLabel, label) - if(score != 0.0) + if (score != 0.0) f(pos, prevLabel, label, score) prevLabel += 1 } @@ -170,24 +165,19 @@ object CRF { } - /** Log-normalized probability of seing segment with transition */ def transitionMarginal(pos: Int, prev: Int, cur: Int): Double = { val withoutTrans = forwardScores(pos)(prev) + backwardScore(pos+1)(cur) - if(withoutTrans.isInfinite) 0.0 + if (withoutTrans.isInfinite) 0.0 else math.exp(withoutTrans + anchoring.scoreTransition(pos, prev, cur) - logPartition) } - - def logPartition: Double = partition // println(words + " " + partition) } } - - def goldMarginal[L, W](scorer: Anchoring[L, W], tags: IndexedSeq[L]):Marginal[L, W] = { var lastSymbol = scorer.labelIndex(scorer.startSymbol) var score = 0.0 @@ -220,14 +210,10 @@ object CRF { numerics.I(prev == indexedSymbols(pos) && cur == indexedSymbols(pos + 1)) } - def logPartition: Double = score } } - - - /** * * @param scorer @@ -247,20 +233,17 @@ object CRF { val cur = forwardScores(i+1) for ( next <- scorer.validSymbols(i)) { var offset = 0 - for ( previous <- if(i == 0) IndexedSeq(scorer.labelIndex(scorer.startSymbol)) else scorer.validSymbols(i-1)) { + for ( previous <- if (i == 0) IndexedSeq(scorer.labelIndex(scorer.startSymbol)) else scorer.validSymbols(i-1)) { val score = scorer.scoreTransition(i, previous, next) + forwardScores(i)(previous) - if(score != Double.NegativeInfinity) { + if (score != Double.NegativeInfinity) { cache(offset) = score offset += 1 } } cur(next) = softmax.array(cache, offset) } - - } - forwardScores } @@ -288,33 +271,27 @@ object CRF { for( next <- scorer.validSymbols(i)) { val nextScore = backwardScores(i+1)(next) val score = scorer.scoreTransition(i, curLabel, next) + nextScore - if(score != Double.NegativeInfinity) { + if (score != Double.NegativeInfinity) { accumArray(offset) = score offset += 1 } } cur(curLabel) = softmax(new DenseVector(accumArray, 0, 1, offset)) - } } backwardScores } - - } - trait TransitionVisitor[L, W] { def apply(pos: Int, prev: Int, cur: Int, count: Double) } trait IndexedFeaturizer[L, W] { def anchor(w: IndexedSeq[W]):AnchoredFeaturizer[L, W] - def startSymbol: L - def labelIndex: Index[L] def featureIndex: Index[Feature] } @@ -325,7 +302,6 @@ object CRF { def validSymbols(pos: Int):Set[Int] } - def viterbi[L, W](scorer: Anchoring[L ,W], id: String=""):TaggedSequence[L, W] = { val length = scorer.length val numLabels = scorer.labelIndex.size @@ -334,7 +310,6 @@ object CRF { forwardScores(0)(scorer.labelIndex(scorer.startSymbol)) = 0.0 val backPointer = Array.fill(length, numLabels)(-1) - // forward for(i <- 0 until length) { val cur = forwardScores(i+1) @@ -345,7 +320,7 @@ object CRF { for ( previous <- scorer.validSymbols(i-1)) { val score = scorer.scoreTransition(i, previous, next) + forwardScores(i)(previous) - if(score > currentMax) { + if (score > currentMax) { currentMax = score currentArgMax = previous } @@ -361,29 +336,27 @@ object CRF { def rec(end: Int, label: Int) { tags += scorer.labelIndex.get(label) - if(end > 0) { + if (end > 0) { val bestCurrentLabel = backPointer(end)(label) rec(end-1, bestCurrentLabel) } - } + rec(length-1, (0 until numLabels).maxBy(forwardScores(length)(_))) assert(tags.length == scorer.words.length, tags.reverse + " " + scorer.words) TaggedSequence(tags.reverse, scorer.words, id) } - def posteriorDecode[L, W](m: Marginal[L, W], id: String = "") = { val length = m.length val labels = (0 until length).map(pos => (0 until m.anchoring.labelIndex.size).maxBy(m.positionMarginal(pos, _))) - TaggedSequence(labels.map(m.anchoring.labelIndex.get), m.words, id) } case class ProductAnchoring[L, W](a: Anchoring[L ,W], b: Anchoring[L, W]) extends Anchoring[L, W] { - if((a.labelIndex ne b.labelIndex) && (a.labelIndex != b.labelIndex)) throw new IllegalArgumentException("Elements of product anchoring must have the same labelIndex!") - if(a.startSymbol != b.startSymbol) throw new IllegalArgumentException("Elements of product anchoring must have the same startSymbol!") + if ((a.labelIndex ne b.labelIndex) && (a.labelIndex != b.labelIndex)) throw new IllegalArgumentException("Elements of product anchoring must have the same labelIndex!") + if (a.startSymbol != b.startSymbol) throw new IllegalArgumentException("Elements of product anchoring must have the same startSymbol!") def words: IndexedSeq[W] = a.words @@ -402,11 +375,9 @@ object CRF { class IdentityAnchoring[L, W](val words: IndexedSeq[W], val validSyms: IndexedSeq[Set[Int]], val labelIndex: Index[L], val startSymbol: L) extends Anchoring[L, W] { def scoreTransition(pos: Int, prev: Int, cur: Int): Double = 0.0 - - def validSymbols(pos: Int): Set[Int] = validSyms(pos) - def canStartLongSegment(pos: Int): Boolean = true } + } diff --git a/src/main/scala/epic/sequences/CRFModel.scala b/src/main/scala/epic/sequences/CRFModel.scala index 103471eb..e04ee21a 100644 --- a/src/main/scala/epic/sequences/CRFModel.scala +++ b/src/main/scala/epic/sequences/CRFModel.scala @@ -43,7 +43,7 @@ class CRFModel[L, W](val featureIndex: Index[Feature], def apply(pos: Int, prev: Int, cur: Int, count: Double) { val feats = localization.featuresForTransition(pos, prev, cur) - if(count != 0) assert(feats ne null, (pos, prev, cur, marg.length, marg.anchoring.validSymbols(pos), marg.anchoring.validSymbols(pos-1))) + if (count != 0) assert(feats ne null, (pos, prev, cur, marg.length, marg.anchoring.validSymbols(pos), marg.anchoring.validSymbols(pos-1))) axpy(scale * count, feats, counts) } } @@ -59,16 +59,12 @@ class CRFInference[L, W](val weights: DenseVector[Double], val lexicon: TagConstraints.Factory[L, W], featurizer: CRF.IndexedFeaturizer[L, W]) extends AugmentableInference[TaggedSequence[L, W], CRF.Anchoring[L, W]] with CRF[L, W] with AnnotatingInference[TaggedSequence[L, W]] with Serializable { - - - def scorer(v: TaggedSequence[L, W]): Scorer = new Anchoring(v.words) def viterbi(sentence: IndexedSeq[W], anchoring: CRF.Anchoring[L, W]): TaggedSequence[L, W] = { CRF.viterbi(new Anchoring(sentence) * anchoring) } - def annotate(datum: TaggedSequence[L, W], m: Marginal): TaggedSequence[L, W] = { CRF.posteriorDecode(m) } @@ -81,21 +77,17 @@ class CRFInference[L, W](val weights: DenseVector[Double], def anchor(w: IndexedSeq[W]) = new Anchoring(w) - def labelIndex = featurizer.labelIndex def startSymbol = featurizer.startSymbol - def marginal(scorer: Scorer, v: TaggedSequence[L, W], aug: CRF.Anchoring[L, W]): CRFInference[L, W]#Marginal = { CRF.Marginal(scorer * aug) } - def goldMarginal(scorer: Scorer, v: TaggedSequence[L, W], aug: CRF.Anchoring[L, W]): Marginal = { CRF.Marginal.goldMarginal[L, W](new Anchoring(v.words) * aug, v.label) } - private val allLabels = (0 until labelIndex.size).toSet def baseAugment(v: TaggedSequence[L, W]): CRF.Anchoring[L, W] = { @@ -109,13 +101,11 @@ class CRFInference[L, W](val weights: DenseVector[Double], for(a <- transCache; b <- a) util.Arrays.fill(b, Double.NegativeInfinity) for(i <- 0 until length; c <- validSymbols(i); p <- validSymbols(i-1)) { val feats = localization.featuresForTransition(i, p, c) - if(feats ne null) + if (feats ne null) transCache(p)(c)(i) = weights dot feats else transCache(p)(c)(i) = Double.NegativeInfinity } - - def validSymbols(pos: Int): Set[Int] = localization.validSymbols(pos) def scoreTransition(pos: Int, prev: Int, cur: Int): Double = { @@ -127,7 +117,6 @@ class CRFInference[L, W](val weights: DenseVector[Double], def startSymbol = featurizer.startSymbol } - def posteriorDecode(m: Marginal):TaggedSequence[L, W] = { CRF.posteriorDecode(m) } @@ -146,10 +135,8 @@ class TaggedSequenceModelFactory[L](val startSymbol: L, val labelIndex: Index[L] = Index[L](Iterator(startSymbol) ++ train.iterator.flatMap(_.label)) val counts: Counter2[L, String, Double] = Counter2.count(train.flatMap(p => p.label zip p.words)).mapValues(_.toDouble) - val lexicon:TagConstraints.Factory[L, String] = new SimpleLexicon[L, String](labelIndex, counts) - var featurizer: WordFeaturizer[String] = wordFeaturizer.getOrElse(WordFeaturizer.goodPOSTagFeaturizer(counts)) featurizer = gazetteer.foldLeft(featurizer)(_ + _) val l2featurizer: WordFeaturizer[String] = transitionFeaturizer.getOrElse(WordFeaturizer.goodPOSTagTransitionFeaturizer(counts)) @@ -175,8 +162,8 @@ class TaggedSequenceModelFactory[L](val startSymbol: L, l <- lexLoc.allowedTags(b) } { lfBuilder.add(l, loc.featuresForWord(b)) - if(lexLoc.allowedTags(b).size > 1) { - for(prevTag <- if(b == 0) Set(labelIndex(startSymbol)) else lexLoc.allowedTags(b-1)) { + if (lexLoc.allowedTags(b).size > 1) { + for(prevTag <- if (b == 0) Set(labelIndex(startSymbol)) else lexLoc.allowedTags(b-1)) { l2Builder.add(label2Features(prevTag)(l), l2loc.featuresForWord(b)) } } @@ -184,7 +171,6 @@ class TaggedSequenceModelFactory[L](val startSymbol: L, progress.info(s"${lfBuilder.size + l2Builder.size}") } - val indexed = new IndexedStandardFeaturizer[L, String](indexedFeaturizer, indexedL2featurizer, lexicon, startSymbol, labelIndex, label2Features, lfBuilder.result(), l2Builder.result()) @@ -198,7 +184,6 @@ class TaggedSequenceModelFactory[L](val startSymbol: L, object TaggedSequenceModelFactory { - @SerialVersionUID(1L) class IndexedStandardFeaturizer[L, String](wordFeaturizer: IndexedWordFeaturizer[String], l2WordFeaturizer: IndexedWordFeaturizer[String], @@ -214,7 +199,6 @@ object TaggedSequenceModelFactory { private val loff = featureIndex.componentOffset(0) private val l2off = featureIndex.componentOffset(1) - private val startSymbolSet = Set(labelIndex(startSymbol)) def anchor(w: IndexedSeq[String]): AnchoredFeaturizer[L, String] = new AnchoredFeaturizer[L, String] { @@ -223,12 +207,10 @@ object TaggedSequenceModelFactory { val lexLoc = lexicon.anchor(w) def featureIndex: Index[Feature] = outer.featureIndex - def validSymbols(pos: Int): Set[Int] = if(pos < 0 || pos >= w.length) startSymbolSet else lexLoc.allowedTags(pos) + def validSymbols(pos: Int): Set[Int] = if (pos < 0 || pos >= w.length) startSymbolSet else lexLoc.allowedTags(pos) def length = w.length - - val featureArray = Array.ofDim[FeatureVector](length, labelIndex.size, labelIndex.size) private val posNeedsAmbiguity = Array.tabulate(length)(i => validSymbols(i).size > 1) for { @@ -239,7 +221,7 @@ object TaggedSequenceModelFactory { prevTag <- validSymbols(pos-1) } { val l2feats = l2loc.featuresForWord(pos) - val feats = if(posNeedsAmbiguity(pos)) { + val feats = if (posNeedsAmbiguity(pos)) { justLabel++ label2FeatureIndex.crossProduct(Array(label2Features(prevTag)(curTag)), l2feats, offset = l2off, usePlainLabelFeatures = true) } else { justLabel @@ -255,5 +237,4 @@ object TaggedSequenceModelFactory { } } - } diff --git a/src/main/scala/epic/sequences/GoldSegmentPolicy.scala b/src/main/scala/epic/sequences/GoldSegmentPolicy.scala index 0651b919..7a7f4ee7 100644 --- a/src/main/scala/epic/sequences/GoldSegmentPolicy.scala +++ b/src/main/scala/epic/sequences/GoldSegmentPolicy.scala @@ -30,7 +30,7 @@ object GoldSegmentPolicy { def goldSegmentForcing[L](trees: IndexedSeq[(Int,Span)]*):GoldSegmentPolicy[L] ={ val gold = TriangularArray.raw(trees.last.last._2.end+1,collection.mutable.BitSet()) for(tree <- trees) { - if(tree != null) { + if (tree != null) { for( (label, span) <- tree) { gold(TriangularArray.index(span.begin,span.end)) += label } diff --git a/src/main/scala/epic/sequences/HMM.scala b/src/main/scala/epic/sequences/HMM.scala index bafab0c2..f8d0ee4c 100644 --- a/src/main/scala/epic/sequences/HMM.scala +++ b/src/main/scala/epic/sequences/HMM.scala @@ -38,7 +38,7 @@ object HMM { val wcs = w.map(wordCounts(_)) val validSyms = w.map { w => - if(wordCounts(w) >= 10) { + if (wordCounts(w) >= 10) { emissions(::, w).findAll( _ > 0).map(labelIndex(_)).toSet } else { allSyms @@ -57,12 +57,12 @@ object HMM { emitScore + encodedTransitions(prev, cur) } - def scoreEmission(cur: Int, pos: Int): Double = if(smoothEmissions) { + def scoreEmission(cur: Int, pos: Int): Double = if (smoothEmissions) { val w = words(pos) var cWord = wcs(pos) var cTagWord = emissions(labelIndex.get(cur), w) assert(cWord >= cTagWord) - if(cWord < 10) { + if (cWord < 10) { cWord += 1.0 cTagWord += indexedLabelCounts(cur) / wordCounts.size } diff --git a/src/main/scala/epic/sequences/HammingLossAugmentation.scala b/src/main/scala/epic/sequences/HammingLossAugmentation.scala index ffeaeeed..f9c7025c 100644 --- a/src/main/scala/epic/sequences/HammingLossAugmentation.scala +++ b/src/main/scala/epic/sequences/HammingLossAugmentation.scala @@ -29,7 +29,7 @@ object HammingLossAugmentation { def scoreTransition(prev: Int, cur: Int, begin: Int, end: Int): Double = { - if(gt.isGoldSegment(begin, end, cur)) -precisionScale + if (gt.isGoldSegment(begin, end, cur)) -precisionScale else recallScale } diff --git a/src/main/scala/epic/sequences/SegmentText.scala b/src/main/scala/epic/sequences/SegmentText.scala index 7cd32980..3dcdba2a 100644 --- a/src/main/scala/epic/sequences/SegmentText.scala +++ b/src/main/scala/epic/sequences/SegmentText.scala @@ -9,7 +9,6 @@ import epic.util.ProcessTextMain */ object SegmentText extends ProcessTextMain[SemiCRF[Any, String], Segmentation[Any, String]] { - override def render(model: SemiCRF[Any, String], ann: Segmentation[Any, String], tokens: IndexedSeq[String]): String = { ann.render } diff --git a/src/main/scala/epic/sequences/Segmentation.scala b/src/main/scala/epic/sequences/Segmentation.scala index ecb87eac..fe366695 100644 --- a/src/main/scala/epic/sequences/Segmentation.scala +++ b/src/main/scala/epic/sequences/Segmentation.scala @@ -12,7 +12,6 @@ case class Segmentation[+L, +W](segments: IndexedSeq[(L, Span)], words: IndexedSeq[W], id: String = "") extends Example[IndexedSeq[(L, Span)], IndexedSeq[W]] { - def render: String = { segmentsWithOutside.map { case (None, span) => words.slice(span.begin, span.end).mkString(" ") @@ -20,7 +19,6 @@ case class Segmentation[+L, +W](segments: IndexedSeq[(L, Span)], }.mkString(" ") } - def features = words def length: Int = words.length @@ -41,7 +39,6 @@ case class Segmentation[+L, +W](segments: IndexedSeq[(L, Span)], for (i <- 0 until length) { - if (currentSegment < segments.length && segments(currentSegment)._2.end == i) { if (newSpanBegin != newOffset) newSegments += (segments(currentSegment)._1 -> Span(newSpanBegin, newOffset)) @@ -63,16 +60,14 @@ case class Segmentation[+L, +W](segments: IndexedSeq[(L, Span)], new Segmentation(newSegments, newWords, s"$id-filtered") } - def segmentsWithOutside: Iterator[(Option[L], Span)] = { val segs = for { qq@IndexedSeq((pL, pSpan), (l, span)) <- (segments.headOption.map(pair => pair._1 -> Span(0, 0)).toIndexedSeq ++ segments).sliding(2) padding = Iterator.range(pSpan.end, span.begin).map(i => None -> Span(i, i + 1)) pair <- padding ++ Iterator((Some(l), span)) } yield { - pair - } - + pair + } val lastSpanEnd = segments.lastOption match { case Some((_, Span(_, end))) => end @@ -82,23 +77,21 @@ case class Segmentation[+L, +W](segments: IndexedSeq[(L, Span)], segs ++ (lastSpanEnd until length).map(i => None -> Span(i, i + 1)) } - def asBIOSequence[LL>:L](outsideLabel: LL): TaggedSequence[BIOETag[L], W] = { val outLabels = new ArrayBuffer[BIOETag[L]]() for((l,span) <- segments if !span.isEmpty) { - while(outLabels.length < span.begin) { + while (outLabels.length < span.begin) { outLabels += BIOETag.Outside } - - if(l == outsideLabel) + if (l == outsideLabel) outLabels += BIOETag.Outside else outLabels += BIOETag.Begin(l) for(i <- (span.begin+1) until (span.end) ) { - outLabels += {if(l != outsideLabel) BIOETag.Inside(l) else BIOETag.Outside} + outLabels += {if (l != outsideLabel) BIOETag.Inside(l) else BIOETag.Outside} } } - while(outLabels.length < words.length) { + while (outLabels.length < words.length) { outLabels += BIOETag.Outside } assert(outLabels.length == words.length) @@ -108,7 +101,7 @@ case class Segmentation[+L, +W](segments: IndexedSeq[(L, Span)], def asFlatTaggedSequence[LL>:L]: TaggedSequence[Option[LL], W] = { val outLabels = new ArrayBuffer[Option[LL]]() for((l,span) <- segments if !span.isEmpty) { - while(outLabels.length < span.begin) { + while (outLabels.length < span.begin) { outLabels += None } @@ -116,7 +109,7 @@ case class Segmentation[+L, +W](segments: IndexedSeq[(L, Span)], outLabels += Some(l) } } - while(outLabels.length < words.length) { + while (outLabels.length < words.length) { outLabels += None } assert(outLabels.length == words.length) @@ -127,15 +120,14 @@ case class Segmentation[+L, +W](segments: IndexedSeq[(L, Span)], val outLabels = new ArrayBuffer[(LL, Span)]() for((l,span) <- segments if !span.isEmpty) { val lastEnd = outLabels.lastOption.map(_._2.end).getOrElse(0) - if(lastEnd < span.begin) { + if (lastEnd < span.begin) { outLabels += (outsideLabel -> Span(lastEnd, span.begin)) } - outLabels += (l -> span) } val lastEnd = outLabels.lastOption.map(_._2.end).getOrElse(0) - if(lastEnd < words.length) { + if (lastEnd < words.length) { outLabels += (outsideLabel -> Span(lastEnd, words.length)) } @@ -154,20 +146,20 @@ object Segmentation { for(i <- 0 until seq.length) { seq.label(i) match { case Begin(l) => - if(currentStart < i) + if (currentStart < i) spans += (currentLabel -> Span(currentStart, i)) currentStart = i currentLabel = l case Inside(l) => - if(currentLabel != l) { - if(currentStart < i) + if (currentLabel != l) { + if (currentStart < i) spans += (currentLabel -> Span(currentStart, i)) currentStart = i currentLabel = l } case End(l) => - if(currentLabel != l) { - if(currentStart < i) + if (currentLabel != l) { + if (currentStart < i) spans += (currentLabel -> Span(currentStart, i)) currentStart = i currentLabel = l @@ -175,18 +167,17 @@ object Segmentation { spans += (currentLabel -> Span(currentStart, i+1)) currentStart = i + 1 case Outside => - if(currentLabel != outsideLabel) { - if(currentStart < i) + if (currentLabel != outsideLabel) { + if (currentStart < i) spans += (currentLabel -> Span(currentStart, i)) currentStart = i currentLabel = outsideLabel } spans += (currentLabel -> Span(currentStart, i+1)) currentStart = i + 1 - } } - if(currentStart < seq.length) + if (currentStart < seq.length) spans += (currentLabel -> Span(currentStart, seq.length)) Segmentation(spans, seq.words, seq.id.replaceAll("-bio","-seg")) } diff --git a/src/main/scala/epic/sequences/SegmentationEval.scala b/src/main/scala/epic/sequences/SegmentationEval.scala index b000ff53..d0a3ea98 100644 --- a/src/main/scala/epic/sequences/SegmentationEval.scala +++ b/src/main/scala/epic/sequences/SegmentationEval.scala @@ -11,11 +11,12 @@ import com.typesafe.scalalogging.slf4j.LazyLogging * @author dlwh */ object SegmentationEval extends LazyLogging { + def eval[L ,W](crf: SemiCRF[L, W], examples: IndexedSeq[Segmentation[L, W]], logOnlyErrors: Boolean = true):Stats = { examples.par.aggregate(new Stats(0,0,0)) ({ (stats, gold )=> val guess = crf.bestSequence(gold.words, gold.id +"-guess") try { - if(guess.label != gold.label) + if (guess.label != gold.label) logger.trace(s"gold = $gold guess = $guess " + s"guess logPartition = ${crf.goldMarginal(guess.segments, guess.words).logPartition} " + s"gold logPartition =${crf.goldMarginal(gold.segments, gold.words).logPartition}") @@ -23,11 +24,10 @@ object SegmentationEval extends LazyLogging { case ex: Exception => logger.debug("Can't recover gold for " + gold) } val myStats = evaluateExample(Set(), guess, gold) - if(!logOnlyErrors || myStats.f1 < 1.0) + if (!logOnlyErrors || myStats.f1 < 1.0) logger.info("Guess:\n" + guess.render + "\n Gold:\n" + gold.render+ "\n" + myStats) stats + myStats }, {_ + _}) - } def evaluateExample[W, L](outsideLabel: Set[L], guess: Segmentation[L, W], gold: Segmentation[L, W]): SegmentationEval.Stats = { diff --git a/src/main/scala/epic/sequences/Segmenter.scala b/src/main/scala/epic/sequences/Segmenter.scala index 0c031fd8..9048356a 100644 --- a/src/main/scala/epic/sequences/Segmenter.scala +++ b/src/main/scala/epic/sequences/Segmenter.scala @@ -14,6 +14,7 @@ import scala.reflect.ClassTag * @author dlwh **/ trait Segmenter[Tag] extends StringAnalysisFunction[Sentence with Token, Tag] with (IndexedSeq[String]=>IndexedSeq[(Tag, Span)]) { + implicit protected def tagTag: ClassTag[Tag] override def apply[In <: Sentence with Token](slab: StringSlab[In]): StringSlab[In with Tag] = { val annotatedSentences = for((span, sent) <- slab.iterator[Sentence]) yield { @@ -23,7 +24,6 @@ trait Segmenter[Tag] extends StringAnalysisFunction[Sentence with Token, Tag] wi Span(tokens(espan.begin)._1.begin, tokens(espan.end - 1)._1.end) -> lbl } } - slab.addLayer[Tag](annotatedSentences.flatten) } @@ -32,7 +32,6 @@ trait Segmenter[Tag] extends StringAnalysisFunction[Sentence with Token, Tag] wi object Segmenter { def nerSystem[L](crf: SemiCRF[L, String]) = fromCRF(crf, (a: L) => EntityMention(a.toString)) - def fromCRF[L, Tag:ClassTag](crf: SemiCRF[L, String], lToTag: L=>Tag):Segmenter[Tag] = new SemiCRFSegmenter(crf, lToTag) case class SemiCRFSegmenter[L, Tag:ClassTag] (crf: SemiCRF[L, String], lToTag: L=>Tag) extends Segmenter[Tag] { @@ -41,4 +40,5 @@ object Segmenter { crf.bestSequence(v1).segments.map { case (l, span) => lToTag(l) -> span} } } + } diff --git a/src/main/scala/epic/sequences/SemiCRF.scala b/src/main/scala/epic/sequences/SemiCRF.scala index bc986970..21423aa6 100644 --- a/src/main/scala/epic/sequences/SemiCRF.scala +++ b/src/main/scala/epic/sequences/SemiCRF.scala @@ -59,7 +59,7 @@ object SemiCRF { val obj = new ModelObjective(model, data) val cached = new CachedBatchDiffFunction(obj) val weights = opt.minimize(cached, obj.initialWeightVector(false)) -// GradientTester.test(cached, weights, randFraction = 1.0, toString={(i: Int) => model.featureIndex.get(i).toString}, tolerance=0.0) + // GradientTester.test(cached, weights, randFraction = 1.0, toString={(i: Int) => model.featureIndex.get(i).toString}, tolerance=0.0) val crf = model.extractCRF(weights) crf @@ -102,7 +102,6 @@ object SemiCRF { override def labelIndex: OptionIndex[L] = new OptionIndex(crf.labelIndex) } - /** * An Anchoring encodes all the information needed to score Semimarkov models. * @@ -175,13 +174,13 @@ object SemiCRF { val allowedLabels = spanMarginals.map { arr => BitSet.empty ++ (0 until arr.length).filter(i => arr(i) >= threshold) -// BitSet.empty ++ (0 until arr.length) + // BitSet.empty ++ (0 until arr.length) } LabeledSpanConstraints(allowedLabels) } - def hasSupportOver(m: Marginal[L, W]):Boolean = { + def hasSupportOver(m: Marginal[L, W]): Boolean = { object FailureException extends Exception try { m visit new TransitionVisitor[L, W] { @@ -224,7 +223,6 @@ object SemiCRF { val partition = softmax(forwardScores.last) val _s = scorer - new Marginal[L, W] { def anchoring: Anchoring[L, W] = _s @@ -270,7 +268,6 @@ object SemiCRF { } - /** Log-normalized probability of seing segment with transition */ def transitionMarginal(prev: Int, cur: Int, begin: Int, end: Int): Double = { val withoutTrans = forwardScores(begin)(prev) + backwardScore(end)(cur) @@ -330,7 +327,6 @@ object SemiCRF { numerics.I(goldEnds(begin) == end && goldLabels(begin) == cur && goldPrevLabels(begin) == prev) } - def logPartition: Double = score } } @@ -455,8 +451,6 @@ object SemiCRF { backwardScores } - - } trait ConstraintSemiCRF[L, W] extends SemiCRF[L, W] with LabeledSpanConstraints.Factory[L, W] { @@ -466,17 +460,15 @@ object SemiCRF { @SerialVersionUID(1L) class IdentityConstraintSemiCRF[L, W](val labelIndex: OptionIndex[L]) extends ConstraintSemiCRF[L, W] with Serializable { outer => + def scorer(w: IndexedSeq[W]) = new Anchoring[L,W]() { def words = w def scoreTransition(prev: Int, cur: Int, begin: Int, end: Int) = 0.0 def labelIndex = outer.labelIndex - def constraints: LabeledSpanConstraints[L] = NoConstraints } - def constraints(w: IndexedSeq[W]) = NoConstraints - def constraints(seg: Segmentation[L, W], keepGold: Boolean) = NoConstraints } @@ -502,7 +494,6 @@ object SemiCRF { c = crf.marginal(w).computeSpanConstraints(threshold) cache.put(w, c) } - c } @@ -515,34 +506,24 @@ object SemiCRF { } } - def scorer(w: IndexedSeq[W]): Anchoring[L, W] = { val c = constraints(w) new Anchoring[L, W] { def words: IndexedSeq[W] = w - - def constraints: LabeledSpanConstraints[L] = c - def labelIndex:OptionIndex[L] = crf.labelIndex - def scoreTransition(prev: Int, cur: Int, begin: Int, end: Int): Double = numerics.logI(c.isAllowedLabeledSpan(begin, end, cur)) - } } } - trait IndexedFeaturizer[L, W] { def anchor(w: IndexedSeq[W]):AnchoredFeaturizer[L, W] - - def labelIndex: OptionIndex[L] def featureIndex: Index[Feature] - def hasTransitionFeatures: Boolean = true } @@ -551,7 +532,6 @@ object SemiCRF { def featuresForTransition(prev: Int, cur: Int, begin: Int, end: Int):FeatureVector } - def viterbi[L, W](anchoring: Anchoring[L ,W], id: String=""):Segmentation[L, W] = { val length = anchoring.length val numLabels = anchoring.labelIndex.size @@ -608,7 +588,6 @@ object SemiCRF { Segmentation(segments.reverse, anchoring.words, id) } - def posteriorDecode[L, W](m: Marginal[L, W], id: String = "") = { val length = m.length val numLabels = m.anchoring.labelIndex.size @@ -681,7 +660,6 @@ object SemiCRF { class IdentityAnchoring[L, W](val words: IndexedSeq[W], val labelIndex: OptionIndex[L], val constraints: LabeledSpanConstraints[L]) extends Anchoring[L, W] { def scoreTransition(prev: Int, cur: Int, beg: Int, end: Int): Double = 0.0 - def canStartLongSegment(pos: Int): Boolean = true } diff --git a/src/main/scala/epic/sequences/SemiCRFModel.scala b/src/main/scala/epic/sequences/SemiCRFModel.scala index a9d9bf49..69944950 100644 --- a/src/main/scala/epic/sequences/SemiCRFModel.scala +++ b/src/main/scala/epic/sequences/SemiCRFModel.scala @@ -124,7 +124,7 @@ class SemiCRFInference[L, W](weights: DenseVector[Double], val m = SemiCRF.Marginal(aug * scorer) val partition: Double = m.logPartition val partition1: Double = SemiCRF.Marginal.goldMarginal[L, W](scorer * aug, v.label).logPartition - if(partition1 > partition) + if (partition1 > partition) println(v + " " + SemiCRF.posteriorDecode(m).render + " " + v.render + " " + partition + " " + partition1) m } @@ -181,13 +181,13 @@ class SemiCRFInference[L, W](weights: DenseVector[Double], private def cachedSpanScore(prev: Int, cur: Int, beg: Int, end: Int):Double = { val tind: Int = TriangularArray.index(beg, end) var spanCell = spanCache(tind) - if(spanCache(tind) == null) { + if (spanCache(tind) == null) { spanCell = new Array[Array[Double]](labelIndex.size) spanCache(tind) = spanCell } var curLabelCell = spanCell(cur) - if(curLabelCell == null) { + if (curLabelCell == null) { val span = localization.featuresForSpan(prev, cur, beg, end) if (span eq null) { @@ -264,8 +264,6 @@ class SegmentationModelFactory[L](wordFeaturizer: Optional[WordFeaturizer[String model } - - } object SegmentationModelFactory { @@ -297,13 +295,10 @@ object SegmentationModelFactory { case class TransitionFeature[L](label: L, label2: L) extends Feature case object OutsideFeature extends Feature - object FeatureKinds extends Enumeration { val Begin, Interior, Span, Label = Value } - - @SerialVersionUID(2L) class IndexedStandardFeaturizer[L, W] private (wordFeaturizer: IndexedWordFeaturizer[W], surfaceFeaturizer: IndexedSurfaceFeaturizer[W], @@ -345,7 +340,7 @@ object SegmentationModelFactory { } else { var features = spanFeatureIndex.crossProduct(bioeFeatures(cur)(Span.id), loc.featuresForSpan(begin, end), spanOffset) - if(end - begin == 1) { + if (end - begin == 1) { features ++= wordFeatureIndex.crossProduct(bioeFeatures(cur)(Span.id), wloc.featuresForWord(begin), wordOffset) } @@ -365,12 +360,12 @@ object SegmentationModelFactory { (data: IndexedSeq[Segmentation[L, W]]):IndexedStandardFeaturizer[L, W] = { val labelPartIndex = Index[Feature]() val outsideFeature = labelPartIndex.index(OutsideFeature) - val bioeFeatures = Array.tabulate(labelIndex.size, FeatureKinds.maxId)((i,j) => if(i == labelIndex.size - 1) Array.empty[Int] else Array(labelPartIndex.index(Label1Feature(labelIndex.get(i).get, FeatureKinds(j))))) + val bioeFeatures = Array.tabulate(labelIndex.size, FeatureKinds.maxId)((i,j) => if (i == labelIndex.size - 1) Array.empty[Int] else Array(labelPartIndex.index(Label1Feature(labelIndex.get(i).get, FeatureKinds(j))))) val transitionFeatures = Array.tabulate(labelIndex.size, labelIndex.size) { (i, j) => val li = labelIndex.get(i).fold(OutsideFeature:Any)(identity) val lj = labelIndex.get(j).fold(OutsideFeature:Any)(identity) - if(lj == OutsideFeature) + if (lj == OutsideFeature) Array(labelPartIndex.index(TransitionFeature(li, lj)), outsideFeature) else Array(labelPartIndex.index(TransitionFeature(li, lj))) @@ -397,7 +392,7 @@ object SegmentationModelFactory { } // span spanBuilder.add(bioeFeatures(li)(FeatureKinds.Span.id), feats.featuresForSpan(span.begin, span.end)) - if(span.length == 1) { + if (span.length == 1) { wordBuilder.add(bioeFeatures(li)(FeatureKinds.Span.id), wordFeats.featuresForWord(span.begin)) } last = li @@ -416,5 +411,4 @@ object SegmentationModelFactory { } - } diff --git a/src/main/scala/epic/sequences/SemiNERPipeline.scala b/src/main/scala/epic/sequences/SemiNERPipeline.scala index 3b4c89da..b5a29582 100644 --- a/src/main/scala/epic/sequences/SemiNERPipeline.scala +++ b/src/main/scala/epic/sequences/SemiNERPipeline.scala @@ -16,7 +16,6 @@ import epic.preprocess.TreebankTokenizer import epic.corpora.CONLLSequenceReader import epic.framework.Example - /** * * @author dlwh @@ -52,14 +51,13 @@ object SemiNerPipeline extends LazyLogging { instances.splitAt(instances.length * 9 / 10) } - val gazetteer = None//Gazetteer.ner("en") // build feature Index val model = new SegmentationModelFactory(gazetteer = gazetteer).makeModel(train) val obj = new ModelObjective(model, train, params.nthreads) val cached = new CachedBatchDiffFunction(obj) - if(params.checkGradient) { + if (params.checkGradient) { GradientTester.test(cached, obj.initialWeightVector(true), toString = {(x: Int) => model.featureIndex.get(x).toString}) } @@ -68,7 +66,7 @@ object SemiNerPipeline extends LazyLogging { println("Eval + " + (state.iter+1) + " " + SegmentationEval.eval(crf, test)) } - val finalState = params.opt.iterations(cached, obj.initialWeightVector(randomize=false)).tee(state => if((state.iter +1) % params.iterPerEval == 0) eval(state)).take(params.opt.maxIterations).last + val finalState = params.opt.iterations(cached, obj.initialWeightVector(randomize=false)).tee(state => if ((state.iter +1) % params.iterPerEval == 0) eval(state)).take(params.opt.maxIterations).last eval(finalState) breeze.util.writeObject(params.modelOut, model.extractCRF(finalState.x)) @@ -77,8 +75,6 @@ object SemiNerPipeline extends LazyLogging { } - - object SemiConllNerPipeline extends LazyLogging { def makeSegmentation(ex: Example[IndexedSeq[String],IndexedSeq[IndexedSeq[String]]]): Segmentation[String, String] = { @@ -88,22 +84,22 @@ object SemiConllNerPipeline extends LazyLogging { val out = new ArrayBuffer[(String, Span)]() var start = labels.length var i = 0 - while(i < labels.length) { + while (i < labels.length) { val l = labels(i) l(0) match { case 'O' => - if(start < i) + if (start < i) out += (labels(start).replaceAll(".-","").intern -> Span(start, i)) -// out += ("O".intern -> Span(i, i+1)) + // out += ("O".intern -> Span(i, i+1)) start = i + 1 case 'B' => - if(start < i) + if (start < i) out += (labels(start).replaceAll(".-","").intern -> Span(start, i)) start = i case 'I' => - if(start >= i) { + if (start >= i) { start = i - } else if(labels(start) != l){ + } else if (labels(start) != l){ out += (labels(start).replaceAll(".-","").intern -> Span(start, i)) start = i } // else, still in a field, do nothing. @@ -113,15 +109,13 @@ object SemiConllNerPipeline extends LazyLogging { i += 1 } - if(start < i) + if (start < i) out += (labels(start).replaceAll(".-","").intern -> Span(start, i)) -// assert(out.nonEmpty && out.last._2.end == words.length, out + " " + words + " " + labels) + // assert(out.nonEmpty && out.last._2.end == words.length, out + " " + words + " " + labels) Segmentation(out, words, ex.id) } - - case class Params(train: File, test: File, nsents: Int = 100000, @@ -141,13 +135,12 @@ object SemiConllNerPipeline extends LazyLogging { standardTrain.take(params.nsents).map(makeSegmentation) -> standardTest.map(makeSegmentation) } - // build feature Index val model: SemiCRFModel[String, String] = new SegmentationModelFactory(/*, gazetteer = Gazetteer.ner("en" )*/).makeModel(train) val obj = new ModelObjective(model, train, params.nthreads) val cached = new CachedBatchDiffFunction(obj) - if(params.checkGradient) { + if (params.checkGradient) { GradientTester.test(cached, obj.initialWeightVector(true), toString={(i: Int) => model.featureIndex.get(i).toString}) } @@ -163,12 +156,11 @@ object SemiConllNerPipeline extends LazyLogging { stats } - val weights = params.opt.iterations(cached, obj.initialWeightVector(randomize=false)).tee(state => if((state.iter +1) % params.iterPerEval == 0) eval(state)).take(params.opt.maxIterations).last + val weights = params.opt.iterations(cached, obj.initialWeightVector(randomize=false)).tee(state => if ((state.iter +1) % params.iterPerEval == 0) eval(state)).take(params.opt.maxIterations).last val stats = eval(weights) breeze.util.writeObject(params.modelOut, model.extractCRF(weights.x)) println(stats) - } } diff --git a/src/main/scala/epic/sequences/TagText.scala b/src/main/scala/epic/sequences/TagText.scala index 2a725246..0edebaee 100644 --- a/src/main/scala/epic/sequences/TagText.scala +++ b/src/main/scala/epic/sequences/TagText.scala @@ -10,7 +10,6 @@ import epic.trees.AnnotatedLabel */ object TagText extends ProcessTextMain[CRF[AnnotatedLabel, String], TaggedSequence[AnnotatedLabel, String]] { - override def render(model: CRF[AnnotatedLabel, String], ann: TaggedSequence[AnnotatedLabel, String], tokens: IndexedSeq[String]): String = ann.render override def annotate(model: CRF[AnnotatedLabel, String], text: IndexedSeq[String]): TaggedSequence[AnnotatedLabel, String] = { diff --git a/src/main/scala/epic/sequences/TaggedSequence.scala b/src/main/scala/epic/sequences/TaggedSequence.scala index adeddf72..4d04dbe2 100644 --- a/src/main/scala/epic/sequences/TaggedSequence.scala +++ b/src/main/scala/epic/sequences/TaggedSequence.scala @@ -14,17 +14,10 @@ case class TaggedSequence[+L, +W](tags: IndexedSeq[L], require(tags.length == words.length) - def render = { - (tags zip words map { case (t, w) => w +"/" + t}).mkString(" ") - } - + def render = (tags zip words map { case (t, w) => w +"/" + t}).mkString(" ") def pairs = tags zip words - def features = words - def length: Int = words.length - def label: IndexedSeq[L] = tags - def asSegmentation = Segmentation(tags.zipWithIndex.map{case (l, i) => (l -> Span(i, i+1))}, words, id+"-seg") } diff --git a/src/main/scala/epic/sequences/TaggedSequenceEval.scala b/src/main/scala/epic/sequences/TaggedSequenceEval.scala index 84b3a7f9..eaa46ef1 100644 --- a/src/main/scala/epic/sequences/TaggedSequenceEval.scala +++ b/src/main/scala/epic/sequences/TaggedSequenceEval.scala @@ -17,7 +17,7 @@ object TaggedSequenceEval { val myStats = evaluateExample(guess, gold) val sent = for( ((p,g),w) <- guess.label zip gold.label zip guess.words) yield if (g == p) s"$w/$g" else s"$w/[G:$g,P:$p]" - if(myStats.exact != 1) + if (myStats.exact != 1) println(sent.mkString(" ") + "\n" + myStats) stats + myStats }, {_ + _}) @@ -28,7 +28,7 @@ object TaggedSequenceEval { val confusion = Counter2({for( (p, g) <- guess.tags zip gold.tags if p != g) yield (p,g,1)}:_*) val nRight = guess.length - confusion.size val nTotal = guess.length - val myStats = new Stats(nRight, nTotal, if(nRight == nTotal) 1 else 0, 1, confusion) + val myStats = new Stats(nRight, nTotal, if (nRight == nTotal) 1 else 0, 1, confusion) myStats } diff --git a/src/main/scala/epic/sequences/Tagger.scala b/src/main/scala/epic/sequences/Tagger.scala index cf91e1b1..08774fe5 100644 --- a/src/main/scala/epic/sequences/Tagger.scala +++ b/src/main/scala/epic/sequences/Tagger.scala @@ -20,10 +20,8 @@ trait Tagger[Tag] extends StringAnalysisFunction[Sentence with Token, Tag] with val tagSeq = apply(tokens.map(_._2.token)) tokens.map(_._1) zip tagSeq } - slab.addLayer[Tag](annotatedSentences.flatten) } - } object Tagger { diff --git a/src/main/scala/epic/sequences/TrainPosTagger.scala b/src/main/scala/epic/sequences/TrainPosTagger.scala index a1144b90..0b2509a3 100644 --- a/src/main/scala/epic/sequences/TrainPosTagger.scala +++ b/src/main/scala/epic/sequences/TrainPosTagger.scala @@ -48,9 +48,9 @@ object SemiPOSTagger extends LazyLogging { val crf = SemiCRF.buildSimple(train, opt = opt) val inf = crf.asInstanceOf[SemiCRFInference[_, _]] -// val out = new PrintWriter(new BufferedOutputStream(new FileOutputStream("weights.txt"))) -// Encoder.fromIndex(inf.featureIndex).decode(inf.weights).iterator foreach {case (x, v) if v.abs > 1E-6 => out.println(x -> v) case _ => } -// out.close() + // val out = new PrintWriter(new BufferedOutputStream(new FileOutputStream("weights.txt"))) + // Encoder.fromIndex(inf.featureIndex).decode(inf.weights).iterator foreach {case (x, v) if v.abs > 1E-6 => out.println(x -> v) case _ => } + // out.close() val stats = SegmentationEval.eval(crf, test) println("Final Stats: " + stats) } diff --git a/src/main/scala/epic/slab/AnalysisFunction.scala b/src/main/scala/epic/slab/AnalysisFunction.scala index bd582d1d..0b15e599 100644 --- a/src/main/scala/epic/slab/AnalysisFunction.scala +++ b/src/main/scala/epic/slab/AnalysisFunction.scala @@ -33,7 +33,6 @@ case class ComposedAnalysisFunction[C, B, I, O, II >: (I with O), +OO](a: Analys } - object StringIdentityAnalyzer extends StringAnalysisFunction[Any, Any] { def apply[In](slab: StringSlab[In]):StringSlab[In] = slab } @@ -50,7 +49,6 @@ object RegexTokenizer extends Tokenizer { }) } - object AnalysisPipeline { import AnnotatedSpan._ @@ -82,5 +80,4 @@ object AnalysisPipeline { } - } diff --git a/src/main/scala/epic/slab/Slab.scala b/src/main/scala/epic/slab/Slab.scala index 9e776c9b..434ddf05 100644 --- a/src/main/scala/epic/slab/Slab.scala +++ b/src/main/scala/epic/slab/Slab.scala @@ -45,7 +45,7 @@ trait Slab[ContentType, RegionType, +AnnotationTypes] { /** useful for downcasting */ def checkedCast[A: ClassTag]:Option[Slab[ContentType, RegionType, AnnotationTypes with A]] = { - if(!hasLayer[A]) { + if (!hasLayer[A]) { None } else { Some(this.asInstanceOf[Slab[ContentType, RegionType, AnnotationTypes with A]]) @@ -81,10 +81,8 @@ trait Slab[ContentType, RegionType, +AnnotationTypes] { def stringRep[A >: AnnotationTypes: ClassTag] = { iterator[A].mkString("\n") } - - -} +} object AnnotatedSpan { @@ -124,7 +122,7 @@ case class EntityMention(entityType: String, id: Option[String] = None) object Slab { trait ExtractRegion[Region, T] { - def apply(region: Region, t: T):T + def apply(region: Region, t: T): T } implicit object SpanStringExtractRegion extends ExtractRegion[Span, String] { @@ -148,7 +146,6 @@ object Slab { val annotations: Map[Class[_], Vector[(Span, Any)]] = Map.empty, val reverseAnnotations: Map[Class[_], Vector[(Span, Any)]] = Map.empty)(implicit extract: ExtractRegion[Span, ContentType]) extends Slab[ContentType, Span, AnnotationType] { - override def spanned(region: Span): ContentType = extract(region, content) override def addLayer[A:ClassTag](annotations: TraversableOnce[(Span, A)]): Slab[ContentType, Span, AnnotationType with A] = { @@ -164,14 +161,12 @@ object Slab { new SortedSequenceSlab(content, newAnnotations, reverseAnnotations) } - override def removeLayer[A >: AnnotationType: ClassTag]: Slab[ContentType, Span, AnnotationType] = { new SortedSequenceSlab(content, annotations - implicitly[ClassTag[A]].runtimeClass, reverseAnnotations - implicitly[ClassTag[A]].runtimeClass) } - /** Queries whether we have annotations of this type, even if the slab * doesn't have this type. Sometimes you just have to cast... */ override def hasLayer[A: ClassTag]: Boolean = { @@ -181,23 +176,23 @@ object Slab { override def following[A >: AnnotationType: ClassTag](region: Span): Iterator[(Span, A)] = { val annotations = selectAnnotations[A] var pos = BinarySearch.interpolationSearch(annotations, (_:(Span, Any))._1.begin, region.end) - if(pos < 0) pos = ~pos + if (pos < 0) pos = ~pos annotations.view(pos, annotations.length).iterator } override def preceding[A >: AnnotationType : ClassTag](region: Span): Iterator[(Span, A)] = { val annotations = selectReverse[A] var pos = BinarySearch.interpolationSearch(annotations, (_:(Span, Any))._1.end, region.begin + 1) - if(pos < 0) pos = ~pos + if (pos < 0) pos = ~pos annotations.view(0, pos).reverseIterator } override def covered[A >: AnnotationType : ClassTag](region: Span): IndexedSeq[(Span, A)] = { val annotations = selectAnnotations[A] var begin = BinarySearch.interpolationSearch(annotations, (_:(Span, Any))._1.begin, region.begin) - if(begin < 0) begin = ~begin + if (begin < 0) begin = ~begin var end = annotations.indexWhere(_._1.end > region.end, begin) - if(end < 0) end = annotations.length + if (end < 0) end = annotations.length annotations.slice(begin, end) } @@ -223,7 +218,4 @@ object Slab { } - - - } diff --git a/src/main/scala/epic/slab/package.scala b/src/main/scala/epic/slab/package.scala index 414a99b5..4ecbe423 100644 --- a/src/main/scala/epic/slab/package.scala +++ b/src/main/scala/epic/slab/package.scala @@ -9,9 +9,6 @@ import epic.trees.Span **/ package object slab { // some type aliases - - type StringAnalysisFunction[I, O] = AnalysisFunction[String, Span, I, O] type StringSlab[+AnnotationTypes] = Slab[String, Span, AnnotationTypes] - } diff --git a/src/main/scala/epic/trees/AnnotatedLabel.scala b/src/main/scala/epic/trees/AnnotatedLabel.scala index 4ba2645c..ead8b6e2 100644 --- a/src/main/scala/epic/trees/AnnotatedLabel.scala +++ b/src/main/scala/epic/trees/AnnotatedLabel.scala @@ -45,7 +45,6 @@ case class AnnotatedLabel(label: String, index: Int = -1) extends Feature with CachedHashCode { def hasAnnotation(f: Annotation): Boolean = features.contains(f) - def annotate(sym: Annotation*) = copy(features = features ++ sym) def isIntermediate = label.nonEmpty && label.charAt(0) == '@' @@ -60,17 +59,16 @@ case class AnnotatedLabel(label: String, if (index != -1) { x += s"-$index" } - x } override def toString = { val components = new ArrayBuffer[String]() headTag.foreach(components += _) - if(parents.nonEmpty) { + if (parents.nonEmpty) { components += parents.mkString("^","^","") } - if(siblings.nonEmpty) { + if (siblings.nonEmpty) { val b = new StringBuilder() siblings foreach { case Left(sib) => @@ -82,13 +80,13 @@ case class AnnotatedLabel(label: String, } components += b.toString } - if(features.nonEmpty) + if (features.nonEmpty) components ++= features.iterator.map(_.toString) - if(index != -1) + if (index != -1) components += s"_$index" - if(components.nonEmpty) components.mkString(label+"[", ", ", "]") + if (components.nonEmpty) components.mkString(label+"[", ", ", "]") else label } } @@ -110,7 +108,7 @@ object AnnotatedLabel { label.split("[-=#]") } - if(label.isEmpty) return AnnotatedLabel.TOP + if (label.isEmpty) return AnnotatedLabel.TOP val tag = fields.head diff --git a/src/main/scala/epic/trees/Debinarizer.scala b/src/main/scala/epic/trees/Debinarizer.scala index 48d413a7..50f754db 100644 --- a/src/main/scala/epic/trees/Debinarizer.scala +++ b/src/main/scala/epic/trees/Debinarizer.scala @@ -8,20 +8,17 @@ import java.io.ObjectStreamException * * @author dlwh **/ -trait Debinarizer[L] extends (BinarizedTree[L]=>Tree[L]) with Serializable { - -} +trait Debinarizer[L] extends (BinarizedTree[L] => Tree[L]) with Serializable object Debinarizer { @SerialVersionUID(1L) implicit object AnnotatedLabelDebinarizer extends Debinarizer[AnnotatedLabel] { def apply(t: BinarizedTree[AnnotatedLabel]): Tree[AnnotatedLabel] = { - Trees.debinarize(replaceUnaries(t), {(_:AnnotatedLabel).isIntermediate}).map(_.baseAnnotatedLabel) + Trees.debinarize(replaceUnaries(t), {(_: AnnotatedLabel).isIntermediate}).map(_.baseAnnotatedLabel) } - - def replaceUnaries(t: Tree[AnnotatedLabel]):Tree[AnnotatedLabel] = t match { + def replaceUnaries(t: Tree[AnnotatedLabel]): Tree[AnnotatedLabel] = t match { case UnaryTree(a, child, chain, span) if a.label == child.label.label && chain.isEmpty => replaceUnaries(child) case UnaryTree(a, child, chain, span) => @@ -37,7 +34,6 @@ object Debinarizer { case _ => t } - } @SerialVersionUID(1L) @@ -47,7 +43,7 @@ object Debinarizer { Trees.debinarize(Trees.deannotate(replaceUnaries(t))) } - def replaceUnaries(t: Tree[String]):Tree[String] = t match { + def replaceUnaries(t: Tree[String]): Tree[String] = t match { case UnaryTree(a, child, chain, span) if a == child.label && chain.isEmpty => replaceUnaries(child) case UnaryTree(a, child, chain, span) => @@ -64,7 +60,4 @@ object Debinarizer { } } - - - } diff --git a/src/main/scala/epic/trees/DependencyTree.scala b/src/main/scala/epic/trees/DependencyTree.scala index 778a9175..d7c02c59 100644 --- a/src/main/scala/epic/trees/DependencyTree.scala +++ b/src/main/scala/epic/trees/DependencyTree.scala @@ -7,7 +7,7 @@ package epic.trees case class DependencyTree[+L, +W](dependencies: IndexedSeq[(L, Int)], words: IndexedSeq[W]) { def render : String = { for(((label, head),dep) <- dependencies.zipWithIndex) yield { - if(head == words.length) s"ROOT(${words(dep)}-$dep)" else s"$label(${words(head)}-$head, ${words(dep)}-$dep)" + if (head == words.length) s"ROOT(${words(dep)}-$dep)" else s"$label(${words(head)}-$head, ${words(dep)}-$dep)" } }.mkString("\n") diff --git a/src/main/scala/epic/trees/HeadFinder.scala b/src/main/scala/epic/trees/HeadFinder.scala index fd44272a..ca6d056e 100644 --- a/src/main/scala/epic/trees/HeadFinder.scala +++ b/src/main/scala/epic/trees/HeadFinder.scala @@ -30,17 +30,14 @@ import breeze.util.Lens */ object HeadFinder { def left[L]: HeadFinder[L] = new RuleBasedHeadFinder[L](Left, HeadRules.empty) - def right[L]: HeadFinder[L] = new RuleBasedHeadFinder[L](Right, HeadRules.empty) - val collins = new RuleBasedHeadFinder(Left, rules = HeadRules.collinsHeadRules) - implicit def lensed[L, U](hf: HeadFinder[L])(implicit lens: Lens[U, L]) = hf.projected(lens.get(_: U)) } trait HeadFinder[L] { - def findHeadChild(l: L, children: L*):Int + def findHeadChild(l: L, children: L*): Int def findHeadChild(r: Rule[L]): Int = r match { case UnaryRule(_, _, _) => 0 @@ -66,7 +63,6 @@ trait HeadFinder[L] { } } - def annotateHeadIndices[W](t: Tree[L]): Tree[(L, Int)] = t match { case t:BinarizedTree[L] => annotateHeadIndices(t) case Tree(l, children, span) if children.isEmpty => Tree(l -> t.span.begin, IndexedSeq.empty, t.span) @@ -76,7 +72,6 @@ trait HeadFinder[L] { Tree(l -> rec(headChild).label._2, rec, t.span) } - def annotateHeadIndices(t: BinarizedTree[L]): BinarizedTree[(L, Int)] = t match { case NullaryTree(l, span) => NullaryTree(l -> t.span.begin, t.span) case u@UnaryTree(a, b, chain, span) => @@ -86,7 +81,7 @@ trait HeadFinder[L] { val headChild = findHeadChild(t) val recB = annotateHeadIndices(b) val recC = annotateHeadIndices(c) - val head = if(headChild == 0) recB.label._2 else recC.label._2 + val head = if (headChild == 0) recB.label._2 else recC.label._2 BinaryTree(a -> head, recB, recC, t.span) } @@ -100,15 +95,13 @@ trait HeadFinder[L] { val headChild = findHeadChild(t) val recB: BinarizedTree[(L, L)] = annotateHeadTags(b) val recC: BinarizedTree[(L, L)] = annotateHeadTags(c) - val head = if(headChild == 0) recB.label._2 else recC.label._2 + val head = if (headChild == 0) recB.label._2 else recC.label._2 BinaryTree(a -> head, recB, recC, t.span) } - def projected[U](f: U => L): HeadFinder[U] } - /** * Can annotate a tree with the head word. Usually * you should just use HeadFinder.collinsHeadFinder @@ -134,7 +127,6 @@ class RuleBasedHeadFinder[L](defaultDirection: Dir = Left, rules: HeadRules[L]) Tree(l -> rec(headChild).label._2, rec, t.span) } - def projected[U](f: U => L): HeadFinder[U] = new RuleBasedHeadFinder[U](defaultDirection, rules.projected(f)) def lensed[U](implicit lens: Lens[U, L]) = HeadFinder.lensed(this) @@ -170,7 +162,6 @@ case class HeadRule[L](dir: Dir, dis: Boolean, heads: Seq[L]) { rule => } candidates.find(_ >= 0) getOrElse -1 } - } } @@ -203,25 +194,19 @@ trait HeadRules[L] extends Serializable { outer => object HeadRules { - /** * Search direction for the match. */ sealed trait Dir - case object Left extends Dir - case object Right extends Dir def empty[L]: HeadRules[L] = fromMap[L](Map.empty) def fromMap[L](map: Map[L, Seq[HeadRule[L]]]): HeadRules[L] = new HeadRules[L] { protected type InnerLabel = L - protected def findRules(l: L) = map.getOrElse(l, Seq.empty) - protected def proj(l: L) = l - } private def shr[L](dir: Dir, dis: Boolean, heads: L*) = HeadRule(dir, dis, heads) @@ -279,7 +264,6 @@ object HeadRules { shr(Right, true, "JJ", "JJS", "RB", "QP")) ) - //add in binarized symbols, and look for the binarized symbol first (basic ++ basic.map { case (k, v) => ("@" + k, v) @@ -293,32 +277,28 @@ object HeadRules { } : Map[String, Seq[HeadRule[String]]] } - - } /* - - object NegraHeadFinder extends HeadFinder[AnnotatedLabel] { def findHeadChild(l: AnnotatedLabel, children: AnnotatedLabel*): Int = l.label match { case "ISU" => var index = children.indexWhere(a => a.hasAnnotation(FunctionalTag("UC"))) - if(index < 0) { + if (index < 0) { children.length - 1 } else { index } case "DL" => var index = children.indexWhere(a => a.hasAnnotation(FunctionalTag("HD")) || a.hasAnnotation(FunctionalTag("DH"))) - if(index < 0) { + if (index < 0) { children.length - 1 } else { index } case _ => var index = children.indexWhere(a => a.hasAnnotation(FunctionalTag("HD")) || a.hasAnnotation(FunctionalTag("PH"))) - if(index < 0) { + if (index < 0) { index = children.length - 1 } index diff --git a/src/main/scala/epic/trees/PartialTreeProcessor.scala b/src/main/scala/epic/trees/PartialTreeProcessor.scala index ee3dd1e5..898a230a 100644 --- a/src/main/scala/epic/trees/PartialTreeProcessor.scala +++ b/src/main/scala/epic/trees/PartialTreeProcessor.scala @@ -7,7 +7,7 @@ case class PartialTreeProcessor() { def apply(tree: Tree[String]):Tree[String] = { var transformed = xox(ens(tree).get) - transformed = if(transformed.children.length != 1) { + transformed = if (transformed.children.length != 1) { Tree("", IndexedSeq(transformed), transformed.span) } else { transformed diff --git a/src/main/scala/epic/trees/PennTreeReader.scala b/src/main/scala/epic/trees/PennTreeReader.scala index ab0c2021..ada4b4a4 100644 --- a/src/main/scala/epic/trees/PennTreeReader.scala +++ b/src/main/scala/epic/trees/PennTreeReader.scala @@ -57,12 +57,10 @@ class PennTreeReader(reader: Reader, def next() = { if (!hasNext) throw new NoSuchElementException() val tree = nextTree - nextTree = readRootTree() - if(nextTree == null) { + if (nextTree == null) { in.close() } - tree } @@ -73,10 +71,8 @@ class PennTreeReader(reader: Reader, val tree = readTree(true, 0) tree } - } - private def readTree(isRoot : Boolean, pos : Int) : (Tree[String],IndexedSeq[String]) = { readLeftParen() val label = { @@ -110,7 +106,6 @@ class PennTreeReader(reader: Reader, ch = in.read() atLeastOne = false } - in.unread(ch) sb.toString() } @@ -144,7 +139,6 @@ class PennTreeReader(reader: Reader, } private def isTextParen() = { - var numRead = 0 var ch = in.read() while (isLeftParen(ch)) { @@ -167,9 +161,9 @@ class PennTreeReader(reader: Reader, private def readLeaf() = { var label = readText(true, true) - if(unescapeTokens) + if (unescapeTokens) label = TreebankTokenizer.treebankTokenToToken(label) - if(label.startsWith("/") && label.length == 2 && label(1) != '/') { + if (label.startsWith("/") && label.length == 2 && label(1) != '/') { label = label.drop(1) // ontonotes escapes periods as /. } label @@ -208,6 +202,3 @@ class PennTreeReader(reader: Reader, } } - - - diff --git a/src/main/scala/epic/trees/ProcessedTreebank.scala b/src/main/scala/epic/trees/ProcessedTreebank.scala index 2b37ee93..523d96f8 100644 --- a/src/main/scala/epic/trees/ProcessedTreebank.scala +++ b/src/main/scala/epic/trees/ProcessedTreebank.scala @@ -48,7 +48,7 @@ case class ProcessedTreebank(@Help(text="Location of the treebank directory") case "conllonto" => Treebank.fromOntonotesDirectory(path) case "spmrl" => var trainPath: File = new File(path, "train") - if(!trainPath.exists) + if (!trainPath.exists) trainPath = new File(path, "train5k") val train = trainPath.listFiles().filter(_.getName.endsWith("ptb")) val dev = new File(path, "dev").listFiles().filter(_.getName.endsWith("ptb")) @@ -66,17 +66,16 @@ case class ProcessedTreebank(@Help(text="Location of the treebank directory") lazy val trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]] = { var train = transformTrees(treebank.train, maxLength, collapseUnaries = true) - if(includeDevInTrain) train ++= transformTrees(treebank.dev, maxLength, collapseUnaries = true) + if (includeDevInTrain) train ++= transformTrees(treebank.dev, maxLength, collapseUnaries = true) train.take(numSentences) } lazy val devTrees = transformTrees(treebank.dev, 100000) lazy val testTrees = transformTrees(treebank.test, 1000000) - def transformTrees(portion: treebank.Portion, maxL: Int, collapseUnaries: Boolean = false): IndexedSeq[TreeInstance[AnnotatedLabel, String]] = { val binarizedAndTransformed = for ( ((tree, words), index) <- portion.trees.zipWithIndex if words.length <= maxL; - w2 = if(debuckwalterize) words.map(ArabicNormalization.buckwalterToUnicode) else words + w2 = if (debuckwalterize) words.map(ArabicNormalization.buckwalterToUnicode) else words ) yield { val name = s"${portion.name}-$index" makeTreeInstance(name, tree, w2, collapseUnaries) @@ -85,7 +84,6 @@ case class ProcessedTreebank(@Help(text="Location of the treebank directory") binarizedAndTransformed.toIndexedSeq } - def makeTreeInstance(name: String, tree: Tree[String], words: IndexedSeq[String], collapseUnaries: Boolean): TreeInstance[AnnotatedLabel, String] = { var transformed = process(tree.map(AnnotatedLabel.parseTreebank)) if (collapseUnaries) { diff --git a/src/main/scala/epic/trees/Rule.scala b/src/main/scala/epic/trees/Rule.scala index 146fa208..8ced6dba 100644 --- a/src/main/scala/epic/trees/Rule.scala +++ b/src/main/scala/epic/trees/Rule.scala @@ -27,33 +27,24 @@ sealed trait Production[@specialized(Int) +L, +W] extends Feature { sealed trait Rule[@specialized(Int) +L] extends Production[L, Nothing] { def parent: L - def children: Seq[L] - def symbols = parent +: children - def map[A](f: L => A): Rule[A] - def mapChildren[A >: L](f: L => A): Rule[A] } @SerialVersionUID(8613629952079423488L) final case class BinaryRule[@specialized(Int) +L](parent: L, left: L, right: L) extends Rule[L] { def children = Seq(left, right) - def map[A](f: L => A) = BinaryRule(f(parent), f(left), f(right)) - def mapChildren[A >: L](f: L => A) = BinaryRule(parent, f(left), f(right)) } @SerialVersionUID(8559479322874082992L) final case class UnaryRule[@specialized(Int) +L](parent: L, child: L, chain: IndexedSeq[String]) extends Rule[L] { def children = Seq(child) - def map[A](f: L => A) = UnaryRule(f(parent), f(child), chain) - def mapChildren[A >: L](f: L => A) = UnaryRule(parent, f(child), chain) - def isIdentity = chain.isEmpty && parent == child } @@ -65,7 +56,6 @@ case class NullRule[@specialized(Int) +L](parent: L) extends Production[L, Nothi def map[A](f: (L) => A): NullRule[A] = NullRule(f(parent)) } - object BinaryRule { def leftChildFirstOrdering[L:Ordering]:Ordering[BinaryRule[L]] = Ordering.Tuple3[L, L, L].on(br => (br.left, br.right, br.parent)) def parentFirstOrdering[L:Ordering]:Ordering[BinaryRule[L]] = Ordering.Tuple3[L, L, L].on(br => (br.parent, br.left, br.right)) diff --git a/src/main/scala/epic/trees/Span.scala b/src/main/scala/epic/trees/Span.scala index 0f8639a2..1b020bbc 100644 --- a/src/main/scala/epic/trees/Span.scala +++ b/src/main/scala/epic/trees/Span.scala @@ -53,7 +53,6 @@ class Span(val encoded: Long) extends AnyVal with Serializable { || (other.begin < begin && other.end < end && other.end > begin) ) - // override def hashCode(): Int = { // (begin, end).hashCode() // } diff --git a/src/main/scala/epic/trees/StandardTreeProcessor.scala b/src/main/scala/epic/trees/StandardTreeProcessor.scala index 63af9570..44fba167 100644 --- a/src/main/scala/epic/trees/StandardTreeProcessor.scala +++ b/src/main/scala/epic/trees/StandardTreeProcessor.scala @@ -42,15 +42,14 @@ case class StandardTreeProcessor(headFinder: HeadFinder[AnnotatedLabel] = HeadFi oin.defaultReadObject() } - def apply(rawTree: Tree[AnnotatedLabel]):BinarizedTree[AnnotatedLabel] = { -// val ann = tree.map { AnnotatedLabel.parseTreebank } + // val ann = tree.map { AnnotatedLabel.parseTreebank } var detraced = traceProcessor(rawTree) if (removeTraces) { detraced = detraced.map(_.copy(index = -1)) } var transformed = xox(detraced) - transformed = if(transformed.children.length != 1) { + transformed = if (transformed.children.length != 1) { Tree(AnnotatedLabel.TOP, IndexedSeq(transformed), transformed.span) } else { transformed @@ -70,7 +69,4 @@ case class StandardTreeProcessor(headFinder: HeadFinder[AnnotatedLabel] = HeadFi } } -object StandardTreeProcessor { - - -} +object StandardTreeProcessor diff --git a/src/main/scala/epic/trees/SubsampledTreebank.scala b/src/main/scala/epic/trees/SubsampledTreebank.scala index ddaf983c..8e63af08 100644 --- a/src/main/scala/epic/trees/SubsampledTreebank.scala +++ b/src/main/scala/epic/trees/SubsampledTreebank.scala @@ -36,7 +36,7 @@ class SubsampledTreebank(base: Treebank[String], numTrain: Int, numDev:Int, numT } private def downSample[K](trees: Iterator[K], num: Int) = { - if(num < 0) trees + if (num < 0) trees else { // TODO: maybe randomly sample trees.take(num) diff --git a/src/main/scala/epic/trees/SupervisedHeadFinder.scala b/src/main/scala/epic/trees/SupervisedHeadFinder.scala index 79e34075..9905f0c8 100644 --- a/src/main/scala/epic/trees/SupervisedHeadFinder.scala +++ b/src/main/scala/epic/trees/SupervisedHeadFinder.scala @@ -8,7 +8,7 @@ import java.io.InputStreamReader class SupervisedHeadFinder[L](innards: SupervisedHeadFinderInnards[L,_]) extends HeadFinder[L] { - def findHeadChild(l: L, children: L*):Int = { + def findHeadChild(l: L, children: L*): Int = { val head = innards.findHeadChild(l, children.toSeq) head } @@ -51,7 +51,7 @@ object SupervisedHeadFinderInnards extends Serializable { case class HeadDB[B](symbolArityHeadChildCounts: Counter2[(B,Int),Int,Int], ruleHeadChildCounts: Counter2[(B,Seq[B]),Int,Int], defaultToLeft: Boolean = true) { - def findHeadChild(l: B, children: Seq[B]):Int = { + def findHeadChild(l: B, children: Seq[B]): Int = { // Manual arg-max because I suck at using Counter2 var best = -1 var bestCount = 0 @@ -91,8 +91,7 @@ object SupervisedHeadFinder { } val symbolArityHeadChildCounts = Counter2[(String,Int),Int,Int]() val ruleHeadChildCounts = Counter2[(String,Seq[String]),Int,Int]() - - + def rec(tree: Tree[String], conllTree: Seq[Int]) { if (!tree.isLeaf) { val label = tree.label @@ -125,14 +124,14 @@ object SupervisedHeadFinder { } println("Head finder trained; lengths matched on " + numMatched + " / " + conllTrees.size + " trees") new SupervisedHeadFinder[String](SupervisedHeadFinderInnards.fromHeadDB(new HeadDB(symbolArityHeadChildCounts, ruleHeadChildCounts))) -// HeadFinder.collins + // HeadFinder.collins } // Reads in a vector of parents, 0-indexed, with the root being -1 def readDepTrees(conllPath: String): Seq[Seq[Int]] = { val in = breeze.io.FileStreams.input(new File(conllPath)) val br = new BufferedReader(new InputStreamReader(in, "UTF-8")) -// val sents = new ArrayBuffer[Seq[Seq[String]]]() + // val sents = new ArrayBuffer[Seq[Seq[String]]]() val trees = new ArrayBuffer[Seq[Int]]() var currSent = new ArrayBuffer[Seq[String]] var i = 0 diff --git a/src/main/scala/epic/trees/TestTreebank.scala b/src/main/scala/epic/trees/TestTreebank.scala index f9fff694..dca56c42 100644 --- a/src/main/scala/epic/trees/TestTreebank.scala +++ b/src/main/scala/epic/trees/TestTreebank.scala @@ -24,7 +24,6 @@ object TstTreebank { val train = TstTreebank.getClass.getClassLoader.getResource("smallbank/train") val test = TstTreebank.getClass.getClassLoader.getResource("smallbank/test") val dev = TstTreebank.getClass.getClassLoader.getResource("smallbank/dev") - new SimpleTreebank(Map("train"->train),Map("dev"->dev),Map("test"->test)) } } \ No newline at end of file diff --git a/src/main/scala/epic/trees/TraceRemover.scala b/src/main/scala/epic/trees/TraceRemover.scala index d9b23c2f..a893dbcd 100644 --- a/src/main/scala/epic/trees/TraceRemover.scala +++ b/src/main/scala/epic/trees/TraceRemover.scala @@ -21,8 +21,6 @@ class TraceRemover[T, W](emptyCategory: T=>Boolean) extends (Tree[T] =>Tree[T]) } } } - rec(tree).get } - } diff --git a/src/main/scala/epic/trees/TraceToSlashCategoryConverter.scala b/src/main/scala/epic/trees/TraceToSlashCategoryConverter.scala index 5f713e81..c77bf7bf 100644 --- a/src/main/scala/epic/trees/TraceToSlashCategoryConverter.scala +++ b/src/main/scala/epic/trees/TraceToSlashCategoryConverter.scala @@ -37,18 +37,12 @@ class TraceToSlashCategoryConverter extends (Tree[AnnotatedLabel] =>Tree[Annotat val (newChildren, gapsList) = tree.children.filterNot(_.label.label == "-NONE-").map(recursive(_, resolvedIndices ++ cCommandIndices)).unzip val gaps: IndexedSeq[(String, Int)] = gapsList.flatten.distinct val unresolvedGaps = gaps.filterNot(pair => resolvedIndices(pair._2)) - - val newLabel = label.copy(siblings = label.siblings ++ unresolvedGaps.map(pair => Left(pair._1))) - - -// if(unresolvedGaps.nonEmpty) { -// println(unresolvedGaps, newLabel) -// } - + // if (unresolvedGaps.nonEmpty) { + // println(unresolvedGaps, newLabel) + // } Tree(newLabel, newChildren, span) -> unresolvedGaps } - } val (newTree, gaps) = recursive(tree, Set.empty) @@ -57,7 +51,6 @@ class TraceToSlashCategoryConverter extends (Tree[AnnotatedLabel] =>Tree[Annotat } } - object TraceToSlashCategoryConverter { def main(args: Array[String]):Unit = { diff --git a/src/main/scala/epic/trees/Tree.scala b/src/main/scala/epic/trees/Tree.scala index 77f9b80e..4eaf3507 100644 --- a/src/main/scala/epic/trees/Tree.scala +++ b/src/main/scala/epic/trees/Tree.scala @@ -16,7 +16,6 @@ package epic.trees limitations under the License. */ - import java.io.StringReader import breeze.util.Lens @@ -48,7 +47,7 @@ trait Tree[+L] extends Serializable { children.last.span.end == this.span.end } - def leaves:Iterable[Tree[L]] = if(isLeaf) { + def leaves:Iterable[Tree[L]] = if (isLeaf) { IndexedSeq(this).view } else { children.map(_.leaves).foldLeft[Stream[Tree[L]]](Stream.empty){_ append _} @@ -60,7 +59,7 @@ trait Tree[+L] extends Serializable { */ def cutLeaves: (Tree[L],IndexedSeq[L]) = { def recCutLeaves(tree: Tree[L]): (Option[Tree[L]],IndexedSeq[L]) = { - if(tree.isLeaf) (None,IndexedSeq(tree.label)) + if (tree.isLeaf) (None,IndexedSeq(tree.label)) else { val fromChildren = tree.children.map(recCutLeaves _) Some(Tree(tree.label,fromChildren.flatMap(_._1), tree.span)) -> fromChildren.flatMap(_._2) @@ -70,7 +69,6 @@ trait Tree[+L] extends Serializable { treeOpt.get -> leaves } - def map[M](f: L=>M):Tree[M] = Tree( f(label), children map { _ map f}, span) def extend[B](f: Tree[L]=>B):Tree[B] = Tree(f(this),children map { _ extend f}, span) def relabelRoot[B>:L](f: L=>B):Tree[B] @@ -85,7 +83,7 @@ trait Tree[+L] extends Serializable { children.map(_.postorder).foldRight(Iterator(this)){_ ++ _} } - def leftHeight:Int = if(isLeaf) 0 else 1 + children(0).leftHeight + def leftHeight:Int = if (isLeaf) 0 else 1 + children(0).leftHeight import epic.trees.Tree._ override def toString = toString(false) @@ -105,7 +103,7 @@ object Tree { import tree._ sb append "( " append tree.label append " [" append span.begin append "," append span.end append "] " for( c <- tree.children ) { - if(newline && c.children.nonEmpty) sb append "\n" append " " * depth + if (newline && c.children.nonEmpty) sb append "\n" append " " * depth else sb.append(' ') recursiveToString(c,depth+1,newline, sb) } @@ -113,30 +111,26 @@ object Tree { sb } - private def recursiveRender[L,W](tree: Tree[L], depth: Int, words: Seq[W], newline: Boolean, sb: StringBuilder): StringBuilder = { import tree._ sb append "(" append tree.label - if(isLeaf) { + if (isLeaf) { sb append TreebankTokenizer.tokensToTreebankTokens(span.map(words).map(_.toString)).mkString(" "," ","") } else { val anyNonTerminals = children.exists(!_.isLeaf) //sb append "\n" for( c <- children ) { - if(newline && (c.span.length != words.length) && anyNonTerminals) sb append "\n" append " " * depth + if (newline && (c.span.length != words.length) && anyNonTerminals) sb append "\n" append " " * depth else sb.append(' ') recursiveRender(c,depth+1, words, newline, sb) } } sb append ')' - if(sb.length > 1 && sb(sb.length-2) != ')' && sb(sb.length-2) != ' ') + if (sb.length > 1 && sb(sb.length-2) != ')' && sb(sb.length-2) != ' ') sb append ' ' sb } - - - } case class NaryTree[L](label: L, children: IndexedSeq[Tree[L]], span: Span) extends Tree[L] { @@ -146,9 +140,9 @@ case class NaryTree[L](label: L, children: IndexedSeq[Tree[L]], span: Span) exte sealed trait BinarizedTree[+L] extends Tree[L] { def findSpan(begin: Int, end: Int): Option[Tree[L]] = this match { case t if t.span == Span(begin, end) => Some(t) - case t@BinaryTree(a, b, c, span) if end <= t.splitPoint => if(t.begin <= begin) b.findSpan(begin, end) else None - case t@BinaryTree(a, b, c, span) if t.splitPoint <= begin => if(t.end <= end) c.findSpan(begin, end) else None - case t@UnaryTree(a, b, chain, span) => if(span.contains(Span(begin, end))) b.findSpan(begin, end) else None + case t@BinaryTree(a, b, c, span) if end <= t.splitPoint => if (t.begin <= begin) b.findSpan(begin, end) else None + case t@BinaryTree(a, b, c, span) if t.splitPoint <= begin => if (t.end <= end) c.findSpan(begin, end) else None + case t@UnaryTree(a, b, chain, span) => if (span.contains(Span(begin, end))) b.findSpan(begin, end) else None case _ => None } @@ -211,7 +205,7 @@ object Trees { val right = binarized.drop(headChildIndex+1).foldLeft(headChild){ (tree,newArg) => // TODO ugh val intermediate = { - if(tree eq headChild) + if (tree eq headChild) makeIntermediate(l, headChild.label) else tree.label @@ -222,7 +216,7 @@ object Trees { // now fold in left args val fullyBinarized = binarized.take(headChildIndex).foldRight(right){(newArg,tree) => val intermediate = { - if(tree eq headChild) + if (tree eq headChild) makeIntermediate(l, headChild.label) else tree.label @@ -236,7 +230,7 @@ object Trees { def binarize(tree: Tree[String], headFinder: HeadFinder[String] = HeadFinder.collins):BinarizedTree[String] = { def stringBinarizer(currentLabel: String, headTag: String) = { - if(currentLabel.startsWith("@")) currentLabel + if (currentLabel.startsWith("@")) currentLabel else s"@$currentLabel[$headTag]" } @@ -253,7 +247,6 @@ object Trees { binarize[String](tree, stringBinarizer, extendIntermediate, headFinder) } - def deannotate(tree: Tree[String]):Tree[String] = tree.map(deannotateLabel _) def deannotate(tree: BinarizedTree[String]):BinarizedTree[String] = tree.map(deannotateLabel _) def deannotateLabel(l: String) = l.takeWhile(c => c != '^' && c != '>') @@ -288,13 +281,13 @@ object Trees { case BinaryTree(label, t1, t2, span) => val (newt1, _) = rec(t1) val (newt2, _) = rec(t2) - val newHistory = if(isIntermediate(label) && order > 0) IndexedSeq(Right(t2.label)) else IndexedSeq.empty - val newLabel = if(isIntermediate(label)) join(label, newHistory) else label + val newHistory = if (isIntermediate(label) && order > 0) IndexedSeq(Right(t2.label)) else IndexedSeq.empty + val newLabel = if (isIntermediate(label)) join(label, newHistory) else label BinaryTree(newLabel, newt1, newt2, tree.span) -> newHistory case UnaryTree(label, child, chain, span) => val (newt1, hist) = rec(child) - val newHistory = if(isIntermediate(label)) hist else IndexedSeq.empty - val newLabel = if(isIntermediate(label)) join(label, newHistory) else label + val newHistory = if (isIntermediate(label)) hist else IndexedSeq.empty + val newLabel = if (isIntermediate(label)) join(label, newHistory) else label UnaryTree(newLabel, newt1, chain, tree.span) -> newHistory case tree@NullaryTree(_, span) => tree -> IndexedSeq.empty } @@ -315,7 +308,7 @@ object Trees { val children = tree.children val buf = new ArrayBuffer[Tree[L]] for(c <- children) { - if(isBinarized(c.label)) { + if (isBinarized(c.label)) { buf ++= debinarize(c,isBinarized).children } else { buf += debinarize(c,isBinarized) @@ -327,7 +320,7 @@ object Trees { def debinarize(tree: Tree[String]):Tree[String] = debinarize(tree, (x:String) => x.startsWith("@")) def annotateParents[L](tree: Tree[L], join: (L,L)=>L, depth: Int, history: List[L] = List.empty):Tree[L] = { - if(depth == 0) tree + if (depth == 0) tree else { val newLabel = (tree.label :: history).iterator.take(depth).reduceLeft(join) Tree(newLabel,tree.children.map(c => annotateParents[L](c,join,depth,tree.label :: history.take(depth-1 max 0))), tree.span) @@ -350,9 +343,9 @@ object Trees { val ot = tree def rec(tree: BinarizedTree[L], history: List[L] = List.empty):BinarizedTree[L] = { import tree._ - val newLabel = if(dontAnnotate(tree)) { + val newLabel = if (dontAnnotate(tree)) { label - } else if(isIntermediate(label)) { + } else if (isIntermediate(label)) { assert(history.length > 1, label + " " + history + "\n\n\n" + tree + "\n\n\n" + ot) join(label, history drop 1 take depth) } else { @@ -362,14 +355,14 @@ object Trees { tree match { //invariant: history is the (depth) non-intermediate symbols, where we remove unary-identity transitions case BinaryTree(label, t1, t2, span) => - val newHistory = if(!isIntermediate(label)) label :: history else history + val newHistory = if (!isIntermediate(label)) label :: history else history val lchild = rec(t1,newHistory) val rchild = rec(t2,newHistory) BinaryTree(newLabel, lchild, rchild, span) case u@UnaryTree(label, child, chain, span) => - if(isIntermediate(label)) assert(history.nonEmpty, ot.toString(true) + "\n" + u.toString(true) ) - //if(isIntermediate(label)) assert(label != newLabel, label + " " + newLabel + " " + u + " " + history) - val newHistory = if(!isIntermediate(label) && label != child.label) label :: history else history + if (isIntermediate(label)) assert(history.nonEmpty, ot.toString(true) + "\n" + u.toString(true) ) + //if (isIntermediate(label)) assert(label != newLabel, label + " " + newLabel + " " + u + " " + history) + val newHistory = if (!isIntermediate(label) && label != child.label) label :: history else history UnaryTree(newLabel,rec(child,newHistory), chain, span) case NullaryTree(label, span) => NullaryTree(newLabel, span) @@ -388,11 +381,11 @@ object Trees { @SerialVersionUID(1L) class EmptyNodeStripper[T](implicit lens: Lens[T,String]) extends (Tree[T]=>Option[Tree[T]]) with Serializable { def apply(tree: Tree[T]):Option[Tree[T]] = { - if(lens.get(tree.label) == "-NONE-") None - else if(tree.span.begin == tree.span.end) None // screw stupid spans + if (lens.get(tree.label) == "-NONE-") None + else if (tree.span.begin == tree.span.end) None // screw stupid spans else { val newC = tree.children map this filter (None!=) - if(newC.isEmpty && !tree.isLeaf) None + if (newC.isEmpty && !tree.isLeaf) None else Some(Tree(tree.label,newC map (_.get), tree.span)) } } @@ -400,7 +393,7 @@ object Trees { class XOverXRemover[L] extends (Tree[L]=>Tree[L]) { def apply(tree: Tree[L]):Tree[L] = { - if(tree.children.size == 1 && tree.label == tree.children(0).label) { + if (tree.children.size == 1 && tree.label == tree.children(0).label) { this(tree.children(0)) } else { Tree(tree.label,tree.children.map(this), tree.span) @@ -415,7 +408,7 @@ object Trees { case "-RCB-" | "-RRB-" | "-LRB-" | "-LCB-" => label case "PRT|ADVP" => lens.set(label, "PRT") case x => - if(x.startsWith("--")) lens.set(label,x.replaceAll("---.*","--")) + if (x.startsWith("--")) lens.set(label,x.replaceAll("---.*","--")) else lens.set(label,x.replaceAll("[-|=].*","")) } } @@ -555,7 +548,7 @@ object Trees { // go up until we find a LeftChild, then go to its right child. // if we hit the root (that is, we only go up right children), there is no next. var cur:Option[Zipper[L]] = Some(this) - while(true) { + while (true) { cur match { case None => return None case Some(loc@Zipper(_, LeftChild(_, _, _))) => diff --git a/src/main/scala/epic/trees/Treebank.scala b/src/main/scala/epic/trees/Treebank.scala index 9b3a51f3..49e889e7 100644 --- a/src/main/scala/epic/trees/Treebank.scala +++ b/src/main/scala/epic/trees/Treebank.scala @@ -16,7 +16,6 @@ package epic.trees limitations under the License. */ - import java.io._ import epic.ontonotes.ConllOntoReader @@ -111,7 +110,7 @@ object Treebank { * of the parsed Treebank. */ def fromPennTreebankDir(dir: File):Treebank[String] = new Treebank[String] { - if(!dir.exists) throw new FileNotFoundException(dir.toString) + if (!dir.exists) throw new FileNotFoundException(dir.toString) def sections = dir.listFiles.filter(_.isDirectory).map(_.getName) val train = Portion("train", IndexedSeq.range(2,10).map("0" + _) ++ IndexedSeq.range(10,22).map(""+_)) @@ -135,7 +134,7 @@ object Treebank { def fromChineseTreebankDir(dir: File):Treebank[String] = new Treebank[String] { def sections = dir.listFiles.map(_.getName) - private def id_to_name(id: Int) = s"chtb_${if(id < 100) "0" + id else id}.mrg" + private def id_to_name(id: Int) = s"chtb_${if (id < 100) "0" + id else id}.mrg" val train = Portion("train",{(1 to 270) ++ (400 to 1151)} map id_to_name) val test = Portion("test", 271 to 300 map id_to_name) diff --git a/src/main/scala/epic/trees/UnaryChainCollapser.scala b/src/main/scala/epic/trees/UnaryChainCollapser.scala index 7282dda2..f5034748 100644 --- a/src/main/scala/epic/trees/UnaryChainCollapser.scala +++ b/src/main/scala/epic/trees/UnaryChainCollapser.scala @@ -30,12 +30,12 @@ object UnaryChainCollapser { def transform(t: BinarizedTree[AnnotatedLabel],parentWasUnary:Boolean):BinarizedTree[AnnotatedLabel] = t match { case UnaryTree(l,c, _chain, span) => val (chain,cn) = stripChain(c) - UnaryTree(l,transform(cn,true), if(keepChains) _chain ++ chain.toIndexedSeq else IndexedSeq.empty, t.span) + UnaryTree(l,transform(cn,true), if (keepChains) _chain ++ chain.toIndexedSeq else IndexedSeq.empty, t.span) case BinaryTree(l,lchild,rchild, span) => - if(parentWasUnary) BinaryTree(l,transform(lchild,false),transform(rchild,false), t.span) + if (parentWasUnary) BinaryTree(l,transform(lchild,false),transform(rchild,false), t.span) else UnaryTree(l,BinaryTree(l,transform(lchild,false),transform(rchild,false), t.span), IndexedSeq.empty, t.span) case NullaryTree(l, span) => - if(parentWasUnary) NullaryTree(l, t.span) + if (parentWasUnary) NullaryTree(l, t.span) else UnaryTree(l,NullaryTree(l, t.span), IndexedSeq.empty, t.span) case t => t } diff --git a/src/main/scala/epic/trees/annotations/KMAnnotator.scala b/src/main/scala/epic/trees/annotations/KMAnnotator.scala index 805bbbe6..56b76f38 100644 --- a/src/main/scala/epic/trees/annotations/KMAnnotator.scala +++ b/src/main/scala/epic/trees/annotations/KMAnnotator.scala @@ -21,6 +21,7 @@ package annotations */ case class KMAnnotator( horizontal: Int = 2, vertical: Int = 2) extends TreeAnnotator[AnnotatedLabel, String, AnnotatedLabel] { + val pipeline = FilterAnnotations[String](Set(FunctionalTag("TMP"))) andThen Markovize[String](horizontal,vertical) andThen diff --git a/src/main/scala/epic/trees/annotations/TreeAnnotations.scala b/src/main/scala/epic/trees/annotations/TreeAnnotations.scala index 7ca6058c..da5acd08 100644 --- a/src/main/scala/epic/trees/annotations/TreeAnnotations.scala +++ b/src/main/scala/epic/trees/annotations/TreeAnnotations.scala @@ -22,7 +22,6 @@ object TreeAnnotations { case class HeadTagAnnotation(tag: String) extends Annotation - // KM Annotations trait KMAnnotation extends Annotation diff --git a/src/main/scala/epic/trees/annotations/TreeAnnotator.scala b/src/main/scala/epic/trees/annotations/TreeAnnotator.scala index ce6bcc4f..ad5bc968 100644 --- a/src/main/scala/epic/trees/annotations/TreeAnnotator.scala +++ b/src/main/scala/epic/trees/annotations/TreeAnnotator.scala @@ -87,8 +87,6 @@ case class StripAnnotations[W]() extends TreeAnnotator[AnnotatedLabel, W, Annota } } - - /** * Removes all features from the [[epic.trees.AnnotatedLabel]] * @tparam W @@ -107,10 +105,9 @@ case class Markovize[W](horizontal: Int=0, vertical: Int=2) extends TreeAnnotato } } - case class ParentAnnotate[W](order: Int = 0, skipPunctTags: Boolean = true) extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] { def apply(tree: BinarizedTree[AnnotatedLabel], words: Seq[W]) = { - if(order == 0) { + if (order == 0) { tree } else { def join(base: AnnotatedLabel, parent: Seq[AnnotatedLabel]) = { @@ -122,15 +119,13 @@ case class ParentAnnotate[W](order: Int = 0, skipPunctTags: Boolean = true) ext case ex: AssertionError => throw new RuntimeException(s"While handling $words", ex) } - } } - } case class ParentAnnotatePosTags[W](order: Int = 1, skipPunctTags: Boolean = true) extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] { def apply(tree: BinarizedTree[AnnotatedLabel], words: Seq[W]) = { - if(order == 0) { + if (order == 0) { tree } else { def join(base: AnnotatedLabel, parent: Seq[AnnotatedLabel]) = { @@ -139,20 +134,17 @@ case class ParentAnnotatePosTags[W](order: Int = 1, skipPunctTags: Boolean = tr Trees.annotateParentsBinarized(tree, join, {(_:AnnotatedLabel).isIntermediate}, {(l:Tree[AnnotatedLabel])=> !(l.isLeaf || l.children.length == 1 && l.children.head.label.label == l.label.label && l.span.length == 1) || l.label.label.isEmpty || (l.label.label.head != '@' && !l.label.label.head.isLetterOrDigit)}, order) } } - } case class ForgetHeadTag[W]() extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] { def apply(tree: BinarizedTree[AnnotatedLabel], words: Seq[W]) = { tree.map(_.copy(headTag=None)) } - } - case class MarkovizeSiblings[W](order: Int=0) extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] { def apply(tree: BinarizedTree[AnnotatedLabel], words: Seq[W]) = { - if(order == 0) tree.map {l => l.copy(siblings = IndexedSeq.empty)} + if (order == 0) tree.map {l => l.copy(siblings = IndexedSeq.empty)} else tree.map { l => l.copy(siblings = l.siblings.takeRight(order)) } } @@ -169,7 +161,6 @@ case class MarkovizeSiblings[W](order: Int=0) extends TreeAnnotator[AnnotatedLab } */ - } /** * Marks verb tags based on the auxiliary @@ -210,7 +201,7 @@ case class SplitPunct() extends TreeAnnotator[AnnotatedLabel, String, AnnotatedL val w = words(span.begin) if (w.forall(!_.isLetterOrDigit) && label.baseLabel != w) label.annotate(Punct(w)) - else if(w.matches("-[LR].B-") && label.baseLabel != w) label.annotate(Punct(w)) + else if (w.matches("-[LR].B-") && label.baseLabel != w) label.annotate(Punct(w)) else label case NullaryTree(label, span) => val w = words(span.begin) @@ -229,7 +220,7 @@ case class SplitPunct() extends TreeAnnotator[AnnotatedLabel, String, AnnotatedL case class SplitVP() extends TreeAnnotator[AnnotatedLabel, String, AnnotatedLabel] { val finiteVerbs = Set("VBZ", "VBD", "VBP", "MD") def apply(tree: BinarizedTree[AnnotatedLabel], words: Seq[String]) = tree.extend { t => - if(t.label.baseLabel != "VP") { + if (t.label.baseLabel != "VP") { t.label } else { val headTag = HeadFinder.collins.lensed[AnnotatedLabel].findHeadTag(t) @@ -244,7 +235,6 @@ case class SplitVP() extends TreeAnnotator[AnnotatedLabel, String, AnnotatedLabe } - case class SplitIN[W]() extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] { def apply(tree: BinarizedTree[AnnotatedLabel], words: Seq[W]) = { def rec(tree: BinarizedTree[AnnotatedLabel], root: String, @@ -253,14 +243,14 @@ case class SplitIN[W]() extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] val blbl = tree.label.baseLabel tree match { case tree@NullaryTree(lbl, span) if blbl == "IN" => - if(grandParent.isEmpty || grandParent.contains(root) || parent.contains(root)) { + if (grandParent.isEmpty || grandParent.contains(root) || parent.contains(root)) { tree } else if (grandParent.exists(_(0) == 'N') && parent.exists(s => s(0) == 'P' || s(0) == 'A')) { tree.copy(lbl.annotate(IN_N), span) } else if (parent.exists(_(0) == 'Q') && grandParent.exists(s => s(0) == 'N' || s.startsWith("ADJP"))) { tree.copy(lbl.annotate(IN_Q), span) - } else if(grandParent.contains("S")) { - if(parent.contains("SBAR")) { + } else if (grandParent.contains("S")) { + if (parent.contains("SBAR")) { tree.copy(lbl.annotate(IN_SCC), span) } else { tree.copy(lbl.annotate(IN_SC), span) @@ -269,8 +259,8 @@ case class SplitIN[W]() extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] tree } case u @ UnaryTree(lbl, c, chain, span) => - if(blbl != "IN") { - if(parent.exists(_ != blbl)) + if (blbl != "IN") { + if (parent.exists(_ != blbl)) u.copy(lbl, rec(c, root, Some(blbl), parent)) else u.copy(lbl, rec(c, root, parent, grandParent)) @@ -290,7 +280,7 @@ case class SplitIN[W]() extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] case class SplitPossNP[W]() extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] { def apply(tree: BinarizedTree[AnnotatedLabel], words: Seq[W]) = tree.extend { t => - if(t.label.baseLabel != "NP") t.label + if (t.label.baseLabel != "NP") t.label else { val headTag = HeadFinder.collins.lensed[AnnotatedLabel].findHeadTag(t) if (headTag.baseLabel == "POS") { @@ -317,7 +307,7 @@ case class AnnotateBaseNP[W]() extends TreeAnnotator[AnnotatedLabel, W, Annotate t -> true case t@UnaryTree(lbl1, child, chain, span) => val (newchild, ok) = rec(child) - if(lbl1.baseLabel == "NP" && (ok || newchild.label.hasAnnotation(BaseNP))) { + if (lbl1.baseLabel == "NP" && (ok || newchild.label.hasAnnotation(BaseNP))) { UnaryTree(lbl1.annotate(BaseNP), newchild, chain, span) -> lbl1.isIntermediate } else { UnaryTree(lbl1, newchild, chain, span) -> false @@ -325,12 +315,11 @@ case class AnnotateBaseNP[W]() extends TreeAnnotator[AnnotatedLabel, W, Annotate case t@BinaryTree(lbl, lc, rc, span) => val (newlc, lok) = rec(lc) val (newrc, rok) = rec(rc) - if(lok && rok && lbl.baseLabel == "NP") { + if (lok && rok && lbl.baseLabel == "NP") { BinaryTree(lbl.annotate(BaseNP), newlc, newrc, span) -> lbl.isIntermediate } else { BinaryTree(lbl, newlc, newrc, span) -> false } - } rec(tree)._1 @@ -351,7 +340,7 @@ case class AnnotateRightRecNP[W]() extends TreeAnnotator[AnnotatedLabel, W, Anno def rec(tree: BinarizedTree[AnnotatedLabel]):BinarizedTree[AnnotatedLabel] = tree match { case t@UnaryTree(lbl1, child, chain, span) => val newchild = rec(child) - if(lbl1.baseLabel == "NP" && newchild.label.hasAnnotation(RightRecNP)) { + if (lbl1.baseLabel == "NP" && newchild.label.hasAnnotation(RightRecNP)) { UnaryTree(lbl1.annotate(RightRecNP), newchild, chain, span) } else { UnaryTree(lbl1, newchild, chain, span) @@ -360,7 +349,7 @@ case class AnnotateRightRecNP[W]() extends TreeAnnotator[AnnotatedLabel, W, Anno val newrc = rec(rc) val isRightRec = lbl.baseLabel == "NP" && (newrc.label.label == "NP" || (newrc.label.label == "@NP" && newrc.label.hasAnnotation(RightRecNP))) val newlc = rec(lc) - if(isRightRec) { + if (isRightRec) { val lclc = annotateDownwards(newlc) BinaryTree(lbl.annotate(RightRecNP), lclc, newrc, span) } else { @@ -376,7 +365,7 @@ case class AnnotateRightRecNP[W]() extends TreeAnnotator[AnnotatedLabel, W, Anno case UnaryTree(lbl, child, chain, span) if lbl.label == "@NP" => UnaryTree(lbl.annotate(RightRecNP), annotateDownwards(child), chain, span) case BinaryTree(lbl, lc, rc, span) if lbl.label == "@NP" => - BinaryTree(lbl.annotate(RightRecNP), if(lc.label.isIntermediate) annotateDownwards(lc) else lc, if(rc.label.isIntermediate) annotateDownwards(rc) else rc, span) + BinaryTree(lbl.annotate(RightRecNP), if (lc.label.isIntermediate) annotateDownwards(lc) else lc, if (rc.label.isIntermediate) annotateDownwards(rc) else rc, span) case _ => tree } rec(tree) @@ -394,9 +383,9 @@ case class AnnotateDomCC[W]() extends TreeAnnotator[AnnotatedLabel, W, Annotated def rec(tree: BinarizedTree[AnnotatedLabel]):BinarizedTree[AnnotatedLabel] = tree match { case t@UnaryTree(lbl1, child, chain, span) => val newchild = rec(child) - if(newchild.label.hasAnnotation(DomCCLeft)) { + if (newchild.label.hasAnnotation(DomCCLeft)) { UnaryTree(lbl1.annotate(DomCCLeft), newchild, chain, span) - } else if(newchild.label.hasAnnotation(DomCCRight)) { + } else if (newchild.label.hasAnnotation(DomCCRight)) { UnaryTree(lbl1.annotate(DomCCRight), newchild, chain, span) } else { UnaryTree(lbl1, newchild, chain, span) @@ -406,7 +395,7 @@ case class AnnotateDomCC[W]() extends TreeAnnotator[AnnotatedLabel, W, Annotated val newlc = rec(lc) val domsCCR = newrc.label.label == "CC" || (newrc.label.isIntermediate && newrc.label.hasAnnotation(DomCCRight)) val domsCCL = newlc.label.label == "CC" || (newlc.label.isIntermediate && newlc.label.hasAnnotation(DomCCLeft)) - val sym = if(domsCCL) lbl.annotate(DomCCLeft) else if(domsCCR) lbl.annotate(DomCCRight) else lbl + val sym = if (domsCCL) lbl.annotate(DomCCLeft) else if (domsCCR) lbl.annotate(DomCCRight) else lbl BinaryTree(sym, newlc, newrc, span) case _ => tree } @@ -429,7 +418,6 @@ case class MarkNonIdentityUnaries[W]() extends TreeAnnotator[AnnotatedLabel, W, else u.copy(child = rec(c)) } - rec(tree) } } @@ -449,7 +437,6 @@ case class MarkExternalUnaries[W]() extends TreeAnnotator[AnnotatedLabel, W, Ann else u.copy(child=rec(c)) } - rec(tree) } } @@ -495,17 +482,17 @@ case class MarkPreterminals[W]() extends TreeAnnotator[AnnotatedLabel, W, Annota } trait MarkDominates[W] extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] { - protected def dominates(x: Tree[AnnotatedLabel]):Boolean + protected def dominates(x: Tree[AnnotatedLabel]): Boolean protected def sym: String def apply(tree: BinarizedTree[AnnotatedLabel], words: Seq[W]) = tree.extend { t => - if(t eq tree) t.label - else if(dominates(t)) t.label.annotate(Dom(sym)) + if (t eq tree) t.label + else if (dominates(t)) t.label.annotate(Dom(sym)) else t.label } } case class DominatesV[W]() extends MarkDominates[W] { - protected def dominates(x: Tree[AnnotatedLabel]):Boolean = x.leaves.exists { t => t.label.label.startsWith("V") || t.label.label.startsWith("MD")} + protected def dominates(x: Tree[AnnotatedLabel]): Boolean = x.leaves.exists { t => t.label.label.startsWith("V") || t.label.label.startsWith("MD")} def sym = "V" } diff --git a/src/main/scala/epic/trees/util/FilterTreesByLength.scala b/src/main/scala/epic/trees/util/FilterTreesByLength.scala index e1fce074..30f6b64f 100644 --- a/src/main/scala/epic/trees/util/FilterTreesByLength.scala +++ b/src/main/scala/epic/trees/util/FilterTreesByLength.scala @@ -40,7 +40,7 @@ object FilterTreesByLength { for ( (gold, guess) <- getTrees(gold) zip getTrees(params.guess)) { assert(gold._2 == guess._2) - val len = (if(ignorePunct) gold._2.count(!_.forall(!_.isLetterOrDigit)) else gold._2.length)/bucketSize * bucketSize + val len = (if (ignorePunct) gold._2.count(!_.forall(!_.isLetterOrDigit)) else gold._2.length)/bucketSize * bucketSize goldOut(len).println(gold._1.render(gold._2, newline = false)) guessOut(len).println(guess._1.render(guess._2, newline = false)) } @@ -48,7 +48,6 @@ object FilterTreesByLength { goldOut.values.foreach(_.close()) guessOut.values.foreach(_.close()) - } def getTrees(file: File): PennTreeReader = { diff --git a/src/main/scala/epic/util/ArabicNormalization.scala b/src/main/scala/epic/util/ArabicNormalization.scala index b3839568..ab44244c 100644 --- a/src/main/scala/epic/util/ArabicNormalization.scala +++ b/src/main/scala/epic/util/ArabicNormalization.scala @@ -13,7 +13,7 @@ import scala.annotation.switch object ArabicNormalization extends LazyLogging { def handleTreebankThings(s: String):Option[String] = { - if(!s.startsWith("-")) { + if (!s.startsWith("-")) { None } else { s match { @@ -27,11 +27,10 @@ object ArabicNormalization extends LazyLogging { case "-MINUS-" => Some(s) case _ => None } - } } - def buckwalterToUnicode(buckwalter: String):String = { + def buckwalterToUnicode(buckwalter: String): String = { handleTreebankThings(buckwalter) match { case Some(x) => x case None => @@ -89,11 +88,10 @@ object ArabicNormalization extends LazyLogging { case '{' => '\u0671' case '.' | '?' | '!' | ',' | '"' | '%' | '-' | '/' | ':' | ';' | '=' => buckwalter(i) case x => - if(!x.isDigit) + if (!x.isDigit) logger.warn("Unknown buckwalter character: " + x) x }} - i += 1 } out.result() diff --git a/src/main/scala/epic/util/BinarySearch.scala b/src/main/scala/epic/util/BinarySearch.scala index 64adf664..131535fa 100644 --- a/src/main/scala/epic/util/BinarySearch.scala +++ b/src/main/scala/epic/util/BinarySearch.scala @@ -7,8 +7,8 @@ package epic.util **/ object BinarySearch { - def interpolationSearch[T](objs: IndexedSeq[T], proj: T=>Int, toFind: Int):Int = { - if(objs.isEmpty) return ~0 + def interpolationSearch[T](objs: IndexedSeq[T], proj: T=>Int, toFind: Int): Int = { + if (objs.isEmpty) return ~0 // Returns index of toFind in sortedArray, or -1 if not found var low = 0 @@ -17,7 +17,7 @@ object BinarySearch { var highV = proj(objs(high)) while (lowV <= toFind && highV >= toFind) { - val mid = (if(highV == lowV) low else low + ((toFind - lowV.toLong) * (high - low)) / (highV.toLong - lowV.toLong)).toInt + val mid = (if (highV == lowV) low else low + ((toFind - lowV.toLong) * (high - low)) / (highV.toLong - lowV.toLong)).toInt val midV = proj(objs(mid)) if (midV < toFind){ @@ -31,16 +31,13 @@ object BinarySearch { } } - if (lowV == toFind) { low - } else if(lowV > toFind) { + } else if (lowV > toFind) { ~low } else { ~(high + 1) } } - - } diff --git a/src/main/scala/epic/util/Cache.scala b/src/main/scala/epic/util/Cache.scala index 552eaecd..56603de9 100644 --- a/src/main/scala/epic/util/Cache.scala +++ b/src/main/scala/epic/util/Cache.scala @@ -16,17 +16,17 @@ case class CacheBroker(path: File = null, copyFrom: File = null, clearCaches: St @transient private var _actualCache:CacheBroker.ActualCache = null private def actualCache = synchronized { - lazy val dbMaker = if(path eq null) { + lazy val dbMaker = if (path eq null) { DBMaker.newMemoryDB() } else { DBMaker.newFileDB(path) }.closeOnJvmShutdown().cacheSoftRefEnable() - if(_actualCache eq null) { + if (_actualCache eq null) { _actualCache = CacheBroker.getCacheBroker(path, dbMaker, autocommit, copyFrom) } - if(disableWriteAheadLog) _actualCache.dbMaker.writeAheadLogDisable() - if(clearCaches != null && clearCaches.nonEmpty) + if (disableWriteAheadLog) _actualCache.dbMaker.writeAheadLogDisable() + if (clearCaches != null && clearCaches.nonEmpty) for(toDisable <- clearCaches.split(",")) { _actualCache.db.getHashMap(toDisable).clear() } @@ -34,11 +34,9 @@ case class CacheBroker(path: File = null, copyFrom: File = null, clearCaches: St _actualCache } - def dbMaker = actualCache.dbMaker def db = actualCache.db - def commit() { db.commit()} def close() {actualCache.close()} @@ -50,22 +48,22 @@ object CacheBroker extends LazyLogging { private class ActualCache private[CacheBroker] (val path: File, val dbMaker: DBMaker, val autocommit: Boolean, copyFrom: File = null) { lazy val db = { val db = dbMaker.make() - if(copyFrom != null) { + if (copyFrom != null) { logger.info(s"Copying database from $copyFrom to ${if (path ne null) path else "in memory database"}") val from = DBMaker.newFileDB(copyFrom).make() Pump.copy(from, db) from.close() } - if(autocommit) cacheThread.start() + if (autocommit) cacheThread.start() db } private lazy val cacheThread: Thread = new Thread(new Runnable { def run() { try { - while(!db.isClosed && !Thread.interrupted()) { + while (!db.isClosed && !Thread.interrupted()) { Thread.sleep(1000 * 60) - if(!db.isClosed) + if (!db.isClosed) db.commit() } } catch { @@ -88,18 +86,17 @@ object CacheBroker extends LazyLogging { private val cacheCache = Collections.synchronizedMap(new util.HashMap[File, ActualCache]()).asScala private def getCacheBroker(path: File, dbMaker: =>DBMaker, autocommit: Boolean, copyFrom: File) = { - if(path eq null) new ActualCache(path, dbMaker, autocommit) + if (path eq null) new ActualCache(path, dbMaker, autocommit) else cacheCache.getOrElseUpdate(path, new ActualCache(path, dbMaker, autocommit, copyFrom)) } - @SerialVersionUID(1L) private class CacheMap[K, V](name: String, cache: CacheBroker)(implicit kser: Serializer[K], vser: Serializer[V]) extends Map[K, V] with Serializable { import cache._ @transient private var _theMap : Map[K, V] = null def theMap = synchronized { - if(_theMap eq null) { + if (_theMap eq null) { _theMap = try { // this throws if the hash map exists, and there's no "does it exist" method // that takes the serializers... @@ -111,7 +108,6 @@ object CacheBroker extends LazyLogging { _theMap } - def +=(kv: (K, V)): this.type = {theMap += kv; this} def -=(key: K): this.type = {theMap -= key; this} diff --git a/src/main/scala/epic/util/FIFOWorkQueue.scala b/src/main/scala/epic/util/FIFOWorkQueue.scala index 7600f8ce..bf57bd9d 100644 --- a/src/main/scala/epic/util/FIFOWorkQueue.scala +++ b/src/main/scala/epic/util/FIFOWorkQueue.scala @@ -35,9 +35,9 @@ class FIFOWorkQueue[-In, Out](f: In=>Out)(implicit context: ExecutionContext) ex def next() = {waitUntilReady(); Await.result(queue.poll(), Duration.Inf)} - private def waitUntilReady():Boolean = { + private def waitUntilReady(): Boolean = { synchronized { - while(!done && queue.isEmpty) { + while (!done && queue.isEmpty) { wait() } } diff --git a/src/main/scala/epic/util/Has.scala b/src/main/scala/epic/util/Has.scala index 3193c238..c8813d3d 100644 --- a/src/main/scala/epic/util/Has.scala +++ b/src/main/scala/epic/util/Has.scala @@ -2,7 +2,6 @@ package epic.util import epic.framework.{Example, Observation} - /** * * @author dlwh @@ -16,7 +15,6 @@ trait Has2[Haver, +WhatIHave] { def get(h: Haver):WhatIHave } - object Has2 { implicit def identityHas2[H]:Has2[H, H] = new Has2[H, H] with Serializable { def get(h: H): H = h diff --git a/src/main/scala/epic/util/LRUCache.scala b/src/main/scala/epic/util/LRUCache.scala index dab2d283..c7ee95ff 100644 --- a/src/main/scala/epic/util/LRUCache.scala +++ b/src/main/scala/epic/util/LRUCache.scala @@ -1,6 +1,5 @@ package epic.util - import scala.reflect.ClassTag import scala.util.hashing.MurmurHash3 @@ -32,7 +31,6 @@ final class LRUCache[@specialized(Int, Long) K:ClassTag, V:ClassTag](size: Int, occupied(pos) = -1 onEvict(keys(pos), values(pos)) } - } def iterator: Iterator[(K, V)] = { @@ -64,7 +62,7 @@ final class LRUCache[@specialized(Int, Long) K:ClassTag, V:ClassTag](size: Int, values(pos) = v } - private def lookup(k: K):Int = { + private def lookup(k: K): Int = { val hc : Int = k.## val hh = MurmurHash3.mixLast(10891, hc).abs % keys.length val hh2 = MurmurHash3.mixLast(10909, hc).abs % keys.length diff --git a/src/main/scala/epic/util/LockableSeenSet.scala b/src/main/scala/epic/util/LockableSeenSet.scala index 8bc83725..795993ca 100644 --- a/src/main/scala/epic/util/LockableSeenSet.scala +++ b/src/main/scala/epic/util/LockableSeenSet.scala @@ -9,7 +9,7 @@ import java.io.ObjectStreamException * @author dlwh **/ trait LockableSeenSet[@specialized(Int, Long) -T] extends Serializable { - def addOrSeen(x: T):Boolean + def addOrSeen(x: T): Boolean def lock: LockableSeenSet[T] } @@ -18,7 +18,6 @@ class BloomFilterSeenSet[@specialized(Int, Long) T](bf: BloomFilter[T]) extends override def addOrSeen(x: T): Boolean = { bf(x) } - override def lock: LockableSeenSet[T] = this } @@ -26,13 +25,10 @@ object LockableSeenSet { def always[T]:LockableSeenSet[T] = AlwaysSeenSet } - @SerialVersionUID(1L) object AlwaysSeenSet extends LockableSeenSet[Any] { override def addOrSeen(x: Any): Boolean = true - override def lock: LockableSeenSet[Any] = this - @throws[ObjectStreamException] private def readResolve() = { AlwaysSeenSet diff --git a/src/main/scala/epic/util/Optional.scala b/src/main/scala/epic/util/Optional.scala index 806431c7..8f4fd244 100644 --- a/src/main/scala/epic/util/Optional.scala +++ b/src/main/scala/epic/util/Optional.scala @@ -18,11 +18,7 @@ case class Provided[A](value: A) extends Optional[A] case object NotProvided extends Optional[Nothing] object Optional { - implicit def anyToOptional[A](x: A): Optional[A] = if (x == null) NotProvided else Provided(x) - implicit def optionToOptional[A](x: Option[A]): Optional[A] = x.fold(NotProvided:Optional[A])(Provided(_)) - implicit def optionalToOption[A](x: Optional[A]): Option[A] = x.toOption - } diff --git a/src/main/scala/epic/util/ProcessTextMain.scala b/src/main/scala/epic/util/ProcessTextMain.scala index d326a0ce..83dfe52a 100644 --- a/src/main/scala/epic/util/ProcessTextMain.scala +++ b/src/main/scala/epic/util/ProcessTextMain.scala @@ -16,7 +16,7 @@ import java.util.concurrent.{LinkedBlockingDeque, TimeUnit, ThreadPoolExecutor} trait ProcessTextMain[Model, AnnotatedType] { import ProcessTextMain._ - def render(model: Model, ann: AnnotatedType, tokens: IndexedSeq[String]):String + def render(model: Model, ann: AnnotatedType, tokens: IndexedSeq[String]): String def renderFailed(model: Model, tokens: IndexedSeq[String], reason: Throwable): String = { s"### Could not tag $tokens, because ${reason.getMessage}... ${reason.getStackTrace.take(2).mkString(";")}".replaceAll("\n", " ") @@ -60,7 +60,7 @@ trait ProcessTextMain[Model, AnnotatedType] { case "none" | "whitespace" => new WhitespaceTokenizer } - implicit val context = if(params.threads > 0) { + implicit val context = if (params.threads > 0) { scala.concurrent.ExecutionContext.fromExecutor(new ThreadPoolExecutor(1, params.threads, 1, TimeUnit.SECONDS, new LinkedBlockingDeque[Runnable]())) } else { scala.concurrent.ExecutionContext.global @@ -72,7 +72,7 @@ trait ProcessTextMain[Model, AnnotatedType] { val queue = FIFOWorkQueue(sentenceSegmenter.sentences(src)){sent => val tokens = tokenizer(sent).toIndexedSeq try { - if(tokens.length > params.maxLength) { + if (tokens.length > params.maxLength) { throw new SentenceTooLongException(tokens.length) } val tree = annotate(model, tokens) diff --git a/src/main/scala/epic/util/ProgressLog.scala b/src/main/scala/epic/util/ProgressLog.scala index e930aec8..9a82ba45 100644 --- a/src/main/scala/epic/util/ProgressLog.scala +++ b/src/main/scala/epic/util/ProgressLog.scala @@ -12,17 +12,16 @@ class ProgressLog(log: Logger, items: Int, frequency: Int = 100, name: String = val initialTime = System.currentTimeMillis() val item = new AtomicInteger() - def reportProgress() = { val x = item.incrementAndGet() - if(x % frequency == 0 || x == items) { + if (x % frequency == 0 || x == items) { log.info(s"$name $x/$items (${(System.currentTimeMillis() - initialTime)/1000.0}s elapsed.)") } } def info(msg: =>String) = { val x = item.incrementAndGet() - if(x % frequency == 0 || x == items) { + if (x % frequency == 0 || x == items) { val m = msg log.info(s"$name $x/$items: $m (${(System.currentTimeMillis() - initialTime)/1000.0}s elapsed.)") } @@ -30,7 +29,7 @@ class ProgressLog(log: Logger, items: Int, frequency: Int = 100, name: String = def debug(msg: =>String) = { val x = item.incrementAndGet() - if(x % frequency == 0 || x == items) { + if (x % frequency == 0 || x == items) { val m = msg log.debug(s"$name $x/$items: $m (${(System.currentTimeMillis() - initialTime)/1000.0}s elapsed.)") } diff --git a/src/main/scala/epic/util/SafeLogging.scala b/src/main/scala/epic/util/SafeLogging.scala index 9d3b9b92..c8072c68 100644 --- a/src/main/scala/epic/util/SafeLogging.scala +++ b/src/main/scala/epic/util/SafeLogging.scala @@ -14,10 +14,10 @@ trait SafeLogging { def logger: Logger = { var logger = _the_logger - if(logger eq null) { + if (logger eq null) { synchronized { logger = _the_logger - if(logger eq null) { + if (logger eq null) { val ll = Logger(LoggerFactory.getLogger(this.getClass)) _the_logger = ll logger = ll @@ -27,5 +27,4 @@ trait SafeLogging { logger } - } diff --git a/src/main/scala/epic/util/ThreadLocalBloomFilter.scala b/src/main/scala/epic/util/ThreadLocalBloomFilter.scala index e55ed49f..40ade11e 100644 --- a/src/main/scala/epic/util/ThreadLocalBloomFilter.scala +++ b/src/main/scala/epic/util/ThreadLocalBloomFilter.scala @@ -20,7 +20,6 @@ class ThreadLocalBloomFilter[@specialized(Int, Long) T](numBuckets: Int, numHash } } - override def addOrSeen(x: T): Boolean = {tl.get() += x; true} private val queue = new ConcurrentLinkedDeque[BloomFilter[T]]() @@ -29,7 +28,7 @@ class ThreadLocalBloomFilter[@specialized(Int, Long) T](numBuckets: Int, numHash val bf = tl.get() var i = 0 val len = queue.size - while(!queue.isEmpty && i < len) { + while (!queue.isEmpty && i < len) { bf |= queue.pop() i += 1 } @@ -41,7 +40,6 @@ class ThreadLocalBloomFilter[@specialized(Int, Long) T](numBuckets: Int, numHash val u = union val load = u.load val size = - u.numBuckets * math.log1p(-load)/u.numHashFunctions - logger.info(f"Bloom filter has load of ${u.load}%.3f and approx size $size. Queue is ${queue.size()} elements long.") new BloomFilterSeenSet[T](u) } diff --git a/src/main/scala/epic/util/Unicode.scala b/src/main/scala/epic/util/Unicode.scala index ffa79e78..2bd0b652 100644 --- a/src/main/scala/epic/util/Unicode.scala +++ b/src/main/scala/epic/util/Unicode.scala @@ -26,7 +26,6 @@ object Unicode { if (cp < 0) { return false } - var i = 0 while (i < rangeStarts.length && cp < rangeStarts(i)) { i += 1 diff --git a/src/main/scala/epic/util/WeightsCache.scala b/src/main/scala/epic/util/WeightsCache.scala index ef8e54a8..1914f9d8 100644 --- a/src/main/scala/epic/util/WeightsCache.scala +++ b/src/main/scala/epic/util/WeightsCache.scala @@ -17,8 +17,8 @@ object WeightsCache { require(weights.length == index.size) val out = new PrintStream(new GZIPOutputStream(new FileOutputStream(file), 1024)) var i = 0 - while(i < index.size) { - if(weights(i).abs > threshold) + while (i < index.size) { + if (weights(i).abs > threshold) out.println(index.get(i) + "\t" + weights(i)) i += 1 } diff --git a/src/test/scala/epic/constraints/LabeledSpanConstraintsTest.scala b/src/test/scala/epic/constraints/LabeledSpanConstraintsTest.scala index 9aa3495d..e42a7b7a 100644 --- a/src/test/scala/epic/constraints/LabeledSpanConstraintsTest.scala +++ b/src/test/scala/epic/constraints/LabeledSpanConstraintsTest.scala @@ -14,9 +14,9 @@ class LabeledSpanConstraintsTest extends FunSuite { /* test("serialization") { val x = LabeledSpanConstraints[Int](TriangularArray.tabulate(10) { (i,j) => - if(i == j || i > 5) null + if (i == j || i > 5) null else { - if(i < j - 1) BitSet(1,2,3,4) + if (i < j - 1) BitSet(1,2,3,4) else BitSet(i,j) } @@ -37,15 +37,15 @@ class LabeledSpanConstraintsTest extends FunSuite { test("containsAll") { val x = LabeledSpanConstraints[Int](TriangularArray.tabulate(10) { (i,j) => - if(i == j || i > 5) null + if (i == j || i > 5) null else { - if(i < j) BitSet(1,2,3,4) + if (i < j) BitSet(1,2,3,4) else BitSet(1) } }) val z = LabeledSpanConstraints[Int](TriangularArray.tabulate(10) { (i,j) => - if(i < j) BitSet(1,2,3,4) + if (i < j) BitSet(1,2,3,4) else BitSet(1) }) diff --git a/src/test/scala/epic/features/CrossProductIndexTest.scala b/src/test/scala/epic/features/CrossProductIndexTest.scala index 966cd126..da348707 100644 --- a/src/test/scala/epic/features/CrossProductIndexTest.scala +++ b/src/test/scala/epic/features/CrossProductIndexTest.scala @@ -32,7 +32,7 @@ class CrossProductIndexTest extends FunSuite { for(i <- 0 until index1.size; j <- 0 until index2.size) { val mapped = res.mapped(i, j) - if(mapped >= 0) + if (mapped >= 0) assert(csc(i, j) === weights(mapped)) } @@ -62,7 +62,7 @@ class CrossProductIndexTest extends FunSuite { for(i <- 0 until index1.size; j <- 0 until index2.size) { val mapped = res.mapped(i, j) - if(mapped >= 0) + if (mapped >= 0) assert(csc(i, j) === weights(mapped)) } diff --git a/src/test/scala/epic/features/DistanceBinnerTest.scala b/src/test/scala/epic/features/DistanceBinnerTest.scala index 8d83f4bd..a6cd5fc9 100644 --- a/src/test/scala/epic/features/DistanceBinnerTest.scala +++ b/src/test/scala/epic/features/DistanceBinnerTest.scala @@ -11,17 +11,17 @@ class DistanceBinnerTest extends FunSuite { val binner = new DistanceBinner(preserveDirection = true) val dists = Array.tabulate(20,20) { (i, j) => val dist = binner.binnedDistance(i, j) - if(i < j) + if (i < j) assert(dist > 0, (dist, i, j)) - else if(i == j) + else if (i == j) assert(dist === 0) - else if(i > j) assert(dist < 0, (dist, i, j)) + else if (i > j) assert(dist < 0, (dist, i, j)) dist } assert(dists.flatten.toSet.size == binner.numBins * 2 - 1, dists.flatten.toSet -> binner.numBins) for(i <- 1 until 19; j <- 1 until 19) { - if(i != j) { + if (i != j) { assert(dists(i)(j) >= dists(i +1)(j), (i,j,dists(i)(j),dists(i+1)(j))) assert(dists(i)(j) >= dists(i)(j-1)) assert(dists(i)(j) <= dists(i -1)(j), (i,j,dists(i)(j),dists(i+1)(j))) @@ -34,16 +34,16 @@ class DistanceBinnerTest extends FunSuite { val binner = new DistanceBinner(preserveDirection = false) val dists = Array.tabulate(20,20) { (i, j) => val dist = binner.binnedDistance(i, j) - if(i != j) + if (i != j) assert(dist > 0, (dist, i, j)) - else if(i == j) + else if (i == j) assert(dist === 0) dist } assert(dists.flatten.toSet.size === binner.numBins) for(i <- 1 until 19; j <- i until 19) { - if(i < j) { + if (i < j) { assert(dists(i)(j) >= dists(i +1)(j), (i,j,dists(i)(j),dists(i+1)(j))) assert(dists(i)(j) >= dists(i)(j-1)) assert(dists(i)(j) <= dists(i -1)(j), (i,j,dists(i)(j),dists(i+1)(j))) diff --git a/src/test/scala/epic/parser/InsideOutsideTest.scala b/src/test/scala/epic/parser/InsideOutsideTest.scala index ad4995e4..7a8415ea 100644 --- a/src/test/scala/epic/parser/InsideOutsideTest.scala +++ b/src/test/scala/epic/parser/InsideOutsideTest.scala @@ -29,7 +29,7 @@ import repl.DSLGrammar class InsideOutsideTest extends FunSuite { implicit def near(x: Double) = new { - def near(y: Double) = if( (x-y).abs < 1E-4 * math.max(x+y,1E-4)/2) None else Some(x + " not near " + y) + def near(y: Double) = if ( (x-y).abs < 1E-4 * math.max(x+y,1E-4)/2) None else Some(x + " not near " + y) } test("Simple test from iobasics") { diff --git a/src/test/scala/epic/preprocess/TreebankTokenizerTest.scala b/src/test/scala/epic/preprocess/TreebankTokenizerTest.scala index 530d61a5..e58b40a7 100644 --- a/src/test/scala/epic/preprocess/TreebankTokenizerTest.scala +++ b/src/test/scala/epic/preprocess/TreebankTokenizerTest.scala @@ -5,7 +5,7 @@ import org.scalatest.FunSuite class TreebankTokenizerTest extends FunSuite { private def isOneToken(w: String) = - if(w === TreebankTokenizer(w).head) None else Some(w + " " + TreebankTokenizer(w)) + if (w === TreebankTokenizer(w).head) None else Some(w + " " + TreebankTokenizer(w)) test("simple words") { val words = List("Hi","there","pilgrim","happy","Thanksgiving","there") diff --git a/src/test/scala/epic/sequences/SegmentationTest.scala b/src/test/scala/epic/sequences/SegmentationTest.scala index 35daa020..188802bb 100644 --- a/src/test/scala/epic/sequences/SegmentationTest.scala +++ b/src/test/scala/epic/sequences/SegmentationTest.scala @@ -17,7 +17,7 @@ class SegmentationTest extends FunSuite with Checkers { } yield { val segments = segs.foldLeft((Vector((0,Span(0,0))))) { (cur, sl) => val (segId, len) = sl - if(segId == 0) cur :+ (segId -> Span(cur.last._2.end, cur.last._2.end + 1)) + if (segId == 0) cur :+ (segId -> Span(cur.last._2.end, cur.last._2.end + 1)) else cur :+ (segId -> Span(cur.last._2.end, cur.last._2.end + len)) } Segmentation(segments.drop(1), 0 until segments.last._2.end) diff --git a/src/test/scala/epic/trees/annotations/KMAnnotatorTest.scala b/src/test/scala/epic/trees/annotations/KMAnnotatorTest.scala index 36b0a4ae..ba3d03ee 100644 --- a/src/test/scala/epic/trees/annotations/KMAnnotatorTest.scala +++ b/src/test/scala/epic/trees/annotations/KMAnnotatorTest.scala @@ -43,7 +43,7 @@ class KMAnnotatorTest extends FunSuite { // make sure the S dominates a V assert(pipelined.allChildren.exists(t => t.label.label == "S" && t.label.hasAnnotation(Dom("V"))), "DomV2") // make sure the @S dominates a V and has an NP to its left - if(pipelined.allChildren.exists(t => t.label.label == "@S" && t.label.hasAnnotation(Dom("V")) && t.label.siblings.nonEmpty && t.label.siblings(0) == Right("."))) { + if (pipelined.allChildren.exists(t => t.label.label == "@S" && t.label.hasAnnotation(Dom("V")) && t.label.siblings.nonEmpty && t.label.siblings(0) == Right("."))) { } else { fail(pipelined.toString + " " + pipelined.map(label => label -> ( label.label == "@S" , label.hasAnnotation(Dom("V")) , label.siblings.map(_ == Right(".")))))