diff --git a/src/main/java/epic/util/Arrays.scala b/src/main/java/epic/util/Arrays.scala index f1890a31..7fba7390 100644 --- a/src/main/java/epic/util/Arrays.scala +++ b/src/main/java/epic/util/Arrays.scala @@ -48,9 +48,9 @@ object Arrays { val ret = new Array[C](arr1.length * arr2.length) var off = 0 var i = 0 - while(i < arr1.length) { + while (i < arr1.length) { var j = 0 - while(j < arr2.length) { + while (j < arr2.length) { ret(off) = f(arr1(i), arr2(j)) off += 1 j += 1 @@ -65,9 +65,9 @@ object Arrays { val ret = new Array[Int](arr1.length * arr2.length) var off = 0 var i = 0 - while(i < arr1.length) { + while (i < arr1.length) { var j = 0 - while(j < arr2.length) { + while (j < arr2.length) { ret(off) = arr1(i) + arr2(j) * secondScale off += 1 j += 1 diff --git a/src/main/scala/epic/constraints/CachedLabeledSpanConstraintsFactory.scala b/src/main/scala/epic/constraints/CachedLabeledSpanConstraintsFactory.scala index 195b4e61..b9d11b9a 100644 --- a/src/main/scala/epic/constraints/CachedLabeledSpanConstraintsFactory.scala +++ b/src/main/scala/epic/constraints/CachedLabeledSpanConstraintsFactory.scala @@ -2,7 +2,6 @@ package epic.constraints import epic.util.CacheBroker - /** * A cached version of [[epic.constraints.LabeledSpanConstraints.Factory]]. * Uses the [[epic.util.CacheBroker]] infrastructure diff --git a/src/main/scala/epic/constraints/ChartConstraints.scala b/src/main/scala/epic/constraints/ChartConstraints.scala index fcc68cbb..19a6990e 100644 --- a/src/main/scala/epic/constraints/ChartConstraints.scala +++ b/src/main/scala/epic/constraints/ChartConstraints.scala @@ -19,29 +19,26 @@ import java.io.{DataOutput, DataInput} case class ChartConstraints[L](top: LabeledSpanConstraints[L], bot: LabeledSpanConstraints[L]) extends SpanConstraints with Serializable { - - def isAllowedSpan(begin: Int, end: Int):Boolean = top.isAllowedSpan(begin, end) || bot.isAllowedSpan(begin, end) + def isAllowedSpan(begin: Int, end: Int): Boolean = top.isAllowedSpan(begin, end) || bot.isAllowedSpan(begin, end) /** TODO */ // TODO - def hasMaximalLabel(begin: Int, end: Int):Boolean = ??? - + def hasMaximalLabel(begin: Int, end: Int): Boolean = ??? def maxSpanLengthStartingAt(begin: Int): Int = top.maxSpanLengthStartingAt(begin) max bot.maxSpanLengthStartingAt(begin) def flatten = top | bot - def &(other: ChartConstraints[L]) = if(this eq other) this else ChartConstraints(top & other.top, bot & other.bot) + def &(other: ChartConstraints[L]) = if (this eq other) this else ChartConstraints(top & other.top, bot & other.bot) def |(other: ChartConstraints[L]) = ChartConstraints(top | other.top, bot | other.bot) - } object ChartConstraints { + def noSparsity[L]: ChartConstraints[L] = ChartConstraints[L](LabeledSpanConstraints.noConstraints[L], LabeledSpanConstraints.noConstraints[L]) def apply[L](top: TriangularArray[_ <: BitSet], bot: TriangularArray[_ <: BitSet]): ChartConstraints[L] = ChartConstraints(LabeledSpanConstraints(top), LabeledSpanConstraints(bot)) trait Factory[L, W] extends SpanConstraints.Factory[W] { def constraints(w: IndexedSeq[W]): ChartConstraints[L] - def |(cf: Factory[L, W]) = new OrFactory(this, cf) } @@ -82,11 +79,9 @@ object ChartConstraints { case _ => bot(t.begin,t.end) = BitSet(labelIndex(t.label)) } - ChartConstraints(LabeledSpanConstraints(top), LabeledSpanConstraints(bot)) } - implicit def serializerChartConstraints[L]:Serializer[ChartConstraints[L]] = new Serializer[ChartConstraints[L]] with Serializable { def serialize(out: DataOutput, value: ChartConstraints[L]) { implicitly[Serializer[LabeledSpanConstraints[L]]].serialize(out, value.top) @@ -98,6 +93,6 @@ object ChartConstraints { val bot = implicitly[Serializer[LabeledSpanConstraints[L]]].deserialize(in, available) ChartConstraints(top, bot) } - } + } diff --git a/src/main/scala/epic/constraints/LabeledSpanConstraints.scala b/src/main/scala/epic/constraints/LabeledSpanConstraints.scala index 4d794fbe..b74473e1 100644 --- a/src/main/scala/epic/constraints/LabeledSpanConstraints.scala +++ b/src/main/scala/epic/constraints/LabeledSpanConstraints.scala @@ -26,9 +26,9 @@ sealed trait LabeledSpanConstraints[-L] extends SpanConstraints { def isAllowedLabeledSpan(begin: Int, end: Int, label: Int): Boolean def isAllowedSpan(begin: Int, end: Int): Boolean /** How long can a span be if it starts at begin*/ - def maxSpanLengthStartingAt(begin: Int):Int + def maxSpanLengthStartingAt(begin: Int): Int /** How long can a span be if it has label label in this sentence? */ - def maxSpanLengthForLabel(label: Int):Int + def maxSpanLengthForLabel(label: Int): Int /** * Computes the intersection of the constraints @@ -36,7 +36,7 @@ sealed trait LabeledSpanConstraints[-L] extends SpanConstraints { * @return */ def &(other: LabeledSpanConstraints[L @uncheckedVariance ]): LabeledSpanConstraints[L] = { - if(this eq other) this + if (this eq other) this else this match { case NoConstraints => other case PromotedSpanConstraints(inner) => other match { @@ -44,7 +44,7 @@ sealed trait LabeledSpanConstraints[-L] extends SpanConstraints { case PromotedSpanConstraints(otherinner) => PromotedSpanConstraints(inner & otherinner) case SimpleConstraints(maxPosX, maxLx, x) => SimpleConstraints(maxPosX, maxLx, TriangularArray.tabulate(x.dimension){(b, e) => - if(x(b,e) == null || !inner.isAllowedSpan(b,e)) null + if (x(b,e) == null || !inner.isAllowedSpan(b,e)) null else x(b,e) }) } @@ -55,14 +55,14 @@ sealed trait LabeledSpanConstraints[-L] extends SpanConstraints { require(x.dimension == y.dimension, "Dimensions of constrained spans must match!") SimpleConstraints( elementwiseMin(maxPosX, maxPosY), elementwiseMin(maxLx, maxLy), TriangularArray.tabulate(x.dimension) { (b,e) => - if(x(b,e) == null || y(b,e) == null) null + if (x(b,e) == null || y(b,e) == null) null else x(b,e) & y(b,e) }) } } } - def containsAll(other: LabeledSpanConstraints[L @uncheckedVariance]):Boolean = this match { + def containsAll(other: LabeledSpanConstraints[L @uncheckedVariance]): Boolean = this match { case NoConstraints => true case SimpleConstraints(maxPosX, maxLx, x) => other match { case NoConstraints => throw new UnsupportedOperationException("Can't check Simple.containsAll(noconstraints)") @@ -75,7 +75,6 @@ sealed trait LabeledSpanConstraints[-L] extends SpanConstraints { yield (y(i,j) eq null) || ((x(i,j) ne null) && (y(i,j) &~ x(i,j )).isEmpty) }.forall(identity)) } - } /** @@ -89,7 +88,6 @@ sealed trait LabeledSpanConstraints[-L] extends SpanConstraints { case NoConstraints => this case PromotedSpanConstraints(otherinner) => PromotedSpanConstraints(inner | otherinner) case SimpleConstraints(maxPosX, maxLx, x) => ??? - } case SimpleConstraints(maxPosX, maxLx, x) => other match { case NoConstraints => this @@ -98,15 +96,14 @@ sealed trait LabeledSpanConstraints[-L] extends SpanConstraints { require(x.dimension == y.dimension, "Dimensions of constrained spans must match!") SimpleConstraints( elementwiseMax(maxPosX, maxPosY), elementwiseMax(maxLx, maxLy), TriangularArray.tabulate(x.dimension) { (b,e) => - if(x(b,e) == null) y(b,e) + if (x(b,e) == null) y(b,e) else if (y(b,e) == null) x(b, e) else x(b,e) | y(b,e) }) } } - def decode(labelIndex: Index[L@uncheckedVariance ]):String - + def decode(labelIndex: Index[L@uncheckedVariance ]): String } @@ -122,24 +119,20 @@ object LabeledSpanConstraints { out.writeBoolean(true) val length: Int = maxLengthsForPosition.length out.writeInt(length) - if(length < Byte.MaxValue) { - for(i <- 0 until length) { - out.writeByte( (maxLengthsForPosition(i) min length).toByte) - } + if (length < Byte.MaxValue) { + maxLengthsForPosition.foreach(maxLengthForPosition => + out.writeByte((maxLengthForPosition min length).toByte) + ) } else { - for(i <- 0 until length) { - out.writeInt(maxLengthsForPosition(i)) - } + maxLengthsForPosition.foreach(out.writeInt) } out.writeInt(maxLengthsForLabel.length) - for(i <- 0 until maxLengthsForLabel.length) { - out.writeInt(maxLengthsForLabel(i)) - } + maxLengthsForLabel.foreach(out.writeInt) for(i <- 0 until length; j <- (i+1) to length if value.isAllowedSpan(i, j)) { val cardinality: Int = spans(i, j).cardinality - if(cardinality != 0) { + if (cardinality != 0) { out.writeInt(TriangularArray.index(i, j)) - if(cardinality == 1) { + if (cardinality == 1) { // have to deal with 0 length mask out.writeInt(~(spans(i, j).nextSetBit(0))) } else { @@ -159,30 +152,21 @@ object LabeledSpanConstraints { case true => import in._ val length = readInt() - val maxLengthsForPosition = new Array[Int](length) - if(length < Byte.MaxValue) { - for(i <- 0 until length) { - maxLengthsForPosition(i) = readByte() - } - } else { - for(i <- 0 until length) { - maxLengthsForPosition(i) = readInt() - } - } + val maxLengthsForPosition: Array[Int] = if (length < Byte.MaxValue) + Array.fill(length)(readByte()) + else + Array.fill(length)(readInt()) val labelLen = in.readInt() - val maxLengthsForLabel = new Array[Int](labelLen) - for(i <- 0 until maxLengthsForLabel.length) { - maxLengthsForLabel(i) = in.readInt() - } + val maxLengthsForLabel = Array.fill(labelLen)(in.readInt()) val spans = new TriangularArray[util.BitSet](length+1) var ok = true - while(ok) { + while (ok) { ok = false val ti = readInt() - if(ti >= 0) { + if (ti >= 0) { ok = true val bitmaskSize = readInt() - if(bitmaskSize < 0) { + if (bitmaskSize < 0) { val index = ~bitmaskSize spans.data(ti) = new util.BitSet() spans.data(ti).set(index) @@ -195,7 +179,6 @@ object LabeledSpanConstraints { } new SimpleConstraints[L](maxLengthsForPosition, maxLengthsForLabel, spans) } - } } @@ -213,9 +196,9 @@ object LabeledSpanConstraints { } val maxLengthLabel = ArrayBuffer[Int]() for(begin <- 0 until spans.dimension; end <- (begin+1) until spans.dimension) { - if(spans(begin, end) ne null) { + if (spans(begin, end) ne null) { for(l <- spans(begin, end)) { - if(l >= maxLengthLabel.length) { + if (l >= maxLengthLabel.length) { maxLengthLabel ++= new Array[Int](l - maxLengthLabel.length + 1) } maxLengthLabel(l) = maxLengthLabel(l) max (end-begin) @@ -223,17 +206,16 @@ object LabeledSpanConstraints { } } - apply(maxLengthPos, maxLengthLabel.toArray, spans) } def apply[L](maxLengthPos: Array[Int], maxLengthLabel: Array[Int], spans: TriangularArray[_ <: BitSet]):LabeledSpanConstraints[L] = { - SimpleConstraints(maxLengthPos, maxLengthLabel, spans.map(bs => if(bs eq null) null else java.util.BitSet.valueOf(bs.toBitMask))) + SimpleConstraints(maxLengthPos, maxLengthLabel, spans.map(bs => if (bs eq null) null else java.util.BitSet.valueOf(bs.toBitMask))) } def fromTagConstraints[L](constraints: TagConstraints[L]): LabeledSpanConstraints[L] = { val arr = TriangularArray.tabulate(constraints.length+1) { (b,e) => - if(b +1 == e) { + if (b +1 == e) { ensureBitSet(constraints.allowedTags(b)) } else { null @@ -242,7 +224,6 @@ object LabeledSpanConstraints { apply(arr) } - private def ensureBitSet[L](tags: Set[Int]): BitSet = { tags match { case x: BitSet => x @@ -260,21 +241,19 @@ object LabeledSpanConstraints { val arr = new TriangularArray[BitSet](localization.length + 1) val maxMaxLength = maxLengthForLabel.max min localization.length for(i <- 0 until localization.length) { - arr(i, i+1) = ensureBitSet(localization.allowedTags(i)) + arr(i, i+1) = ensureBitSet(localization.allowedTags(i)) } - val maxLengthPos = Array.fill(localization.length)(1) val maxLengthLabel = maxLengthForLabel.clone() - - var acceptableTags = BitSet.empty ++ (0 until maxLengthForLabel.length) + var acceptableTags = BitSet.empty ++ maxLengthForLabel.indices for(length <- 2 to maxMaxLength if acceptableTags.nonEmpty) { acceptableTags = acceptableTags.filter(i => maxLengthForLabel(i) >= length) - if(acceptableTags.nonEmpty) + if (acceptableTags.nonEmpty) for (begin <- 0 to (localization.length - length) ) { val end = begin + length - if(arr(begin,begin+1) != null && arr(begin+1,end) != null) { + if (arr(begin,begin+1) != null && arr(begin+1,end) != null) { arr(begin, end) = (arr(begin, begin+1) & arr(begin+1, end)) & acceptableTags - if(arr(begin,end).isEmpty) { + if (arr(begin,end).isEmpty) { arr(begin, end) = null } else { maxLengthPos(begin) = length @@ -287,50 +266,34 @@ object LabeledSpanConstraints { apply(maxLengthPos, maxLengthLabel, arr) } - @SerialVersionUID(1L) object NoConstraints extends LabeledSpanConstraints[Any] with Serializable { - def maxSpanLengthStartingAt(begin: Int): Int = Int.MaxValue/2 // /2 because i get worried about wrap around. - def isAllowedSpan(begin: Int, end: Int): Boolean = true def isAllowedLabeledSpan(begin: Int, end: Int, label: Int): Boolean = true - - def maxSpanLengthForLabel(label: Int):Int = Int.MaxValue / 2 - - - def decode(labelIndex: Index[Any]):String = toString + def maxSpanLengthForLabel(label: Int): Int = Int.MaxValue / 2 + def decode(labelIndex: Index[Any]): String = toString } - @SerialVersionUID(1L) case class PromotedSpanConstraints(inner: SpanConstraints) extends LabeledSpanConstraints[Any] with Serializable { - def maxSpanLengthStartingAt(begin: Int): Int = Int.MaxValue/2 // /2 because i get worried about wrap around. - def isAllowedSpan(begin: Int, end: Int): Boolean = inner.isAllowedSpan(begin, end) def isAllowedLabeledSpan(begin: Int, end: Int, label: Int): Boolean = isAllowedSpan(begin, end) - - def maxSpanLengthForLabel(label: Int):Int = Int.MaxValue / 2 - - def decode(labelIndex: Index[Any]):String = inner.toString + def maxSpanLengthForLabel(label: Int): Int = Int.MaxValue / 2 + def decode(labelIndex: Index[Any]): String = inner.toString } - // private vars for serialization. @SerialVersionUID(2L) case class SimpleConstraints[L](private var maxLengthsForPosition: Array[Int], // maximum length for position private var maxLengthsForLabel: Array[Int], private var spans: TriangularArray[java.util.BitSet]) extends LabeledSpanConstraints[L] with Serializable { def isAllowedSpan(begin: Int, end: Int): Boolean = (spans(begin,end) ne null) && spans(begin,end).cardinality() > 0 - def isAllowedLabeledSpan(begin: Int, end: Int, label: Int): Boolean = (spans(begin,end) ne null) && spans(begin, end).get(label) - def maxSpanLengthStartingAt(begin: Int): Int = maxLengthsForPosition(begin) - - def maxSpanLengthForLabel(label: Int) = if(maxLengthsForLabel.length <= label) 0 else maxLengthsForLabel(label) - - def decode(labelIndex: Index[L]):String = { + def maxSpanLengthForLabel(label: Int) = if (maxLengthsForLabel.length <= label) 0 else maxLengthsForLabel(label) + def decode(labelIndex: Index[L]): String = { val ret = new StringBuilder() val enc = Encoder.fromIndex(labelIndex) ret ++= "SimpleConstraints(positionMaxLengths=" @@ -340,7 +303,7 @@ object LabeledSpanConstraints { ret ++= ")\n" for(i <- 0 until maxLengthsForPosition.length; j <- (i+1) to maxLengthsForPosition.length) { val s = spans(i, j) - if(s ne null) { + if (s ne null) { ret ++= s" ($i,$j) " + enc.decode(Array.tabulate(labelIndex.size)(x => spans(i, j).get(x))).toString + "\n" } } @@ -365,32 +328,16 @@ object LabeledSpanConstraints { private def elementwiseMax(a: Array[Int], b: Array[Int]):Array[Int] = { // could avoid the allocation, but whatever. - if(a.length < b.length) elementwiseMax(util.Arrays.copyOf(a, b.length), b) - else if(b.length < a.length) elementwiseMax(a, util.Arrays.copyOf(b, a.length)) - else { - val result = new Array[Int](a.length) - var i = 0 - while(i < result.length) { - result(i) = math.max(a(i), b(i)) - i += 1 - } - result - } + if (a.length < b.length) elementwiseMax(util.Arrays.copyOf(a, b.length), b) + else if (b.length < a.length) elementwiseMax(a, util.Arrays.copyOf(b, a.length)) + else Array.fillWith[Int](a.length)(i => math.max(a(i), b(i))) } private def elementwiseMin(a: Array[Int], b: Array[Int]):Array[Int] = { // could avoid the allocation, but whatever. - if(a.length < b.length) elementwiseMin(util.Arrays.copyOf(a, b.length), b) - else if(b.length < a.length) elementwiseMin(a, util.Arrays.copyOf(b, a.length)) - else { - val result = new Array[Int](a.length) - var i = 0 - while(i < result.length) { - result(i) = math.min(a(i), b(i)) - i += 1 - } - result - } + if (a.length < b.length) elementwiseMin(util.Arrays.copyOf(a, b.length), b) + else if (b.length < a.length) elementwiseMin(a, util.Arrays.copyOf(b, a.length)) + else Array.fillWith[Int](a.length)(i => math.min(a(i), b(i))) } } diff --git a/src/main/scala/epic/constraints/LongSpanConstraints.scala b/src/main/scala/epic/constraints/LongSpanConstraints.scala index 9e4264f1..ccdda7e5 100644 --- a/src/main/scala/epic/constraints/LongSpanConstraints.scala +++ b/src/main/scala/epic/constraints/LongSpanConstraints.scala @@ -8,26 +8,15 @@ import epic.constraints.LabeledSpanConstraints.PromotedSpanConstraints * * @author dlwh */ -class LongSpanConstraints { - -} - object LongSpanConstraints { case class Factory[L](maxSimpleLength: Int = 30, okWords: Set[String]) extends ChartConstraints.Factory[L, String] { def constraints(w: IndexedSeq[String]): ChartConstraints[L] = { val spans = new SpanConstraints { val oks = w.map(ww => okWords(ww) || !ww.head.isLetterOrDigit) def maxSpanLengthStartingAt(begin: Int): Int = w.length - begin - - def maxSpanLengthForLabel(label: Int): Int = w.length - - def decode(labelIndex: Index[L]): String = "..." - - def isAllowedLabeledSpan(begin: Int, end: Int, label: Int): Boolean = isAllowedSpan(begin, end) - def isAllowedSpan(begin: Int, end: Int): Boolean = ( end - begin <= maxSimpleLength || end == w.length @@ -40,6 +29,5 @@ object LongSpanConstraints { } new ChartConstraints(PromotedSpanConstraints(spans), PromotedSpanConstraints(spans)) } - } } diff --git a/src/main/scala/epic/constraints/SpanConstraints.scala b/src/main/scala/epic/constraints/SpanConstraints.scala index 51f9d89e..1dc4abfa 100644 --- a/src/main/scala/epic/constraints/SpanConstraints.scala +++ b/src/main/scala/epic/constraints/SpanConstraints.scala @@ -12,19 +12,19 @@ import epic.constraints.LabeledSpanConstraints.PromotedSpanConstraints * @author dlwh */ trait SpanConstraints { outer => - def apply(begin: Int, end: Int):Boolean = isAllowedSpan(begin, end) + def apply(begin: Int, end: Int): Boolean = isAllowedSpan(begin, end) def isAllowedSpan(begin: Int, end: Int): Boolean - def maxSpanLengthStartingAt(begin: Int):Int + def maxSpanLengthStartingAt(begin: Int): Int def |(other: SpanConstraints):SpanConstraints = new SpanConstraints { def isAllowedSpan(begin: Int, end: Int): Boolean = outer.isAllowedSpan(begin, end) || other.isAllowedSpan(begin, end) - def maxSpanLengthStartingAt(begin: Int):Int = outer.maxSpanLengthStartingAt(begin) max other.maxSpanLengthStartingAt(begin) + def maxSpanLengthStartingAt(begin: Int): Int = outer.maxSpanLengthStartingAt(begin) max other.maxSpanLengthStartingAt(begin) } def &(other: SpanConstraints):SpanConstraints = new SpanConstraints { def isAllowedSpan(begin: Int, end: Int): Boolean = outer.isAllowedSpan(begin, end) && other.isAllowedSpan(begin, end) - def maxSpanLengthStartingAt(begin: Int):Int = outer.maxSpanLengthStartingAt(begin) min other.maxSpanLengthStartingAt(begin) + def maxSpanLengthStartingAt(begin: Int): Int = outer.maxSpanLengthStartingAt(begin) min other.maxSpanLengthStartingAt(begin) } } diff --git a/src/main/scala/epic/corpora/CONLLSequenceReader.scala b/src/main/scala/epic/corpora/CONLLSequenceReader.scala index 6e7e7d79..d4e54a58 100644 --- a/src/main/scala/epic/corpora/CONLLSequenceReader.scala +++ b/src/main/scala/epic/corpora/CONLLSequenceReader.scala @@ -21,10 +21,9 @@ object CONLLSequenceReader { val outputs = new ArrayBuffer[String] import scala.util.control.Breaks._ breakable { - while(source.hasNext) { + while (source.hasNext) { val line = source.next() - if(line.trim().isEmpty) break - + if (line.trim().isEmpty) break val split = line.split(splitToken) inputs += split.take(split.length -1).toIndexedSeq outputs += split.last @@ -53,9 +52,9 @@ object CONLLSequenceReader { val inputs = new ArrayBuffer[IndexedSeq[String]]() import scala.util.control.Breaks._ breakable { - while(source.hasNext) { + while (source.hasNext) { val line = source.next() - if(line.trim().isEmpty) break + if (line.trim().isEmpty) break val split = line.split(splitToken) inputs += split diff --git a/src/main/scala/epic/corpora/MascUtil.scala b/src/main/scala/epic/corpora/MascUtil.scala index baada651..06378275 100644 --- a/src/main/scala/epic/corpora/MascUtil.scala +++ b/src/main/scala/epic/corpora/MascUtil.scala @@ -10,7 +10,6 @@ import epic.trees.Span import MascTransform._ - /** * Convert native MASC xml into CONLL format for named entity recognition. * @@ -18,13 +17,11 @@ import MascTransform._ */ object MascTransform { - case class MNode(id: String, targets: Seq[String]) case class MAnnotation(id: String, label: String, ref: String, features: Map[String,String]) case class MEdge(id: String, from: String, to: String) case class MRegion(id: String, start: Int, end: Int) extends Ordered[MRegion] { def span = Span(start, end) - def compare(that: MRegion) = this.start - that.start } @@ -32,9 +29,7 @@ object MascTransform { val mascDir = args(0) val outputDir = new File(if (args.length > 1) args(1) else "/tmp") outputDir.mkdirs - val targets = collectTargets(new File(mascDir)) - // Get 3/5 for train, 1/5 for dev, and 1/5 for test val targetsAndIndices = targets.zipWithIndex val trainSet = targetsAndIndices.filter(_._2 % 5 < 3).unzip._1 @@ -57,11 +52,9 @@ object MascTransform { System.err.println("Creating " + outputName) val outputDir = new File(parentDir, outputName) outputDir.mkdirs - val outputSentences = new FileWriter(new File(outputDir,outputName+"-sent.txt")) val outputTokens = new FileWriter(new File(outputDir,outputName+"-tok.txt")) val outputNer = new FileWriter(new File(outputDir,outputName+"-ner.txt")) - for (mfile <- MascFile(targets)) { for (sentence <- mfile.sentences) { val tokenizedSentence = new StringBuffer @@ -102,7 +95,6 @@ case class MascSentence ( bioLabels: Seq[String], orderedRegions: Seq[MRegion] ) { - lazy val numTokens = orderedTokens.length } @@ -112,9 +104,7 @@ class MascFile ( val rawtext: String, val sentences: Seq[MascSentence] ) { - lazy val numSentences = sentences.length - } object MascFile { @@ -125,7 +115,7 @@ object MascFile { lazy val outsideNe = MAnnotation("outside", "outside", "none", Map[String,String]()) def apply(targets: Seq[(File, String)]): Iterator[MascFile] = { - targets.toIterator.flatMap { case(file, prefix) => { + targets.toIterator.flatMap { case(file, prefix) => try { val mfile = MascFile(file,prefix) System.err.println("Success: " + file + "," + prefix) @@ -135,7 +125,7 @@ object MascFile { System.err.println("Failure: " + file + "," + prefix) None } - }} + } } def apply(dir: File, prefix: String): MascFile = { @@ -152,34 +142,33 @@ object MascFile { val sentenceXml = loadXML(dirFile(prefix+"-s.xml")) val sentenceRegions = getRegions(sentenceXml).sorted - // Basic segment information val segmentXml = loadXML(dirFile(prefix+"-seg.xml")) - val segmentRegions = getRegions(segmentXml).map(r => (r.id -> r)).toMap + val segmentRegions = getRegions(segmentXml).map(r => r.id -> r).toMap // POS information val pennXml = loadXML(dirFile(prefix+"-penn.xml")) val tokenRegions = getNodes(pennXml).map { n => val regions = n.targets.map(segmentRegions).sorted - (n.id -> MRegion(n.id, regions.head.start, regions.last.end)) + n.id -> MRegion(n.id, regions.head.start, regions.last.end) }.toMap val tokens = tokenRegions.mapValues(region => rawtext.slice(region.start, region.end)) - val posAnnotations = getAnnotations(pennXml).map(anno => (anno.ref -> anno)).toMap + val posAnnotations = getAnnotations(pennXml).map(anno => anno.ref -> anno).toMap // NER information val neXml = loadXML(dirFile(prefix+"-ne.xml")) val neAnnotations = - getAnnotations(neXml).map(anno => (anno.ref -> anno)).toMap.withDefault(x=>outsideNe) + getAnnotations(neXml).map(anno => anno.ref -> anno).toMap.withDefault(x=>outsideNe) val neEdges = - getEdges(neXml).map(edge => (edge.to -> edge.from)).toMap.withDefault(x=>"outside") + getEdges(neXml).map(edge => edge.to -> edge.from).toMap.withDefault(x=>"outside") // A helper function for pulling out the information associated with a // subsequence of the tokens in the document. def orderedTokPosNer(orderedRegions: Seq[MRegion]) = { - if (orderedRegions.length == 0) None + if (orderedRegions.isEmpty) None else { val orderedTokens = orderedRegions.map(reg=>tokens(reg.id)) @@ -202,7 +191,6 @@ object MascFile { } } - // Insert the "missing" sentences. (Content not marked as a sentence, // but containing tokens.) @@ -223,7 +211,7 @@ object MascFile { // Pull out the sequence of token, pos, and NE for each sentence. val allOrderedTokRegions = tokenRegions.values.toIndexedSeq.sorted var index = 0 - val allDataBySentence = paddedSentenceRegions.flatMap { region => { + val allDataBySentence = paddedSentenceRegions.flatMap { region => //val startIndex = math.max(index, region.start) val startIndex = math.max(index, allOrderedTokRegions.indexWhere(t=>t.start>=region.start,index)) //val startIndex = index @@ -236,7 +224,7 @@ object MascFile { index = endIndex orderedTokPosNer(sentence) } - }} + } new MascFile(dir, prefix, rawtext, allDataBySentence) } @@ -260,15 +248,14 @@ object MascUtil { "date" -> "MISC" ).withDefault(x=>"O") - def getRegions(doc: Elem) = (doc \\ "region").toSeq.map { rxml => val Array(start, end) = (rxml \ "@anchors").toString.split(" ") MRegion(xmlId(rxml), start.toInt, end.toInt) } def getNodes(doc: Elem) = (doc \\ "node").toSeq.flatMap { nxml => - val link = (nxml \ "link") - if (!link.isEmpty) { + val link = nxml \ "link" + if (link.nonEmpty) { val targets = (link.head \ "@targets").toString.split(" ").toSeq Some(MNode(xmlId(nxml), targets)) } else throw new Exception("Missing link element.") //None OK? @@ -386,7 +373,7 @@ object MascSlab { val neXml = XML.load(source.url.toString().replaceAll("[.]txt$", "-ne.xml")) val idToPos = (for ((span, p) <- slab.iterator[PartOfSpeech]; id <- p.id.iterator) yield id -> (span, p)).toMap - val neIdPosIdTuples = MascUtil.getEdges(neXml).map(e => (e.from -> e.to)) + val neIdPosIdTuples = MascUtil.getEdges(neXml).map(e => e.from -> e.to) val neIdToPosIds = neIdPosIdTuples.groupBy(_._1).mapValues(_.map(_._2)) val entityMentions = for (annotation <- MascUtil.getAnnotations(neXml)) yield { diff --git a/src/main/scala/epic/dense/AdadeltaGradientDescentDVD.scala b/src/main/scala/epic/dense/AdadeltaGradientDescentDVD.scala index 92386765..622d53ba 100644 --- a/src/main/scala/epic/dense/AdadeltaGradientDescentDVD.scala +++ b/src/main/scala/epic/dense/AdadeltaGradientDescentDVD.scala @@ -28,7 +28,7 @@ class AdadeltaGradientDescentDVD(maxIter: Int, val newG = (oldState.grad :* oldState.grad) * (1 - rho) axpy(rho, oldHistory.squaredGradientsHistory, newG) val deltaX = newX - oldState.x - val newU = deltaX :* deltaX * (1 - rho); + val newU = deltaX :* deltaX * (1 - rho) axpy(rho, oldHistory.squaredUpdatesHistory, newU) new History(newG, newU) } diff --git a/src/main/scala/epic/dense/AffineOutputTransform.scala b/src/main/scala/epic/dense/AffineOutputTransform.scala index 167aa4df..39bc1707 100644 --- a/src/main/scala/epic/dense/AffineOutputTransform.scala +++ b/src/main/scala/epic/dense/AffineOutputTransform.scala @@ -16,12 +16,11 @@ import scala.util.Random */ case class AffineOutputTransform[FV](numOutputs: Int, numInputs: Int, innerTransform: Transform[FV, DenseVector[Double]], includeBias: Boolean = true) extends OutputTransform[FV, DenseVector[Double]] { - val index = SegmentedIndex(new AffineTransform.Index(numOutputs, numInputs, includeBias), innerTransform.index) def extractLayerAndPenultimateLayer(weights: DenseVector[Double], forTrain: Boolean) = { val mat = weights(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) - val bias = if(includeBias) { + val bias = if (includeBias) { weights(numOutputs * numInputs until index.componentOffset(1)) } else { DenseVector.zeros[Double](numOutputs) @@ -51,8 +50,7 @@ case class AffineOutputTransform[FV](numOutputs: Int, numInputs: Int, innerTrans override val index = AffineOutputTransform.this.index val weightst = weights.t -// val weightst = weights.t.copy - + //val weightst = weights.t.copy def activations(fv: FV) = { val out = weights * innerLayer.activations(fv) += bias @@ -74,7 +72,7 @@ case class AffineOutputTransform[FV](numOutputs: Int, numInputs: Int, innerTrans def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = { val scale = _scale val matDeriv = deriv(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) - val biasDeriv = if(includeBias) { + val biasDeriv = if (includeBias) { deriv(numOutputs * numInputs until index.componentOffset(1)) } else { DenseVector.zeros[Double](numOutputs) @@ -86,7 +84,7 @@ case class AffineOutputTransform[FV](numOutputs: Int, numInputs: Int, innerTrans // d/d(weights(::, i)) == scale(i) * innerAct for (i <- 0 until weights.rows) { val a: Double = scale(i) - if(a != 0.0) { + if (a != 0.0) { axpy(a, innerAct, matDeriv.t(::, i)) // so d/dbias(i) = scale(i) biasDeriv(i) += a diff --git a/src/main/scala/epic/dense/AffineTransform.scala b/src/main/scala/epic/dense/AffineTransform.scala index 5e6be860..1abeb952 100644 --- a/src/main/scala/epic/dense/AffineTransform.scala +++ b/src/main/scala/epic/dense/AffineTransform.scala @@ -12,7 +12,6 @@ case class AffineTransform[FV, Mid](numOutputs: Int, numInputs: Int, innerTransf (implicit mult: OpMulMatrix.Impl2[DenseMatrix[Double], Mid, DenseVector[Double]], canaxpy: scaleAdd.InPlaceImpl3[DenseVector[Double], Double, Mid]) extends Transform[FV, DenseVector[Double]] { - val index = SegmentedIndex(new AffineTransform.Index(numOutputs, numInputs, includeBias), innerTransform.index) def extractLayer(weights: DenseVector[Double], forTrain: Boolean) = { @@ -21,7 +20,7 @@ case class AffineTransform[FV, Mid](numOutputs: Int, numInputs: Int, innerTransf def extractLayerAndPenultimateLayer(weights: DenseVector[Double], forTrain: Boolean) = { val mat = weights(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) - val bias = if(includeBias) { + val bias = if (includeBias) { weights(numOutputs * numInputs until index.componentOffset(1)) } else { DenseVector.zeros[Double](numOutputs) @@ -58,8 +57,7 @@ case class AffineTransform[FV, Mid](numOutputs: Int, numInputs: Int, innerTransf override val index = AffineTransform.this.index val weightst = weights.t -// val weightst = weights.t.copy - + // val weightst = weights.t.copy def activations(fv: FV) = { val out = weights * innerLayer.activations(fv) += bias @@ -67,10 +65,10 @@ case class AffineTransform[FV, Mid](numOutputs: Int, numInputs: Int, innerTransf } def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = { -// println("SCALE: " + _scale) + // println("SCALE: " + _scale) val scale = _scale val matDeriv = deriv(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) - val biasDeriv = if(includeBias) { + val biasDeriv = if (includeBias) { deriv(numOutputs * numInputs until index.componentOffset(1)) } else { DenseVector.zeros[Double](numOutputs) @@ -82,18 +80,18 @@ case class AffineTransform[FV, Mid](numOutputs: Int, numInputs: Int, innerTransf // d/d(weights(::, i)) == scale(i) * innerAct for (i <- 0 until weights.rows) { val a: Double = scale(i) - if(a != 0.0) { + if (a != 0.0) { axpy(a, innerAct, matDeriv.t(::, i)) // so d/dbias(i) = scale(i) biasDeriv(i) += a } } -// biasDeriv += scale + // biasDeriv += scale // scale is f'(mat * inner(v) + bias) // d/dv is mat.t * f'(mat * inner(v) + bias) -// println("Intermediate scale: " + weightst * scale) + // println("Intermediate scale: " + weightst * scale) innerLayer.tallyDerivative(deriv(index.componentOffset(1) to -1), weightst * scale, fv) } @@ -108,8 +106,7 @@ object AffineTransform { canAxpy: scaleAdd.InPlaceImpl3[DenseVector[Double], Double, FV]) = new AffineTransform(numOutputs, numInputs, new IdentityTransform[FV], includeBias) def apply(numOutputs: Int, numInputs: Int, includeBias: Boolean):AffineTransform[DenseVector[Double], DenseVector[Double]] = apply(numOutputs, numInputs, new IdentityTransform[DenseVector[Double]], includeBias) def apply(numOutputs: Int, numInputs: Int):AffineTransform[DenseVector[Double], DenseVector[Double]] = apply(numOutputs, numInputs, true) - - + def getUniformAffineWeights(numWeights: Int, initWeightsScale: Double, rng: Random) = { DenseVector(Array.tabulate(numWeights)(i => rng.nextGaussian * initWeightsScale)) } @@ -160,7 +157,7 @@ object AffineTransform { def iterator: Iterator[Feature] = Iterator.range(0, size) map unapply map (_.get) - override val size: Int = if(includeBias) numOutputs * numInputs + numOutputs else numOutputs * numInputs + override val size: Int = if (includeBias) numOutputs * numInputs + numOutputs else numOutputs * numInputs override def toString() = ScalaRunTime._toString(this) } diff --git a/src/main/scala/epic/dense/BatchNormalizationTransform.scala b/src/main/scala/epic/dense/BatchNormalizationTransform.scala index 04abb51f..e51aa283 100644 --- a/src/main/scala/epic/dense/BatchNormalizationTransform.scala +++ b/src/main/scala/epic/dense/BatchNormalizationTransform.scala @@ -56,11 +56,11 @@ case class BatchNormalizationTransform[FV](size: Int, useBias: Boolean, inner: T val myIndex = Index[Feature] - def index = myIndex; + def index = myIndex def activations(fv: FV): DenseVector[Double] = { val act = innerLayer.activations(fv) - var i = 0; + var i = 0 while (i < act.size) { act(i) = fcn.fcn(i, act(i)) + bias(i) i += 1 @@ -71,7 +71,7 @@ case class BatchNormalizationTransform[FV](size: Int, useBias: Boolean, inner: T def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = { val biasDeriv = if (useBias) deriv(0 until size) else DenseVector[Double]() val scale = _scale - var i = 0; + var i = 0 while (i < scale.size) { if (useBias) { biasDeriv(i) += scale(i) @@ -87,8 +87,8 @@ case class BatchNormalizationTransform[FV](size: Int, useBias: Boolean, inner: T val mean = allActivations.reduce(_ + _) * (1.0/inputs.size) val variances = allActivations.map(act => (act - mean) :* (act - mean)).reduce(_ + _) * (1.0/inputs.size) val invStdDevs = variances.data.map(variance => 1.0/Math.sqrt(variance + 1e-6)) -// println(mean.data.toSeq) -// println(invStdDevs.toSeq) + // println(mean.data.toSeq) + // println(invStdDevs.toSeq) fcn = new NonlinearTransform.ShiftAndScaleEach(mean.data, invStdDevs) innerLayer.applyBatchNormalization(inputs) } diff --git a/src/main/scala/epic/dense/CachingLookupAndAffineTransformDense.scala b/src/main/scala/epic/dense/CachingLookupAndAffineTransformDense.scala index 417627d8..5e1ee59f 100644 --- a/src/main/scala/epic/dense/CachingLookupAndAffineTransformDense.scala +++ b/src/main/scala/epic/dense/CachingLookupAndAffineTransformDense.scala @@ -18,12 +18,11 @@ case class CachingLookupAndAffineTransformDense[FV](numOutputs: Int, word2vecIndexed: Word2VecIndexed[String], includeBias: Boolean = true) extends Transform[Array[Int], DenseVector[Double]] { - val index = new AffineTransform.Index(numOutputs, numInputs, includeBias) def extractLayer(weights: DenseVector[Double], forTrain: Boolean) = { val mat = weights(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) - val bias = if(includeBias) { + val bias = if (includeBias) { weights(numOutputs * numInputs until index.size) } else { DenseVector.zeros[Double](numOutputs) @@ -49,7 +48,7 @@ case class CachingLookupAndAffineTransformDense[FV](numOutputs: Int, } def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = { - (offset until offset + Math.min(10, index.size)) + offset until offset + Math.min(10, index.size) } case class Layer(weights: DenseMatrix[Double], bias: DenseVector[Double]) extends Transform.Layer[Array[Int],DenseVector[Double]] { @@ -66,8 +65,8 @@ case class CachingLookupAndAffineTransformDense[FV](numOutputs: Int, def activations(fv: Array[Int]) = { val finalVector = DenseVector.zeros[Double](numOutputs) - for (i <- 0 until fv.size) { -// val wordPosn = fv(i) -> i + fv.indices.foreach { i => + // val wordPosn = fv(i) -> i if (fv(i) != -1) { caches(i).synchronized { if (!caches(i).contains(fv(i))) { @@ -84,7 +83,7 @@ case class CachingLookupAndAffineTransformDense[FV](numOutputs: Int, def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: Array[Int]) = { val scale = _scale val matDeriv = deriv(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) - val biasDeriv = if(includeBias) { + val biasDeriv = if (includeBias) { deriv(numOutputs * numInputs until index.size) } else { DenseVector.zeros[Double](numOutputs) @@ -92,12 +91,12 @@ case class CachingLookupAndAffineTransformDense[FV](numOutputs: Int, // whole function is f(mat * inner(fv) + bias) // scale(i) pushes in (f'(mat * inner(v) + bias))(i) - val innerAct = DenseVector(word2vecIndexed.convertToVector(fv)); + val innerAct = DenseVector(word2vecIndexed.convertToVector(fv)) // d/d(weights(::, i)) == scale(i) * innerAct for (i <- 0 until weights.rows) { val a: Double = scale(i) - if(a != 0.0) { + if (a != 0.0) { axpy(a, innerAct, matDeriv.t(::, i)) // so d/dbias(i) = scale(i) biasDeriv(i) += a diff --git a/src/main/scala/epic/dense/CachingLookupTransform.scala b/src/main/scala/epic/dense/CachingLookupTransform.scala index e934d2f8..65e9f26b 100644 --- a/src/main/scala/epic/dense/CachingLookupTransform.scala +++ b/src/main/scala/epic/dense/CachingLookupTransform.scala @@ -29,7 +29,7 @@ case class CachingLookupTransform(word2vecIndexed: Word2VecIndexed[String]) exte def activations(fv: Array[Int]) = { var finalVector = DenseVector.zeros[Double](0) - for (i <- 0 until fv.size) { + fv.indices.foreach { i => val vec: DenseVector[Double] = if (fv(i) != -1) DenseVector(word2vecIndexed.convertIndexToVector(fv(i))) else DenseVector(word2vecIndexed.zeroVector) finalVector = DenseVector.vertcat(finalVector, vec) } diff --git a/src/main/scala/epic/dense/EmbeddingsTransform.scala b/src/main/scala/epic/dense/EmbeddingsTransform.scala index bd0fbad2..0fc82710 100644 --- a/src/main/scala/epic/dense/EmbeddingsTransform.scala +++ b/src/main/scala/epic/dense/EmbeddingsTransform.scala @@ -23,7 +23,7 @@ case class EmbeddingsTransform[FV](numOutputs: Int, def extractLayer(weights: DenseVector[Double], forTrain: Boolean) = { val mat = weights(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) - val bias = if(includeBias) { + val bias = if (includeBias) { weights(numOutputs * numInputs until index.indices(0).size) } else { DenseVector.zeros[Double](numOutputs) @@ -69,8 +69,8 @@ case class EmbeddingsTransform[FV](numOutputs: Int, def activations(fv: Array[Int]) = { val finalVector = DenseVector.zeros[Double](numOutputs) - for (i <- 0 until fv.size) { -// val wordPosn = fv(i) -> i + fv.indices.foreach { i => + // val wordPosn = fv(i) -> i if (fv(i) != -1) { caches(i).synchronized { if (!caches(i).contains(fv(i))) { @@ -88,25 +88,23 @@ case class EmbeddingsTransform[FV](numOutputs: Int, def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: Array[Int]) = { val scale = _scale val matDeriv = deriv(0 until (numOutputs * numInputs)).asDenseMatrix.reshape(numOutputs, numInputs, view = View.Require) - val biasDeriv = if(includeBias) { + val biasDeriv = if (includeBias) { deriv(numOutputs * numInputs until index.size) } else { DenseVector.zeros[Double](numOutputs) } - // whole function is f(mat * inner(fv) + bias) // scale(i) pushes in (f'(mat * inner(v) + bias))(i) - val innerAct = DenseVector(word2vecIndexed.convertToVector(fv)) + Word2VecSurfaceFeaturizerIndexed.makeVectFromParams(fv, wordWeights); - + val innerAct = DenseVector(word2vecIndexed.convertToVector(fv)) + Word2VecSurfaceFeaturizerIndexed.makeVectFromParams(fv, wordWeights) val wordsDeriv = deriv(index.indices(0).size until index.indices(0).size + index.indices(1).size).asDenseMatrix.reshape(word2vecIndexed.vocSize, word2vecIndexed.wordRepSize, view = View.Require) - val wordsDerivs = Array.tabulate(fv.size)(wordPosnIdx => wordsDeriv(fv(wordPosnIdx), ::).t) + val wordsDerivs = Array.tabulate(fv.length)(wordPosnIdx => wordsDeriv(fv(wordPosnIdx), ::).t) // d/d(weights(::, i)) == scale(i) * innerAct for (i <- 0 until weights.rows) { val a: Double = scale(i) - if(a != 0.0) { + if (a != 0.0) { axpy(a, innerAct, matDeriv.t(::, i)) - var wordPosnIdx = 0; - while (wordPosnIdx < fv.size) { + var wordPosnIdx = 0 + while (wordPosnIdx < fv.length) { val relevantWeights = weights(i, wordPosnIdx * word2vecIndexed.wordRepSize until (wordPosnIdx + 1) * word2vecIndexed.wordRepSize).t axpy(a, relevantWeights, wordsDerivs(wordPosnIdx)) wordPosnIdx += 1 @@ -119,8 +117,7 @@ case class EmbeddingsTransform[FV](numOutputs: Int, // scale is f'(mat * inner(v) + bias) // d/dv is mat.t * f'(mat * inner(v) + bias) } - - + def applyBatchNormalization(inputs: scala.collection.GenTraversable[Array[Int]]) = {} } } \ No newline at end of file diff --git a/src/main/scala/epic/dense/IdentityTransform.scala b/src/main/scala/epic/dense/IdentityTransform.scala index f7410f36..97fc44eb 100644 --- a/src/main/scala/epic/dense/IdentityTransform.scala +++ b/src/main/scala/epic/dense/IdentityTransform.scala @@ -21,7 +21,7 @@ class IdentityTransform[T] extends Transform[T, T] { val myIndex = Index[Feature] - def index = myIndex; + def index = myIndex def activations(fv: T) = fv diff --git a/src/main/scala/epic/dense/LowRankQuadraticTransform.scala b/src/main/scala/epic/dense/LowRankQuadraticTransform.scala index 0dad562a..e6622e6e 100644 --- a/src/main/scala/epic/dense/LowRankQuadraticTransform.scala +++ b/src/main/scala/epic/dense/LowRankQuadraticTransform.scala @@ -16,15 +16,15 @@ case class LowRankQuadraticTransform[FV](numOutputs: Int, numRanks: Int, numLeft val index = SegmentedIndex(neuronIndex, innerTransform.index) def extractLayerAndPenultimateLayer(weights: DenseVector[Double], forTrain: Boolean) = { - val subTransforms = (0 until neurons.size).map(i => neurons(i).extractLayer(weights(neuronIndex.componentOffset(i) until neuronIndex.componentOffset(i) + neuronIndex.indices(i).size))) - val innerLayer = innerTransform.extractLayer(weights(index.componentOffset(1) to -1), forTrain); + val subTransforms = neurons.indices.map(i => neurons(i).extractLayer(weights(neuronIndex.componentOffset(i) until neuronIndex.componentOffset(i) + neuronIndex.indices(i).size))) + val innerLayer = innerTransform.extractLayer(weights(index.componentOffset(1) to -1), forTrain) new OutputLayer(subTransforms, innerLayer) -> innerLayer } -// def extractLayer(weights: DenseVector[Double]) = { -// val subTransforms = (0 until neurons.size).map(i => neurons(i).extractLayer(weights(neuronIndex.componentOffset(i) until neuronIndex.componentOffset(i) + neuronIndex.indices(i).size))) -// new Layer(subTransforms, innerTransform.extractLayer(weights(index.componentOffset(1) to -1))) -// } + // def extractLayer(weights: DenseVector[Double]) = { + // val subTransforms = (0 until neurons.size).map(i => neurons(i).extractLayer(weights(neuronIndex.componentOffset(i) until neuronIndex.componentOffset(i) + neuronIndex.indices(i).size))) + // new Layer(subTransforms, innerTransform.extractLayer(weights(index.componentOffset(1) to -1))) + // } def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = { val subVects = DenseVector.vertcat(neurons.map(_.initialWeightVector(initWeightsScale, rng, outputLayer, spec)):_*) @@ -32,7 +32,7 @@ case class LowRankQuadraticTransform[FV](numOutputs: Int, numRanks: Int, numLeft } def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) { - innerTransform.clipHiddenWeightVectors(weights(index.componentOffset(1) to -1), norm, outputLayer); + innerTransform.clipHiddenWeightVectors(weights(index.componentOffset(1) to -1), norm, outputLayer) } def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] = { @@ -55,7 +55,7 @@ case class LowRankQuadraticTransform[FV](numOutputs: Int, numRanks: Int, numLeft def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = { val innerActivations = innerLayer.activations(fv) - for (i <- 0 until sublayers.size) { + sublayers.indices.foreach { i => sublayers(i).tallyDerivative(deriv(neuronIndex.componentOffset(i) until neuronIndex.componentOffset(i) + neuronIndex.indices(i).size), _scale(i), innerActivations) } } diff --git a/src/main/scala/epic/dense/NonlinearTransform.scala b/src/main/scala/epic/dense/NonlinearTransform.scala index 31ff8cf1..7b2ea282 100644 --- a/src/main/scala/epic/dense/NonlinearTransform.scala +++ b/src/main/scala/epic/dense/NonlinearTransform.scala @@ -26,7 +26,7 @@ case class NonlinearTransform[FV](nonLinType: String, size: Int, inner: Transfor } new Layer(fcn, inner.extractLayer(dv, forTrain)) } else { - val nonlinearFcn = NonlinearTransform.getNonlinearFcn(nonLinType); + val nonlinearFcn = NonlinearTransform.getNonlinearFcn(nonLinType) new Layer(nonlinearFcn, inner.extractLayer(dv, forTrain)) } } @@ -41,11 +41,11 @@ case class NonlinearTransform[FV](nonLinType: String, size: Int, inner: Transfor val myIndex = Index[Feature] - def index = myIndex; + def index = myIndex def activations(fv: FV): DenseVector[Double] = { val act = innerLayer.activations(fv) - var i = 0; + var i = 0 while (i < act.size) { act(i) = nonlinearFcn.fcn(i, act(i)) i += 1 @@ -56,7 +56,7 @@ case class NonlinearTransform[FV](nonLinType: String, size: Int, inner: Transfor def tallyDerivative(deriv: DenseVector[Double], _scale: =>Vector[Double], fv: FV) = { val scale = _scale val act = innerLayer.activations(fv) - var i = 0; + var i = 0 while (i < act.size) { act(i) = nonlinearFcn.deriv(i, act(i)) i += 1 @@ -94,8 +94,8 @@ object NonlinearTransform { trait NonlinearFcn { // idx is the position of the unit; this basically only applies to dropout // where we want to zero out particular units - def fcn(idx: Int, x: Double): Double; - def deriv(idx: Int, x: Double): Double; + def fcn(idx: Int, x: Double): Double + def deriv(idx: Int, x: Double): Double } case class Constant() extends NonlinearFcn { @@ -103,17 +103,17 @@ object NonlinearTransform { def deriv(idx: Int, x: Double) = 0 } - case class Mask(val mask: Array[Boolean]) extends NonlinearFcn { + case class Mask(mask: Array[Boolean]) extends NonlinearFcn { def fcn(idx: Int, x: Double) = if (mask(idx)) x else 0 def deriv(idx: Int, x: Double) = if (mask(idx)) 1 else 0 } - case class ShiftAndScaleEach(val shifts: Array[Double], val factors: Array[Double]) extends NonlinearFcn { + case class ShiftAndScaleEach(shifts: Array[Double], factors: Array[Double]) extends NonlinearFcn { def fcn(idx: Int, x: Double) = factors(idx) * (x - shifts(idx)) def deriv(idx: Int, x: Double) = factors(idx) } - case class Scale(val factor: Double) extends NonlinearFcn { + case class Scale(factor: Double) extends NonlinearFcn { def fcn(idx: Int, x: Double) = factor * x def deriv(idx: Int, x: Double) = factor } diff --git a/src/main/scala/epic/dense/OutputEmbeddingTransform.scala b/src/main/scala/epic/dense/OutputEmbeddingTransform.scala index d9d494e8..a5f7e01d 100644 --- a/src/main/scala/epic/dense/OutputEmbeddingTransform.scala +++ b/src/main/scala/epic/dense/OutputEmbeddingTransform.scala @@ -17,7 +17,6 @@ import scala.util.Random */ case class OutputEmbeddingTransform[FV](numOutputs: Int, outputDim: Int, innerTransform: Transform[FV, DenseVector[Double]], coarsenerForInitialization: Option[Int => Int] = None) extends OutputTransform[FV, DenseVector[Double]] { - val index = SegmentedIndex(new AffineTransform.Index(numOutputs, outputDim, true), innerTransform.index) @@ -30,12 +29,12 @@ case class OutputEmbeddingTransform[FV](numOutputs: Int, outputDim: Int, innerTr def clipEmbeddingNorms(weights: DenseVector[Double]) { val embeddings = weights(index.componentOffset(1) until index.componentOffset(1) + (numOutputs * outputDim)).asDenseMatrix.reshape(numOutputs, outputDim, view = View.Require) - OutputEmbeddingTransform.clipEmbeddingNorms(embeddings); + OutputEmbeddingTransform.clipEmbeddingNorms(embeddings) } def displayEmbeddingNorms(weights: DenseVector[Double]) { val embeddings = weights(index.componentOffset(1) until index.componentOffset(1) + (numOutputs * outputDim)).asDenseMatrix.reshape(numOutputs, outputDim, view = View.Require) - OutputEmbeddingTransform.displayEmbeddingNorms(embeddings); + OutputEmbeddingTransform.displayEmbeddingNorms(embeddings) } def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String) = { diff --git a/src/main/scala/epic/dense/OutputTransform.scala b/src/main/scala/epic/dense/OutputTransform.scala index fe64cb4c..e910020a 100644 --- a/src/main/scala/epic/dense/OutputTransform.scala +++ b/src/main/scala/epic/dense/OutputTransform.scala @@ -7,40 +7,28 @@ import scala.util.Random trait OutputTransform[In, +Out] extends Serializable { val index: Index[Feature] - def extractLayer(dv: DenseVector[Double], forTrain: Boolean):OutputLayer = extractLayerAndPenultimateLayer(dv, forTrain)._1 - - def extractLayerAndPenultimateLayer(dv: DenseVector[Double], forTrain: Boolean): (OutputLayer, Transform.Layer[In,Out]); - + def extractLayerAndPenultimateLayer(dv: DenseVector[Double], forTrain: Boolean): (OutputLayer, Transform.Layer[In,Out]) def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String): DenseVector[Double] - def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) - - def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int]; - + def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] type OutputLayer <: OutputTransform.OutputLayer[In,Out] } object OutputTransform { trait OutputLayer[In, +Out] extends Transform.Layer[In,Out] { - - def index: Index[Feature]; - + def index: Index[Feature] def activations(fv: In):Out - - def activationsFromPenultimateDot(innerLayerActivations: DenseVector[Double], sparseIdx: Int): Double; - + def activationsFromPenultimateDot(innerLayerActivations: DenseVector[Double], sparseIdx: Int): Double def activationsFromPenultimateDot(innerLayerActivations: DenseVector[Double], sparseFeatures: Array[Int]): Double = { - var value = 0.0; + var value = 0.0 for (sparseFeature <- sparseFeatures) { value += activationsFromPenultimateDot(innerLayerActivations, sparseFeature) } value } - def tallyDerivative(deriv: DenseVector[Double], scale: =>Vector[Double], fv: In) - def applyBatchNormalization(inputs: scala.collection.GenTraversable[In]) } diff --git a/src/main/scala/epic/dense/TanhTransform.scala b/src/main/scala/epic/dense/TanhTransform.scala index 92a052d5..7594298d 100644 --- a/src/main/scala/epic/dense/TanhTransform.scala +++ b/src/main/scala/epic/dense/TanhTransform.scala @@ -27,7 +27,7 @@ case class TanhTransform[FV](inner: Transform[FV, DenseVector[Double]]) extends val myIndex = Index[Feature] - def index = myIndex; + def index = myIndex def activations(fv: FV): DenseVector[Double] = { val act = innerLayer.activations(fv) * 2.0 diff --git a/src/main/scala/epic/dense/Transform.scala b/src/main/scala/epic/dense/Transform.scala index 10fd55ee..885ccc62 100644 --- a/src/main/scala/epic/dense/Transform.scala +++ b/src/main/scala/epic/dense/Transform.scala @@ -12,29 +12,19 @@ import scala.util.Random */ trait Transform[In, +Out] extends Serializable { val index: Index[Feature] - - def extractLayer(dv: DenseVector[Double], forTrain: Boolean):Layer - def initialWeightVector(initWeightsScale: Double, rng: Random, outputLayer: Boolean, spec: String): DenseVector[Double] - def clipHiddenWeightVectors(weights: DenseVector[Double], norm: Double, outputLayer: Boolean) - - def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int]; - + def getInterestingWeightIndicesForGradientCheck(offset: Int): Seq[Int] type Layer <: Transform.Layer[In,Out] } object Transform { trait Layer[In, +Out] { - - def index: Index[Feature]; - + def index: Index[Feature] def activations(fv: In):Out - def tallyDerivative(deriv: DenseVector[Double], scale: =>Vector[Double], fv: In) - def applyBatchNormalization(inputs: scala.collection.GenTraversable[In]) } diff --git a/src/main/scala/epic/dense/Word2Vec.scala b/src/main/scala/epic/dense/Word2Vec.scala index fe79023b..b45b634f 100644 --- a/src/main/scala/epic/dense/Word2Vec.scala +++ b/src/main/scala/epic/dense/Word2Vec.scala @@ -28,8 +28,8 @@ object Word2Vec { throw new RuntimeException("Unrecognized vectors: " + word2vecPath) } } - val dimsEachSource = vectorsEachSource.map(_.values.head.size) - val finalVectorDim = Math.min(maxVectorLen, dimsEachSource.reduce(_ + _) + (if (inputVectorBias) 1 else 0)) + val dimsEachSource = vectorsEachSource.map(_.values.head.length) + val finalVectorDim = Math.min(maxVectorLen, dimsEachSource.sum + (if (inputVectorBias) 1 else 0)) val finalVectors = new HashMap[String,Array[Float]] val rng = new Random(0) val mostCommonMisses = Counter[String,Double] @@ -37,7 +37,7 @@ object Word2Vec { for (word <- voc) { val containedInSome = vectorsEachSource.map(_.keySet.contains(word)).reduce(_ || _) val vector = if (containedInSome) { - var finalVector = (0 until vectorsEachSource.size).map(i => vectorsEachSource(i).getOrElse(word, { Array.tabulate(dimsEachSource(i))(j => 0.0F) })).reduce(_ ++ _) + var finalVector = vectorsEachSource.indices.map(i => vectorsEachSource(i).getOrElse(word, { Array.tabulate(dimsEachSource(i))(j => 0.0F) })).reduce(_ ++ _) if (inputVectorBias) { finalVector = finalVector ++ Array(1.0F) } @@ -51,8 +51,8 @@ object Word2Vec { Array.tabulate(finalVectorDim)(i => if (i == finalVectorDim - 1 && inputVectorBias) 1.0F else 0.0F) } } - val vectorTrimmed = if (vector.size > finalVectorDim) vector.slice(0, finalVectorDim) else vector - require(vectorTrimmed.size == finalVectorDim, "Mismatched sizes, expected dimension " + finalVectorDim + " but got " + vector.size + " clipped to " + vectorTrimmed.size) + val vectorTrimmed = if (vector.length > finalVectorDim) vector.slice(0, finalVectorDim) else vector + require(vectorTrimmed.length == finalVectorDim, "Mismatched sizes, expected dimension " + finalVectorDim + " but got " + vector.length + " clipped to " + vectorTrimmed.length) finalVectors.put(word, vectorTrimmed) } println("Read embeddings for " + voc.size + " words from " + word2vecPaths.size + " sources, " + @@ -78,7 +78,7 @@ object Word2Vec { * generated for them. */ def loadVectorsForVocabulary(word2vecPath: String, voc: Set[String], inputVectorBias: Boolean) = { - val word2vecMap = readWord2Vec(word2vecPath, voc, inputVectorBias); + val word2vecMap = readWord2Vec(word2vecPath, voc, inputVectorBias) if (word2vecMap.isEmpty) { throw new RuntimeException("No word2vec vectors loaded") } @@ -86,7 +86,7 @@ object Word2Vec { } def loadBansalVectorsForVocabulary(word2vecPath: String, voc: Set[String], inputVectorBias: Boolean) = { - val word2vecMap = readBansalEmbeddings(word2vecPath, voc, inputVectorBias); + val word2vecMap = readBansalEmbeddings(word2vecPath, voc, inputVectorBias) if (word2vecMap.isEmpty) { throw new RuntimeException("No Bansal vectors loaded") } @@ -94,7 +94,7 @@ object Word2Vec { } private def augmentVectorsToCompleteVocabulary(word2vecMap: HashMap[String,Array[Float]], voc: Set[String], inputVectorBias: Boolean) = { - val word2vecDim = word2vecMap.values.head.size + val word2vecDim = word2vecMap.values.head.length val rng = new Random(0) for (unkWord <- voc -- word2vecMap.keySet) { // Set to random noise except for the bias feature, if it's there @@ -109,40 +109,40 @@ object Word2Vec { * file. */ def readWord2Vec(word2VecPath: String, words: Set[String], inputVectorBias: Boolean) = { - val bis = new BufferedInputStream(new FileInputStream(word2VecPath)); - val dis = new DataInputStream(bis); - val word2Vec = new HashMap[String,Array[Float]]; + val bis = new BufferedInputStream(new FileInputStream(word2VecPath)) + val dis = new DataInputStream(bis) + val word2Vec = new HashMap[String,Array[Float]] // First two entries are vocabulary size and dimension of vectors - val vocSize = Word2VecUtils.readString(dis).toInt; - val dim = Word2VecUtils.readString(dis).toInt; + val vocSize = Word2VecUtils.readString(dis).toInt + val dim = Word2VecUtils.readString(dis).toInt // Now read vectors, augmented with 1s for bias for (i <- 0 until vocSize) { if (i % 1000000 == 0) { println("On line " + i) } - val word = Word2VecUtils.readString(dis); - val vector = new Array[Float](if (inputVectorBias) dim + 1 else dim); - val len = 0; - var j = 0; + val word = Word2VecUtils.readString(dis) + val vector = new Array[Float](if (inputVectorBias) dim + 1 else dim) + val len = 0 + var j = 0 while (j < dim) { - vector(j) = Word2VecUtils.readFloat(dis); - j += 1; + vector(j) = Word2VecUtils.readFloat(dis) + j += 1 } if (inputVectorBias) { vector(j) = 1.0F } if (words.isEmpty || words.contains(word)) { - word2Vec.put(word, vector); + word2Vec.put(word, vector) } } - println("Loaded " + word2Vec.size + " word2vec representations out of " + words.size + " attempted words"); - word2Vec; + println("Loaded " + word2Vec.size + " word2vec representations out of " + words.size + " attempted words") + word2Vec } - val hyphenPattern = Pattern.compile("(\\w+-)+(\\w+)"); + val hyphenPattern = Pattern.compile("(\\w+-)+(\\w+)") def convertWord(str: String, lowercase: Boolean = false) = { - var strRep = str; + var strRep = str strRep = strRep.replace("-LRB-", "(") strRep = strRep.replace("-RRB-", ")") strRep = strRep.replace("-LSB-", "[") @@ -166,10 +166,10 @@ object Word2Vec { def readBansalEmbeddings(embeddingsPath: String, words: Set[String], inputVectorBias: Boolean) = { val inFile = scala.io.Source.fromFile(new File(embeddingsPath)).getLines() - val word2Vec = new HashMap[String,Array[Float]]; + val word2Vec = new HashMap[String,Array[Float]] var firstLine = true while (inFile.hasNext) { - val line = inFile.next; + val line = inFile.next if (firstLine) { if (line.split("\\s+").size == 2) { println("Skipping first line: " + line) @@ -177,14 +177,14 @@ object Word2Vec { // skip over it by leaving firstLine set to true } else { println("Not skipping first line: " + line) - firstLine = false; + firstLine = false } } if (!firstLine) { // If the line contains a tab, then that's the delimiter between the word and // the vectors if (line.contains("\t")) { - val word = line.substring(0, line.indexOf("\t")); + val word = line.substring(0, line.indexOf("\t")) if (words.isEmpty || words.contains(word)) { val entries = line.substring(line.indexOf("\t") + 1).split(" ") val arr = Array.tabulate(if (inputVectorBias) entries.size + 1 else entries.size)(i => { @@ -198,9 +198,9 @@ object Word2Vec { } } else { // Otherwise, a space is the first delimiter - val word = line.substring(0, line.indexOf(" ")); + val word = line.substring(0, line.indexOf(" ")) if (words.isEmpty || words.contains(word)) { - val entries = line.substring(line.indexOf(" ") + 1).split(" "); + val entries = line.substring(line.indexOf(" ") + 1).split(" ") val arr = Array.tabulate(if (inputVectorBias) entries.size + 1 else entries.size)(i => { if (inputVectorBias && i == entries.size) { 1.0F @@ -212,9 +212,9 @@ object Word2Vec { } } } - firstLine = false; + firstLine = false } - println("Loaded " + word2Vec.size + " Bansal representations out of " + words.size + " attempted words"); - word2Vec; + println("Loaded " + word2Vec.size + " Bansal representations out of " + words.size + " attempted words") + word2Vec } } \ No newline at end of file diff --git a/src/main/scala/epic/dense/Word2VecSurfaceFeaturizer.scala b/src/main/scala/epic/dense/Word2VecSurfaceFeaturizer.scala index eef2abaf..0b2634dd 100644 --- a/src/main/scala/epic/dense/Word2VecSurfaceFeaturizer.scala +++ b/src/main/scala/epic/dense/Word2VecSurfaceFeaturizer.scala @@ -21,7 +21,7 @@ class Word2VecIndexed[W](private val wordIndex: Index[W], private val word2vec: Array[Array[Double]], private val converter: W => W) extends Serializable { - def wordRepSize = word2vec.head.size + def wordRepSize = word2vec.head.length def vocSize = wordIndex.size val zeroVector = Array.tabulate(wordRepSize)(i => 0.0) @@ -39,7 +39,7 @@ class Word2VecIndexed[W](private val wordIndex: Index[W], } def augment(numSparseFeats: Int, featurizer: W => Array[Int]): Word2VecIndexed[W] = { - val newWord2Vec = Array.tabulate(word2vec.size)(i => { + val newWord2Vec = Array.tabulate(word2vec.length)(i => { val word = wordIndex.get(i) val feats = featurizer(word) word2vec(i) ++ Array.tabulate(numSparseFeats)(j => if (feats.contains(j)) 1.0 else 0.0) @@ -62,20 +62,20 @@ object Word2VecIndexed { } trait WordVectorAnchoringIndexed[String] { - def reducedFeaturesForSpan(start: Int, end: Int): Array[Int]; - def featuresForSpan(start: Int, end: Int): Array[Int]; - def featuresForSplit(start: Int, split: Int, end: Int): Array[Int]; + def reducedFeaturesForSpan(start: Int, end: Int): Array[Int] + def featuresForSpan(start: Int, end: Int): Array[Int] + def featuresForSplit(start: Int, split: Int, end: Int): Array[Int] } class Word2VecSurfaceFeaturizerIndexed[W](val word2vecIndexed: Word2VecIndexed[W], val featureSpec: String) extends Serializable { def reducedInputSize = { - anchor(IndexedSeq[W]()).reducedFeaturesForSpan(0, 0).size * word2vecIndexed.wordRepSize + anchor(IndexedSeq[W]()).reducedFeaturesForSpan(0, 0).length * word2vecIndexed.wordRepSize } def splitInputSize = { - anchor(IndexedSeq[W]()).featuresForSplit(0, 0, 0).size * word2vecIndexed.wordRepSize + anchor(IndexedSeq[W]()).featuresForSplit(0, 0, 0).length * word2vecIndexed.wordRepSize } def anchor(words: IndexedSeq[W]): WordVectorAnchoringIndexed[W] = { @@ -154,8 +154,8 @@ object Word2VecSurfaceFeaturizerIndexed { trait WordVectorDepAnchoringIndexed[String] { - def getHeadDepPair(begin: Int, split: Int, end: Int, rule: Int): (Int, Int); - def featuresForHeadPair(head: Int, dep: Int): Array[Int]; + def getHeadDepPair(begin: Int, split: Int, end: Int, rule: Int): (Int, Int) + def featuresForHeadPair(head: Int, dep: Int): Array[Int] } class Word2VecDepFeaturizerIndexed[W](val word2VecIndexed: Word2VecIndexed[W], @@ -165,21 +165,21 @@ class Word2VecDepFeaturizerIndexed[W](val word2VecIndexed: Word2VecIndexed[W], val hackyHeadFinder: HackyHeadFinder[String,String] = new RuleBasedHackyHeadFinder def anchor(words: IndexedSeq[W]): WordVectorDepAnchoringIndexed[W] = { - val indexedWords = words.map(word2VecIndexed.indexWord(_)) + val indexedWords = words.map(word2VecIndexed.indexWord) new WordVectorDepAnchoringIndexed[W] { - val preterminals = new Array[String](words.size); - for (i <- 0 until words.size) { - preterminals(i) = tagger.tag(words(i)); + val preterminals = new Array[String](words.size) + words.indices.foreach { i => + preterminals(i) = tagger.tag(words(i)) } def getHeadDepPair(begin: Int, split: Int, end: Int, rule: Int): (Int, Int) = { - val lc = topology.labelIndex.get(topology.leftChild(rule)).baseLabel; - val rc = topology.labelIndex.get(topology.rightChild(rule)).baseLabel; - val parent = topology.labelIndex.get(topology.parent(rule)).baseLabel; + val lc = topology.labelIndex.get(topology.leftChild(rule)).baseLabel + val rc = topology.labelIndex.get(topology.rightChild(rule)).baseLabel + val parent = topology.labelIndex.get(topology.parent(rule)).baseLabel - val lcHeadIdx = begin + hackyHeadFinder.findHead(lc, preterminals.slice(begin, split)); - val rcHeadIdx = split + hackyHeadFinder.findHead(rc, preterminals.slice(split, end)); + val lcHeadIdx = begin + hackyHeadFinder.findHead(lc, preterminals.slice(begin, split)) + val rcHeadIdx = split + hackyHeadFinder.findHead(rc, preterminals.slice(split, end)) val overallHeadIdx = begin + hackyHeadFinder.findHead(parent, preterminals.slice(begin, end)) if (overallHeadIdx == rcHeadIdx) { (rcHeadIdx, lcHeadIdx) @@ -205,28 +205,28 @@ trait Tagger[W] { class FrequencyTagger[W](wordTagCounts: Counter2[String, W, Double]) extends Tagger[W] with Serializable { - private val wordCounts = Counter[W,Double]; - private val wordToTagMap = new HashMap[W,String]; + private val wordCounts = Counter[W,Double] + private val wordToTagMap = new HashMap[W,String] for (word <- wordTagCounts.keysIterator.map(_._2).toSeq.distinct) { - wordCounts(word) = sum(wordTagCounts(::, word)); + wordCounts(word) = sum(wordTagCounts(::, word)) if (!wordToTagMap.contains(word)) { - val tagCounts = wordTagCounts(::, word).iterator; - var bestTag = HackyLexicalProductionFeaturizer.UnkTag; - var bestTagCount = 0.0; + val tagCounts = wordTagCounts(::, word).iterator + var bestTag = HackyLexicalProductionFeaturizer.UnkTag + var bestTagCount = 0.0 for ((tag, count) <- tagCounts) { if (count > bestTagCount) { - bestTag = tag; - bestTagCount = count; + bestTag = tag + bestTagCount = count } } - wordToTagMap.put(word, bestTag); + wordToTagMap.put(word, bestTag) } } val tagTypesIdx = Index[String] wordToTagMap.values.toSet[String].foreach(tagType => tagTypesIdx.index(tagType)) tagTypesIdx.index(HackyLexicalProductionFeaturizer.UnkTag) - def tag(word: W) = if (wordToTagMap.contains(word)) wordToTagMap(word) else HackyLexicalProductionFeaturizer.UnkTag; + def tag(word: W) = if (wordToTagMap.contains(word)) wordToTagMap(word) else HackyLexicalProductionFeaturizer.UnkTag def convertToFeaturizer: W => Array[Int] = (word: W) => Array(tagTypesIdx.index(tag(word))) } diff --git a/src/main/scala/epic/features/BilexicalFeaturizer.scala b/src/main/scala/epic/features/BilexicalFeaturizer.scala index a51b722f..95225a44 100644 --- a/src/main/scala/epic/features/BilexicalFeaturizer.scala +++ b/src/main/scala/epic/features/BilexicalFeaturizer.scala @@ -63,12 +63,11 @@ object BilexicalFeaturizer { } } - case class AdaptedSurfaceFeaturizer[W](base: SurfaceFeaturizer[W]) extends BilexicalFeaturizer[W] { def anchor(w: IndexedSeq[W]): BilexicalFeatureAnchoring[W] = new BilexicalFeatureAnchoring[W] { val ba = base.anchor(w) def featuresForAttachment(head: Int, dep: Int): Array[Feature] = { - if(head < dep) ba.featuresForSpan(head, dep) + if (head < dep) ba.featuresForSpan(head, dep) else ba.featuresForSpan(dep, head) } } @@ -91,7 +90,7 @@ object BilexicalFeaturizer { case class BinomialFeaturizer[W](headBase: BilexicalFeaturizer[W], depBase: BilexicalFeaturizer[W]) extends BilexicalFeaturizer[W] { def anchor(w: IndexedSeq[W]): BilexicalFeatureAnchoring[W] = new BilexicalFeatureAnchoring[W] { val hb = headBase.anchor(w) - val db = if(headBase eq depBase) hb else depBase.anchor(w) + val db = if (headBase eq depBase) hb else depBase.anchor(w) def featuresForAttachment(head: Int, dep: Int): Array[Feature] = { val hf = hb.featuresForAttachment(head, dep) val df = db.featuresForAttachment(head, dep) @@ -104,7 +103,7 @@ object BilexicalFeaturizer { case class HeadDepFeaturizer[W](headBase: WordFeaturizer[W], depBase: WordFeaturizer[W]) extends BilexicalFeaturizer[W] { def anchor(w: IndexedSeq[W]): BilexicalFeatureAnchoring[W] = new BilexicalFeatureAnchoring[W] { val hb = headBase.anchor(w) - val db = if(headBase eq depBase) hb else depBase.anchor(w) + val db = if (headBase eq depBase) hb else depBase.anchor(w) def featuresForAttachment(head: Int, dep: Int): Array[Feature] = { Arrays.crossProduct(hb.featuresForWord(head), db.featuresForWord(dep))((a, b) => HeadDepFeature(a,b):Feature) } @@ -133,7 +132,6 @@ trait BilexicalFeatureAnchoring[W] { def featuresForAttachment(head: Int, dep: Int):Array[Feature] } - @SerialVersionUID(1L) class ProductIndexedBilexicalFeaturizer[W](headFeaturizer: IndexedWordFeaturizer[W], depFeaturizer: IndexedWordFeaturizer[W], @@ -153,7 +151,6 @@ class ProductIndexedBilexicalFeaturizer[W](headFeaturizer: IndexedWordFeaturizer ret = f1 cache(head)(dep) = f1 } - ret } } @@ -208,8 +205,8 @@ object IndexedBilexicalFeaturizer { for( (head, dep) <- tree.arcs if head < tree.words.length) { builder.add(hanch.featuresForWord(head), danch.featuresForWord(dep)) - // builder.add(danch.featuresForWord(head), - // hanch.featuresForWord(dep)) + // builder.add(danch.featuresForWord(head), + // hanch.featuresForWord(dep)) } } diff --git a/src/main/scala/epic/features/BrownClusters.scala b/src/main/scala/epic/features/BrownClusters.scala index c39796a2..2c383559 100644 --- a/src/main/scala/epic/features/BrownClusters.scala +++ b/src/main/scala/epic/features/BrownClusters.scala @@ -22,17 +22,14 @@ object BrownClusters { } yield { word -> cluster.intern } - val map = pairs.toMap in.close() - map } lazy val clusterIds = theClusters.values.toSet - def clusterFor(w: String, default:String = "00"):String = theClusters.getOrElse(w, default) - + def clusterFor(w: String, default:String = "00"): String = theClusters.getOrElse(w, default) trait DSL { // Tkachenko and Simanovsky liked these values @@ -41,14 +38,13 @@ object BrownClusters { } } - case class BrownClusterFeature(f: String) extends Feature case class BrownClusterFeaturizer(lengths: Array[Int]) extends WordFeaturizer[String] with Serializable { def anchor(w: IndexedSeq[String]): WordFeatureAnchoring[String] = new WordFeatureAnchoring[String] { def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0 || pos >= words.length) + if (pos < 0 || pos >= words.length) Array(BoundaryFeature) else features(pos) @@ -68,7 +64,7 @@ case class BrownClusterFeaturizer(lengths: Array[Int]) extends WordFeaturizer[St private val clusterFeatures = { BrownClusters.clusterIds .iterator - .map(k => k -> lengths.map(l => if(l > k.length) BrownClusterFeature(k) else BrownClusterFeature(k.substring(0, l))).toSet[Feature].toArray[Feature]) + .map(k => k -> lengths.map(l => if (l > k.length) BrownClusterFeature(k) else BrownClusterFeature(k.substring(0, l))).toSet[Feature].toArray[Feature]) .toMap } } diff --git a/src/main/scala/epic/features/ContextFeaturizer.scala b/src/main/scala/epic/features/ContextFeaturizer.scala index b67600e9..1302ead1 100644 --- a/src/main/scala/epic/features/ContextFeaturizer.scala +++ b/src/main/scala/epic/features/ContextFeaturizer.scala @@ -22,7 +22,7 @@ case class ContextFeaturizer[W](featurizer: WordFeaturizer[W], window: Int) exte } def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0 || pos >= words.length) + if (pos < 0 || pos >= words.length) Array.empty else settedFeatures(pos) diff --git a/src/main/scala/epic/features/ContextWordFeaturizer.scala b/src/main/scala/epic/features/ContextWordFeaturizer.scala index 6737fc71..8c716f92 100644 --- a/src/main/scala/epic/features/ContextWordFeaturizer.scala +++ b/src/main/scala/epic/features/ContextWordFeaturizer.scala @@ -4,7 +4,6 @@ import epic.framework.Feature import scala.collection.mutable.ArrayBuffer import epic.util.Arrays - /** * * @author dlwh @@ -19,7 +18,6 @@ class ContextWordFeaturizer[W](offsetFeaturizer: WordFeaturizer[W], wordOffsetOr for(off <- -wordOffsetOrder to wordOffsetOrder if off != 0) { result ++= offsetAnchoring.featuresForWord(pos + off).map(f => OffsetFeature(off, f):Feature) } - /* val myFeats = offsetAnchoring.featuresForWord(pos) result ++= Arrays.crossProduct(Array(myFeats.head), offsetAnchoring.featuresForWord(pos+1)){BigramFeature(0, _, _)} @@ -30,7 +28,6 @@ class ContextWordFeaturizer[W](offsetFeaturizer: WordFeaturizer[W], wordOffsetOr def words: IndexedSeq[W] = w - } } diff --git a/src/main/scala/epic/features/CrossProductIndex.scala b/src/main/scala/epic/features/CrossProductIndex.scala index 510ec2f9..9d659f89 100644 --- a/src/main/scala/epic/features/CrossProductIndex.scala +++ b/src/main/scala/epic/features/CrossProductIndex.scala @@ -12,7 +12,7 @@ import scala.util.hashing.MurmurHash3 @SerialVersionUID(1743448091752596096L) case class CrossProductFeature[A, B](labelPart: A, surfacePart: B, id: String = "") extends Feature { - override def toString = s"${if(id.nonEmpty) id else "CrossProduct"}Feature($labelPart, $surfacePart)" + override def toString = s"${if (id.nonEmpty) id else "CrossProduct"}Feature($labelPart, $surfacePart)" } /** @@ -32,7 +32,6 @@ class CrossProductIndex[A, B] private (val firstIndex: Index[A], def surfacePart(i: Int) = surfacePartOfFeature(i - labelOnlySize) def labelPart(i: Int) = labelPartOfFeature(i - labelOnlySize) - def lock = { val lockedFirst: Index[A] = firstIndex match { case x: HashExtendingIndex[A] => x.lock @@ -58,39 +57,36 @@ class CrossProductIndex[A, B] private (val firstIndex: Index[A], case _ => -1 } - def mapped(labelFeature: Int, surfaceFeature: Int):Int = { - if(labelFeature < 0 || surfaceFeature < 0) { + def mapped(labelFeature: Int, surfaceFeature: Int): Int = { + if (labelFeature < 0 || surfaceFeature < 0) { -1 } else { val arr = mapping(labelFeature) - val f = if(arr ne null) { + val f = if (arr ne null) { arr(surfaceFeature) } else { -1 } - - if(f != -1 || numHashFeatures == 0) { + if (f != -1 || numHashFeatures == 0) { f } else if (f < -1) { // really not present -1 } else { val hf = MurmurHash3.mixLast(MurmurHash3.mix(10891, labelFeature.##), surfaceFeature.##).abs - if(!seenSet.addOrSeen(hf)) { + if (!seenSet.addOrSeen(hf)) { -1 } else { (hf % numHashFeatures) + trueSize } } } - } - - private val labelOnlySize: Int = if(includePlainLabelFeatures) firstIndex.size else 0 + private val labelOnlySize: Int = if (includePlainLabelFeatures) firstIndex.size else 0 private val trueSize = labelOnlySize + labelPartOfFeature.length override def size: Int = trueSize + numHashFeatures - def unapply(i: Int): Option[Feature] = if(i >= size || i < 0) None else Some(get(i)) + def unapply(i: Int): Option[Feature] = if (i >= size || i < 0) None else Some(get(i)) override def get(i: Int): Feature = { if (i >= size || i < 0) { @@ -110,15 +106,15 @@ class CrossProductIndex[A, B] private (val firstIndex: Index[A], def crossProduct(lFeatures: Array[Int], sFeatures: Array[Int], offset: Int = 0, usePlainLabelFeatures: Boolean = true):Array[Int] = { val builder = new mutable.ArrayBuilder.ofInt - builder.sizeHint(lFeatures.length * (sFeatures.length + {if(includePlainLabelFeatures) 1 else 0})) + builder.sizeHint(lFeatures.length * (sFeatures.length + {if (includePlainLabelFeatures) 1 else 0})) var i = 0 - while(i < lFeatures.length) { - if(usePlainLabelFeatures && includePlainLabelFeatures && lFeatures(i) >= 0) + while (i < lFeatures.length) { + if (usePlainLabelFeatures && includePlainLabelFeatures && lFeatures(i) >= 0) builder += (lFeatures(i) + offset) var j = 0 - while(j < sFeatures.length) { + while (j < sFeatures.length) { val m = mapped(lFeatures(i),sFeatures(j)) + offset - if(m != -1) + if (m != -1) builder += m j += 1 } @@ -133,32 +129,31 @@ class CrossProductIndex[A, B] private (val firstIndex: Index[A], val builder = new CSCMatrix.Builder[Double](firstIndex.size, secondIndex.size) val vbuilder = new VectorBuilder[Double](firstIndex.size) - if(includePlainLabelFeatures) { + if (includePlainLabelFeatures) { for(i <- 0 until firstIndex.size) { val w = weights(i) - if(w != 0.0) + if (w != 0.0) vbuilder.add(i, w) } } - if(numHashFeatures == 0) { + if (numHashFeatures == 0) { // if no hash features, we can just iterate over the enumerated part of the index for(((l, s), i) <- (labelPartOfFeature zip surfacePartOfFeature).zipWithIndex) { val w = weights(i + labelOnlySize) - if(w != 0.0) + if (w != 0.0) builder.add(l, s, w) } } else { // otherwise, check everything for(l <- 0 until firstIndex.size; s <- 0 until secondIndex.size) { val i = mapped(l, s) - if(i >= 0 && weights(i) != 0) { + if (i >= 0 && weights(i) != 0) { builder.add(l, s, weights(i)) } } } - (builder.result(), vbuilder.toSparseVector(true, true)) } @@ -175,7 +170,7 @@ class CrossProductIndex[A, B] private (val firstIndex: Index[A], */ def prune(shouldPrune: Int=>Boolean, rebuildSurfaceIndex: Boolean = true):CrossProductIndex[A, B] = { val newSecondIndex = Index[B]() - def newIndexOf(b: Int) = if(rebuildSurfaceIndex) newSecondIndex.index(secondIndex.get(b)) else b + def newIndexOf(b: Int) = if (rebuildSurfaceIndex) newSecondIndex.index(secondIndex.get(b)) else b def alreadyInNewIndex(b: Int) = !rebuildSurfaceIndex || newSecondIndex.contains(secondIndex.get(b)) val mapping = Array.fill(firstIndex.size)(new OpenAddressHashArray[Int](secondIndex.size max 1, -1, 4)) val newLabelPart, newSurfacePart = new ArrayBuffer[Int]() @@ -211,7 +206,7 @@ class CrossProductIndex[A, B] private (val firstIndex: Index[A], } new CrossProductIndex(firstIndex, - if(rebuildSurfaceIndex) newSecondIndex else secondIndex, + if (rebuildSurfaceIndex) newSecondIndex else secondIndex, mapping, newLabelPart.toArray, newSurfacePart.toArray, id, includePlainLabelFeatures, @@ -240,12 +235,12 @@ object CrossProductIndex { val includeLabelOnlyFeatures: Boolean = true, minCount: Int = 1, seenSet: LockableSeenSet[Long] = LockableSeenSet.always) extends SafeLogging { - def add(a: A, b: B):Int = add(firstIndex(a), secondIndex(b)) + def add(a: A, b: B): Int = add(firstIndex(a), secondIndex(b)) private val counts = Array.fill(firstIndex.size)(new OpenAddressHashArray[Int](secondIndex.size max 1, 0, 4)) private val mapping = Array.fill(firstIndex.size)(new OpenAddressHashArray[Int](secondIndex.size max 1, -1, 4)) private val labelPart, surfacePart = new ArrayBuffer[Int]() - private val labelOnlySize: Int = if(includeLabelOnlyFeatures) firstIndex.size else 0 + private val labelOnlySize: Int = if (includeLabelOnlyFeatures) firstIndex.size else 0 def size = labelPart.size + labelOnlySize @@ -257,14 +252,14 @@ object CrossProductIndex { secondArray.map(add(first, _)) } - def add(first: Int, second: Int):Int = { - if(first < 0 || second < 0) { + def add(first: Int, second: Int): Int = { + if (first < 0 || second < 0) { -1 } else { val currentIndex: Int = mapping(first)(second) - if(currentIndex == -1) { + if (currentIndex == -1) { val currentCount = counts(first)(second) - if(minCount <= 1 || currentCount + 1 >= minCount) { + if (minCount <= 1 || currentCount + 1 >= minCount) { val x = size mapping(first)(second) = x labelPart += first diff --git a/src/main/scala/epic/features/DistanceBinner.scala b/src/main/scala/epic/features/DistanceBinner.scala index 2f968cb3..4a29f4e7 100644 --- a/src/main/scala/epic/features/DistanceBinner.scala +++ b/src/main/scala/epic/features/DistanceBinner.scala @@ -27,13 +27,13 @@ class DistanceBinner private (val binThresholds: Array[Int], preserveDirection: else bin + 1 } - def distanceBin(a: Int, b: Int):Int = { + def distanceBin(a: Int, b: Int): Int = { val dist: Int = b - a distanceBin(dist) } def distanceBin(dist: Int): Int = { - val array = if(dist < 0) negativeBins else bins + val array = if (dist < 0) negativeBins else bins val adist = math.min(math.abs(dist), array.length - 1) array(adist) } @@ -44,13 +44,13 @@ class DistanceBinner private (val binThresholds: Array[Int], preserveDirection: def binnedDistance(dist: Int): Int = { val bin = distanceBin(dist) - if(dist == 0) 0 - else if(bin < 0) { - if(-bin-1 >= binThresholds.length) + if (dist == 0) 0 + else if (bin < 0) { + if (-bin-1 >= binThresholds.length) -(binThresholds.last + 1) else -binThresholds(-bin-1) - } else if(bin >= binThresholds.length) { - (binThresholds.last + 1) + } else if (bin >= binThresholds.length) { + binThresholds.last + 1 } else binThresholds(bin-1) } @@ -58,10 +58,9 @@ class DistanceBinner private (val binThresholds: Array[Int], preserveDirection: } - object DistanceBinner { def mkBinArray(numBins: Int, numExactBins: Int): Array[Int] = { - if(numBins <= 1) Array(1) + if (numBins <= 1) Array(1) else { val exact = Array.range(1, numExactBins+1) exact ++ Array.iterate(exact.last, (numBins - numExactBins) max 1)(exact => exact * 2).drop(1) diff --git a/src/main/scala/epic/features/EnglishWordClassGenerator.scala b/src/main/scala/epic/features/EnglishWordClassGenerator.scala index 663c6066..c48400ca 100644 --- a/src/main/scala/epic/features/EnglishWordClassGenerator.scala +++ b/src/main/scala/epic/features/EnglishWordClassGenerator.scala @@ -1,6 +1,5 @@ package epic.features - /** * Converts a string into another string with properties of that string * Useful for rare or 0 count words @@ -11,60 +10,59 @@ object EnglishWordClassGenerator extends (String=>String) with Serializable { def apply(x: String) = signatureFor(x) def signatureFor(word: String) = { - val sb = new StringBuilder; - val wlen = word.length(); - val numCaps = (word: Seq[Char]).count(_.isUpper); - val hasDigit = word.exists(_.isDigit); - val hasDash = word.contains('-'); - val hasLower = numCaps < wlen; - val ch0 = word.charAt(0); - val lowered = word.toLowerCase(); + val sb = new StringBuilder + val wlen = word.length() + val numCaps = (word: Seq[Char]).count(_.isUpper) + val hasDigit = word.exists(_.isDigit) + val hasDash = word.contains('-') + val hasLower = numCaps < wlen + val ch0 = word.charAt(0) + val lowered = word.toLowerCase() if (Character.isUpperCase(ch0) || Character.isTitleCase(ch0)) { if (numCaps == 1) { - sb.append("-INITC"); + sb.append("-INITC") } else { - sb.append("-CAPS"); + sb.append("-CAPS") } } else if (!Character.isLetter(ch0) && numCaps > 0) { - sb.append("-CAPS"); + sb.append("-CAPS") } else if (hasLower) { - sb.append("-LC"); + sb.append("-LC") } - if (hasDigit) { - sb.append("-NUM"); + sb.append("-NUM") } if (hasDash) { - sb.append("-DASH"); + sb.append("-DASH") } if (lowered.endsWith("s") && wlen >= 3) { // here length 3, so you don't miss out on ones like 80s - val ch2 = lowered.charAt(wlen - 2); + val ch2 = lowered.charAt(wlen - 2) // not -ess suffixes or greek/latin -us, -is if (ch2 != 's' && ch2 != 'i' && ch2 != 'u') { - sb.append("-s"); + sb.append("-s") } } else if (word.length() >= 5 && !hasDash && !(hasDigit && numCaps > 0)) { if (lowered.endsWith("ed")) { - sb.append("-ed"); + sb.append("-ed") } else if (lowered.endsWith("ing")) { - sb.append("-ing"); + sb.append("-ing") } else if (lowered.endsWith("ion")) { - sb.append("-ion"); + sb.append("-ion") } else if (lowered.endsWith("er")) { - sb.append("-er"); + sb.append("-er") } else if (lowered.endsWith("est")) { - sb.append("-est"); + sb.append("-est") } else if (lowered.endsWith("ly")) { - sb.append("-ly"); + sb.append("-ly") } else if (lowered.endsWith("ity")) { - sb.append("-ity"); + sb.append("-ity") } else if (lowered.endsWith("y")) { - sb.append("-y"); + sb.append("-y") } else if (lowered.endsWith("al")) { - sb.append("-al"); + sb.append("-al") } } - sb.toString; + sb.toString } } diff --git a/src/main/scala/epic/features/HackyHeadFinder.scala b/src/main/scala/epic/features/HackyHeadFinder.scala index 3310ecd9..5e6f8e99 100644 --- a/src/main/scala/epic/features/HackyHeadFinder.scala +++ b/src/main/scala/epic/features/HackyHeadFinder.scala @@ -2,7 +2,6 @@ package epic.features import scala.collection.mutable.HashMap - /** * HackyHeadFinders find "heads" in a span using only preterminal labels. * It doesn't use the syntactic structure of the sentence. @@ -12,7 +11,7 @@ import scala.collection.mutable.HashMap * @tparam T */ trait HackyHeadFinder[L,T] extends Serializable { - def findHead(label: L, preterminals: Seq[T]): Int; + def findHead(label: L, preterminals: Seq[T]): Int } case class RuleBasedHackyHeadFinder() extends HackyHeadFinder[String,String] { @@ -21,80 +20,80 @@ case class RuleBasedHackyHeadFinder() extends HackyHeadFinder[String,String] { if (!RuleBasedHackyHeadFinder.headRules.contains(label)) { 0 } else { - val result = RuleBasedHackyHeadFinder.headRules(label)(preterminals); + val result = RuleBasedHackyHeadFinder.headRules(label)(preterminals) if (result == -1) { - println("-1 for " + label + ": " + preterminals); + println("-1 for " + label + ": " + preterminals) } - result; + result } } } object RuleBasedHackyHeadFinder { - val L2R = true; - val R2L = false; + val L2R = true + val R2L = false - val headRules = new HashMap[String,(Seq[String] => Int)]; + val headRules = new HashMap[String,(Seq[String] => Int)] // Ss: lots of problems are due to fronted PPs, NPs with sentential complements, etc. // NPs: lots of stuff due to CD / $, weird NPs // SBAR: I can't figure out why 0 tends to work the best but it does - headRules.put("ADJP", (preterminals) => searchFindFirst(preterminals, L2R, Set("NNS", "NN", "$", "JJ", "VBN", "VBG", "JJR", "JJS"))); - headRules.put("ADVP", (preterminals) => searchFindFirst(preterminals, R2L, Set("RB", "RBR", "RBS", "FW"))); + headRules.put("ADJP", (preterminals) => searchFindFirst(preterminals, L2R, Set("NNS", "NN", "$", "JJ", "VBN", "VBG", "JJR", "JJS"))) + headRules.put("ADVP", (preterminals) => searchFindFirst(preterminals, R2L, Set("RB", "RBR", "RBS", "FW"))) headRules.put("NP", (preterminals) => searchFindLastBefore(preterminals, L2R, Set("NN", "NNP", "NNPS", "NNS", "NX", "POS", "JJR", "$", "PRN"), Set(",", "WDT", "TO", "IN", "-LRB-", ":", "CC", "(" ))); // block appositives, complementizers, prepositions, parentheticals, conjunctions - headRules.put("QP", (preterminals) => searchFindFirst(preterminals, L2R, Set("$", "IN", "CD"))); - headRules.put("PP", (preterminals) => searchFindFirst(preterminals, L2R, Set("IN", "TO", "VBG", "VBN", "RP", "FW"))); - headRules.put("PRN", (preterminals) => if (preterminals.size > 1) 1 else 0); - headRules.put("S", (preterminals) => searchFindFirst(preterminals, L2R, Set("TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP"))); - headRules.put("VP", (preterminals) => searchFindFirst(preterminals, L2R, Set("TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP"))); -// headRules.put("SBAR", (preterminals) => searchFindFirst(preterminals, L2R, Set("WP", "WP$", "WDT", "WRB", "IN", "PRP", "PRP$"))); + headRules.put("QP", (preterminals) => searchFindFirst(preterminals, L2R, Set("$", "IN", "CD"))) + headRules.put("PP", (preterminals) => searchFindFirst(preterminals, L2R, Set("IN", "TO", "VBG", "VBN", "RP", "FW"))) + headRules.put("PRN", (preterminals) => if (preterminals.size > 1) 1 else 0) + headRules.put("S", (preterminals) => searchFindFirst(preterminals, L2R, Set("TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP"))) + headRules.put("VP", (preterminals) => searchFindFirst(preterminals, L2R, Set("TO", "VBD", "VBN", "MD", "VBZ", "VB", "VBG", "VBP"))) + // headRules.put("SBAR", (preterminals) => searchFindFirst(preterminals, L2R, Set("WP", "WP$", "WDT", "WRB", "IN", "PRP", "PRP$"))); def searchFindFirst(preterminals: Seq[String], leftToRight: Boolean, goodOnes: Set[String]): Int = { - val start = if (leftToRight) 0 else preterminals.size - 1; - val end = if (leftToRight) preterminals.size else -1; - var headIdx = -1; - var i = start; + val start = if (leftToRight) 0 else preterminals.size - 1 + val end = if (leftToRight) preterminals.size else -1 + var headIdx = -1 + var i = start while (i != end && headIdx == -1) { if (goodOnes.contains(preterminals(i))) { - headIdx = i; + headIdx = i } - i += (if (leftToRight) 1 else -1); + i += (if (leftToRight) 1 else -1) } if (headIdx < 0 || headIdx >= preterminals.size) { - headIdx = start; + headIdx = start } - headIdx; + headIdx } def searchFindLastBefore(preterminals: Seq[String], leftToRight: Boolean, goodOnes: Set[String], blockers: Set[String]) = { - val start = if (leftToRight) 0 else preterminals.size - 1; - val end = if (leftToRight) preterminals.size else -1; - var headIdx = -1; - var i = start; - var blocked = false; + val start = if (leftToRight) 0 else preterminals.size - 1 + val end = if (leftToRight) preterminals.size else -1 + var headIdx = -1 + var i = start + var blocked = false while (i != end && !blocked) { if (goodOnes.contains(preterminals(i))) { - headIdx = i; + headIdx = i } if (blockers.contains(preterminals(i))) { - blocked = true; + blocked = true } else { - i += (if (leftToRight) 1 else -1); + i += (if (leftToRight) 1 else -1) } } if (headIdx == -1) { -// headIdx = if (leftToRight) preterminals.size - 1 else 0; - headIdx = if (leftToRight) Math.max(0, i - 1) else Math.min(i+1, preterminals.size); + // headIdx = if (leftToRight) preterminals.size - 1 else 0; + headIdx = if (leftToRight) Math.max(0, i - 1) else Math.min(i+1, preterminals.size) } if (headIdx < 0 || headIdx >= preterminals.size) { - headIdx = 0; + headIdx = 0 } - headIdx; + headIdx } } diff --git a/src/main/scala/epic/features/HackyHeadFinderTest.scala b/src/main/scala/epic/features/HackyHeadFinderTest.scala index aa832014..2d1d967e 100644 --- a/src/main/scala/epic/features/HackyHeadFinderTest.scala +++ b/src/main/scala/epic/features/HackyHeadFinderTest.scala @@ -13,86 +13,83 @@ import scala.collection.mutable.HashMap object HackyHeadFinderTest { def main(args: Array[String]) { -// val treebank = new SimpleTreebank(new File(ptbPath), new File(ptbPath), new File(ptbPath)); - val treebank = Treebank.fromPennTreebankDir(new File("data/wsj")); + // val treebank = new SimpleTreebank(new File(ptbPath), new File(ptbPath), new File(ptbPath)) + val treebank = Treebank.fromPennTreebankDir(new File("data/wsj")) - val process = PartialTreeProcessor(); - val treesWords = treebank.train.trees.toSeq; - val processedTreesWords = treesWords.map(treeWordsPair => (process(treeWordsPair._1), treeWordsPair._2)); + val process = PartialTreeProcessor() + val treesWords = treebank.train.trees.toSeq + val processedTreesWords = treesWords.map(treeWordsPair => (process(treeWordsPair._1), treeWordsPair._2)) - println("Training lexicon"); - var sentIdx = 0; - val trainWordTagCounts = Counter2[String,String,Double]; + println("Training lexicon") + var sentIdx = 0 + val trainWordTagCounts = Counter2[String,String,Double] for ((tree, words) <- processedTreesWords) { if (sentIdx % 1000 == 0) { - println("Sentence: " + sentIdx); + println("Sentence: " + sentIdx) } - val treeLeaves = tree.leaves.toSeq; - for (i <- 0 until treeLeaves.size) { - trainWordTagCounts(treeLeaves(i).label, words(i)) += 1.0; + (tree.leaves zip words).foreach { case (treeLeaf, word) => + trainWordTagCounts(treeLeaf.label, words) += 1.0 } - sentIdx += 1; + sentIdx += 1 } - val wordToTagMap = new HashMap[String,String]; + val wordToTagMap = new HashMap[String,String] for (word <- trainWordTagCounts.keysIterator.map(_._2)) { - var bestTag = ""; - var bestTagCount = 0.0; - val tagCounts = trainWordTagCounts(::, word).iterator; + var bestTag = "" + var bestTagCount = 0.0 + val tagCounts = trainWordTagCounts(::, word).iterator for ((tag, count) <- tagCounts) { if (count > bestTagCount) { - bestTag = tag; - bestTagCount = count; + bestTag = tag + bestTagCount = count } } - wordToTagMap.put(word, bestTag); + wordToTagMap.put(word, bestTag) } - println("Done training lexicon"); - - - val hf = HeadFinder.collins; - val hackyHeadFinder = new RuleBasedHackyHeadFinder; + println("Done training lexicon") + + val hf = HeadFinder.collins + val hackyHeadFinder = new RuleBasedHackyHeadFinder - var correct = Counter[String,Int]; - var correctPredTags = Counter[String,Int]; - var total = Counter[String,Int]; + var correct = Counter[String,Int] + var correctPredTags = Counter[String,Int] + var total = Counter[String,Int] def rec(tree: Tree[(String,Int)], words: Seq[String]): Unit = { if (!tree.isLeaf && !tree.label._1.isEmpty) { - val headIdx = tree.label._2 - tree.begin; - val hhfHead = hackyHeadFinder.findHead(tree.label._1, tree.leaves.map(_.label._1).toSeq); - val predTags = words.slice(tree.begin, tree.end).map(word => if (wordToTagMap.contains(word)) wordToTagMap(word) else "NN"); - val hhfHeadPredTags = hackyHeadFinder.findHead(tree.label._1, predTags); + val headIdx = tree.label._2 - tree.begin + val hhfHead = hackyHeadFinder.findHead(tree.label._1, tree.leaves.map(_.label._1).toSeq) + val predTags = words.slice(tree.begin, tree.end).map(word => if (wordToTagMap.contains(word)) wordToTagMap(word) else "NN") + val hhfHeadPredTags = hackyHeadFinder.findHead(tree.label._1, predTags) if (hhfHead == headIdx) { - correct(tree.label._1) += 1; + correct(tree.label._1) += 1 } if (hhfHeadPredTags == headIdx) { - correctPredTags(tree.label._1) += 1; + correctPredTags(tree.label._1) += 1 } else { - println(tree.label + " => " + tree.leaves.map(_.label._1).toIndexedSeq + "\n " + predTags + "; gold = " + headIdx + ", pred (gold) = " + hhfHead + ", pred (pred) = " + hhfHeadPredTags); + println(tree.label + " => " + tree.leaves.map(_.label._1).toIndexedSeq + "\n " + predTags + "; gold = " + headIdx + ", pred (gold) = " + hhfHead + ", pred (pred) = " + hhfHeadPredTags) } - total(tree.label._1) += 1; + total(tree.label._1) += 1 } if (!tree.isLeaf) { - tree.children.foreach(rec(_, words)); + tree.children.foreach(rec(_, words)) } - }; + } - val devTreesWords = treebank.dev.trees.toSeq.map(treeWordsPair => (hf.annotateHeadIndices(process(treeWordsPair._1)), treeWordsPair._2)); + val devTreesWords = treebank.dev.trees.toSeq.map(treeWordsPair => (hf.annotateHeadIndices(process(treeWordsPair._1)), treeWordsPair._2)) for (i <- 0 until 100) { - val tree = devTreesWords(i)._1; - val words = devTreesWords(i)._2; - rec(tree, words); - -// println(tree.render(devTreesWords(i)._2, false)); -// println(processedTrees(i).render(treesWords(i)._2, false)); -// println(processedTreesWithIndices(i).render(treesWords(i)._2, false)); + val tree = devTreesWords(i)._1 + val words = devTreesWords(i)._2 + rec(tree, words) + // println(tree.render(devTreesWords(i)._2, false)) + // println(processedTrees(i).render(treesWords(i)._2, false)) + // println(processedTreesWithIndices(i).render(treesWords(i)._2, false)) } - var totalAcc = 0; - var totalCount = 0; + var totalAcc = 0 + var totalCount = 0 for (key <- total.keySet) { - println(key + ": " + correctPredTags(key) + " / " + total(key)); - totalAcc += correctPredTags(key); - totalCount += total(key); + println(key + ": " + correctPredTags(key) + " / " + total(key)) + totalAcc += correctPredTags(key) + totalCount += total(key) } - println(totalAcc + " / " + totalCount); + println(totalAcc + " / " + totalCount) } } \ No newline at end of file diff --git a/src/main/scala/epic/features/HackyLexicalProductionFeaturizer.scala b/src/main/scala/epic/features/HackyLexicalProductionFeaturizer.scala index 039b8f8d..10404238 100644 --- a/src/main/scala/epic/features/HackyLexicalProductionFeaturizer.scala +++ b/src/main/scala/epic/features/HackyLexicalProductionFeaturizer.scala @@ -21,88 +21,85 @@ class HackyLexicalProductionFeaturizer(wordTagCounts: Counter2[String, String, D wordThreshold: Int = 5, commonWordThreshold: Int = 100) extends RuleAndSpansFeaturizer[String] { - private val wordCounts = Counter[String,Double]; - private val wordToTagMap = new HashMap[String,String]; + private val wordCounts = Counter[String,Double] + private val wordToTagMap = new HashMap[String,String] for (word <- wordTagCounts.keysIterator.map(_._2).toSeq.distinct) { - wordCounts(word) = sum(wordTagCounts(::, word)); + wordCounts(word) = sum(wordTagCounts(::, word)) if (!wordToTagMap.contains(word)) { - val tagCounts = wordTagCounts(::, word).iterator; - var bestTag = HackyLexicalProductionFeaturizer.UnkTag; - var bestTagCount = 0.0; + val tagCounts = wordTagCounts(::, word).iterator + var bestTag = HackyLexicalProductionFeaturizer.UnkTag + var bestTagCount = 0.0 for ((tag, count) <- tagCounts) { if (count > bestTagCount) { - bestTag = tag; - bestTagCount = count; + bestTag = tag + bestTagCount = count } } - wordToTagMap.put(word, bestTag); + wordToTagMap.put(word, bestTag) } } - def tag(word: String) = if (wordToTagMap.contains(word)) wordToTagMap(word) else HackyLexicalProductionFeaturizer.UnkTag; + def tag(word: String) = if (wordToTagMap.contains(word)) wordToTagMap(word) else HackyLexicalProductionFeaturizer.UnkTag - val emptyArray = Array[Feature](); + val emptyArray = Array[Feature]() def anchor(w: IndexedSeq[String]) = new Anchoring { - def words: IndexedSeq[String] = w; + def words: IndexedSeq[String] = w def featuresForBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int):Array[Feature] = { - val preterminals = new Array[String](end - begin); + val preterminals = new Array[String](end - begin) for (i <- begin until end) { - preterminals(i - begin) = tag(words(i)); + preterminals(i - begin) = tag(words(i)) } - val lc = topology.labelIndex.get(topology.leftChild(rule)).baseLabel; - val rc = topology.labelIndex.get(topology.rightChild(rule)).baseLabel; + val lc = topology.labelIndex.get(topology.leftChild(rule)).baseLabel + val rc = topology.labelIndex.get(topology.rightChild(rule)).baseLabel - val lcHeadIdx = begin + hackyHeadFinder.findHead(lc, preterminals.slice(0, split - begin)); - val rcHeadIdx = split + hackyHeadFinder.findHead(rc, preterminals.slice(split - begin, end - begin)); - val lcHeadWord = words(lcHeadIdx); - val lcHeadTag = tag(words(lcHeadIdx)); - val rcHeadWord = words(rcHeadIdx); - val rcHeadTag = tag(words(rcHeadIdx)); + val lcHeadIdx = begin + hackyHeadFinder.findHead(lc, preterminals.slice(0, split - begin)) + val rcHeadIdx = split + hackyHeadFinder.findHead(rc, preterminals.slice(split - begin, end - begin)) + val lcHeadWord = words(lcHeadIdx) + val lcHeadTag = tag(words(lcHeadIdx)) + val rcHeadWord = words(rcHeadIdx) + val rcHeadTag = tag(words(rcHeadIdx)) - val distance = db.binnedDistance(lcHeadIdx, rcHeadIdx); + val distance = db.binnedDistance(lcHeadIdx, rcHeadIdx) // It doesn't really make sense to back off to tag features here since the tags // will fail when the words are rare... val otherFeats: Array[Feature] = if (featsDesc.contains("lexical")) { Array(HeadPairDistanceRuleFeature(rule, lcHeadTag, rcHeadTag, distance), HeadPairDistanceRuleFeature(rule, lcHeadTag, if (wordCounts(rcHeadWord) >= commonWordThreshold) rcHeadWord else rcHeadTag, distance), - HeadPairDistanceRuleFeature(rule, if (wordCounts(lcHeadWord) >= commonWordThreshold) lcHeadWord else lcHeadTag, rcHeadTag, distance)); + HeadPairDistanceRuleFeature(rule, if (wordCounts(lcHeadWord) >= commonWordThreshold) lcHeadWord else lcHeadTag, rcHeadTag, distance)) } else if (featsDesc.contains("ultralexical")) { Array(HeadPairDistanceRuleFeature(rule, lcHeadTag, rcHeadTag, distance), HeadPairDistanceRuleFeature(rule, lcHeadTag, if (wordCounts(rcHeadWord) >= commonWordThreshold) rcHeadWord else rcHeadTag, distance), HeadPairDistanceRuleFeature(rule, if (wordCounts(lcHeadWord) >= commonWordThreshold) lcHeadWord else lcHeadTag, rcHeadTag, distance), - HeadPairDistanceRuleFeature(rule, if (wordCounts(lcHeadWord) >= commonWordThreshold) lcHeadWord else lcHeadTag, if (wordCounts(rcHeadWord) >= commonWordThreshold) rcHeadWord else rcHeadTag, distance)); + HeadPairDistanceRuleFeature(rule, if (wordCounts(lcHeadWord) >= commonWordThreshold) lcHeadWord else lcHeadTag, if (wordCounts(rcHeadWord) >= commonWordThreshold) rcHeadWord else rcHeadTag, distance)) } else { - Array[Feature](); + Array[Feature]() } Arrays.concatenate(otherFeats, Array(LeftTagDistanceRuleFeature(rule, lcHeadTag, distance), LeftHeadDistanceRuleFeature(rule, if (wordCounts(lcHeadWord) >= wordThreshold) lcHeadWord else HackyLexicalProductionFeaturizer.RareToken, distance), RightTagDistanceRuleFeature(rule, rcHeadTag, distance), - RightHeadDistanceRuleFeature(rule, if (wordCounts(rcHeadWord) >= wordThreshold) rcHeadWord else HackyLexicalProductionFeaturizer.RareToken, distance))); + RightHeadDistanceRuleFeature(rule, if (wordCounts(rcHeadWord) >= wordThreshold) rcHeadWord else HackyLexicalProductionFeaturizer.RareToken, distance))) } - - - def featuresForUnaryRule(begin: Int, end: Int, rule: Int, ref: Int):Array[Feature] = emptyArray; - def featuresForSpan(begin: Int, end: Int, tag: Int, ref: Int):Array[Feature] = emptyArray; - } + def featuresForUnaryRule(begin: Int, end: Int, rule: Int, ref: Int):Array[Feature] = emptyArray + def featuresForSpan(begin: Int, end: Int, tag: Int, ref: Int):Array[Feature] = emptyArray + } } -case class LeftTagDistanceRuleFeature(rule: Int, ltag: String, distance: Int) extends Feature; -case class LeftHeadDistanceRuleFeature(rule: Int, lsuff: String, distance: Int) extends Feature; -case class RightTagDistanceRuleFeature(rule: Int, rtag: String, distance: Int) extends Feature; -case class RightHeadDistanceRuleFeature(rule: Int, rsuff: String, distance: Int) extends Feature; -case class HeadPairDistanceRuleFeature(rule: Int, lsuff: String, rsuff: String, distance: Int) extends Feature; - +case class LeftTagDistanceRuleFeature(rule: Int, ltag: String, distance: Int) extends Feature +case class LeftHeadDistanceRuleFeature(rule: Int, lsuff: String, distance: Int) extends Feature +case class RightTagDistanceRuleFeature(rule: Int, rtag: String, distance: Int) extends Feature +case class RightHeadDistanceRuleFeature(rule: Int, rsuff: String, distance: Int) extends Feature +case class HeadPairDistanceRuleFeature(rule: Int, lsuff: String, rsuff: String, distance: Int) extends Feature object HackyLexicalProductionFeaturizer { - val UnkTag = "NN"; - val RareToken = ""; + val UnkTag = "NN" + val RareToken = "" } diff --git a/src/main/scala/epic/features/HackyLexicalSplitFeaturizer.scala b/src/main/scala/epic/features/HackyLexicalSplitFeaturizer.scala index 69979ca1..0bc0e980 100644 --- a/src/main/scala/epic/features/HackyLexicalSplitFeaturizer.scala +++ b/src/main/scala/epic/features/HackyLexicalSplitFeaturizer.scala @@ -8,8 +8,8 @@ class HackyLexicalSplitFeaturizer[W]() extends SplitSpanFeaturizer[W] { private val theSplitNeedingAnchoring = new SplitSpanFeatureAnchoring[W] with Serializable { def featuresForSplit(begin: Int, split: Int, end: Int): Array[Feature] = { - emptyArray; -// Array(DistanceFeature(db.binnedDistance((end-split) - (split-begin)), label)) + emptyArray + // Array(DistanceFeature(db.binnedDistance((end-split) - (split-begin)), label)) } def featuresForSpan(begin: Int, end: Int): Array[Feature] = emptyArray diff --git a/src/main/scala/epic/features/HashExtendingIndex.scala b/src/main/scala/epic/features/HashExtendingIndex.scala index 4231b260..89ec2788 100644 --- a/src/main/scala/epic/features/HashExtendingIndex.scala +++ b/src/main/scala/epic/features/HashExtendingIndex.scala @@ -23,7 +23,7 @@ class HashExtendingIndex[T](val baseIndex: Index[T], def apply(t: T): Int = baseIndex(t) match { case -1 => val code = t.##.abs - if(!cache.addOrSeen(code)) + if (!cache.addOrSeen(code)) -1 else t.##.abs % numHashFeatures + baseIndex.size @@ -31,8 +31,8 @@ class HashExtendingIndex[T](val baseIndex: Index[T], } def unapply(i: Int): Option[T] = { - if(i < baseIndex.size) baseIndex.unapply(i) - else if(i < size) Some(hashWrapper(i - baseIndex.size)) + if (i < baseIndex.size) baseIndex.unapply(i) + else if (i < size) Some(hashWrapper(i - baseIndex.size)) else None } diff --git a/src/main/scala/epic/features/HashFeature.scala b/src/main/scala/epic/features/HashFeature.scala index 0bef6c78..cc159763 100644 --- a/src/main/scala/epic/features/HashFeature.scala +++ b/src/main/scala/epic/features/HashFeature.scala @@ -10,7 +10,7 @@ case class HashFeature(hashBucket: Int) extends Feature object HashFeature { sealed trait Scale { - def numFeatures(nonHashFeatures: Int):Int + def numFeatures(nonHashFeatures: Int): Int } case class Absolute(numHashFeatures: Int) extends Scale { diff --git a/src/main/scala/epic/features/IdentityWordFeaturizer.scala b/src/main/scala/epic/features/IdentityWordFeaturizer.scala index b5fbdb08..0d964023 100644 --- a/src/main/scala/epic/features/IdentityWordFeaturizer.scala +++ b/src/main/scala/epic/features/IdentityWordFeaturizer.scala @@ -19,16 +19,16 @@ class IdentityWordFeaturizer[W](wordCounts: Counter[W, Double], unknownWordThres def words = w def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0 || pos >= words.length) { + if (pos < 0 || pos >= words.length) { boundaryFeatures } else { _minimalFeatures(pos) } } - private val _minimalFeatures: immutable.IndexedSeq[Array[Feature]] = (0 until words.length) map { i => + private val _minimalFeatures: immutable.IndexedSeq[Array[Feature]] = words.indices.map { i => val index = indices(i) - if(index >= 0) { + if (index >= 0) { IdentityWordFeaturizer.this.minimalFeatures(index) } else { Array[Feature](Unk) @@ -42,14 +42,13 @@ class IdentityWordFeaturizer[W](wordCounts: Counter[W, Double], unknownWordThres private val wordIndex = Index(wordCounts.keySet) private val Unk = WordFeature("#UNK#", 'LowCount) private val boundaryFeatures = Array[Feature](BoundaryFeature) - - private val wordFeatures = Encoder.fromIndex(wordIndex).tabulateArray(s => if(wordCounts(s) > unknownWordThreshold) IndicatorFeature(s) else Unk) + private val wordFeatures = Encoder.fromIndex(wordIndex).tabulateArray(s => if (wordCounts(s) > unknownWordThreshold) IndicatorFeature(s) else Unk) // caches private val minimalFeatures = Array.tabulate[Array[Feature]](wordIndex.size){ i => val wc = wordCounts(wordIndex.get(i)) val w = wordFeatures(i) - if(wc > unknownWordThreshold) { + if (wc > unknownWordThreshold) { Array(w) } else { Array(Unk) diff --git a/src/main/scala/epic/features/IndexedSurfaceFeaturizer.scala b/src/main/scala/epic/features/IndexedSurfaceFeaturizer.scala index 4285f31c..66171402 100644 --- a/src/main/scala/epic/features/IndexedSurfaceFeaturizer.scala +++ b/src/main/scala/epic/features/IndexedSurfaceFeaturizer.scala @@ -29,28 +29,25 @@ object IndexedSurfaceFeaturizer { constraintFactory: SpanConstraints.Factory[W], deduplicateFeatures: Boolean = false) : IndexedSurfaceFeaturizer[W] = { - val index = if(deduplicateFeatures) new NonRedundantIndexBuilder[Feature] else new NormalIndexBuilder[Feature]() + val index = if (deduplicateFeatures) new NonRedundantIndexBuilder[Feature] else new NormalIndexBuilder[Feature]() for(words <- data) { val cons = constraintFactory.get(words) val anch = feat.anchor(words) - for(i <- 0 until words.length) { - for(j <- (i+1) to math.min(words.length, (i + cons.maxSpanLengthStartingAt(i))) if cons(i, j)) { + words.indices.foreach { i => + for(j <- (i+1) to math.min(words.length, i + cons.maxSpanLengthStartingAt(i)) if cons(i, j)) { index.add(anch.featuresForSpan(i, j) ) } } } - new MySurfaceFeaturizer[W](feat, constraintFactory, index.result()) } @SerialVersionUID(1L) class CachedFeaturizer[W](val base: IndexedSurfaceFeaturizer[W], cache: collection.mutable.Map[IndexedSeq[W], IndexedSurfaceAnchoring[W]]) extends IndexedSurfaceFeaturizer[W] with Serializable { def featurizer: SurfaceFeaturizer[W] = base.featurizer - def featureIndex: Index[Feature] = base.featureIndex - def anchor(datum: IndexedSeq[W]): IndexedSurfaceAnchoring[W] = cache.getOrElseUpdate(datum, base.anchor(datum)) } @@ -62,7 +59,7 @@ object IndexedSurfaceFeaturizer { val cons = constraintsFactory.constraints(words) val anch = featurizer.anchor(words) val spanFeatures = TriangularArray.tabulate(words.length+1){ (i, j) => - if(cons(i,j) && i < j) { + if (cons(i,j) && i < j) { stripEncode(featureIndex, anch.featuresForSpan(i, j)) } else { null @@ -78,9 +75,9 @@ object IndexedSurfaceFeaturizer { val result = mutable.ArrayBuilder.make[Int]() result.sizeHint(features) var i = 0 - while(i < features.length) { + while (i < features.length) { val fi = ind(features(i)) - if(fi >= 0) + if (fi >= 0) result += fi i += 1 } diff --git a/src/main/scala/epic/features/IndexedWordFeaturizer.scala b/src/main/scala/epic/features/IndexedWordFeaturizer.scala index 1239ab27..08800475 100644 --- a/src/main/scala/epic/features/IndexedWordFeaturizer.scala +++ b/src/main/scala/epic/features/IndexedWordFeaturizer.scala @@ -20,15 +20,14 @@ object IndexedWordFeaturizer { data: IndexedSeq[IndexedSeq[W]], wordHashFeatures: Int = 0, deduplicateFeatures: Boolean = true): IndexedWordFeaturizer[W] = { - val wordIndex = if(deduplicateFeatures) new NonRedundantIndexBuilder[Feature] else new NormalIndexBuilder[Feature]() + val wordIndex = if (deduplicateFeatures) new NonRedundantIndexBuilder[Feature] else new NormalIndexBuilder[Feature]() for(words <- data) { val anch = feat.anchor(words) - for(i <- 0 until words.length) { + words.indices.foreach { i => wordIndex.add(anch.featuresForWord(i) ) } } - new MyWordFeaturizer[W](feat, wordIndex.result()) } @@ -47,9 +46,9 @@ object IndexedWordFeaturizer { val result = mutable.ArrayBuilder.make[Int]() result.sizeHint(features) var i = 0 - while(i < features.length) { + while (i < features.length) { val fi = ind(features(i)) - if(fi >= 0) + if (fi >= 0) result += fi i += 1 } @@ -59,7 +58,6 @@ object IndexedWordFeaturizer { } } - @SerialVersionUID(1L) class TabulatedIndexedWordAnchoring[W](val words: IndexedSeq[W], spanFeatures: Array[Array[Int]]) extends IndexedWordAnchoring[W] with Serializable { diff --git a/src/main/scala/epic/features/LongestFrequentSuffixFeaturizer.scala b/src/main/scala/epic/features/LongestFrequentSuffixFeaturizer.scala index b6a0ae82..82918161 100644 --- a/src/main/scala/epic/features/LongestFrequentSuffixFeaturizer.scala +++ b/src/main/scala/epic/features/LongestFrequentSuffixFeaturizer.scala @@ -15,13 +15,11 @@ class LongestFrequentSuffixFeaturizer private (fixedMap: Map[String, Feature], def anchor(w: IndexedSeq[String]): WordFeatureAnchoring[String] = new WordFeatureAnchoring[String] { val feats = words.map(w => Array(fixedMap.getOrElse(w, LongestFrequentSuffix(lookup(w))))) - def featuresForWord(pos: Int): Array[Feature] = if(pos < 0 || pos >= w.length) Array(BeginSentFeature) else feats(pos) + def featuresForWord(pos: Int): Array[Feature] = if (pos < 0 || pos >= w.length) Array(BeginSentFeature) else feats(pos) def words: IndexedSeq[String] = w } - - def lookupSentence(sent: IndexedSeq[String]) = { sent.map(w => fixedMap.getOrElse(w, LongestFrequentSuffix(lookup(w))) match { case LongestFrequentSuffix(s) => "-" + s @@ -29,8 +27,8 @@ class LongestFrequentSuffixFeaturizer private (fixedMap: Map[String, Feature], }) } - private def lookup(x: String):String = { - (x).tails.find(suffixCounts(_) >= commonWordThreshold).getOrElse("-UNK-") + private def lookup(x: String): String = { + x.tails.find(suffixCounts(_) >= commonWordThreshold).getOrElse("-UNK-") } } @@ -43,12 +41,12 @@ object LongestFrequentSuffixFeaturizer { suffixCounts = suffixCounts.mapValues(v => v * I(v >= commonWordThreshold)) - def lookup(x: String):String = { - (x).tails.find(suffixCounts(_) >= commonWordThreshold).getOrElse("-UNK-") + def lookup(x: String): String = { + x.tails.find(suffixCounts(_) >= commonWordThreshold).getOrElse("-UNK-") } val map = Map.empty ++ (for( (w,v) <- counts.iterator) yield { - if(v > commonWordThreshold) + if (v > commonWordThreshold) w -> IndicatorFeature(w) else w -> LongestFrequentSuffix(lookup(w)) diff --git a/src/main/scala/epic/features/MinimalWordFeaturizer.scala b/src/main/scala/epic/features/MinimalWordFeaturizer.scala index d24d3549..1f0ed3a3 100644 --- a/src/main/scala/epic/features/MinimalWordFeaturizer.scala +++ b/src/main/scala/epic/features/MinimalWordFeaturizer.scala @@ -21,22 +21,22 @@ class MinimalWordFeaturizer(wordCounts: Counter[String, Double], includeWordShap def words = w def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0 || pos >= words.length) { + if (pos < 0 || pos >= words.length) { boundaryFeatures } else { _minimalFeatures(pos) } } - private val _minimalFeatures: immutable.IndexedSeq[Array[Feature]] = (0 until words.length) map { i => + private val _minimalFeatures: immutable.IndexedSeq[Array[Feature]] = words.indices.map { i => val index = indices(i) - if(index >= 0) { + if (index >= 0) { MinimalWordFeaturizer.this.minimalFeatures(index) } else { val ww = words(i) val classe = interner(WordFeature(EnglishWordClassGenerator(ww), 'Class)) val shape = interner(WordFeature(WordShapeGenerator(ww), 'Shape)) - if(includeWordShapeFeatures) { + if (includeWordShapeFeatures) { Array(shape, classe, Unk) } else{ Array(classe, Unk) @@ -54,21 +54,21 @@ class MinimalWordFeaturizer(wordCounts: Counter[String, Double], includeWordShap private val Unk = WordFeature("#UNK#", 'LowCount) private val boundaryFeatures = Array[Feature](BoundaryFeature) - private val wordFeatures = Encoder.fromIndex(wordIndex).tabulateArray(s => if(wordCounts(s) > unknownWordThreshold) interner(IndicatorFeature(s)) else Unk) - private val classes = Encoder.fromIndex(wordIndex).tabulateArray(w => if(wordCounts(w) > functionWordThreshold) wordFeatures(wordIndex(w)) else interner(WordFeature(EnglishWordClassGenerator(w), 'Class))) - private val shapes = if(includeWordShapeFeatures) Encoder.fromIndex(wordIndex).tabulateArray(w => if(wordCounts(w) > functionWordThreshold) wordFeatures(wordIndex(w)) else interner(WordFeature(WordShapeGenerator(w), 'Shape))) else null + private val wordFeatures = Encoder.fromIndex(wordIndex).tabulateArray(s => if (wordCounts(s) > unknownWordThreshold) interner(IndicatorFeature(s)) else Unk) + private val classes = Encoder.fromIndex(wordIndex).tabulateArray(w => if (wordCounts(w) > functionWordThreshold) wordFeatures(wordIndex(w)) else interner(WordFeature(EnglishWordClassGenerator(w), 'Class))) + private val shapes = if (includeWordShapeFeatures) Encoder.fromIndex(wordIndex).tabulateArray(w => if (wordCounts(w) > functionWordThreshold) wordFeatures(wordIndex(w)) else interner(WordFeature(WordShapeGenerator(w), 'Shape))) else null // caches private val minimalFeatures = Array.tabulate(wordIndex.size){ i => val wc = wordCounts(wordIndex.get(i)) val w = wordFeatures(i) val classe = classes(i) - if(wc > functionWordThreshold) Array(w) + if (wc > functionWordThreshold) Array(w) else if (includeWordShapeFeatures) { val shape = shapes(i) - if(wc > unknownWordThreshold) Array(w, shape, classe) + if (wc > unknownWordThreshold) Array(w, shape, classe) else Array(shape, classe, Unk) - } else if(wc > unknownWordThreshold) { + } else if (wc > unknownWordThreshold) { Array(w, classe) } else { Array(classe, Unk) diff --git a/src/main/scala/epic/features/MorphFeaturizer.scala b/src/main/scala/epic/features/MorphFeaturizer.scala index c1e448e3..d86f5462 100644 --- a/src/main/scala/epic/features/MorphFeaturizer.scala +++ b/src/main/scala/epic/features/MorphFeaturizer.scala @@ -15,16 +15,16 @@ import java.io.InputStreamReader class MorphFeaturizer private (morphLookupTable: MorphFeaturizer.MorphLookupTable) extends WordFeaturizer[String] with Serializable { def anchor(w: IndexedSeq[String]): WordFeatureAnchoring[String] = new WordFeatureAnchoring[String] { val morphFeats = if (!morphLookupTable.contains(w)) { - println("Sentence wasn't found in lookup table: " + w); - (0 until w.size).map(i => Array[MorphFeat]()); + println("Sentence wasn't found in lookup table: " + w) + w.indices.map(i => Array[MorphFeat]()) } else { - morphLookupTable(w); + morphLookupTable(w) } - val feats = (0 until w.size).map(i => morphFeats(i).filter(feat => feat.label == "lem").map(feat => IndicatorFeature(feat): Feature)) -// logger.info("Feats for sentence: " + w); -// (0 until w.size).foreach(i => logger.info(w(i) + ": " + feats(i).toSeq)); + val feats = w.indices.map(i => morphFeats(i).filter(feat => feat.label == "lem").map(feat => IndicatorFeature(feat): Feature)) + // logger.info("Feats for sentence: " + w) + // (0 until w.size).foreach(i => logger.info(w(i) + ": " + feats(i).toSeq)) - def featuresForWord(pos: Int): Array[Feature] = if(pos < 0 || pos >= w.length) Array(BeginSentFeature) else feats(pos) + def featuresForWord(pos: Int): Array[Feature] = if (pos < 0 || pos >= w.length) Array(BeginSentFeature) else feats(pos) def words: IndexedSeq[String] = w } @@ -33,40 +33,40 @@ class MorphFeaturizer private (morphLookupTable: MorphFeaturizer.MorphLookupTabl object MorphFeaturizer { // Stores each sentence's associated vector of - type MorphLookupTable = HashMap[IndexedSeq[String],IndexedSeq[Array[MorphFeat]]]; + type MorphLookupTable = HashMap[IndexedSeq[String],IndexedSeq[Array[MorphFeat]]] def makeLookupTable(pathToTaggedSentences: String): MorphLookupTable = { val in = breeze.io.FileStreams.input(new File(pathToTaggedSentences)) - val br = new BufferedReader(new InputStreamReader(in, "UTF-8")); + val br = new BufferedReader(new InputStreamReader(in, "UTF-8")) val lookupTable = new HashMap[IndexedSeq[String],IndexedSeq[Array[MorphFeat]]] - val morphFeatArr = new ArrayBuffer[IndexedSeq[Array[MorphFeat]]]; - var thisSent = new ArrayBuffer[String]; - var thisSentFeats = new ArrayBuffer[Array[MorphFeat]]; + val morphFeatArr = new ArrayBuffer[IndexedSeq[Array[MorphFeat]]] + var thisSent = new ArrayBuffer[String] + var thisSentFeats = new ArrayBuffer[Array[MorphFeat]] while (br.ready()) { - val line = br.readLine(); + val line = br.readLine() if (line.trim.isEmpty) { - lookupTable.put(thisSent, thisSentFeats); - morphFeatArr += thisSentFeats; - thisSent = new ArrayBuffer[String]; - thisSentFeats = new ArrayBuffer[Array[MorphFeat]]; + lookupTable.put(thisSent, thisSentFeats) + morphFeatArr += thisSentFeats + thisSent = new ArrayBuffer[String] + thisSentFeats = new ArrayBuffer[Array[MorphFeat]] } else { - val splitLine = line.split("\\s+"); + val splitLine = line.split("\\s+") if (splitLine.size != 3) { - println("WARNING: Bad line, split into more than three parts on whitespace: " + splitLine); + println("WARNING: Bad line, split into more than three parts on whitespace: " + splitLine) } thisSent += splitLine(0) - thisSentFeats += MorphFeat.readMorphFeatsFromBit(splitLine(2)).toArray; + thisSentFeats += MorphFeat.readMorphFeatsFromBit(splitLine(2)).toArray } } - if (!thisSent.isEmpty) { - lookupTable.put(thisSent, thisSentFeats); + if (thisSent.nonEmpty) { + lookupTable.put(thisSent, thisSentFeats) } - println("Loaded " + lookupTable.size + " entries from " + pathToTaggedSentences); - lookupTable; + println("Loaded " + lookupTable.size + " entries from " + pathToTaggedSentences) + lookupTable } def apply(pathsToTaggedSentences: Seq[String]) = { - val lookupTable = pathsToTaggedSentences.map(makeLookupTable(_)).reduce(_ ++ _); - new MorphFeaturizer(lookupTable); + val lookupTable = pathsToTaggedSentences.map(makeLookupTable(_)).reduce(_ ++ _) + new MorphFeaturizer(lookupTable) } } diff --git a/src/main/scala/epic/features/MultiSurfaceFeaturizer.scala b/src/main/scala/epic/features/MultiSurfaceFeaturizer.scala index f17ffc88..82d3e965 100644 --- a/src/main/scala/epic/features/MultiSurfaceFeaturizer.scala +++ b/src/main/scala/epic/features/MultiSurfaceFeaturizer.scala @@ -13,7 +13,6 @@ case class MultiSurfaceFeaturizer[W](feats: IndexedSeq[SurfaceFeaturizer[W]]) ex def anchor(w: IndexedSeq[W]): SurfaceFeatureAnchoring[W] = new SurfaceFeatureAnchoring[W] { val anchs = feats.map(_.anchor(w)).toArray def words: IndexedSeq[W] = w - def featuresForSpan(beg: Int, end: Int): Array[Feature] = anchs.flatMap(_.featuresForSpan(beg, end)) } } diff --git a/src/main/scala/epic/features/MultiWordFeaturizer.scala b/src/main/scala/epic/features/MultiWordFeaturizer.scala index 04ddd76d..f3908bb8 100644 --- a/src/main/scala/epic/features/MultiWordFeaturizer.scala +++ b/src/main/scala/epic/features/MultiWordFeaturizer.scala @@ -13,7 +13,6 @@ case class MultiWordFeaturizer[W](featurizers: IndexedSeq[WordFeaturizer[W]]) ex def anchor(w: IndexedSeq[W]): WordFeatureAnchoring[W] = new WordFeatureAnchoring[W] { val anchs = featurizers.map(_.anchor(w)).toArray def words: IndexedSeq[W] = w - def featuresForWord(pos: Int): Array[Feature] = anchs.flatMap(_.featuresForWord(pos)) } } diff --git a/src/main/scala/epic/features/NGramSpanFeaturizer.scala b/src/main/scala/epic/features/NGramSpanFeaturizer.scala index 91a9c3ac..0a412a4f 100644 --- a/src/main/scala/epic/features/NGramSpanFeaturizer.scala +++ b/src/main/scala/epic/features/NGramSpanFeaturizer.scala @@ -19,54 +19,54 @@ class NGramSpanFeaturizer(wordCounts: Counter[String,Double], ngramCountThreshold: Int, maxOrder: Int, useNot: Boolean) extends SurfaceFeaturizer[String] with Serializable { - private val higherOrderCounts = (3 to maxOrder).map(n => NGramSpanFeaturizer.countNgrams(allSents, n)); + private val higherOrderCounts = (3 to maxOrder).map(n => NGramSpanFeaturizer.countNgrams(allSents, n)) private val wordIndex = Index(wordCounts.keysIterator) private val bigramIndex = Index(bigramCounts.keysIterator) - private val higherOrderIndices = (3 to maxOrder).map(n => Index(higherOrderCounts(n-3).keysIterator)); - println(wordIndex.size + " unigrams, " + bigramIndex.size + " bigrams, " + (higherOrderIndices.map(_.size)) + " higher-order n-grams"); + private val higherOrderIndices = (3 to maxOrder).map(n => Index(higherOrderCounts(n-3).keysIterator)) + println(wordIndex.size + " unigrams, " + bigramIndex.size + " bigrams, " + higherOrderIndices.map(_.size) + " higher-order n-grams") def anchor(words: IndexedSeq[String]): SurfaceFeatureAnchoring[String] = { new SurfaceFeatureAnchoring[String] { def featuresForSpan(begin: Int, end: Int): Array[Feature] = { -// println("Span: " + words.slice(begin, end)); + // println("Span: " + words.slice(begin, end)) val unigramFeats = for (i <- begin until end) yield { -// println(words(i) + ": " + wordCounts(words(i))); - NGramUnigramFeature(if (wordCounts(words(i)) < ngramCountThreshold) -1 else wordIndex(words(i))); + // println(words(i) + ": " + wordCounts(words(i))) + NGramUnigramFeature(if (wordCounts(words(i)) < ngramCountThreshold) -1 else wordIndex(words(i))) } val bigramFeats = for (i <- begin until end - 1) yield { - val pair = (words(i), words(i+1)); -// println(pair + ": " + bigramCounts(pair)); - NGramBigramFeature(if (bigramCounts(pair) < ngramCountThreshold) -1 else bigramIndex(pair)); + val pair = (words(i), words(i+1)) + // println(pair + ": " + bigramCounts(pair)) + NGramBigramFeature(if (bigramCounts(pair) < ngramCountThreshold) -1 else bigramIndex(pair)) } val notFeats = if (useNot) { - val notFeats = new ArrayBuffer[Feature]; - var inNotSpan = false; + val notFeats = new ArrayBuffer[Feature] + var inNotSpan = false for (i <- begin until end) { if (NGramSpanFeaturizer.NotWords.contains(words(i))) { - inNotSpan = true; + inNotSpan = true } else if (NGramSpanFeaturizer.NotEndingPunc.contains(words(i))) { - inNotSpan = false; + inNotSpan = false } else if (inNotSpan) { -// println(words.slice(begin, end) + " (not span): " + words(i)); - notFeats += NotFeature(if (wordCounts(words(i)) < ngramCountThreshold) -1 else wordIndex(words(i))); + // println(words.slice(begin, end) + " (not span): " + words(i)) + notFeats += NotFeature(if (wordCounts(words(i)) < ngramCountThreshold) -1 else wordIndex(words(i))) } } - notFeats.toArray; + notFeats.toArray } else { - Array[Feature](); + Array[Feature]() } if (maxOrder >= 3) { val ngramFeats = (3 to maxOrder).flatMap(n => { for (i <- begin until end - n + 1) yield { - val slice = words.slice(i, i+n); -// println(slice + ": " + higherOrderCounts(n-3)(slice)); - NGramFeature(n, if (higherOrderCounts(n-3)(slice) < ngramCountThreshold) -1 else higherOrderIndices(n-3)(slice)); + val slice = words.slice(i, i+n) + // println(slice + ": " + higherOrderCounts(n-3)(slice)) + NGramFeature(n, if (higherOrderCounts(n-3)(slice) < ngramCountThreshold) -1 else higherOrderIndices(n-3)(slice)) } - }); - (unigramFeats ++ bigramFeats ++ notFeats ++ ngramFeats).toArray; + }) + (unigramFeats ++ bigramFeats ++ notFeats ++ ngramFeats).toArray } else { - (unigramFeats ++ bigramFeats ++ notFeats).toArray; + (unigramFeats ++ bigramFeats ++ notFeats).toArray } } @@ -84,7 +84,7 @@ object NGramSpanFeaturizer { for( ti <- data) { val TreeInstance(_, tree, words) = ti for (i <- 0 until words.size - 1) { - bigrams((words(i), words(i+1))) += 1.0; + bigrams((words(i), words(i+1))) += 1.0 } } bigrams @@ -94,7 +94,7 @@ object NGramSpanFeaturizer { val ngrams = Counter[Seq[String], Double]() for(sent <- allSents) { for (i <- 0 until sent.size - n) { - ngrams(sent.slice(i, i+n)) += 1.0; + ngrams(sent.slice(i, i+n)) += 1.0 } } ngrams diff --git a/src/main/scala/epic/features/NGramSurfaceFeaturizer.scala b/src/main/scala/epic/features/NGramSurfaceFeaturizer.scala index f0f61a39..17736120 100644 --- a/src/main/scala/epic/features/NGramSurfaceFeaturizer.scala +++ b/src/main/scala/epic/features/NGramSurfaceFeaturizer.scala @@ -6,7 +6,7 @@ import breeze.util.CachedHashCode import scala.runtime.ScalaRunTime case class OrientedNGramFeature(offset: Int, features: IndexedSeq[Feature]) extends Feature with CachedHashCode { - override def equals(other: Any):Boolean = other match { + override def equals(other: Any): Boolean = other match { case x: OrientedNGramFeature => x.hashCode == hashCode && ScalaRunTime._equals(this, x) case _ => false } @@ -22,7 +22,6 @@ class NGramWordFeaturizer[W](base: WordFeaturizer[W], wordNgramOrder: Int = 2) e val baseAnch = base.anchor(w) def words: IndexedSeq[W] = w - def featuresForWord(pos: Int): Array[Feature] = { val result = ArrayBuffer[Feature]() ++= baseAnch.featuresForWord(pos) for(order <- 2 to wordNgramOrder) @@ -39,12 +38,10 @@ class NGramWordFeaturizer[W](base: WordFeaturizer[W], wordNgramOrder: Int = 2) e } } - } private def allConfigurations(seqOfSeqs: TraversableOnce[Array[Feature]]): IndexedSeq[IndexedSeq[Feature]] = { seqOfSeqs.foldLeft(IndexedSeq(IndexedSeq.empty[Feature]))((acc,currentFeatures) => {for(a <- acc; b <- currentFeatures) yield a :+ b}) } - } diff --git a/src/main/scala/epic/features/NonRedundantIndexBuilder.scala b/src/main/scala/epic/features/NonRedundantIndexBuilder.scala index 2414b073..1ed89327 100644 --- a/src/main/scala/epic/features/NonRedundantIndexBuilder.scala +++ b/src/main/scala/epic/features/NonRedundantIndexBuilder.scala @@ -17,7 +17,7 @@ class NonRedundantIndexBuilder[F] extends IndexBuilder[F] { for(f <- 0 until allSeenFeatures.size) { val c = contexts(f) - if(!c.exists(seenContexts)) { + if (!c.exists(seenContexts)) { c.foreach(seenContexts += _) result.index(allSeenFeatures.get(f)) } @@ -37,7 +37,7 @@ class NonRedundantIndexBuilder[F] extends IndexBuilder[F] { for(x <- featuresForContext) { val next = allSeenFeatures.index(x) - if(contexts.length <= next) { + if (contexts.length <= next) { contexts += Some(mutable.Set[Int](nextContext)) } else { contexts(next).foreach(_ += nextContext) diff --git a/src/main/scala/epic/features/NormalIndexBuilder.scala b/src/main/scala/epic/features/NormalIndexBuilder.scala index 0871a8d4..61266424 100644 --- a/src/main/scala/epic/features/NormalIndexBuilder.scala +++ b/src/main/scala/epic/features/NormalIndexBuilder.scala @@ -15,7 +15,6 @@ class NormalIndexBuilder[F] extends IndexBuilder[F] { def add(fs: TraversableOnce[F]):Unit = { fs.foreach(_result.index) - } } @@ -23,5 +22,4 @@ class NormalIndexBuilder[F] extends IndexBuilder[F] { trait IndexBuilder[F] { def result():Index[F] def add(fs: TraversableOnce[F]):Unit - } diff --git a/src/main/scala/epic/features/OffsetWordFeaturizer.scala b/src/main/scala/epic/features/OffsetWordFeaturizer.scala index 02c968a3..85d718f7 100644 --- a/src/main/scala/epic/features/OffsetWordFeaturizer.scala +++ b/src/main/scala/epic/features/OffsetWordFeaturizer.scala @@ -13,14 +13,9 @@ case class OffsetFeature(offset: Int, feature: Feature) extends Feature class OffsetWordFeaturizer[W](offsetFeaturizer: WordFeaturizer[W], offset:Int) extends WordFeaturizer[W] with Serializable { def anchor(w: IndexedSeq[W]): WordFeatureAnchoring[W] = new WordFeatureAnchoring[W] { val offsetAnchoring = offsetFeaturizer.anchor(w) - def featuresForWord(pos: Int): Array[Feature] = { offsetAnchoring.featuresForWord(pos + offset).map(OffsetFeature(offset, _)) } - def words: IndexedSeq[W] = w - - } - } diff --git a/src/main/scala/epic/features/PorterStemmer.scala b/src/main/scala/epic/features/PorterStemmer.scala index b0fe32e7..a0b59aeb 100644 --- a/src/main/scala/epic/features/PorterStemmer.scala +++ b/src/main/scala/epic/features/PorterStemmer.scala @@ -24,7 +24,6 @@ package epic.features */ class PorterStemmer() extends (String=>String) { import PorterStemmer._ - def apply(w: String) = { if (w.length < 3) w.toLowerCase else { @@ -46,8 +45,6 @@ object PorterStemmer extends PorterStemmer { def apply() = this - - private def step1(w: String) = step1c(step1b(step1a(w))) // get rid of s's @@ -64,7 +61,7 @@ object PorterStemmer extends PorterStemmer { def extra(w: String) = { if (w.endsWith("at") || w.endsWith("bl") || w.endsWith("iz")) w + 'e' // double consonant: - else if (doublec(w) && !("lsz".contains(w.last))) w.substring(0, w.length - 1); + else if (doublec(w) && !"lsz".contains(w.last)) w.substring(0, w.length - 1) else if (m(w) == 1 && cvc(w)) w + "e" else w } @@ -210,7 +207,6 @@ object PorterStemmer extends PorterStemmer { step5b(step5a(w)) } - private def step5a(w: String) = { if (w.length < 3) w else @@ -238,7 +234,7 @@ object PorterStemmer extends PorterStemmer { var x: Seq[Char] = w.substring(firstV) if (x.isEmpty) m else { - while (!x.isEmpty) { + while (x.nonEmpty) { x = x.dropWhile(isVowel) if (x.isEmpty) return m m += 1 @@ -259,7 +255,7 @@ object PorterStemmer extends PorterStemmer { ) private def doublec(w: String) = { - (w.length > 2 && w.last == w.charAt(w.length - 2) && isConsonant(w.last)) + w.length > 2 && w.last == w.charAt(w.length - 2) && isConsonant(w.last) } def isConsonant(letter: Char) = !isVowel(letter) diff --git a/src/main/scala/epic/features/ProductSurfaceFeaturizer.scala b/src/main/scala/epic/features/ProductSurfaceFeaturizer.scala index 01badbee..3d88910f 100644 --- a/src/main/scala/epic/features/ProductSurfaceFeaturizer.scala +++ b/src/main/scala/epic/features/ProductSurfaceFeaturizer.scala @@ -19,8 +19,6 @@ class ProductSurfaceFeaturizer[W](f1: SurfaceFeaturizer[W], f2: SurfaceFeaturize } def words: IndexedSeq[W] = w - - } } diff --git a/src/main/scala/epic/features/ProductWordFeaturizer.scala b/src/main/scala/epic/features/ProductWordFeaturizer.scala index 09ec309e..ae58cca4 100644 --- a/src/main/scala/epic/features/ProductWordFeaturizer.scala +++ b/src/main/scala/epic/features/ProductWordFeaturizer.scala @@ -20,8 +20,6 @@ class ProductWordFeaturizer[W](f1: WordFeaturizer[W], f2: WordFeaturizer[W]) ext } def words: IndexedSeq[W] = w - - } } diff --git a/src/main/scala/epic/features/RuleAndSpansFeaturizer.scala b/src/main/scala/epic/features/RuleAndSpansFeaturizer.scala index f1308053..7433344c 100644 --- a/src/main/scala/epic/features/RuleAndSpansFeaturizer.scala +++ b/src/main/scala/epic/features/RuleAndSpansFeaturizer.scala @@ -1,4 +1,5 @@ -package epic.features +package epic +package features import epic.framework.Feature import breeze.util.Index @@ -17,36 +18,28 @@ trait RuleAndSpansFeaturizer[W] extends Serializable { } } - class ZeroRuleAndSpansFeaturizer[W]() extends RuleAndSpansFeaturizer[W] { - val emptyArray = Array[Feature](); + val emptyArray = Array[Feature]() def anchor(w: IndexedSeq[W]) = new Anchoring { - def words = w; + def words = w - def featuresForBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int) = emptyArray; - def featuresForUnaryRule(begin: Int, end: Int, rule: Int, ref: Int) = emptyArray; - def featuresForSpan(begin: Int, end: Int, tag: Int, ref: Int) = emptyArray; + def featuresForBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int) = emptyArray + def featuresForUnaryRule(begin: Int, end: Int, rule: Int, ref: Int) = emptyArray + def featuresForSpan(begin: Int, end: Int, tag: Int, ref: Int) = emptyArray } } object RuleAndSpansFeaturizer { - def indexAndOffset(index: Index[Feature], feats: Array[Feature], offset: Int): Array[Int] = { - val indexedArr = new Array[Int](feats.size); - var i = 0; - while (i < feats.size) { - indexedArr(i) = index(feats(i)) + offset; - i += 1; - } - indexedArr; - } - + def indexAndOffset(index: Index[Feature], feats: Array[Feature], offset: Int): Array[Int] = + Array.fillWith[Int](feats.length)(i => index(feats(i)) + offset) + def addToIndex(index: MutableIndex[Feature], feats: Array[Feature]) { - var i = 0; - while (i < feats.size) { - index.index(feats(i)); - i += 1; + var i = 0 + while (i < feats.length) { + index.index(feats(i)) + i += 1 } } diff --git a/src/main/scala/epic/features/SegmentedIndex.scala b/src/main/scala/epic/features/SegmentedIndex.scala index 96e2b9fd..0de702ad 100644 --- a/src/main/scala/epic/features/SegmentedIndex.scala +++ b/src/main/scala/epic/features/SegmentedIndex.scala @@ -22,11 +22,11 @@ class SegmentedIndex[T,IndexType](val indices: IndexedSeq[IndexType])(implicit v override def size = offsets.last def unapply(i: Int): Option[Feature] = { - if(i < 0 || i >= size) { + if (i < 0 || i >= size) { None } else { var component = util.Arrays.binarySearch(offsets, i) - if(component < 0) component = ~component - 1 + if (component < 0) component = ~component - 1 indices(component).unapply(i - offsets(component)).map(ComponentFeature(component, _)) } } @@ -38,7 +38,7 @@ class SegmentedIndex[T,IndexType](val indices: IndexedSeq[IndexType])(implicit v def addComponentOffset(component: Int, feature: Int) = feature + offsets(component) def componentOffset(component: Int) = offsets(component) - def shardWeights(dv: DenseVector[Double]): immutable.IndexedSeq[DenseVector[Double]] = (0 until indices.size).map(c => dv(componentOffset(c) until componentOffset(c+1))) + def shardWeights(dv: DenseVector[Double]): immutable.IndexedSeq[DenseVector[Double]] = indices.indices.map(c => dv(componentOffset(c) until componentOffset(c+1))) } object SegmentedIndex { diff --git a/src/main/scala/epic/features/SentencePropertiesFeaturizer.scala b/src/main/scala/epic/features/SentencePropertiesFeaturizer.scala index ef26883b..40ad7798 100644 --- a/src/main/scala/epic/features/SentencePropertiesFeaturizer.scala +++ b/src/main/scala/epic/features/SentencePropertiesFeaturizer.scala @@ -18,30 +18,25 @@ class SentencePropertiesFeaturizer(db: DistanceBinner = new DistanceBinner()) ex def words: IndexedSeq[String] = w - def featuresForWord(pos: Int): Array[Feature] = featuresForSpan(pos, pos+1) def featuresForSpan(begin: Int, end: Int): Array[Feature] = { val feats = new ArrayBuffer[Feature]() feats += sentenceLengthFeature - if(wholeSentenceIsUpperCase) + if (wholeSentenceIsUpperCase) feats += WholeSentenceIsUpperCaseFeature - - -// if (begin == 0) -// feats += BeginSentFeature -// if(end == words.length) -// feats += EndSentFeature + // if (begin == 0) + // feats += BeginSentFeature + // if (end == words.length) + // feats += EndSentFeature if (begin == 0 && end == words.length) feats += WholeSentFeature - feats.toArray } } } } - case object BeginSentFeature extends Feature case object EndSentFeature extends Feature case object WholeSentFeature extends Feature diff --git a/src/main/scala/epic/features/SpanShapeGenerator.scala b/src/main/scala/epic/features/SpanShapeGenerator.scala index aa1bf63b..c0c15155 100644 --- a/src/main/scala/epic/features/SpanShapeGenerator.scala +++ b/src/main/scala/epic/features/SpanShapeGenerator.scala @@ -22,9 +22,9 @@ class SpanShapeFeaturizerBetter(numContextWords: Int, useRichContext: Boolean) e new SurfaceFeatureAnchoring[String] { def featuresForSpan(begin: Int, end: Int): Array[Feature] = { val sig = SpanShapeGenerator.signatureAndContextFor(words, begin, end, numContextWords, useRichContext) -// println("Features for span " + words.slice(begin, end) + ": " + sig); -// val sig2 = SpanShapeGenerator.signatureFor(words, begin, end, includeContext = false) -// Array(SpanShapeFeature(sig), SpanShapeFeature(sig2)) + // println("Features for span " + words.slice(begin, end) + ": " + sig); + // val sig2 = SpanShapeGenerator.signatureFor(words, begin, end, includeContext = false) + // Array(SpanShapeFeature(sig), SpanShapeFeature(sig2)) Array(SpanShapeFeature(sig)) } } @@ -36,9 +36,9 @@ class FullWordSpanShapeFeaturizer(commonWords: Set[String], numContextWords: Int new SurfaceFeatureAnchoring[String] { def featuresForSpan(begin: Int, end: Int): Array[Feature] = { val sig = SpanShapeGenerator.signatureAndContextFor(words, begin, end, numContextWords, true, commonWords) - // println("Features for span " + words.slice(begin, end) + ": " + sig); - // val sig2 = SpanShapeGenerator.signatureFor(words, begin, end, includeContext = false) - // Array(SpanShapeFeature(sig), SpanShapeFeature(sig2)) + // println("Features for span " + words.slice(begin, end) + ": " + sig); + // val sig2 = SpanShapeGenerator.signatureFor(words, begin, end, includeContext = false) + // Array(SpanShapeFeature(sig), SpanShapeFeature(sig2)) Array(SpanShapeFeature(sig)) } } @@ -53,7 +53,6 @@ object SpanShapeGenerator extends Serializable { val MAX_LEN = 6 - def apply(v1: IndexedSeq[String], begin: Int, end: Int): String = signatureFor(v1,begin, end) def signatureFor(words: IndexedSeq[String], begin: Int, end: Int, includeContext: Boolean = true) = { @@ -66,12 +65,12 @@ object SpanShapeGenerator extends Serializable { } result += '[' } - var i = begin; + var i = begin while (i < math.min(begin + MAX_LEN/2 + 1, end)) { appendWordShape(i, words, result) i += 1 } - if(i < end) { + if (i < end) { //val remainingLength = distanceBinner.binnedDistance(begin, end - MAX_LEN) //result ++= "~" * remainingLength result += '~' @@ -82,9 +81,9 @@ object SpanShapeGenerator extends Serializable { i += 1 } if (includeContext) { - result += ']'; + result += ']' if (end >= words.length) { - result += '#'; + result += '#' } else { result += binCharacter(words(end).head) } @@ -95,12 +94,12 @@ object SpanShapeGenerator extends Serializable { // Similar, but has the capability to use more and richer context def signatureAndContextFor(words: IndexedSeq[String], begin: Int, end: Int, numContextWords: Int = 1, richContext: Boolean = false, commonWords: Set[String] = Set.empty) = { val result = new StringBuilder(end-begin) - var i = begin - numContextWords; + var i = begin - numContextWords while (i < begin) { if (i < 0) { result += '#' } else { - if(commonWords(words(i))) { + if (commonWords(words(i))) { result ++= words(i) } else if (richContext) { appendWordShape(i, words, result) @@ -108,14 +107,14 @@ object SpanShapeGenerator extends Serializable { result += binCharacter(words(i).head) } } - i += 1; + i += 1 } result += '[' while (i < math.min(begin + MAX_LEN/2 + 1, end)) { appendWordShape(i, words, result) i += 1 } - if(i < end) { + if (i < end) { //val remainingLength = distanceBinner.binnedDistance(begin, end - MAX_LEN) //result ++= "~" * remainingLength result += '~' @@ -125,12 +124,12 @@ object SpanShapeGenerator extends Serializable { appendWordShape(i, words, result) i += 1 } - result += ']'; + result += ']' while (i < end + numContextWords) { if (i >= words.length) { - result += '#'; + result += '#' } else { - if(commonWords(words(i))) { + if (commonWords(words(i))) { result ++= words(i) } else if (richContext) { appendWordShape(i, words, result) @@ -138,12 +137,11 @@ object SpanShapeGenerator extends Serializable { result += binCharacter(words(i).head) } } - i += 1; + i += 1 } result.toString } - def appendWordShape(i: Int, words: IndexedSeq[String], result: StringBuilder) { val w = if (i < 0 || i >= words.length) "#" else words(i) if (w.isEmpty) { @@ -151,7 +149,7 @@ object SpanShapeGenerator extends Serializable { result += 'ε' } else { var c = w(0) - if(c == '-') { + if (c == '-') { c = w match { case "-LRB-" => '(' case "-RRB-" => ')' @@ -183,12 +181,12 @@ object SpanShapeGenerator extends Serializable { // Similar, but has the capability to use more and richer context def splitShapeFor(words: IndexedSeq[String], begin: Int, split : Int , end: Int, numContextWords: Int = 1, richContext: Boolean = false, commonWords: Set[String] = Set.empty) = { val result = new StringBuilder(end-begin) - var i = begin - numContextWords; + var i = begin - numContextWords while (i < begin) { if (i < 0) { result += '#' } else { - if(commonWords(words(i))) { + if (commonWords(words(i))) { result ++= words(i) } else if (richContext) { appendWordShape(i, words, result) @@ -196,7 +194,7 @@ object SpanShapeGenerator extends Serializable { result += binCharacter(words(i).head) } } - i += 1; + i += 1 } result += '[' while (i < math.min(begin + MAX_LEN/2 + 1, end)) { @@ -204,8 +202,8 @@ object SpanShapeGenerator extends Serializable { i += 1 } - if(i <= split) { - if(i < split) { + if (i <= split) { + if (i < split) { result += '~' } appendWordShape(split, words, result) @@ -213,7 +211,7 @@ object SpanShapeGenerator extends Serializable { i = split + 2 } - if(i < end) { + if (i < end) { //val remainingLength = distanceBinner.binnedDistance(begin, end - MAX_LEN) //result ++= "~" * remainingLength result += '~' @@ -223,12 +221,12 @@ object SpanShapeGenerator extends Serializable { appendWordShape(i, words, result) i += 1 } - result += ']'; + result += ']' while (i < end + numContextWords) { if (i >= words.length) { - result += '#'; + result += '#' } else { - if(commonWords(words(i))) { + if (commonWords(words(i))) { result ++= words(i) } else if (richContext) { appendWordShape(i, words, result) @@ -236,7 +234,7 @@ object SpanShapeGenerator extends Serializable { result += binCharacter(words(i).head) } } - i += 1; + i += 1 } result.toString } diff --git a/src/main/scala/epic/features/SplitSpanFeaturizer.scala b/src/main/scala/epic/features/SplitSpanFeaturizer.scala index 380e49e3..45232578 100644 --- a/src/main/scala/epic/features/SplitSpanFeaturizer.scala +++ b/src/main/scala/epic/features/SplitSpanFeaturizer.scala @@ -133,7 +133,7 @@ object SplitSpanFeaturizer { } def anchor(w: IndexedSeq[W]): SplitSpanFeatureAnchoring[W] = { - if(a.isInstanceOf[SplitPointMarker] || b.isInstanceOf[SplitPointMarker]) + if (a.isInstanceOf[SplitPointMarker] || b.isInstanceOf[SplitPointMarker]) theSplitNeedingAnchoring else theNotSplitNeedingAnchoring @@ -214,11 +214,11 @@ object SplitSpanFeaturizer { val afeats: Array[Feature] = aa.featuresForSpan(begin, end) val bfeats: Array[Feature] = ba.featuresForSpan(begin, end) val cross:Array[Feature] = Arrays.crossProduct(afeats, bfeats)(CrossProductFeature(_, _)) - if(keepJustA && keepJustB) { + if (keepJustA && keepJustB) { Arrays.concatenate[Feature](cross, afeats, bfeats) } else if (keepJustA) { Arrays.concatenate[Feature](cross, afeats) - } else if(keepJustB) { + } else if (keepJustB) { Arrays.concatenate[Feature](cross, bfeats) } else { cross @@ -237,10 +237,10 @@ object SplitSpanFeaturizer { Arrays.crossProduct(aSpan, bSplit)(CrossProductFeature(_, _, "Split")) ) - if(keepJustA) { + if (keepJustA) { results += aSplit } - if(keepJustB) { + if (keepJustB) { results += bSplit } @@ -256,7 +256,6 @@ trait SplitSpanFeatureAnchoring[W] extends SurfaceFeatureAnchoring[W] { def featuresForSplit(begin: Int, split: Int, end: Int):Array[Feature] } - trait IndexedSplitSpanFeaturizer[W] { def anchor(w: IndexedSeq[W]):IndexedSplitSpanFeatureAnchoring[W] def featureIndex: Index[Feature] @@ -272,8 +271,8 @@ object IndexedSplitSpanFeaturizer { hashFeatures: HashFeature.Scale = HashFeature.Relative(1.0), bloomFilter: Boolean = false, deduplicateFeatures: Boolean = false):IndexedSplitSpanFeaturizer[W] = { - def seenSet = if(bloomFilter) new ThreadLocalBloomFilter[Long](8 * 1024 * 1024 * 50, 3) else AlwaysSeenSet - val builder = if(deduplicateFeatures) new NonRedundantIndexBuilder[Feature] else new NormalIndexBuilder[Feature] + def seenSet = if (bloomFilter) new ThreadLocalBloomFilter[Long](8 * 1024 * 1024 * 50, 3) else AlwaysSeenSet + val builder = if (deduplicateFeatures) new NonRedundantIndexBuilder[Feature] else new NormalIndexBuilder[Feature] for (ti <- trees) { val wspec = f.anchor(ti.words) ti.tree.allChildren.foreach { @@ -286,7 +285,7 @@ object IndexedSplitSpanFeaturizer { val index = builder.result() - new BasicIndexedSplitSpanFeaturizer(f, if(hashFeatures.numFeatures(index.size) != 0) new HashExtendingIndex(index, HashFeature(_), hashFeatures, seenSet) else index) + new BasicIndexedSplitSpanFeaturizer(f, if (hashFeatures.numFeatures(index.size) != 0) new HashExtendingIndex(index, HashFeature(_), hashFeatures, seenSet) else index) } class BasicIndexedSplitSpanFeaturizer[W](f: SplitSpanFeaturizer[W], val featureIndex: Index[Feature]) extends IndexedSplitSpanFeaturizer[W] with Serializable { diff --git a/src/main/scala/epic/features/StandardSurfaceFeaturizer.scala b/src/main/scala/epic/features/StandardSurfaceFeaturizer.scala index 1d706160..41205a74 100644 --- a/src/main/scala/epic/features/StandardSurfaceFeaturizer.scala +++ b/src/main/scala/epic/features/StandardSurfaceFeaturizer.scala @@ -6,8 +6,6 @@ import scala.Array import scala.collection.mutable.ArrayBuffer import StandardSpanFeatures._ - - case class FirstWordCapsAnd(f: Feature) extends Feature case class NthWordCapsAnd(f: Feature) extends Feature case class SentenceLengthFeature(length: Int) extends Feature diff --git a/src/main/scala/epic/features/SurfaceFeaturizer.scala b/src/main/scala/epic/features/SurfaceFeaturizer.scala index 1e17fe98..bde68ae8 100644 --- a/src/main/scala/epic/features/SurfaceFeaturizer.scala +++ b/src/main/scala/epic/features/SurfaceFeaturizer.scala @@ -26,13 +26,11 @@ object SurfaceFeaturizer { def apply[W](f: (IndexedSeq[W], Span)=>Array[Feature]):SurfaceFeaturizer[W] = new TabulatedSurfaceFeaturizer[W](f) - /** begin of span */ object begin extends MarkerPos(0) /** end of span */ object end extends MarkerPos(0, false) - trait DSL { def whenLength[W](filt: Int=>Boolean)(f: SurfaceFeaturizer[W])= new LengthFilteredSurfaceFeaturizer(f, filt) @@ -70,7 +68,7 @@ object SurfaceFeaturizer { case class SpanEdgesFeaturizer[W](f1: MarkedWordFeaturizer[W], f2: MarkedWordFeaturizer[W]) extends SurfaceFeaturizer[W] { def anchor(w: IndexedSeq[W]): SurfaceFeatureAnchoring[W] = { val loc1 = f1.wf.anchor(w) - val loc2 = if(f1.wf eq f2.wf) loc1 else f2.wf.anchor(w) + val loc2 = if (f1.wf eq f2.wf) loc1 else f2.wf.anchor(w) new SurfaceFeatureAnchoring[W] { def featuresForSpan(begin: Int, end: Int): Array[Feature] = { val ffs1 = loc1.featuresForWord(f1.mp.toPos(begin, end)) @@ -91,7 +89,6 @@ object SurfaceFeaturizer { } } - case class SingleWordSpanFeaturizer[W](feat: WordFeaturizer[W]) extends SurfaceFeaturizer[W] with Serializable { override def anchor(words: IndexedSeq[W]): SurfaceFeatureAnchoring[W] = new SurfaceFeatureAnchoring[W] { val anch = feat.anchor(words) @@ -126,9 +123,9 @@ object SurfaceFeaturizer { def +(i: Int) = apply(i) def -(i: Int) = apply(-i) - def toPos(begin: Int, end: Int) = if(relativeToBegin) begin + offset else end + offset + def toPos(begin: Int, end: Int) = if (relativeToBegin) begin + offset else end + offset - override def toString = s"(${if(relativeToBegin) "b" else "e"}${if(offset == 0) "" else if(offset > 0) "+" + offset else offset})" + override def toString = s"(${if (relativeToBegin) "b" else "e"}${if (offset == 0) "" else if (offset > 0) "+" + offset else offset})" } class TabulatedSurfaceFeaturizer[W](f: (IndexedSeq[W], Span)=>Array[Feature]) extends SurfaceFeaturizer[W] { diff --git a/src/main/scala/epic/features/TagDictionaryFeaturizer.scala b/src/main/scala/epic/features/TagDictionaryFeaturizer.scala index 5284c4a5..060bb94f 100644 --- a/src/main/scala/epic/features/TagDictionaryFeaturizer.scala +++ b/src/main/scala/epic/features/TagDictionaryFeaturizer.scala @@ -18,7 +18,7 @@ class TagDictionaryFeaturizer[L](counts: Counter2[L, String, Double], commonWord private val emptyArray = Array.empty[Feature] private val argmaxes = Encoder.fromIndex(wordIndex).tabulateArray{w => val totalCount = sum(counts(::, w)) - if(totalCount >= commonWordThreshold) { + if (totalCount >= commonWordThreshold) { emptyArray } else if (totalCount <= 2) { emptyArray @@ -29,16 +29,16 @@ class TagDictionaryFeaturizer[L](counts: Counter2[L, String, Double], commonWord } private val variants = Encoder.fromIndex(wordIndex).tabulateArray{w => val totalCount = sum(counts(::, w)) - if(totalCount < commonWordThreshold) { + if (totalCount < commonWordThreshold) { variantFeatures(w) } else emptyArray } private def variantFeatures(w: String) = { val arr = mutable.ArrayBuilder.make[Feature] - if(w(0).isUpper) { + if (w(0).isUpper) { val lowerCount = sum(counts(::, w.toLowerCase)) - if(lowerCount != 0.0) { + if (lowerCount != 0.0) { arr += HasKnownLowerCaseVariant(counts(::, w.toLowerCase).argmax) } } @@ -47,24 +47,23 @@ class TagDictionaryFeaturizer[L](counts: Counter2[L, String, Double], commonWord if (dashIndex >= 0) { val afterDash = w.substring(dashIndex) val undashedCount = sum(counts(::, afterDash)) - if(undashedCount != 0.0) { + if (undashedCount != 0.0) { arr += HasKnownAfterDashSuffix(counts(::, afterDash).argmax) } } arr.result() } - def anchor(w: IndexedSeq[String]): WordFeatureAnchoring[String] = new WordFeatureAnchoring[String] { val indices = w.map(wordIndex) val myArgmaxes = indices.map{i => - if(i < 0) { + if (i < 0) { emptyArray } else argmaxes(i) } val variants: IndexedSeq[Array[Feature]] = indices.zipWithIndex.map{ case(i, pos) => - if(i < 0) { + if (i < 0) { variantFeatures(w(pos)) } else { TagDictionaryFeaturizer.this.variants(i) @@ -72,11 +71,11 @@ class TagDictionaryFeaturizer[L](counts: Counter2[L, String, Double], commonWord } def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0 || pos >= w.length) { + if (pos < 0 || pos >= w.length) { Array(IndicatorWSFeature('OutOfBounds)) } else { val am = myArgmaxes(pos) - if(variants(pos).length != 0) { + if (variants(pos).length != 0) { am ++ variants(pos) } else { am diff --git a/src/main/scala/epic/features/TagSpanShapeFeaturizer.scala b/src/main/scala/epic/features/TagSpanShapeFeaturizer.scala index d5faf896..f66bde0b 100644 --- a/src/main/scala/epic/features/TagSpanShapeFeaturizer.scala +++ b/src/main/scala/epic/features/TagSpanShapeFeaturizer.scala @@ -12,76 +12,76 @@ import scala.collection.mutable.HashMap @SerialVersionUID(1L) class TagSpanShapeFeaturizer[L](wordTagCounts: Counter2[L, String, Double], commonWordThreshold: Int = 3) extends SurfaceFeaturizer[String] with Serializable { - private val tagIndex = Index(wordTagCounts.keysIterator.map(_._1)); - private val wordToTagIndexMap = new HashMap[String,Int]; + private val tagIndex = Index(wordTagCounts.keysIterator.map(_._1)) + private val wordToTagIndexMap = new HashMap[String,Int] for (word <- wordTagCounts.keysIterator.map(_._2)) { if (!wordToTagIndexMap.contains(word)) { - wordToTagIndexMap.put(word, TagSpanShapeGenerator.labelIndexFor(word, wordTagCounts, commonWordThreshold, tagIndex)); + wordToTagIndexMap.put(word, TagSpanShapeGenerator.labelIndexFor(word, wordTagCounts, commonWordThreshold, tagIndex)) } } - private val wordTagger = (w: String) => TagSpanShapeGenerator.labelIndexFor(w, wordToTagIndexMap); + private val wordTagger = (w: String) => TagSpanShapeGenerator.labelIndexFor(w, wordToTagIndexMap) def anchor(words: IndexedSeq[String]): SurfaceFeatureAnchoring[String] = { new SurfaceFeatureAnchoring[String] { def featuresForSpan(begin: Int, end: Int): Array[Feature] = { - val b11 = TagSpanShapeGenerator.featureFor(words, begin, end, wordTagger, commonWordThreshold, tagIndex, 1, 1, 0, 0); - val b21 = TagSpanShapeGenerator.featureFor(words, begin, end, wordTagger, commonWordThreshold, tagIndex, 2, 1, 0, 0); - val b12 = TagSpanShapeGenerator.featureFor(words, begin, end, wordTagger, commonWordThreshold, tagIndex, 1, 2, 0, 0); - val e11 = TagSpanShapeGenerator.featureFor(words, begin, end, wordTagger, commonWordThreshold, tagIndex, 0, 0, 1, 1); - val e12 = TagSpanShapeGenerator.featureFor(words, begin, end, wordTagger, commonWordThreshold, tagIndex, 0, 0, 1, 2); - val e21 = TagSpanShapeGenerator.featureFor(words, begin, end, wordTagger, commonWordThreshold, tagIndex, 0, 0, 2, 1); - Array(b11, b21, b12, e11, e12, e21); + val b11 = TagSpanShapeGenerator.featureFor(words, begin, end, wordTagger, commonWordThreshold, tagIndex, 1, 1, 0, 0) + val b21 = TagSpanShapeGenerator.featureFor(words, begin, end, wordTagger, commonWordThreshold, tagIndex, 2, 1, 0, 0) + val b12 = TagSpanShapeGenerator.featureFor(words, begin, end, wordTagger, commonWordThreshold, tagIndex, 1, 2, 0, 0) + val e11 = TagSpanShapeGenerator.featureFor(words, begin, end, wordTagger, commonWordThreshold, tagIndex, 0, 0, 1, 1) + val e12 = TagSpanShapeGenerator.featureFor(words, begin, end, wordTagger, commonWordThreshold, tagIndex, 0, 0, 1, 2) + val e21 = TagSpanShapeGenerator.featureFor(words, begin, end, wordTagger, commonWordThreshold, tagIndex, 0, 0, 2, 1) + Array(b11, b21, b12, e11, e12, e21) } } } } -/** +/** * * @author dlwh */ object TagSpanShapeGenerator extends Serializable { def makeFeatType(beginContextAmount: Int, beginContentAmount: Int, endContentAmount: Int, endContextAmount: Int) = { - beginContextAmount * 1000 + beginContentAmount * 100 + endContentAmount * 10 + endContextAmount; + beginContextAmount * 1000 + beginContentAmount * 100 + endContentAmount * 10 + endContextAmount } - def canonicalize(annLabel: AnnotatedLabel) = annLabel.baseLabel.substring(0, 1); + def canonicalize(annLabel: AnnotatedLabel) = annLabel.baseLabel.substring(0, 1) - def makeBaseLexicon(trees: IndexedSeq[TreeInstance[AnnotatedLabel, String]]) = makeLexicon(trees, canonicalize); + def makeBaseLexicon(trees: IndexedSeq[TreeInstance[AnnotatedLabel, String]]) = makeLexicon(trees, canonicalize) - def makeStandardLexicon(trees: IndexedSeq[TreeInstance[AnnotatedLabel, String]]) = makeLexicon(trees, _.baseLabel); + def makeStandardLexicon(trees: IndexedSeq[TreeInstance[AnnotatedLabel, String]]) = makeLexicon(trees, _.baseLabel) def makeLexicon(trees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], collapser: AnnotatedLabel => String) = { - val strLexicon = Counter2[String,String,Double]; + val strLexicon = Counter2[String,String,Double] for(ti <- trees) { - val TreeInstance(_, tree, words) = ti; + val TreeInstance(_, tree, words) = ti for ((l, w) <- tree.leaves.map(leaf => (leaf, words(leaf.span.begin)))) { - strLexicon(collapser(l.label), w) += 1.0; + strLexicon(collapser(l.label), w) += 1.0 } } - strLexicon; + strLexicon } def labelIndexFor[L](word: String, wordTagCounts: Counter2[L, String, Double], commonWordThreshold: Int, tagIndex: Index[L]) = { val totalCount = sum(wordTagCounts(::, word)) if (totalCount < commonWordThreshold) { - -1; + -1 } - val tagCounts = wordTagCounts(::, word).iterator; - var bestTagIdx = -1; - var bestTagCount = 0.0; + val tagCounts = wordTagCounts(::, word).iterator + var bestTagIdx = -1 + var bestTagCount = 0.0 for ((tag, count) <- tagCounts) { if (count > bestTagCount) { - bestTagIdx = tagIndex(tag); - bestTagCount = count; + bestTagIdx = tagIndex(tag) + bestTagCount = count } } bestTagIdx } def labelIndexFor[L](word: String, wordToTagIndexMap: HashMap[String,Int]) = { - if (wordToTagIndexMap.contains(word)) wordToTagIndexMap(word) else -1; + if (wordToTagIndexMap.contains(word)) wordToTagIndexMap(word) else -1 } def featureFor[L](words: IndexedSeq[String], @@ -94,19 +94,19 @@ object TagSpanShapeGenerator extends Serializable { beginContentAmount: Int, endContentAmount: Int, endContextAmount: Int) = { - var result = new StringBuilder(); + var result = new StringBuilder() for (i <- begin - beginContextAmount until end + endContextAmount) { if (i < begin + beginContentAmount || i >= end - endContentAmount) { if (i < 0 || i >= words.size) { - result = result.append("-1,"); + result = result.append("-1,") } else { - result = result.append(wordTagger(words(i))).append(","); + result = result.append(wordTagger(words(i))).append(",") } } } - val featType = makeFeatType(beginContextAmount, beginContentAmount, endContentAmount, endContextAmount); -// println(words.slice(begin, end) + " => " + featType + ": " + result.toString); - TagSpanShapeFeature(featType, result.toString); + val featType = makeFeatType(beginContextAmount, beginContentAmount, endContentAmount, endContextAmount) +// println(words.slice(begin, end) + " => " + featType + ": " + result.toString) + TagSpanShapeFeature(featType, result.toString) } } diff --git a/src/main/scala/epic/features/TransformedWordFeaturizer.scala b/src/main/scala/epic/features/TransformedWordFeaturizer.scala index d6c2f73d..17e1993a 100644 --- a/src/main/scala/epic/features/TransformedWordFeaturizer.scala +++ b/src/main/scala/epic/features/TransformedWordFeaturizer.scala @@ -31,16 +31,16 @@ class TransformedWordFeaturizer[W](initCounts: Counter[W, Double], def words = w def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0 || pos >= words.length) { + if (pos < 0 || pos >= words.length) { boundaryFeatures } else { _minimalFeatures(pos) } } - private val _minimalFeatures: immutable.IndexedSeq[Array[Feature]] = (0 until words.length) map { i => + private val _minimalFeatures: immutable.IndexedSeq[Array[Feature]] = words.indices.map { i => val index = indices(i) - if(index >= 0) { + if (index >= 0) { minimalFeatures(index) } else { Array[Feature](Unk) @@ -55,13 +55,13 @@ class TransformedWordFeaturizer[W](initCounts: Counter[W, Double], private val Unk = WordFeature("#UNK#", 'LowCount) private val boundaryFeatures = Array[Feature](BoundaryFeature) - private val wordFeatures = Encoder.fromIndex(wordIndex).tabulateArray(s => if(wordCounts(s) > unknownWordThreshold) TransformedFeature(transform(s)) else Unk) + private val wordFeatures = Encoder.fromIndex(wordIndex).tabulateArray(s => if (wordCounts(s) > unknownWordThreshold) TransformedFeature(transform(s)) else Unk) // caches private val minimalFeatures = Array.tabulate[Array[Feature]](wordIndex.size){ i => val wc = wordCounts(transform(wordIndex.get(i))) val w = wordFeatures(i) - if(wc > unknownWordThreshold) { + if (wc > unknownWordThreshold) { Array(w) } else { Array(Unk) diff --git a/src/main/scala/epic/features/WordClassFeaturizer.scala b/src/main/scala/epic/features/WordClassFeaturizer.scala index 9ede8c1b..69a4603b 100644 --- a/src/main/scala/epic/features/WordClassFeaturizer.scala +++ b/src/main/scala/epic/features/WordClassFeaturizer.scala @@ -20,16 +20,16 @@ class WordClassFeaturizer(wordCounts: Counter[String, Double], def words = w def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0 || pos >= words.length) { + if (pos < 0 || pos >= words.length) { boundaryFeatures } else { _minimalFeatures(pos) } } - private val _minimalFeatures: immutable.IndexedSeq[Array[Feature]] = (0 until words.length) map { i => + private val _minimalFeatures: immutable.IndexedSeq[Array[Feature]] = words.indices.map { i => val index = indices(i) - if(index >= 0) { + if (index >= 0) { WordClassFeaturizer.this.minimalFeatures(index) } else { val ww = words(i) @@ -47,7 +47,7 @@ class WordClassFeaturizer(wordCounts: Counter[String, Double], private val boundaryFeatures = Array[Feature](BoundaryFeature) - private val classes = Encoder.fromIndex(wordIndex).tabulateArray(w => if(wordCounts(w) > functionWordThreshold) interner(IndicatorFeature(w)) else interner(WordFeature(EnglishWordClassGenerator(w), 'Class))) + private val classes = Encoder.fromIndex(wordIndex).tabulateArray(w => if (wordCounts(w) > functionWordThreshold) interner(IndicatorFeature(w)) else interner(WordFeature(EnglishWordClassGenerator(w), 'Class))) // caches private val minimalFeatures = Array.tabulate(wordIndex.size){ i => diff --git a/src/main/scala/epic/features/WordFeaturizer.scala b/src/main/scala/epic/features/WordFeaturizer.scala index 1c489d9b..f138a575 100644 --- a/src/main/scala/epic/features/WordFeaturizer.scala +++ b/src/main/scala/epic/features/WordFeaturizer.scala @@ -63,9 +63,6 @@ object WordFeaturizer { val props = new WordPropertyFeaturizer(summedCounts) val lfsuf = LongestFrequentSuffixFeaturizer(summedCounts, commonWordThreshold) - - - def suffixes(order: Int = 5) = new WordSuffixFeaturizer(summedCounts, suffixOrder = order, commonWordThreshold = commonWordThreshold) def prefixes(order: Int = 5) = new WordPrefixFeaturizer(summedCounts, prefixOrder = order, commonWordThreshold = commonWordThreshold) @@ -82,7 +79,7 @@ object WordFeaturizer { def unigrams(f: WordFeaturizer[String], offsetOrder:Int = 1) = new MultiWordFeaturizer[String]({ for(i <- -offsetOrder to offsetOrder) yield { - if(i == 0) f else f(i) + if (i == 0) f else f(i) } }) @@ -114,7 +111,7 @@ object WordFeaturizer { val feats = words.map(f) - override def featuresForWord(pos: Int): Array[Feature] = if(pos < 0 || pos >= words.length) Array() else feats(pos) + override def featuresForWord(pos: Int): Array[Feature] = if (pos < 0 || pos >= words.length) Array() else feats(pos) } } } @@ -133,25 +130,23 @@ class ZeroFeaturizer[W] extends WordFeaturizer[W] with SurfaceFeaturizer[W] with } } - - -class NextActualWordFeaturizer(f: WordFeaturizer[String], lookRight: Boolean, isPunct: (String=>Boolean) = (_.forall(!_.isLetterOrDigit))) extends WordFeaturizer[String] with Serializable { - val dir = if(lookRight) 'Right else 'Left +class NextActualWordFeaturizer(f: WordFeaturizer[String], lookRight: Boolean, isPunct: (String=>Boolean) = _.forall(!_.isLetterOrDigit)) extends WordFeaturizer[String] with Serializable { + val dir = if (lookRight) 'Right else 'Left def anchor(words: IndexedSeq[String]): WordFeatureAnchoring[String] = { val w = words new WordFeatureAnchoring[String] { val base = f.anchor(w) // one for each position - val features: immutable.IndexedSeq[Array[Feature]] = (0 until w.length).map { _pos => + val features: immutable.IndexedSeq[Array[Feature]] = w.indices.map { _pos => var pos = _pos - val delta = if(lookRight) 1 else -1 + val delta = if (lookRight) 1 else -1 val feats = new ArrayBuffer[Feature]() var done = false - while(!done && pos >= 0 && pos < w.length) { - if(isPunct(w(pos))) { + while (!done && pos >= 0 && pos < w.length) { + if (isPunct(w(pos))) { feats ++= base.featuresForWord(pos).map(PunctuationFeature(_, dir)) } else { feats ++= base.featuresForWord(pos).map(ActualWordFeature(_, dir)) @@ -160,20 +155,19 @@ class NextActualWordFeaturizer(f: WordFeaturizer[String], lookRight: Boolean, is pos += delta } - if(pos < 0 || pos >= w.length) feats ++= base.featuresForWord(pos) + if (pos < 0 || pos >= w.length) feats ++= base.featuresForWord(pos) feats.toArray } def words: IndexedSeq[String] = w def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0 || pos >= w.length) base.featuresForWord(pos) + if (pos < 0 || pos >= w.length) base.featuresForWord(pos) else features(pos) } } } - } case class PunctuationFeature(f: Feature, dir: Symbol) extends Feature diff --git a/src/main/scala/epic/features/WordPrefixFeaturizer.scala b/src/main/scala/epic/features/WordPrefixFeaturizer.scala index 8332f6a3..c33c494e 100644 --- a/src/main/scala/epic/features/WordPrefixFeaturizer.scala +++ b/src/main/scala/epic/features/WordPrefixFeaturizer.scala @@ -20,7 +20,6 @@ import breeze.linalg._ import collection.mutable.ArrayBuffer import breeze.util.{Encoder, Index} - class WordPrefixFeaturizer(wordCounts: Counter[String, Double], prefixOrder: Int = 5, commonWordThreshold: Int = 100) extends WordFeaturizer[String] with Serializable { private val wordIndex = Index(wordCounts.keysIterator) @@ -29,21 +28,20 @@ class WordPrefixFeaturizer(wordCounts: Counter[String, Double], prefixOrder: Int def anchor(w: IndexedSeq[String]): WordFeatureAnchoring[String] = new WordFeatureAnchoring[String] { def words: IndexedSeq[String] = w val indices = words.map(wordIndex) - val myFeatures = (0 until words.length).map(i => if (indices(i) < 0) featuresFor(words(i)).toArray else knownWordFeatures(indices(i))) + val myFeatures = words.indices.map(i => if (indices(i) < 0) featuresFor(words(i)).toArray else knownWordFeatures(indices(i))) def featuresForWord(pos: Int): Array[Feature] = { myFeatures(pos) } - } def featuresFor(w: String): Array[Feature] = { val wc = wordCounts(w) - if(wc > commonWordThreshold) { + if (wc > commonWordThreshold) { Array.empty } else { val features = new ArrayBuffer[Feature] val wlen = w.length - if(wlen >= 4) { + if (wlen >= 4) { for(i <- 1 to ((wlen - 1) min prefixOrder)) { features += PrefixFeature(w.substring(0,i)) } @@ -55,7 +53,6 @@ class WordPrefixFeaturizer(wordCounts: Counter[String, Double], prefixOrder: Int def apply(w: String) = featuresFor(w) - } diff --git a/src/main/scala/epic/features/WordPropertyFeaturizer.scala b/src/main/scala/epic/features/WordPropertyFeaturizer.scala index b35082fb..bb2c1fbd 100644 --- a/src/main/scala/epic/features/WordPropertyFeaturizer.scala +++ b/src/main/scala/epic/features/WordPropertyFeaturizer.scala @@ -30,8 +30,6 @@ final case class SeenWithTagFeature(str: Any) extends Feature final case class LeftWordFeature(str: Any) extends Feature final case class RightWordFeature(str: Any) extends Feature - - class WordPropertyFeaturizer(wordCounts: Counter[String, Double], commonWordThreshold: Int = 20) extends WordFeaturizer[String] with Serializable { import epic.features.WordPropertyFeaturizer._ @@ -42,16 +40,16 @@ class WordPropertyFeaturizer(wordCounts: Counter[String, Double], def anchor(w: IndexedSeq[String]): WordFeatureAnchoring[String] = new WordFeatureAnchoring[String] { def words: IndexedSeq[String] = w val indices = words.map(wordIndex) - val myFeatures = (0 until words.length).map(i => if (indices(i) < 0) featuresFor(words(i)).toArray else knownWordFeatures(indices(i))) + val myFeatures = words.indices.map(i => if (indices(i) < 0) featuresFor(words(i)).toArray else knownWordFeatures(indices(i))) def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0) Array(BeginSentFeature) - else if(pos >= words.length) Array(EndSentFeature) + if (pos < 0) Array(BeginSentFeature) + else if (pos >= words.length) Array(EndSentFeature) else { val base = myFeatures(pos) // initial words nee special treatment - if( (words(pos).charAt(0).isUpper || words(pos).charAt(0).isTitleCase) && base.length > 1) { - val isInitialWord = (pos == 0 || words(pos -1) == "``") - if(isInitialWord) { + if ( (words(pos).charAt(0).isUpper || words(pos).charAt(0).isTitleCase) && base.length > 1) { + val isInitialWord = pos == 0 || words(pos -1) == "``" + if (isInitialWord) { base ++ base.map(FirstWordCapsAnd) } else { base ++ base.map(NthWordCapsAnd) @@ -61,14 +59,13 @@ class WordPropertyFeaturizer(wordCounts: Counter[String, Double], } } } - } // val signatureGenerator = EnglishWordClassGenerator def featuresFor(w: String): IndexedSeq[Feature] = { val wc = wordCounts(w) val features = ArrayBuffer[Feature]() - if(wc <= commonWordThreshold) { + if (wc <= commonWordThreshold) { val wlen = w.length val numCaps = (w:Seq[Char]).count{_.isUpper} val hasLetter = w.exists(_.isLetter) @@ -80,16 +77,16 @@ class WordPropertyFeaturizer(wordCounts: Counter[String, Double], val numPeriods = w.count('.' ==) val hasPeriod = numPeriods > 0 - if(numCaps > 0) features += hasCapFeature - if(numCaps > 1) features += hasManyCapFeature + if (numCaps > 0) features += hasCapFeature + if (numCaps > 1) features += hasManyCapFeature val isAllCaps = numCaps > 1 && !hasLower && !hasNotLetter - if(isAllCaps) features += isAllCapsFeature + if (isAllCaps) features += isAllCapsFeature - if(w.length == 2 && w(0).isLetter && w(0).isUpper && w(1) == '.') { + if (w.length == 2 && w(0).isLetter && w(0).isUpper && w(1) == '.') { features += isAnInitialFeature } - if(w.length > 1 && w.last == ('.')) { + if (w.length > 1 && w.last == '.') { features += endsWithPeriodFeature } @@ -98,9 +95,9 @@ class WordPropertyFeaturizer(wordCounts: Counter[String, Double], var hasTitleCaseVariant = false val hasInitialUpper: Boolean = w(0).isUpper || w(0).isTitleCase - if(hasInitialUpper) { + if (hasInitialUpper) { features += hasInitCapFeature - if(wordCounts(w.toLowerCase) > 0) { + if (wordCounts(w.toLowerCase) > 0) { features += hasKnownLCFeature knownLowerCase = true } else { @@ -111,16 +108,14 @@ class WordPropertyFeaturizer(wordCounts: Counter[String, Double], } } - - - if(!hasLower && hasLetter) features += hasNoLower - if(hasDash) features += hasDashFeature - if(hasDigit) { + if (!hasLower && hasLetter) features += hasNoLower + if (hasDash) features += hasDashFeature + if (hasDigit) { features += hasDigitFeature features += DigitNormalizedFeature(w.replaceAll("\\d", "0")) } - if(!hasLetter) features += hasNoLetterFeature - if(hasNotLetter) features += hasNotLetterFeature + if (!hasLetter) features += hasNoLetterFeature + if (hasNotLetter) features += hasNotLetterFeature // acronyms are all upper case with maybe some periods interspersed val hasAcronymShape = ( @@ -128,48 +123,45 @@ class WordPropertyFeaturizer(wordCounts: Counter[String, Double], || wlen >= 2 && hasPeriod && !hasLower && numCaps > 0 && !hasDigit && w.forall(c => c.isLetter || c == '.') ) // make sure it doesn't have a lwoer case or title case variant, common for titles and place names... - if(hasAcronymShape && !knownLowerCase && !hasTitleCaseVariant) { + if (hasAcronymShape && !knownLowerCase && !hasTitleCaseVariant) { features += isProbablyAcronymFeature } // year! - if(wlen == 4 && !hasNonDigit) { + if (wlen == 4 && !hasNonDigit) { val year = try{w.toInt} catch {case e: NumberFormatException => 0} - if(year >= 1400 && year < 2300) { + if (year >= 1400 && year < 2300) { features += isProbablyYearFeature } } - if(hasDigit && !hasLetter) { + if (hasDigit && !hasLetter) { try { val n = w.replaceAll(",","").toDouble - if(!hasPeriod) + if (!hasPeriod) features += integerFeature else features += floatFeature } catch {case e: NumberFormatException =>} } - if(wlen > 3 && w.endsWith("s") && !w.endsWith("ss") && !w.endsWith("us") && !w.endsWith("is")) { + if (wlen > 3 && w.endsWith("s") && !w.endsWith("ss") && !w.endsWith("us") && !w.endsWith("is")) { features += endsWithSFeature - if(hasInitialUpper) + if (hasInitialUpper) features += hasInitialCapsAndEndsWithSFeature // we mess up NNP and NNPS } - if(wlen > 10) { + if (wlen > 10) { features += longWordFeature - } else if(wlen < 5) { + } else if (wlen < 5) { features += shortWordFeature } } features } - - def apply(w: String) = featuresFor(w) - } object WordPropertyFeaturizer { diff --git a/src/main/scala/epic/features/WordShapeFeaturizer.scala b/src/main/scala/epic/features/WordShapeFeaturizer.scala index c0be698b..a12b5654 100644 --- a/src/main/scala/epic/features/WordShapeFeaturizer.scala +++ b/src/main/scala/epic/features/WordShapeFeaturizer.scala @@ -19,16 +19,16 @@ class WordShapeFeaturizer(wordCounts: Counter[String, Double], def words = w def featuresForWord(pos: Int): Array[Feature] = { - if(pos < 0 || pos >= words.length) { + if (pos < 0 || pos >= words.length) { boundaryFeatures } else { _minimalFeatures(pos) } } - private val _minimalFeatures: IndexedSeq[Array[Feature]] = (0 until words.length) map { i => + private val _minimalFeatures: IndexedSeq[Array[Feature]] = words.indices.map { i => val index = indices(i) - if(index >= 0) { + if (index >= 0) { WordShapeFeaturizer.this.minimalFeatures(index) } else { val ww = words(i) @@ -46,7 +46,7 @@ class WordShapeFeaturizer(wordCounts: Counter[String, Double], private val boundaryFeatures = Array[Feature](BoundaryFeature) - private val shapes = Encoder.fromIndex(wordIndex).tabulateArray(w => if(wordCounts(w) > functionWordThreshold) interner(IndicatorFeature(w)) else interner(WordFeature(WordShapeGenerator(w), 'Shape))) + private val shapes = Encoder.fromIndex(wordIndex).tabulateArray(w => if (wordCounts(w) > functionWordThreshold) interner(IndicatorFeature(w)) else interner(WordFeature(WordShapeGenerator(w), 'Shape))) // caches private val minimalFeatures = Array.tabulate(wordIndex.size){ i => diff --git a/src/main/scala/epic/features/WordShapeGenerator.scala b/src/main/scala/epic/features/WordShapeGenerator.scala index d1d4c592..48d50f7a 100644 --- a/src/main/scala/epic/features/WordShapeGenerator.scala +++ b/src/main/scala/epic/features/WordShapeGenerator.scala @@ -10,19 +10,19 @@ object WordShapeGenerator extends (String=>String) with Serializable { def apply(v1: String) = signatureFor(v1) def signatureFor(word: String) = { - val result = new StringBuilder(word.length); - var i = 0; + val result = new StringBuilder(word.length) + var i = 0 while (i < word.length) { - val c = word(i); - val x = if (c.isLetter && c.isUpper) 'X' else if (c.isLetter) 'x' else if (c.isDigit) 'd' else c; + val c = word(i) + val x = if (c.isLetter && c.isUpper) 'X' else if (c.isLetter) 'x' else if (c.isDigit) 'd' else c if (result.length > 1 && (result.last == x) && result(result.length - 2) == x) { result += 'e' } else if (result.length > 1 && result.last == 'e' && result(result.length - 2) == x) { () // nothing } else { - result += x; + result += x } - i += 1; + i += 1 } result.toString } diff --git a/src/main/scala/epic/features/WordSuffixFeaturizer.scala b/src/main/scala/epic/features/WordSuffixFeaturizer.scala index 872b9ba3..39523897 100644 --- a/src/main/scala/epic/features/WordSuffixFeaturizer.scala +++ b/src/main/scala/epic/features/WordSuffixFeaturizer.scala @@ -20,7 +20,6 @@ import breeze.linalg._ import collection.mutable.ArrayBuffer import breeze.util.{Encoder, Index} - class WordSuffixFeaturizer(wordCounts: Counter[String, Double], suffixOrder: Int = 5, commonWordThreshold: Int = 100) extends WordFeaturizer[String] with Serializable { import WordPropertyFeaturizer._ @@ -30,37 +29,33 @@ class WordSuffixFeaturizer(wordCounts: Counter[String, Double], suffixOrder: Int def anchor(w: IndexedSeq[String]): WordFeatureAnchoring[String] = new WordFeatureAnchoring[String] { def words: IndexedSeq[String] = w val indices = words.map(wordIndex) - val myFeatures = (0 until words.length).map(i => if (indices(i) < 0) featuresFor(words(i)).toArray else knownWordFeatures(indices(i))) + val myFeatures = words.indices.map(i => if (indices(i) < 0) featuresFor(words(i)).toArray else knownWordFeatures(indices(i))) def featuresForWord(pos: Int): Array[Feature] = { myFeatures(pos) } - } def featuresFor(w: String): Array[Feature] = { val wc = wordCounts(w) - if(wc > commonWordThreshold) { + if (wc > commonWordThreshold) { Array.empty } else { val features = new ArrayBuffer[Feature] val wlen = w.length - if(wlen >= 5) { + if (wlen >= 5) { for(i <- 1 to ((wlen-1) min suffixOrder)) { features += SuffixFeature(w.substring(wlen - i)) } - - // for(i <- 1 to ((wlen - 1) min prefixOrder)) { - // features += PrefixFeature(w.substring(0,i)) - // } + // for(i <- 1 to ((wlen - 1) min prefixOrder)) { + // features += PrefixFeature(w.substring(0,i)) + // } } - features.toArray } } def apply(w: String) = featuresFor(w) - } diff --git a/src/main/scala/epic/features/package.scala b/src/main/scala/epic/features/package.scala index 3bb12a53..713d49a5 100644 --- a/src/main/scala/epic/features/package.scala +++ b/src/main/scala/epic/features/package.scala @@ -14,7 +14,6 @@ package object features { for(x <- it) { builder.add(gen(x)) } - builder.result() } diff --git a/src/main/scala/epic/framework/EPInference.scala b/src/main/scala/epic/framework/EPInference.scala index 32dc365b..48804423 100644 --- a/src/main/scala/epic/framework/EPInference.scala +++ b/src/main/scala/epic/framework/EPInference.scala @@ -36,16 +36,14 @@ class EPInference[Datum, Augment <: AnyRef](val inferences: IndexedSeq[Projectab def scorer(v: Datum): Scorer = EPScorer(inferences.map(_.scorer(v))) - override def forTesting = new EPInference(inferences.map(_.forTesting), maxEPIter, epInGold) - // ugh code duplication... def goldMarginal(scorer: Scorer, datum: Datum, augment: Augment): Marginal = { - if(!epInGold) { - val marginals = for(i <- 0 until inferences.length) yield { + if (!epInGold) { + val marginals = inferences.indices.map { i => val inf = inferences(i) - if(inf eq null) + if (inf eq null) null.asInstanceOf[ProjectableInference[Datum, Augment]#Marginal] else inf.goldMarginal(scorer.scorers(i).asInstanceOf[inf.Scorer], datum) @@ -61,14 +59,10 @@ class EPInference[Datum, Augment <: AnyRef](val inferences: IndexedSeq[Projectab EPInference.doInference(datum, augment, inferences, scorer, (inf:ProjectableInference[Datum, Augment], scorer: ProjectableInference[Datum, Augment]#Scorer, q: Augment) => inf.marginal(scorer.asInstanceOf[inf.Scorer], datum, q), maxEPIter) } - - } - case class EPMarginal[Augment, Marginal](logPartition: Double, q: Augment, marginals: IndexedSeq[Marginal]) extends epic.framework.Marginal - object EPInference extends SafeLogging { val iters, calls = new AtomicLong(0) @@ -103,11 +97,11 @@ object EPInference extends SafeLogging { } val newAugment = inf.project(datum, iScorer.asInstanceOf[inf.Scorer], marg.asInstanceOf[inf.Marginal], q) marginals(i) = marg -// println("Leaving " + i) + // println("Leaving " + i) newAugment -> contributionToLikelihood } val ep = new ExpectationPropagation(project _, convergenceThreshold) - val inferencesToUse = (0 until inferences.length).filter(inferences(_) ne null) + val inferencesToUse = inferences.indices.filter(inferences(_) ne null) var state: ep.State = null val iterates = ep.inference(augment, inferencesToUse, inferencesToUse.map(i => inferences(i).baseAugment(datum))) @@ -117,7 +111,7 @@ object EPInference extends SafeLogging { state = s } EPInference.iters.addAndGet(iter) - if(EPInference.calls.incrementAndGet % 1000 == 0) { + if (EPInference.calls.incrementAndGet % 1000 == 0) { val calls = EPInference.calls.get() val iters = EPInference.iters.get() logger.info(s"EP Stats $iters $calls ${iters * 1.0 / calls} $maxEPIter") diff --git a/src/main/scala/epic/framework/EPModel.scala b/src/main/scala/epic/framework/EPModel.scala index bd0c6f2b..0ee62a60 100644 --- a/src/main/scala/epic/framework/EPModel.scala +++ b/src/main/scala/epic/framework/EPModel.scala @@ -37,22 +37,21 @@ class EPModel[Datum, Augment <: AnyRef](maxEPIter: Int, initFeatureValue: Featur private val offsets = models.map(_.numFeatures).unfold(0)(_ + _) for(i <- 0 until models.length) { println(models(i) + " " + models(i).featureIndex.size)} - def emptyCounts = { val counts = for (m <- models) yield m.emptyCounts EPExpectedCounts(0.0, counts.toIndexedSeq) } - def accumulateCounts(inf: Inference, s: Scorer, datum: Datum, marg: Marginal, accum: ExpectedCounts, scale: Double):Unit = { import marg._ for ( (model, i) <- models.zipWithIndex) { val marg = marginals(i) - if(marg != null) + if (marg != null) model.accumulateCounts(inf.inferences(i).asInstanceOf[model.Inference], s.scorers(i).asInstanceOf[model.Scorer], datum, marg.asInstanceOf[model.Marginal], accum.counts(i).asInstanceOf[model.ExpectedCounts], scale) } accum.loss += scale * marg.logPartition } + def numModels = models.length val featureIndex: Index[Feature] = { @@ -68,7 +67,6 @@ class EPModel[Datum, Augment <: AnyRef](maxEPIter: Int, initFeatureValue: Featur } } - /** * just saves feature weights to disk as a serialized counter. The file is prefix.ser.gz */ @@ -81,12 +79,12 @@ class EPModel[Datum, Augment <: AnyRef](maxEPIter: Int, initFeatureValue: Featur } for(i <- 0 until numModels) { val mySlice = initWeights.slice(offsets(i), offsets(i+1)) - if(mySlice.valuesIterator.exists(_ == 0)) { + if (mySlice.valuesIterator.exists(_ == 0)) { for(cw <- models(i).readCachedFeatureWeights(suffix+"-"+i)) { any = true var j = 0 - while(j < cw.length) { - if(mySlice(j) == 0.0) { + while (j < cw.length) { + if (mySlice(j) == 0.0) { mySlice(j) = cw(j) } j += 1 @@ -94,14 +92,13 @@ class EPModel[Datum, Augment <: AnyRef](maxEPIter: Int, initFeatureValue: Featur } } } - if(any) + if (any) Some(initWeights) else None } - /** * Caches the weights using the cache broker. */ @@ -122,7 +119,7 @@ class EPModel[Datum, Augment <: AnyRef](maxEPIter: Int, initFeatureValue: Featur val toUse = new ArrayBuffer[Int]() var inferences = ArrayBuffer.tabulate(models.length) { i => // hack, for now. - if(dropOutFraction > 0 && Rand.uniform.get < dropOutFraction) + if (dropOutFraction > 0 && Rand.uniform.get < dropOutFraction) null:ProjectableInference[Datum, Augment] else { toUse += i @@ -130,7 +127,7 @@ class EPModel[Datum, Augment <: AnyRef](maxEPIter: Int, initFeatureValue: Featur } } - if(!inferences.exists(_ ne null)) { + if (!inferences.exists(_ ne null)) { toUse.clear() inferences = ArrayBuffer.tabulate(models.length) { i => toUse += i @@ -140,8 +137,6 @@ class EPModel[Datum, Augment <: AnyRef](maxEPIter: Int, initFeatureValue: Featur if (dropOutFraction != 0.0) logger.info("Using inferences for models " + toUse.mkString(", ")) - - new EPInference(inferences, maxEPIter, epInGold = epInGold) } diff --git a/src/main/scala/epic/framework/EvaluableModel.scala b/src/main/scala/epic/framework/EvaluableModel.scala index 7fd2d8c6..dc83f93d 100644 --- a/src/main/scala/epic/framework/EvaluableModel.scala +++ b/src/main/scala/epic/framework/EvaluableModel.scala @@ -18,6 +18,6 @@ trait EvaluableModel[Datum] extends Model[Datum] { self => data.par.aggregate(None:Option[EvaluationResult])({(res, datum) => val result = evaluate(inf.annotate(datum, inf.marginal(datum)), datum, logResults) Some(res.foldLeft(result)(_ + _)) - }, {(a,b) => if(a.isEmpty) b else if(b.isEmpty) a else Some(a.get + b.get)}).get + }, {(a,b) => if (a.isEmpty) b else if (b.isEmpty) a else Some(a.get + b.get)}).get } } diff --git a/src/main/scala/epic/framework/EvaluationResult.scala b/src/main/scala/epic/framework/EvaluationResult.scala index fc8f1909..0dc02d22 100644 --- a/src/main/scala/epic/framework/EvaluationResult.scala +++ b/src/main/scala/epic/framework/EvaluationResult.scala @@ -1,6 +1,5 @@ package epic.framework - /** * Marker for the output of an evaluation routine. * @tparam R self type diff --git a/src/main/scala/epic/framework/Example.scala b/src/main/scala/epic/framework/Example.scala index 25dd0953..bd3367a0 100644 --- a/src/main/scala/epic/framework/Example.scala +++ b/src/main/scala/epic/framework/Example.scala @@ -24,7 +24,6 @@ trait Example[+L,+T] extends Observation[T] with Labeled[L] with Serializable {o val features = outer.features } - override def toString = { "Example { id =" + id + ", label = " + label + ", features = " + features + "}" } @@ -77,7 +76,7 @@ trait Observation[+T] extends Serializable { outer=> } override def toString = { - "Observation { ids =" + id + ", features = " + features + "}"; + "Observation { ids =" + id + ", features = " + features + "}" } } diff --git a/src/main/scala/epic/framework/Inference.scala b/src/main/scala/epic/framework/Inference.scala index fe323670..51de925d 100644 --- a/src/main/scala/epic/framework/Inference.scala +++ b/src/main/scala/epic/framework/Inference.scala @@ -61,8 +61,6 @@ trait Inference[Datum] extends Serializable { def forTesting: Inference[Datum] = this } - - /** * AugmentableInference is an [[epic.framework.Inference]] that can support injecting * additional information into the structure computation. This can include diff --git a/src/main/scala/epic/framework/LossAugmentation.scala b/src/main/scala/epic/framework/LossAugmentation.scala index f6b8269d..fcfabfda 100644 --- a/src/main/scala/epic/framework/LossAugmentation.scala +++ b/src/main/scala/epic/framework/LossAugmentation.scala @@ -8,6 +8,5 @@ package epic.framework **/ trait LossAugmentation[Datum, Augment] extends (Datum=>Augment) { def lossAugmentation(datum: Datum):Augment - def apply(datum: Datum): Augment = lossAugmentation(datum) } diff --git a/src/main/scala/epic/framework/Model.scala b/src/main/scala/epic/framework/Model.scala index c304f883..59c83dfe 100644 --- a/src/main/scala/epic/framework/Model.scala +++ b/src/main/scala/epic/framework/Model.scala @@ -21,7 +21,6 @@ import breeze.linalg._ import breeze.util.Index import epic.util.{SafeLogging, WeightsCache} - /** * A Model represents a class for turning weight vectors into [[epic.framework.Inference]]s. * It's main job is to hook up with a [[epic.framework.ModelObjective]] and mediate @@ -43,7 +42,6 @@ trait Model[Datum] extends SafeLogging { self => def emptyCounts: ExpectedCounts def accumulateCounts(inf: Inference, s: Scorer, d: Datum, m: Marginal, accum: ExpectedCounts, scale: Double):Unit - final def expectedCounts(inf: Inference, d: Datum, scale: Double = 1.0):ExpectedCounts = { val ec = emptyCounts accumulateCounts(inf, d, ec, scale) @@ -82,7 +80,7 @@ trait Model[Datum] extends SafeLogging { self => def readCachedFeatureWeights(suffix:String=""):Option[DenseVector[Double]] = { val file = new File(weightsCacheName+suffix+".txt.gz") logger.info(s"Reading old weights from $file") - if(file.exists) { + if (file.exists) { Some(WeightsCache.read(file, featureIndex)) } else { None diff --git a/src/main/scala/epic/framework/ModelFactory.scala b/src/main/scala/epic/framework/ModelFactory.scala index 79b9818c..ca179c65 100644 --- a/src/main/scala/epic/framework/ModelFactory.scala +++ b/src/main/scala/epic/framework/ModelFactory.scala @@ -23,8 +23,4 @@ import breeze.util._ * Interface for producing Models from training data. * @author dlwh */ -trait ModelFactory[Datum] { - - - -} +trait ModelFactory[Datum] \ No newline at end of file diff --git a/src/main/scala/epic/framework/ModelObjective.scala b/src/main/scala/epic/framework/ModelObjective.scala index a3f4cc68..b6e14f35 100644 --- a/src/main/scala/epic/framework/ModelObjective.scala +++ b/src/main/scala/epic/framework/ModelObjective.scala @@ -22,7 +22,7 @@ import epic.trees.TreeInstance class ModelObjective[Datum](val model: Model[Datum], batchSelector: IndexedSeq[Int]=>GenTraversable[Datum], val fullRange: IndexedSeq[Int]) extends BatchDiffFunction[DenseVector[Double]] with SafeLogging { - def this(model: Model[Datum], data: IndexedSeq[Datum], numThreads: Int = -1) = this(model,ModelObjective.makePar(data, numThreads)(_), 0 until data.length) + def this(model: Model[Datum], data: IndexedSeq[Datum], numThreads: Int = -1) = this(model,ModelObjective.makePar(data, numThreads)(_), data.indices) import model.{ExpectedCounts => _, _} @@ -40,7 +40,7 @@ class ModelObjective[Datum](val model: Model[Datum], case Some(vector) => vector case None => Encoder.fromIndex(featureIndex).tabulateDenseVector(f => model.initialValueForFeature(f)) } - if(randomize) { + if (randomize) { // Control the seed of the RNG for the weights val rng = new scala.util.Random(0) v += DenseVector(Array.tabulate(numFeatures)(i => rng.nextDouble * 2.0 * scale - scale)) @@ -51,7 +51,7 @@ class ModelObjective[Datum](val model: Model[Datum], var timeSinceLastWrite = 0L var nextSave = 5L * 20 * 1000 def calculate(x: DenseVector[Double], batch: IndexedSeq[Int]) = { - if(timeSinceLastWrite > nextSave) { + if (timeSinceLastWrite > nextSave) { logger.info("Saving feature weights...") val timeIn = System.currentTimeMillis() model.cacheFeatureWeights(x) @@ -73,10 +73,10 @@ class ModelObjective[Datum](val model: Model[Datum], } catch { case e: Exception => e.printStackTrace() -// new Exception("While processing " + datum, e).printStackTrace() + // new Exception("While processing " + datum, e).printStackTrace() _countsSoFar } - },{ (a,b) => if(a eq null) b else if (b eq null) a else b += a}) + },{ (a,b) => if (a eq null) b else if (b eq null) a else b += a}) val timeOut = System.currentTimeMillis() timeSinceLastWrite += timeOut - timeIn logger.info(f"Inference took: ${(timeOut - timeIn) * 1.0/1000}%.3fs" ) diff --git a/src/main/scala/epic/framework/OneBestInferenceAdaptor.scala b/src/main/scala/epic/framework/OneBestInferenceAdaptor.scala index b059fd1c..a46dbfd6 100644 --- a/src/main/scala/epic/framework/OneBestInferenceAdaptor.scala +++ b/src/main/scala/epic/framework/OneBestInferenceAdaptor.scala @@ -12,13 +12,10 @@ class OneBestInferenceAdaptor[Datum](val inference: AnnotatingInference[Datum]) type Marginal = inference.Marginal type Scorer = inference.Scorer - - def scorer(v: Datum): Scorer = inference.scorer(v) def goldMarginal(scorer: Scorer, v: Datum): Marginal = inference.goldMarginal(scorer, v) - /** * Produces the "guess marginal" which is the marginal conditioned on only the input data * @param v the example @@ -29,10 +26,8 @@ class OneBestInferenceAdaptor[Datum](val inference: AnnotatingInference[Datum]) goldMarginal(scorer, inference.annotate(v, m)) } - } - class OneBestModelAdaptor[Datum](val model: Model[Datum] { type Inference <: AnnotatingInference[Datum]}) extends Model[Datum] { type ExpectedCounts = model.ExpectedCounts type Marginal = model.Marginal @@ -40,7 +35,6 @@ class OneBestModelAdaptor[Datum](val model: Model[Datum] { type Inference <: Ann type Inference = OneBestInferenceAdaptor[Datum] { type Marginal = model.Marginal; type Scorer = model.Scorer} def emptyCounts: ExpectedCounts = model.emptyCounts - def accumulateCounts(inf: Inference, s: Scorer, d: Datum, m: Marginal, accum: ExpectedCounts, scale: Double) { model.accumulateCounts(inf.inference.asInstanceOf[model.Inference], s, d, m, accum, scale) } diff --git a/src/main/scala/epic/framework/StructSVM.scala b/src/main/scala/epic/framework/StructSVM.scala index 6b7b2896..d8c802f0 100644 --- a/src/main/scala/epic/framework/StructSVM.scala +++ b/src/main/scala/epic/framework/StructSVM.scala @@ -25,7 +25,7 @@ class StructSVM[Datum](val model: Model[Datum], for(i <- 0 until maxIter if !converged) { val newWeights = weights.copy for(i <- 0 until numBatches) { - val smoTol = if(i < 5) math.pow(10, -(i + 1)) else 1E-6 + val smoTol = if (i < 5) math.pow(10, -(i + 1)) else 1E-6 val inf = model.inferenceFromWeights(newWeights) val batch = Rand.subsetsOfSize(data, batchSize).draw() constraints ++= findNewConstraints(inf, batch) @@ -88,10 +88,10 @@ class StructSVM[Datum](val model: Model[Datum], val newAlphas = Array.newBuilder[Double] val newConstraints = new ArrayBuffer[Constraint]() for( i <- 0 until alphas.length) { - if(alphas(i).abs < 1E-5) constraints(i).age += 1 + if (alphas(i).abs < 1E-5) constraints(i).age += 1 else constraints(i).age = 0 - if(constraints(i).age < MAX_CONSTRAINT_AGE) { + if (constraints(i).age < MAX_CONSTRAINT_AGE) { newConstraints += constraints(i) newAlphas += alphas(i) } @@ -107,11 +107,11 @@ class StructSVM[Datum](val model: Model[Datum], alphas: DenseVector[Double], constraints: IndexedSeq[Constraint], smoTol: Double): Unit = { - if(alphas.sum < C) { + if (alphas.sum < C) { alphas += (C-alphas.sum)/alphas.length } for(i <- 0 until alphas.length) { - if(alphas(i) != 0.0) { + if (alphas(i) != 0.0) { constraints(i).axpy(alphas(i), weights) } } @@ -124,11 +124,11 @@ class StructSVM[Datum](val model: Model[Datum], val oldA1 = alphas(i) val j = perm(i) val oldA2 = alphas(j) - if( (oldA1 != 0 && oldA2 != 0)) { + if ( (oldA1 != 0 && oldA2 != 0)) { val con2 = constraints(j) var t = ((con1.loss - con2.loss) - ( (con2.dot(weights)) - (con1.dot(weights))))/(con1.ftf + con2.ftf) val tt = t - if(!t.isNaN && t != 0.0) { + if (!t.isNaN && t != 0.0) { t = t max (-oldA1) val newA1 = (oldA1 + t) min (oldA1 + oldA2) val newA2 = (oldA2 - t) max 0 diff --git a/src/main/scala/epic/framework/StructuredPerceptron.scala b/src/main/scala/epic/framework/StructuredPerceptron.scala index 5166276e..30708e5a 100644 --- a/src/main/scala/epic/framework/StructuredPerceptron.scala +++ b/src/main/scala/epic/framework/StructuredPerceptron.scala @@ -45,7 +45,7 @@ class StructuredPerceptron[Datum](model: Model[Datum], maxPasses: Int = 100, bat logger.info(f"this instance ${ec.loss}%.2f loss, ${numBad.get}/${batch.size} instances were not right!") } - if(totalCounts.isEmpty) + if (totalCounts.isEmpty) logger.info(f"this instance everything was fine!") } diff --git a/src/main/scala/epic/inference/ExpectationPropagation.scala b/src/main/scala/epic/inference/ExpectationPropagation.scala index f475e5fb..457c8b13 100644 --- a/src/main/scala/epic/inference/ExpectationPropagation.scala +++ b/src/main/scala/epic/inference/ExpectationPropagation.scala @@ -24,7 +24,7 @@ class ExpectationPropagation[F,Q <: AnyRef](project: (Q,F)=>(Q,Double), criterio var consumed = true def hasNext = !consumed || { - val next = (0 until f.length).iterator.foldLeft(cur) { (state,i) => + val next = f.indices.iterator.foldLeft(cur) { (state,i) => val State(f_~, q, _, partitions) = state val fi = f(i) val fi_~ = f_~(i) @@ -40,7 +40,7 @@ class ExpectationPropagation[F,Q <: AnyRef](project: (Q,F)=>(Q,Double), criterio } def next() = { - if(consumed) hasNext + if (consumed) hasNext consumed = true cur } @@ -49,8 +49,6 @@ class ExpectationPropagation[F,Q <: AnyRef](project: (Q,F)=>(Q,Double), criterio it } - - } object ExpectationPropagation extends App { @@ -115,7 +113,5 @@ object ExpectationPropagation extends App { assert(!state.logPartition.isNaN, state.q.s + " " + state.q.b) } - - } diff --git a/src/main/scala/epic/inference/Factor.scala b/src/main/scala/epic/inference/Factor.scala index e2d7635d..15172b91 100644 --- a/src/main/scala/epic/inference/Factor.scala +++ b/src/main/scala/epic/inference/Factor.scala @@ -1,19 +1,15 @@ package epic.inference trait Factor[F] { this: F => - /** Pointwise multiplication */ def *(f: F):F /** Pointwise division */ def /(f: F):F - /** May be infinite */ def logPartition: Double - - def isConvergedTo(f: F, diff: Double=1E-4):Boolean + def isConvergedTo(f: F, diff: Double=1E-4): Boolean } - trait ExpFactor[F] extends Factor[F] { this: F => /** Exponentiation */ def **(f: Double):F diff --git a/src/main/scala/epic/lexicon/SignatureLexicon.scala b/src/main/scala/epic/lexicon/SignatureLexicon.scala index b8ff8cb0..17e8abae 100644 --- a/src/main/scala/epic/lexicon/SignatureLexicon.scala +++ b/src/main/scala/epic/lexicon/SignatureLexicon.scala @@ -7,8 +7,7 @@ import epic.util.SafeLogging * A simple lexicon that thresholds to decide when to open up the rare word to all (open) tags */ @SerialVersionUID(1L) -class SignatureLexicon[L, W](val labelIndex: Index[L], allowed: Map[W, Set[Int]], signature: W=>W) extends Lexicon[L, W] with Serializable with SafeLogging { - +class SignatureLexicon[L, W](val labelIndex: Index[L], allowed: Map[W, Set[Int]], signature: W => W) extends Lexicon[L, W] with Serializable with SafeLogging { override def morePermissive: Lexicon[L, W] = { new SignatureLexicon(labelIndex, Map.empty[W, Set[Int]].withDefaultValue(allTags), signature) @@ -28,7 +27,6 @@ class SignatureLexicon[L, W](val labelIndex: Index[L], allowed: Map[W, Set[Int]] def length: Int = words.length } - } diff --git a/src/main/scala/epic/lexicon/SignatureTagScorer.scala b/src/main/scala/epic/lexicon/SignatureTagScorer.scala index 650526ef..4345a70a 100644 --- a/src/main/scala/epic/lexicon/SignatureTagScorer.scala +++ b/src/main/scala/epic/lexicon/SignatureTagScorer.scala @@ -18,7 +18,6 @@ package epic.lexicon import math.log import breeze.linalg._ - /** * @param counts * @tparam L @@ -26,9 +25,7 @@ import breeze.linalg._ class SignatureTagScorer[L, String](counts: Counter2[L, String, Double], signature: String=>String) extends TagScorer[L, String] { def anchor(w: IndexedSeq[String]):Anchoring = new Anchoring { def words: IndexedSeq[String] = w - - val sigs = w.map(x => if(counts(::, x).valuesIterator.nonEmpty) x else signature(x)) - + val sigs = w.map(x => if (counts(::, x).valuesIterator.nonEmpty) x else signature(x)) def scoreTag(pos: Int, l: L) = { counts(l, sigs(pos)) } diff --git a/src/main/scala/epic/lexicon/TagScorer.scala b/src/main/scala/epic/lexicon/TagScorer.scala index 0e186711..89b62d8f 100644 --- a/src/main/scala/epic/lexicon/TagScorer.scala +++ b/src/main/scala/epic/lexicon/TagScorer.scala @@ -82,14 +82,14 @@ class SimpleTagScorer[L, W](counts: Counter2[L, W, Double]) extends TagScorer[L, var cWord = wordCounts(w) var cTagWord = counts(l, w) var pTag = labelCounts(l) / totalCount - if(pTag == 0.0) { + if (pTag == 0.0) { pTag = 1.0 } assert(cWord >= cTagWord) - if(cWord < 10 || cTagWord == 0.0) { + if (cWord < 10 || cTagWord == 0.0) { cWord += 1.0 cTagWord += counts(l, ::).size.toDouble / wordCounts.size - if(cTagWord == 0.0) { + if (cTagWord == 0.0) { cTagWord = 1.0 } } diff --git a/src/main/scala/epic/models/LanguageSpecific.scala b/src/main/scala/epic/models/LanguageSpecific.scala index e82b54e8..15d66cc6 100644 --- a/src/main/scala/epic/models/LanguageSpecific.scala +++ b/src/main/scala/epic/models/LanguageSpecific.scala @@ -1,13 +1,10 @@ package epic.models -trait LanguageSpecific { this:ModelLoader[_] => - +trait LanguageSpecific { this: ModelLoader[_] => def language: String - - def capabilities():Array[String] = Array(s"language:$language") + def capabilities(): Array[String] = Array(s"language:$language") } - -trait EnglishModel extends LanguageSpecific { this:ModelLoader[_] => +trait EnglishModel extends LanguageSpecific { this: ModelLoader[_] => def language = "en" } \ No newline at end of file diff --git a/src/main/scala/epic/models/ModelLoader.scala b/src/main/scala/epic/models/ModelLoader.scala index 5f3754d3..fc9926c7 100644 --- a/src/main/scala/epic/models/ModelLoader.scala +++ b/src/main/scala/epic/models/ModelLoader.scala @@ -9,10 +9,8 @@ import java.util.zip.GZIPInputStream * @author dlwh **/ trait ModelLoader[+T] { outer => - def load():T - - def capabilities:Array[String] - + def load(): T + def capabilities: Array[String] } abstract class ClassPathModelLoader[+T](modelPath: String = "model.ser.gz") extends ModelLoader[T] { @@ -27,10 +25,8 @@ abstract class ClassPathModelLoader[+T](modelPath: String = "model.ser.gz") exte } } - /* this class exists as a hack to get around limitations in service loader*/ class DelegatingLoader[+T](outer: ModelLoader[T]) extends ModelLoader[T] { def load() = outer.load() def capabilities = outer.capabilities - } diff --git a/src/main/scala/epic/models/ModelSelector.scala b/src/main/scala/epic/models/ModelSelector.scala index f1e78cd2..ac1b4bfe 100644 --- a/src/main/scala/epic/models/ModelSelector.scala +++ b/src/main/scala/epic/models/ModelSelector.scala @@ -16,15 +16,14 @@ trait ModelSelector[+T, Loader <: ModelLoader[T]] { private lazy val serviceLoader = ServiceLoader.load(manifest.runtimeClass.asInstanceOf[Class[Loader]], classLoader) - def findModel(features: String*):Option[Loader] = { - findModel{x => lazy val a = x.capabilities.toSet; features.forall(a)} + def findModel(features: String*): Option[Loader] = { + findModel{ x => lazy val a = x.capabilities.toSet; features.forall(a) } } - def findModel(filter: Loader=>Boolean) = serviceLoader.synchronized { + def findModel(filter: Loader => Boolean) = serviceLoader.synchronized { serviceLoader.asScala.find(filter) } - } diff --git a/src/main/scala/epic/models/NerModelLoader.scala b/src/main/scala/epic/models/NerModelLoader.scala index 8fd052f3..a7e03f5c 100644 --- a/src/main/scala/epic/models/NerModelLoader.scala +++ b/src/main/scala/epic/models/NerModelLoader.scala @@ -3,12 +3,10 @@ package epic.models import scala.reflect.ClassTag import epic.sequences.SemiCRF - trait NerModelLoader extends ModelLoader[SemiCRF[Any, String]] object NerSelector extends ModelSelector[SemiCRF[Any, String], NerModelLoader] { override protected def manifest: ClassTag[NerModelLoader] = scala.reflect.classTag[NerModelLoader] - def loadNer(language: String = "en"): Option[SemiCRF[Any, String]] = this.findModel(s"language:$language").map(_.load()) } diff --git a/src/main/scala/epic/models/ParserSelector.scala b/src/main/scala/epic/models/ParserSelector.scala index b428ea74..9ce8d5cb 100644 --- a/src/main/scala/epic/models/ParserSelector.scala +++ b/src/main/scala/epic/models/ParserSelector.scala @@ -11,8 +11,7 @@ import scala.reflect.ClassTag **/ object ParserSelector extends ModelSelector[Parser[AnnotatedLabel, String], ParserModelLoader] { override protected def manifest: ClassTag[ParserModelLoader] = scala.reflect.classTag[ParserModelLoader] - def loadParser(language: String = "en"): Option[Parser[AnnotatedLabel, String]] = this.findModel(s"language:$language").map(_.load()) } -trait ParserModelLoader extends ModelLoader[Parser[AnnotatedLabel,String]]; +trait ParserModelLoader extends ModelLoader[Parser[AnnotatedLabel,String]] diff --git a/src/main/scala/epic/models/PosTagModelLoader.scala b/src/main/scala/epic/models/PosTagModelLoader.scala index 838a93de..8d8edd07 100644 --- a/src/main/scala/epic/models/PosTagModelLoader.scala +++ b/src/main/scala/epic/models/PosTagModelLoader.scala @@ -9,7 +9,6 @@ trait PosTagModelLoader extends ModelLoader[CRF[AnnotatedLabel, String]] object PosTagSelector extends ModelSelector[CRF[AnnotatedLabel, String], PosTagModelLoader] { override protected def manifest: ClassTag[PosTagModelLoader] = scala.reflect.classTag[PosTagModelLoader] - def loadTagger(language: String = "en"): Option[CRF[AnnotatedLabel, String]] = this.findModel(s"language:$language").map(_.load()) } diff --git a/src/main/scala/epic/models/package.scala b/src/main/scala/epic/models/package.scala index ab8716a6..6ba8b7af 100644 --- a/src/main/scala/epic/models/package.scala +++ b/src/main/scala/epic/models/package.scala @@ -12,7 +12,7 @@ import scala.util.{Success, Try} **/ package object models { - def deserialize[T](model: String):T = deserialize[T](model, new File(System.getProperty("user.dir"))) + def deserialize[T](model: String): T = deserialize[T](model, new File(System.getProperty("user.dir"))) def readFromJar[T](model: String, file: File): T = { val zip = new ZipFile(file) @@ -20,9 +20,7 @@ package object models { case e if e.getName == model || e.getName.endsWith("model.ser.gz") => breeze.util.nonstupidObjectInputStream(new GZIPInputStream(zip.getInputStream(e))).readObject().asInstanceOf[T] } - obj.getOrElse(throw new RuntimeException(s"Could not find model $model in jar $file")) - } /** @@ -33,10 +31,10 @@ package object models { * @tparam T * @return */ - def deserialize[T](model: String, path: File):T = { - if(!path.exists()) { + def deserialize[T](model: String, path: File): T = { + if (!path.exists()) { throw new FileNotFoundException(path.toString) - } else if(!path.isDirectory) { + } else if (!path.isDirectory) { try { readFromJar(model, path) } catch { @@ -60,7 +58,6 @@ package object models { case ex: Exception => throw new RuntimeException(s"Could not find model $model in path $path", ex) } - } case None => // look for jar files, try to read from there @@ -71,12 +68,8 @@ package object models { }.collectFirst { case Success(r) => r }.getOrElse { throw new RuntimeException(s"Could not find model $model in path $path") } - } - } - } - } diff --git a/src/main/scala/epic/ontonotes/ConllOntoReader.scala b/src/main/scala/epic/ontonotes/ConllOntoReader.scala index 21242c5f..392005d3 100644 --- a/src/main/scala/epic/ontonotes/ConllOntoReader.scala +++ b/src/main/scala/epic/ontonotes/ConllOntoReader.scala @@ -31,21 +31,21 @@ import scala.io.Source object ConllOntoReader { - def readDocuments(file: File):IndexedSeq[Document] = try { + def readDocuments(file: File): IndexedSeq[Document] = try { val docIterator = new RawDocumentIterator(Source.fromFile(file).getLines()) - for ( (rawSentences_ :IndexedSeq[IndexedSeq[String]], docIndex: Int) <- docIterator.zipWithIndex.toIndexedSeq) yield { + for ((rawSentences_ :IndexedSeq[IndexedSeq[String]], docIndex: Int) <- docIterator.zipWithIndex.toIndexedSeq) yield { val rawSentences = rawSentences_.collect { case seq if seq.nonEmpty => seq.map(_.split("\\s+").toIndexedSeq) } - val sentences = for( (s,sentenceIndex) <- rawSentences.zipWithIndex) yield { + val sentences = for( (s,sentenceIndex) <- rawSentences.zipWithIndex) yield { val words = s.map(_(3)) val tags = s.map(_(4)) val stringTree = { val parseBits = s.map(_(5)) val b = new StringBuilder() - for(i <- 0 until parseBits.length) { + parseBits.indices.foreach { i => b ++= parseBits(i).replace("*","( "+ tags(i) + " " + words(i) + " )") } Tree.fromString(b.toString)._1 @@ -54,15 +54,15 @@ object ConllOntoReader { val entities = collection.mutable.Map[(Int,Int), NerType.Value]() var currentChunkStart = -1 var currentChunkType = NerType.OutsideSentence - for(i <- 0 until s.length) { + s.indices.foreach { i => val chunk = s(i)(10) - if(chunk.startsWith("(")) { + if (chunk.startsWith("(")) { assert(currentChunkStart < 0) currentChunkStart = i currentChunkType = NerType.fromString(chunk.replaceAll("[()*]","")) } - if(chunk.endsWith(")")) { + if (chunk.endsWith(")")) { assert(currentChunkStart >= 0) entities += ((currentChunkStart -> (i+1)) -> currentChunkType) currentChunkStart = -1 @@ -76,18 +76,17 @@ object ConllOntoReader { val lastValue = collection.mutable.Stack[(String, Int)]() val arguments = ArrayBuffer[Argument]() var verb = -1 - for (i <- 0 until s.length) { + s.indices.foreach { i => if (s(i)(column).startsWith("(")) { val trimmed = s(i)(column).substring(1, s(i)(column).lastIndexOf("*")) for(name <- trimmed.split("[(]")) lastValue.push(name.trim -> i) } - if (s(i)(column).endsWith(")")) { for(close <- 0 until s(i)(column).count(_ == ')')) { assert(lastValue.nonEmpty, s.map(_(column)).mkString(",") + " " + i) val (name, start) = lastValue.pop() - if(name == "V") { + if (name == "V") { assert(start == i) verb = i } else { @@ -95,7 +94,6 @@ object ConllOntoReader { } } } - } assert(verb != -1, s.map(_(column)).mkString(",") ) @@ -109,15 +107,15 @@ object ConllOntoReader { val stack = new collection.mutable.HashMap[Int, Stack[Int]]() { override def default(key: Int) = getOrElseUpdate(key,new Stack()) } - for(i <- 0 until s.length) { + s.indices.foreach { i => val chunk = s(i).last - if(chunk != "-") + if (chunk != "-") for( id <- chunk.split("\\|")) { val tid = id.replaceAll("[()*]","").toInt - if(id.startsWith("(")) { + if (id.startsWith("(")) { stack(tid).push(i) } - if(id.endsWith(")")) { + if (id.endsWith(")")) { val start = stack(tid).pop() mentions(start -> (i+1)) = mention(tid) } @@ -131,16 +129,12 @@ object ConllOntoReader { val speaker = s.map(_(9)).find(_ != "-") val annotations = OntoAnnotations(tree, ner, coref, srl, speaker) - - - Sentence(docId, sentenceIndex,words, annotations) } Document(s"${file.toString}-$docIndex",sentences.toIndexedSeq) } - } catch { case ex: MalformedInputException => throw new RuntimeException("Error while processing " + file, ex) @@ -148,28 +142,28 @@ object ConllOntoReader { private val mentionCache = Array.tabulate(100)(i => Mention(i)) - private def mention(id: Int) = if(id < mentionCache.length) mentionCache(id) else Mention(id) + private def mention(id: Int) = if (id < mentionCache.length) mentionCache(id) else Mention(id) private class RawDocumentIterator(it: Iterator[String]) extends Iterator[IndexedSeq[IndexedSeq[String]]] { def hasNext = it.hasNext - def next():IndexedSeq[IndexedSeq[String]] = { + def next(): IndexedSeq[IndexedSeq[String]] = { var doneOuter = false val outBuf = new ArrayBuffer[IndexedSeq[String]] - while(it.hasNext && !doneOuter) { + while (it.hasNext && !doneOuter) { val buf = new ArrayBuffer[String] var done = false var seenSomethingNotBlank = false - while(it.hasNext && !done) { + while (it.hasNext && !done) { val next = it.next() - if(next.startsWith("#begin")) { + if (next.startsWith("#begin")) { // pass - } else if(next.startsWith("#end")) { + } else if (next.startsWith("#end")) { doneOuter = true - } else if(next.trim != "") { + } else if (next.trim != "") { seenSomethingNotBlank = true buf += next.trim - } else if(seenSomethingNotBlank) { + } else if (seenSomethingNotBlank) { done = true } } diff --git a/src/main/scala/epic/ontonotes/DSpan.scala b/src/main/scala/epic/ontonotes/DSpan.scala index c50ff74c..f5c6a08e 100644 --- a/src/main/scala/epic/ontonotes/DSpan.scala +++ b/src/main/scala/epic/ontonotes/DSpan.scala @@ -15,8 +15,8 @@ case class DSpan(doc: String, sentence: Int, begin: Int, end: Int) { * @param doc * @return */ - def render(doc: Document):String = render(doc.sentences.map(_.words)) - def render(doc: IndexedSeq[IndexedSeq[String]]):String = getYield(doc).mkString("[",", ", "]") + def render(doc: Document): String = render(doc.sentences.map(_.words)) + def render(doc: IndexedSeq[IndexedSeq[String]]): String = getYield(doc).mkString("[",", ", "]") /** * Gets the words associated with this document. @@ -34,9 +34,9 @@ object DSpan { def compare(x: DSpan, y: DSpan): Int = { x.doc.compare(y.doc) match { case 0 => - if(x.sentence < y.sentence) -1 - else if(x.sentence > y.sentence) 1 - else if(x.begin < y.begin) -1 + if (x.sentence < y.sentence) -1 + else if (x.sentence > y.sentence) 1 + else if (x.begin < y.begin) -1 else if (x.begin > y.begin) 1 else x.end - y.end case z => z @@ -53,14 +53,13 @@ case class DPos(doc: String, sentence: Int, pos: Int) { def asDSpan = DSpan(doc, sentence, pos, pos + 1) } - object DPos { implicit val ordering: Ordering[DPos] = new Ordering[DPos] { def compare(x: DPos, y: DPos): Int = { x.doc.compare(y.doc) match { case 0 => - if(x.sentence < y.sentence) -1 - else if(x.sentence > y.sentence) 1 + if (x.sentence < y.sentence) -1 + else if (x.sentence > y.sentence) 1 else x.pos - y.pos case z => z } diff --git a/src/main/scala/epic/ontonotes/Document.scala b/src/main/scala/epic/ontonotes/Document.scala index e9b57408..520a2a66 100644 --- a/src/main/scala/epic/ontonotes/Document.scala +++ b/src/main/scala/epic/ontonotes/Document.scala @@ -25,17 +25,11 @@ import epic.trees.{AnnotatedLabel, Tree} */ case class Document(id: String, sentences: IndexedSeq[Sentence]) extends Example[IndexedSeq[OntoAnnotations], IndexedSeq[IndexedSeq[String]]] { def dspans = sentences.flatMap(_.dspans) - def words: IndexedSeq[IndexedSeq[String]] = sentences.map(_.words) - def features = words - lazy val label: IndexedSeq[OntoAnnotations] = sentences.map(_.label) - lazy val trees: IndexedSeq[Tree[AnnotatedLabel]] = sentences.map(_.tree) - lazy val ner: Map[DSpan, NerType.Value] = sentences.map(_.ner).reduceLeft(_ ++ _) - lazy val coref: Map[DSpan, Mention] = sentences.map(_.coref).reduceLeft(_ ++ _) } diff --git a/src/main/scala/epic/ontonotes/Sentence.scala b/src/main/scala/epic/ontonotes/Sentence.scala index c5426127..022a685b 100644 --- a/src/main/scala/epic/ontonotes/Sentence.scala +++ b/src/main/scala/epic/ontonotes/Sentence.scala @@ -83,8 +83,8 @@ case class Frame(lemma: String, pos: Int, sense: Int, args: IndexedSeq[Argument] val newArgs = mutable.Stack[Argument]() val sorted = args.sortBy(a => (a.span.begin, -a.span.length))(Ordering.Tuple2) for(arg <- sorted) { - if(newArgs.isEmpty || !newArgs.top.span.contains(arg.span)) { // don't overlap at all - while(newArgs.nonEmpty && arg.span.contains(newArgs.top.span)) { + if (newArgs.isEmpty || !newArgs.top.span.contains(arg.span)) { // don't overlap at all + while (newArgs.nonEmpty && arg.span.contains(newArgs.top.span)) { newArgs.pop() } assert(newArgs.isEmpty || !arg.span.crosses(newArgs.top.span)) @@ -102,14 +102,14 @@ case class Frame(lemma: String, pos: Int, sense: Int, args: IndexedSeq[Argument] var last = 0 for( arg <- sorted ) { assert(last <= arg.span.begin) - while(arg.span.begin != last) { + while (arg.span.begin != last) { out += (Some(outside) -> Span(last,last+1)) last += 1 } out += (Some(arg.arg) -> Span(arg.span.begin, arg.span.end)) last = arg.span.end } - while(words.length != last) { + while (words.length != last) { out += (Some(outside) -> Span(last,last+1)) last += 1 } diff --git a/src/main/scala/epic/package.scala b/src/main/scala/epic/package.scala index 5b5cb7b5..0799b3af 100644 --- a/src/main/scala/epic/package.scala +++ b/src/main/scala/epic/package.scala @@ -9,7 +9,6 @@ import scala.collection.mutable */ package object epic { - implicit class AwesomeBitSet(val bs: java.util.BitSet) extends AnyVal { def apply(r: Int) = bs.get(r) @@ -27,7 +26,7 @@ package object epic { def foreach[U](f: Int=>U) { var i = bs.nextSetBit(0) - while(i != -1) { + while (i != -1) { f(i) i = bs.nextSetBit(i+1) } @@ -115,4 +114,15 @@ package object epic { } } + implicit class AwesomeArrayOps(val xs: Array.type) { + def fillWith[T : Manifest](size: Int)(f: Int => T): Array[T] = { + var i = 0 + Array.fill[T](size) { + val x = f(i) + i += 1 + x + } + } + } + } diff --git a/src/main/scala/epic/parser/ChartDecoder.scala b/src/main/scala/epic/parser/ChartDecoder.scala index 7be32eb7..d6ca6f92 100644 --- a/src/main/scala/epic/parser/ChartDecoder.scala +++ b/src/main/scala/epic/parser/ChartDecoder.scala @@ -76,7 +76,7 @@ case class ViterbiDecoder[L, W]() extends ChartDecoder[L, W] with Serializable w val b = topology.child(r) val refB = refined.childRefinement(r, refR) val score = ruleScore + insideBotScore(begin, end, b, refB) - if(score > maxScore) { + if (score > maxScore) { maxScore = score maxChild = b maxChildRef = refB @@ -84,7 +84,7 @@ case class ViterbiDecoder[L, W]() extends ChartDecoder[L, W] with Serializable w } } - if(maxScore == Double.NegativeInfinity) { + if (maxScore == Double.NegativeInfinity) { throw new ParseExtractionException(s"Couldn't find a tree! [$begin,$end) ${topology.labelIndex.get(root)}", words) } @@ -101,7 +101,7 @@ case class ViterbiDecoder[L, W]() extends ChartDecoder[L, W] with Serializable w var maxSplit = -1 var maxRule = -1 - if(begin + 1 == end) { + if (begin + 1 == end) { return NullaryTree(labelIndex.get(root) -> rootRef, Span(begin, end)) } @@ -122,7 +122,7 @@ case class ViterbiDecoder[L, W]() extends ChartDecoder[L, W] with Serializable w + marginal.insideTopScore(split, end, c, refC) + spanScore ) - if(score > maxScore) { + if (score > maxScore) { maxScore = score maxLeft = b maxLeftRef = refB @@ -133,15 +133,13 @@ case class ViterbiDecoder[L, W]() extends ChartDecoder[L, W] with Serializable w } } - if(maxScore == Double.NegativeInfinity) { + if (maxScore == Double.NegativeInfinity) { throw new ParseExtractionException(s"Couldn't find a tree! [$begin,$end) ${topology.labelIndex.get(root)}\n", marginal.words) } else { val lchild = buildTreeUnary(begin, maxSplit, maxLeft, maxLeftRef) val rchild = buildTreeUnary(maxSplit, end, maxRight, maxRightRef) BinaryTree(labelIndex.get(root) -> rootRef, lchild, rchild, Span(begin, end)) } - - } val maxRootRef = refined.validLabelRefinements(0, length, rootIndex).maxBy(ref => insideTopScore(0, length, rootIndex, ref)) @@ -187,12 +185,9 @@ case class MaxVariationalDecoder[L, W]() extends ProjectingChartDecoder[L, W](ne class MaxConstituentDecoder[L, W] extends ChartDecoder[L, W] { def extractBestParse(marginal: ParseMarginal[L, W]): BinarizedTree[L] = { - - val length = marginal.length import marginal.topology - val spanMarginals = new AnchoredSpanProjector().projectSpanPosteriors(marginal) val maxSplit = TriangularArray.fill[Int](length+1)(0) val maxBotLabel = TriangularArray.fill[Int](length+1)(-1) @@ -202,7 +197,6 @@ class MaxConstituentDecoder[L, W] extends ChartDecoder[L, W] { val numLabels = topology.labelIndex.size - for { span <- 1 to length begin <- 0 to (length - span) @@ -214,7 +208,7 @@ class MaxConstituentDecoder[L, W] extends ChartDecoder[L, W] { maxTopLabel(begin, end) = argmax(spanMarginals.topType(begin, end).slice(0, numLabels)) maxTopScore(begin, end) = spanMarginals.botType(begin, end)(maxBotLabel(begin, end)) + maxBotScore(begin, end) - if(end - begin > 1) { + if (end - begin > 1) { val (split, splitScore) = (for (split <- begin + 1 until end) yield { val score = maxTopScore(begin, split) + maxTopScore(split, end) (split, score) @@ -260,9 +254,9 @@ class MaxConstituentDecoder[L, W] extends ChartDecoder[L, W] { def extract(begin: Int, end: Int):BinarizedTree[L] = { val bestBot = maxBotLabel(begin, end) - val lower = if(begin + 1== end) { -// if(maxBotScore(begin, end) == Double.NegativeInfinity) -// throw new RuntimeException(s"Couldn't make a good score for ${(begin, end)}. InsideIndices: ${inside.bot.enteredLabelIndexes(begin, end).toIndexedSeq}\noutside: ${outside.bot.enteredLabelIndexes(begin, end).toIndexedSeq} logPartition: $logPartition") + val lower = if (begin + 1== end) { + // if (maxBotScore(begin, end) == Double.NegativeInfinity) + // throw new RuntimeException(s"Couldn't make a good score for ${(begin, end)}. InsideIndices: ${inside.bot.enteredLabelIndexes(begin, end).toIndexedSeq}\noutside: ${outside.bot.enteredLabelIndexes(begin, end).toIndexedSeq} logPartition: $logPartition") NullaryTree(topology.labelIndex.get(bestBot), Span(begin, end)) } else { val split = maxSplit(begin, end) diff --git a/src/main/scala/epic/parser/GenerativeParser.scala b/src/main/scala/epic/parser/GenerativeParser.scala index 7abede9c..477c94a9 100644 --- a/src/main/scala/epic/parser/GenerativeParser.scala +++ b/src/main/scala/epic/parser/GenerativeParser.scala @@ -173,7 +173,7 @@ object GenerativeTrainer extends ParserPipeline { val refinedGrammar = Grammar.generative(xbar, xbarLexicon, indexedRefinements, binaryCounts, initUnaries, scorer) - if(params.grammarDumpPath != null) { + if (params.grammarDumpPath != null) { val out = new BufferedWriter(new FileWriter(params.grammarDumpPath)) refinedGrammar.prettyPrint(out) out.close() diff --git a/src/main/scala/epic/parser/Grammar.scala b/src/main/scala/epic/parser/Grammar.scala index 0a9896bb..754548d7 100644 --- a/src/main/scala/epic/parser/Grammar.scala +++ b/src/main/scala/epic/parser/Grammar.scala @@ -48,7 +48,6 @@ object Grammar { def topology = f1.topology def lexicon = f1.lexicon - override def withPermissiveLexicon: Grammar[L, W] = product(f1.withPermissiveLexicon, f2.withPermissiveLexicon) def anchor(words: IndexedSeq[W], @@ -63,7 +62,6 @@ object Grammar { def lexicon = l - override def withPermissiveLexicon: Grammar[L, W] = identity(ruleTopology, l.morePermissive) override def anchor(words: IndexedSeq[W], constraints: ChartConstraints[L]): GrammarAnchoring[L, W] = { diff --git a/src/main/scala/epic/parser/GrammarAnchoring.scala b/src/main/scala/epic/parser/GrammarAnchoring.scala index c7e0be62..cff71edc 100644 --- a/src/main/scala/epic/parser/GrammarAnchoring.scala +++ b/src/main/scala/epic/parser/GrammarAnchoring.scala @@ -37,7 +37,6 @@ trait GrammarAnchoring[L, W] { def logPartition: Double = marginal.logPartition - private lazy val lexLoc = lexicon.anchor(words) def tagConstraints: TagConstraints[L] = lexLoc @@ -85,8 +84,6 @@ trait GrammarAnchoring[L, W] { else new ProductGrammarAnchoring(this,other) } - - /** * Computes the pointwise division of two grammars, augmenting * their refinement space to reflect this. If they share the same annotationTag, @@ -104,7 +101,7 @@ trait GrammarAnchoring[L, W] { def maxMarginal = RefinedChartMarginal(this, maxMarginal = true) def marginal = RefinedChartMarginal(this, maxMarginal = false) - def isConvergedTo(f: GrammarAnchoring[L, W], diff: Double):Boolean = { + def isConvergedTo(f: GrammarAnchoring[L, W], diff: Double): Boolean = { import scala.util.control.Breaks._ var converged = true breakable { @@ -112,9 +109,6 @@ trait GrammarAnchoring[L, W] { def visitBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int, score: Double) { val myScore = scoreBinaryRule(begin, split, end, rule, ref) val theirScore = f.scoreBinaryRule(begin, split, end, rule, ref) - - - if (myScore != theirScore) { if (theirScore.isInfinite || myScore.isInfinite) { converged = false @@ -126,15 +120,12 @@ trait GrammarAnchoring[L, W] { break() } } - } def visitUnaryRule(begin: Int, end: Int, rule: Int, ref: Int, score: Double) { val myScore = scoreUnaryRule(begin, end, rule, ref) val theirScore = f.scoreUnaryRule(begin, end, rule, ref) assert(!myScore.isInfinite) - - if (myScore != theirScore) { if (theirScore.isInfinite || myScore.isInfinite) { converged = false @@ -146,14 +137,11 @@ trait GrammarAnchoring[L, W] { break() } } - } def visitSpan(begin: Int, end: Int, tag: Int, ref: Int, score: Double) = { val myScore = scoreSpan(begin, end, tag, ref) val theirScore = f.scoreSpan(begin, end, tag, ref) - - if (myScore != theirScore) { if (theirScore.isInfinite || myScore.isInfinite) { converged = true @@ -165,16 +153,14 @@ trait GrammarAnchoring[L, W] { break() } } - } } } -// println(converged) + // println(converged) converged } - /** * The annotationTag controls if two grammars are over the same refinements. * If they are, then * and / can be much faster. @@ -200,11 +186,9 @@ trait GrammarAnchoring[L, W] { def maxLabelRefinements: Int = (0 until topology.labelIndex.size).map(numValidRefinements _).max - def numValidRefinements(label: Int):Int - - def numValidRuleRefinements(rule: Int):Int - + def numValidRefinements(label: Int): Int + def numValidRuleRefinements(rule: Int): Int /** * For a given span and the parent's refinement, what refinements to the rule are allowed? @@ -221,10 +205,10 @@ trait GrammarAnchoring[L, W] { def validRuleRefinementsGivenRightChild(completionBegin: Int, completionEnd: Int, split: Int, end: Int, rule: Int, childRef: Int):Array[Int] def validUnaryRuleRefinementsGivenChild(begin: Int, end: Int, rule: Int, childRef: Int):Array[Int] - def leftChildRefinement(rule: Int, ruleRef: Int):Int - def rightChildRefinement(rule: Int, ruleRef: Int):Int - def parentRefinement(rule: Int, ruleRef: Int):Int - def childRefinement(rule: Int, ruleRef: Int):Int + def leftChildRefinement(rule: Int, ruleRef: Int): Int + def rightChildRefinement(rule: Int, ruleRef: Int): Int + def parentRefinement(rule: Int, ruleRef: Int): Int + def childRefinement(rule: Int, ruleRef: Int): Int /** * Returns the refined rule given parent and child refinements for a unary rule. @@ -234,7 +218,7 @@ trait GrammarAnchoring[L, W] { * @param refB child index * @return rule refinement id, or -1 if rule is not allowed with those refinements */ - def ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int):Int + def ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int): Int /** * Returns the refined rule given parent and child refinements for a unary rule. @@ -245,7 +229,7 @@ trait GrammarAnchoring[L, W] { * @param refC right child index * @return rule refinement id, or -1 if rule is not allowed with those refinements */ - def ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int, refC: Int):Int + def ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int, refC: Int): Int def validCoarseRulesGivenParentRefinement(a: Int, refA: Int): Array[Int] @@ -262,8 +246,6 @@ object GrammarAnchoring { UnrefinedGrammarAnchoring.identity[L, W](topology, lexicon, words, constraints) } - - trait StructureDelegatingAnchoring[L, W] extends GrammarAnchoring[L, W] { protected def baseAnchoring: GrammarAnchoring[L, W] @@ -272,11 +254,11 @@ object GrammarAnchoring { def words: IndexedSeq[W] = baseAnchoring.words -// def scoreSpan(begin: Int, end: Int, label: Int, ref: Int): Double = baseAnchoring.scoreSpan(begin: Int, end: Int, label: Int, ref: Int) + // def scoreSpan(begin: Int, end: Int, label: Int, ref: Int): Double = baseAnchoring.scoreSpan(begin: Int, end: Int, label: Int, ref: Int) -// def scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int): Double = baseAnchoring.scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int) + // def scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int): Double = baseAnchoring.scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int) -// def scoreUnaryRule(begin: Int, end: Int, rule: Int, ref: Int): Double = baseAnchoring.scoreUnaryRule(begin: Int, end: Int, rule: Int, ref: Int) + // def scoreUnaryRule(begin: Int, end: Int, rule: Int, ref: Int): Double = baseAnchoring.scoreUnaryRule(begin: Int, end: Int, rule: Int, ref: Int) def validLabelRefinements(begin: Int, end: Int, label: Int): Array[Int] = baseAnchoring.validLabelRefinements(begin: Int, end: Int, label: Int) @@ -300,7 +282,6 @@ object GrammarAnchoring { def childRefinement(rule: Int, ruleRef: Int): Int = baseAnchoring.childRefinement(rule: Int, ruleRef: Int) - def ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int): Int = baseAnchoring.ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int) def ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int, refC: Int): Int = baseAnchoring.ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int, refC: Int) diff --git a/src/main/scala/epic/parser/LatentTreeMarginal.scala b/src/main/scala/epic/parser/LatentTreeMarginal.scala index 03c7864e..7e0b0a1c 100644 --- a/src/main/scala/epic/parser/LatentTreeMarginal.scala +++ b/src/main/scala/epic/parser/LatentTreeMarginal.scala @@ -35,7 +35,6 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], private val stree = insideScores() outsideScores(stree) - def isMaxMarginal: Boolean = false private val z = stree.label.inside.sum @@ -49,7 +48,7 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], stree.postorder foreach { case t@NullaryTree(Beliefs(labels, iScores, iScale, oScores, oScale), span) => - for( i <- 0 until labels.length) { + for( i <- labels.indices) { val (l, ref) = labels(i) val iS = iScores(i) val oS = oScores(i) @@ -60,18 +59,18 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], } case t@UnaryTree(Beliefs(aLabels, _, _, aScores, aScale), Tree(Beliefs(cLabels, cScores,cScale, _, _), _, _), chain, span) => var pi = 0 - while(pi < aLabels.size) { + while (pi < aLabels.size) { val (a, aRef) = aLabels(pi) val opScore = aScores(pi) pi += 1 var ci = 0 - while(ci < cLabels.size) { + while (ci < cLabels.size) { val (c, cRef) = cLabels(ci) val icScore = cScores(ci) ci += 1 val rule = topology.index(UnaryRule(topology.labelIndex.get(a), topology.labelIndex.get(c), chain)) val ruleRef = anchoring.ruleRefinementFromRefinements(rule, aRef, cRef) - if(ruleRef != -1 ) { + if (ruleRef != -1 ) { val rs = math.exp(anchoring.scoreUnaryRule(t.span.begin, t.span.end, rule, ruleRef)) // exp! val ruleScore = Scaling.unscaleValue(opScore / z * rs * icScore, aScale + cScale - rootScale) assert(!ruleScore.isNaN) @@ -101,7 +100,6 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], } } - // private stuff to do the computation private def insideScores() = { @@ -113,7 +111,7 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], assert(t.span.length == 1) var foundOne = false for { - i <- 0 until scores.length + i <- scores.indices (label, ref) = labels(i) wScore = anchoring.scoreSpan(t.span.begin, t.span.end, label, ref) if !wScore.isInfinite @@ -123,46 +121,44 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], assert(!wScore.isNaN) foundOne = true } - if(!foundOne) { + if (!foundOne) { sys.error(s"Trouble with lexical $words(t.span.begin)") } t.label.scaleInside(0) case t@UnaryTree(Beliefs(aLabels, aScores, _, _, _), Tree(Beliefs(cLabels, cScores, cScale, _, _), _, _), chain, span) => var foundOne = false var ai = 0 - while(ai < aLabels.length) { + while (ai < aLabels.length) { val (a, aRef) = aLabels(ai) - var sum = 0.0 var ci = 0 - while(ci < cLabels.length) { + while (ci < cLabels.length) { val (c, cRef) = cLabels(ci) val rule = topology.index(UnaryRule(topology.labelIndex.get(a), topology.labelIndex.get(c), chain)) - if(rule != -1) { + if (rule != -1) { val ruleRef = anchoring.ruleRefinementFromRefinements(rule, aRef, cRef) if (ruleRef != -1) { val score = anchoring.scoreUnaryRule(t.span.begin, t.span.end, rule, ruleRef) val ruleScore = cScores(ci) * math.exp(score) // exp! sum += ruleScore assert(!ruleScore.isNaN) - if(score != Double.NegativeInfinity && math.exp(score) == 0.0) { + if (score != Double.NegativeInfinity && math.exp(score) == 0.0) { println("Underflow!!!") } - if(ruleScore != 0.0) { + if (ruleScore != 0.0) { foundOne = true } } } ci += 1 } - aScores(ai) = sum ai += 1 } - if(!foundOne) { + if (!foundOne) { sys.error("unary problems") -// sys.error(s"Trouble with unary $t.render(words)} ${grammar.labelIndex.get(a)} ${grammar.labelIndex.get(c)} $rule ${anchoring.scoreUnaryRule(t.span.begin, t.span.end, rule, 0)}") + // sys.error(s"Trouble with unary $t.render(words)} ${grammar.labelIndex.get(a)} ${grammar.labelIndex.get(c)} $rule ${anchoring.scoreUnaryRule(t.span.begin, t.span.end, rule, 0)}") } t.label.scaleInside(cScale) case t@BinaryTree(Beliefs(aLabels, aScores, _, _, _), @@ -173,21 +169,21 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], val split = t.leftChild.span.end val end = span.end var ai = 0 - while(ai < aScores.length) { + while (ai < aScores.length) { var sum = 0.0 val (a, aRef) = aLabels(ai) var bi = 0 - while(bi < bLabels.length) { + while (bi < bLabels.length) { val (b, bRef) = bLabels(bi) var ci = 0 - while(ci < cLabels.length) { + while (ci < cLabels.length) { val (c, cRef) = cLabels(ci) val rule = topology.index(BinaryRule(topology.labelIndex.get(a), topology.labelIndex.get(b), topology.labelIndex.get(c))) - if(rule != -1) { + if (rule != -1) { val ruleRef = anchoring.ruleRefinementFromRefinements(rule, aRef, bRef, cRef) - if(ruleRef != -1) { + if (ruleRef != -1) { val spanScore = anchoring.scoreSpan(begin, end, a, aRef) sum += ( bScores(bi) * cScores(ci) @@ -201,15 +197,15 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], bi += 1 } aScores(ai) = sum - if(aScores(ai) != 0) foundOne = true + if (aScores(ai) != 0) foundOne = true ai += 1 } - if(!foundOne) { -// val r = (BinaryRule(grammar.labelIndex.get(a), -// grammar.labelIndex.get(b), -// grammar.labelIndex.get(c))) -// sys.error(s"Trouble with binary ${t.render(words)}\n\n$r $rule $ai") + if (!foundOne) { + // val r = (BinaryRule(grammar.labelIndex.get(a), + // grammar.labelIndex.get(b), + // grammar.labelIndex.get(c))) + // sys.error(s"Trouble with binary ${t.render(words)}\n\n$r $rule $ai") } t.label.scaleInside(cScale + bScale) case _ => sys.error("bad tree!") @@ -227,10 +223,10 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], case t @ BinaryTree(parent, lchild, rchild, span) => for { ((a, aRef), aScore) <- t.label.labels zip t.label.outside - bi <- 0 until lchild.label.labels.length + bi <- lchild.label.labels.indices (b, bRef) = lchild.label.labels(bi) bScore = lchild.label.inside(bi) - ci <- 0 until rchild.label.labels.length + ci <- rchild.label.labels.indices (c, cRef) = rchild.label.labels(ci) cScore = rchild.label.inside(ci) } { @@ -257,7 +253,7 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], } { val rule = topology.index(UnaryRule(topology.labelIndex.get(a), topology.labelIndex.get(c), chain)) val ruleRef = anchoring.ruleRefinementFromRefinements(rule, aRef, cRef) - if(ruleRef != -1) { + if (ruleRef != -1) { val ruleScore = anchoring.scoreUnaryRule(span.begin, span.end, rule, ruleRef) sum += aScore * math.exp(ruleScore) // exp! } @@ -265,9 +261,6 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], child.label.outside(ci) = sum } child.label.scaleOutside(t.label.oscale) - - - } } @@ -302,7 +295,6 @@ case class LatentTreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], } case _ => Double.NegativeInfinity } - } // override def marginalAt(begin: Int, end: Int): Counter2[L, Int, Double] = { @@ -357,8 +349,8 @@ object LatentTreeMarginal { private object Beliefs { private[LatentTreeMarginal] def apply[L](labels: IndexedSeq[(Int, Int)]):Beliefs[L] = { val r = new Beliefs[L](labels, new Array[Double](labels.length), 0, new Array[Double](labels.length), 0) -// Arrays.fill(r.inside, Double.NegativeInfinity) -// Arrays.fill(r.outside, Double.NegativeInfinity) + // Arrays.fill(r.inside, Double.NegativeInfinity) + // Arrays.fill(r.outside, Double.NegativeInfinity) r } } diff --git a/src/main/scala/epic/parser/ParseEval.scala b/src/main/scala/epic/parser/ParseEval.scala index 8c143e75..96a6f5e7 100644 --- a/src/main/scala/epic/parser/ParseEval.scala +++ b/src/main/scala/epic/parser/ParseEval.scala @@ -15,7 +15,6 @@ package epic.parser limitations under the License. */ - import epic.trees._ import java.io.BufferedOutputStream import java.io.File @@ -29,7 +28,6 @@ import com.typesafe.scalalogging.slf4j.LazyLogging import java.text.DecimalFormat import epic.util.ProgressLog - /** * Hack approximation to true parse eval. Gives Labeled Precision * and Labeled Recall. @@ -43,25 +41,22 @@ class ParseEval[L](ignoredLabels: Set[L]) { * guess/gold pair of trees. */ def apply(guessgold: Iterator[(Tree[L],Tree[L])]):Statistics = { - val allStats = for( (guess,gold) <- guessgold) yield { apply(guess,gold) } - + val allStats = for((guess,gold) <- guessgold) yield { apply(guess,gold) } val stats = allStats.reduceLeft(_ + _) - stats } def apply(guess: Tree[L], gold: Tree[L]): Statistics = { val guessSet = labeledConstituents(guess) val goldSet = labeledConstituents(gold) - val inter = (guessSet intersect goldSet) - val exact = if(goldSet.size == inter.size && guessSet.size == inter.size) 1 else 0 + val inter = guessSet intersect goldSet + val exact = if (goldSet.size == inter.size && guessSet.size == inter.size) 1 else 0 val guessLeaves = guess.leaves val goldLeaves = gold.leaves - val numRight = goldLeaves.zip(guessLeaves).foldLeft(0) { (acc,gg) => if(gg._1.label == gg._2.label) acc + 1 else acc} + val numRight = goldLeaves.zip(guessLeaves).foldLeft(0) { (acc,gg) => if (gg._1.label == gg._2.label) acc + 1 else acc} Statistics(guessSet.size, goldSet.size, inter.size, exact, numRight, guess.span.end, 1) } - private def labeledConstituents(tree: Tree[L]) = Set() ++ { for(child <- tree.preorder if !ignoredLabels.contains(child.label) && !child.isLeaf) @@ -84,9 +79,9 @@ object ParseEval extends LazyLogging { numParses + stats.numParses) } - def precision = if(guess == 0) 1.0 else (right * 1.0 / guess) - def recall = if(guess == 0) 1.0 else (right * 1.0 / gold) - def exact = (numExact * 1.0 / numParses) + def precision = if (guess == 0) 1.0 else right * 1.0 / guess + def recall = if (guess == 0) 1.0 else right * 1.0 / gold + def exact = numExact * 1.0 / numParses def tagAccuracy = tagsRight * 1.0 / numWords def f1 = (2 * precision * recall)/(precision + recall) @@ -143,7 +138,7 @@ object ParseEval extends LazyLogging { nthreads: Int = -1)(implicit deb: Debinarizer[L]) = { val parsedir = new File(evalDir) - if(!parsedir.exists() && !parsedir.mkdirs()) { + if (!parsedir.exists() && !parsedir.mkdirs()) { throw new RuntimeException("Couldn't make directory: " + parsedir) } val goldOut = new PrintStream(new BufferedOutputStream(new FileOutputStream(new File(parsedir,"gold")))) diff --git a/src/main/scala/epic/parser/ParseMarginal.scala b/src/main/scala/epic/parser/ParseMarginal.scala index bb77eb8c..00f1c640 100644 --- a/src/main/scala/epic/parser/ParseMarginal.scala +++ b/src/main/scala/epic/parser/ParseMarginal.scala @@ -83,7 +83,6 @@ trait ParseMarginal[L, W] extends VisitableMarginal[AnchoredVisitor[L]] { object ParseMarginal { - trait Factory[L, W] { def apply(w: IndexedSeq[W], constraints: ChartConstraints[L]):ParseMarginal[L, W] } @@ -123,11 +122,10 @@ object ParseMarginal { case class StandardChartFactory[L, W](refinedGrammar: Grammar[L, W], maxMarginal: Boolean = false) extends ParseMarginal.Factory[L, W] { def apply(w: IndexedSeq[W], constraints: ChartConstraints[L]):RefinedChartMarginal[L, W] = { val marg = RefinedChartMarginal(refinedGrammar.anchor(w, constraints), maxMarginal = maxMarginal) - if(!marg.logPartition.isInfinite) { + if (!marg.logPartition.isInfinite) { marg } else { RefinedChartMarginal(refinedGrammar.withPermissiveLexicon.anchor(w, constraints), maxMarginal = maxMarginal) } - } } diff --git a/src/main/scala/epic/parser/ParseText.scala b/src/main/scala/epic/parser/ParseText.scala index dab50ef6..01ef3d0f 100644 --- a/src/main/scala/epic/parser/ParseText.scala +++ b/src/main/scala/epic/parser/ParseText.scala @@ -10,7 +10,6 @@ import epic.models.ParserSelector */ object ParseText extends ProcessTextMain[Parser[AnnotatedLabel, String], Tree[AnnotatedLabel]] { - override def render(model: Parser[AnnotatedLabel, String], ann: Tree[AnnotatedLabel], tokens: IndexedSeq[String]): String = { ann.render(tokens, newline = false) } diff --git a/src/main/scala/epic/parser/Parser.scala b/src/main/scala/epic/parser/Parser.scala index ae3f85e2..eb71f984 100644 --- a/src/main/scala/epic/parser/Parser.scala +++ b/src/main/scala/epic/parser/Parser.scala @@ -62,17 +62,12 @@ final case class Parser[L,W](topology: RuleTopology[L], } } - - - - } object Parser { def apply[L, W](grammar: Grammar[L, W])(implicit deb: Debinarizer[L]): Parser[L, W]= { Parser(grammar.topology, grammar.lexicon, ChartConstraints.Factory.noSparsity, StandardChartFactory(grammar), ChartDecoder()) - } def apply[L, W](refined: Grammar[L, W], decoder: ChartDecoder[L, W])(implicit deb: Debinarizer[L]): Parser[L, W] = { @@ -83,7 +78,6 @@ object Parser { new Parser(refinedGrammar.topology, refinedGrammar.lexicon, ChartConstraints.Factory.noSparsity[L, W], new SimpleChartMarginal.SimpleChartFactory(refinedGrammar, decoder.wantsMaxMarginal), decoder) } - def apply[L, W](core: ChartConstraints.Factory[L, W], grammar: Grammar[L, W], decoder: ChartDecoder[L, W])(implicit deb: Debinarizer[L]): Parser[L, W] = { Parser(grammar.topology, grammar.lexicon, core, StandardChartFactory(grammar, decoder.wantsMaxMarginal), decoder) } diff --git a/src/main/scala/epic/parser/ParserAnnotator.scala b/src/main/scala/epic/parser/ParserAnnotator.scala index cebb9b52..b8a6eecd 100644 --- a/src/main/scala/epic/parser/ParserAnnotator.scala +++ b/src/main/scala/epic/parser/ParserAnnotator.scala @@ -11,16 +11,13 @@ import epic.trees.Tree **/ class ParserAnnotator[L](parser: Parser[L, String]) extends StringAnalysisFunction[Token with Sentence, Tree[L]] { - def apply[In <: Token with Sentence](slab: StringSlab[In]):StringSlab[In with epic.trees.Tree[L]] = { val annotatedSentences = for((span, sent) <- slab.iterator[Sentence].toIndexedSeq.par) yield { val tokens = slab.covered[Token](span) val tree = parser(tokens.map(_._2.token)) span -> tree } - slab.addLayer[Tree[L]](annotatedSentences.seq) } - } diff --git a/src/main/scala/epic/parser/ParserPipeline.scala b/src/main/scala/epic/parser/ParserPipeline.scala index 2d3057c2..9c79093a 100644 --- a/src/main/scala/epic/parser/ParserPipeline.scala +++ b/src/main/scala/epic/parser/ParserPipeline.scala @@ -48,7 +48,7 @@ object ParserParams { val g = RuleTopology(AnnotatedLabel.TOP, xbarBinaries.keysIterator.map(_._2) ++ xbarUnaries.keysIterator.map(_._2)) val lex = new SimpleLexicon(g.labelIndex, words) - if(path ne null) + if (path ne null) writeObject(path, g -> lex) g -> lex @@ -83,11 +83,8 @@ trait ParserPipeline extends LazyLogging { validate: Parser[AnnotatedLabel, String]=>ParseEval.Statistics, params: Params):Iterator[(String, Parser[AnnotatedLabel, String])] - def trainParser(treebank: ProcessedTreebank, params: Params):Iterator[(String, Parser[AnnotatedLabel, String])] = { import treebank._ - - val validateTrees = devTrees.take(100) def validate(parser: Parser[AnnotatedLabel, String]) = { ParseEval.evaluate[AnnotatedLabel](validateTrees, parser, asString={(l:AnnotatedLabel)=>l.label}, nthreads=params.threads) @@ -104,7 +101,7 @@ trait ParserPipeline extends LazyLogging { val params = CommandLineParser.readIn[JointParams[Params]](args) -// logger.info("Command line arguments for recovery:\n" + Configuration.fromObject(params).toCommandLineString) + // logger.info("Command line arguments for recovery:\n" + Configuration.fromObject(params).toCommandLineString) logger.info("Training Parser...") val parsers = trainParser(params.treebank, params.trainer) @@ -133,7 +130,6 @@ trait ParserPipeline extends LazyLogging { } } - def evalParser(testTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], parser: Parser[AnnotatedLabel, String], name: String):ParseEval.Statistics = { diff --git a/src/main/scala/epic/parser/ProductChartFactory.scala b/src/main/scala/epic/parser/ProductChartFactory.scala index 317ca2b9..fee1725b 100644 --- a/src/main/scala/epic/parser/ProductChartFactory.scala +++ b/src/main/scala/epic/parser/ProductChartFactory.scala @@ -13,11 +13,10 @@ class ProductChartFactory[L, W](grammars: IndexedSeq[Grammar[L, W]], maxIteratio def apply(words: IndexedSeq[W], initialCore: ChartConstraints[L]): RefinedChartMarginal[L, W] = { val anchorings = grammars.map(_.anchor(words, initialCore)) - if(anchorings.length == 1) { + if (anchorings.length == 1) { return RefinedChartMarginal(anchorings.head) } - val proj = new AnchoredRuleMarginalProjector[L, W] val augments = anchorings.map(_.marginal).map(proj.project(_)) val marg = augments.reduceLeft[UnrefinedGrammarAnchoring[L, W]](_ * _).marginal diff --git a/src/main/scala/epic/parser/ProductGrammarAnchoring.scala b/src/main/scala/epic/parser/ProductGrammarAnchoring.scala index c8183aec..9dcfcb8f 100644 --- a/src/main/scala/epic/parser/ProductGrammarAnchoring.scala +++ b/src/main/scala/epic/parser/ProductGrammarAnchoring.scala @@ -44,47 +44,47 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], override val sparsityPattern: ChartConstraints[L] = s1.sparsityPattern & s2.sparsityPattern override def annotationTag = { - if(refinementController == null) -1 + if (refinementController == null) -1 else refinementController.annotationTag } def scoreSpan(begin: Int, end: Int, label: Int, ref: Int) = { val r1 = s1.scoreSpan(begin, end, label, label1Ref(label, ref)) - if(r1 == Double.NegativeInfinity) r1 + if (r1 == Double.NegativeInfinity) r1 else r1 + alpha * s2.scoreSpan(begin, end, label, label2Ref(label, ref)) } def scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int) = { val r1 = s1.scoreBinaryRule(begin, split, end, rule, rule1Ref(rule, ref)) - if(r1 == Double.NegativeInfinity) r1 + if (r1 == Double.NegativeInfinity) r1 else r1 + alpha * s2.scoreBinaryRule(begin, split, end, rule, rule2Ref(rule, ref)) } def scoreUnaryRule(begin: Int, end: Int, rule: Int, ref: Int) = { val r1 = s1.scoreUnaryRule(begin, end, rule, rule1Ref(rule, ref)) - if(r1 == Double.NegativeInfinity) r1 + if (r1 == Double.NegativeInfinity) r1 else r1 + alpha * s2.scoreUnaryRule(begin, end, rule, rule2Ref(rule, ref)) } def validLabelRefinements(begin: Int, end: Int, label: Int) = { - if(refinementController ne null) refinementController.validLabelRefinements(begin, end, label) + if (refinementController ne null) refinementController.validLabelRefinements(begin, end, label) else for(a <- s1.validLabelRefinements(begin, end, label); b <- s2.validLabelRefinements(begin, end, label)) yield a * s2.numValidRefinements(label) + b } def numValidRefinements(label: Int) = { - if(refinementController ne null) refinementController.numValidRefinements(label) + if (refinementController ne null) refinementController.numValidRefinements(label) else s1.numValidRefinements(label) * s2.numValidRefinements(label) } def numValidRuleRefinements(rule: Int) = { - if(refinementController ne null) refinementController.numValidRuleRefinements(rule) + if (refinementController ne null) refinementController.numValidRuleRefinements(rule) else s1.numValidRuleRefinements(rule) * s2.numValidRuleRefinements(rule) } def validRuleRefinementsGivenParent(begin: Int, end: Int, rule: Int, parentRef: Int) = { - if(refinementController ne null) refinementController.validRuleRefinementsGivenParent(begin, end, rule, parentRef) + if (refinementController ne null) refinementController.validRuleRefinementsGivenParent(begin, end, rule, parentRef) else { val parent = topology.parent(rule) val bRefinements = s2.validRuleRefinementsGivenParent(begin, end, rule, label2Ref(parent, parentRef)) @@ -95,7 +95,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], } def validRuleRefinementsGivenLeftChild(begin: Int, split: Int, completionBegin: Int, completionEnd: Int, rule: Int, leftChildRef: Int): Array[Int] = { - if(refinementController ne null) refinementController.validRuleRefinementsGivenLeftChild(begin, split, completionBegin, completionEnd, rule, leftChildRef) + if (refinementController ne null) refinementController.validRuleRefinementsGivenLeftChild(begin, split, completionBegin, completionEnd, rule, leftChildRef) else { val leftChild = topology.leftChild(rule) val bRefinements = s2.validRuleRefinementsGivenLeftChild(begin, split, completionBegin, completionEnd, rule, label2Ref(leftChild, leftChildRef)) @@ -106,7 +106,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], } def validRuleRefinementsGivenRightChild(completionBegin: Int, completionEnd: Int, split: Int, end: Int, rule: Int, rightChildRef: Int): Array[Int] = { - if(refinementController ne null) refinementController.validRuleRefinementsGivenRightChild(completionBegin, completionEnd, split, end, rule, rightChildRef) + if (refinementController ne null) refinementController.validRuleRefinementsGivenRightChild(completionBegin, completionEnd, split, end, rule, rightChildRef) else { val rightChild = topology.rightChild(rule) val bRefinements = s2.validRuleRefinementsGivenRightChild(completionBegin, completionEnd, split, end, rule, label2Ref(rightChild, rightChildRef)) @@ -116,9 +116,8 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], } } - def validUnaryRuleRefinementsGivenChild(begin: Int, end: Int, rule: Int, childRef: Int) = { - if(refinementController ne null) refinementController.validUnaryRuleRefinementsGivenChild(begin, end, rule, childRef) + if (refinementController ne null) refinementController.validUnaryRuleRefinementsGivenChild(begin, end, rule, childRef) else { val child = topology.child(rule) val bRefinements = s2.validUnaryRuleRefinementsGivenChild(begin, end, rule, label2Ref(child, childRef)) @@ -129,7 +128,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], } def leftChildRefinement(rule: Int, ruleRef: Int) = { - if(refinementController ne null) refinementController.leftChildRefinement(rule,ruleRef) + if (refinementController ne null) refinementController.leftChildRefinement(rule,ruleRef) else { val l1 = s1.leftChildRefinement(rule, rule1Ref(rule, ruleRef)) val l2 = s2.leftChildRefinement(rule, rule2Ref(rule, ruleRef)) @@ -139,7 +138,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], def rightChildRefinement(rule: Int, ruleRef: Int) = { - if(refinementController ne null) refinementController.rightChildRefinement(rule,ruleRef) + if (refinementController ne null) refinementController.rightChildRefinement(rule,ruleRef) else { val l1 = s1.rightChildRefinement(rule, rule1Ref(rule, ruleRef)) val l2 = s2.rightChildRefinement(rule, rule2Ref(rule, ruleRef)) @@ -149,7 +148,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], def parentRefinement(rule: Int, ruleRef: Int) = { - if(refinementController ne null) refinementController.parentRefinement(rule,ruleRef) + if (refinementController ne null) refinementController.parentRefinement(rule,ruleRef) else { val l1 = s1.parentRefinement(rule, rule1Ref(rule, ruleRef)) val l2 = s2.parentRefinement(rule, rule2Ref(rule, ruleRef)) @@ -158,7 +157,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], } def childRefinement(rule: Int, ruleRef: Int) = { - if(refinementController ne null) refinementController.childRefinement(rule,ruleRef) + if (refinementController ne null) refinementController.childRefinement(rule,ruleRef) else { val l1 = s1.childRefinement(rule, rule1Ref(rule, ruleRef)) val l2 = s2.childRefinement(rule, rule2Ref(rule, ruleRef)) @@ -167,7 +166,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], } def ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int) = { - if(refinementController ne null) refinementController.ruleRefinementFromRefinements(r, refA, refB) + if (refinementController ne null) refinementController.ruleRefinementFromRefinements(r, refA, refB) else { val a1 = label1Ref(topology.parent(r), refA) val a2 = label2Ref(topology.parent(r), refA) @@ -175,13 +174,13 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], val b2 = label2Ref(topology.child(r), refB) val l1 = s1.ruleRefinementFromRefinements(r, a1, b1) val l2 = s2.ruleRefinementFromRefinements(r, a2, b2) - if(l1 < 0 || l2 < 0) -1 + if (l1 < 0 || l2 < 0) -1 else l1 * s2.numValidRuleRefinements(r) + l2 } } def ruleRefinementFromRefinements(r: Int, refA: Int, refB: Int, refC: Int) = { - if(refinementController ne null) refinementController.ruleRefinementFromRefinements(r, refA, refB, refC) + if (refinementController ne null) refinementController.ruleRefinementFromRefinements(r, refA, refB, refC) else { val a1 = label1Ref(topology.parent(r), refA) val a2 = label2Ref(topology.parent(r), refA) @@ -191,7 +190,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], val c2 = label2Ref(topology.rightChild(r), refC) val l1 = s1.ruleRefinementFromRefinements(r, a1, b1, c1) val l2 = s2.ruleRefinementFromRefinements(r, a2, b2, c2) - if(l1 < 0 || l2 < 0) -1 + if (l1 < 0 || l2 < 0) -1 else l1 * s2.numValidRuleRefinements(r) + l2 } @@ -208,7 +207,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], def validParentRefinementsGivenRule(begin: Int, splitBegin: Int, splitEnd: Int, end: Int, rule: Int): Array[Int] = { - if(refinementController ne null) refinementController.validParentRefinementsGivenRule(begin, splitBegin, splitEnd, end, rule) + if (refinementController ne null) refinementController.validParentRefinementsGivenRule(begin, splitBegin, splitEnd, end, rule) else { val r1arr = s1.validParentRefinementsGivenRule(begin, splitBegin, splitEnd, end, rule) val r2arr = s2.validParentRefinementsGivenRule(begin, splitBegin, splitEnd, end, rule) @@ -220,7 +219,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], def validLeftChildRefinementsGivenRule(begin: Int, end: Int, completionBegin: Int, completionEnd: Int, rule: Int): Array[Int] = { - if(refinementController ne null) refinementController.validLeftChildRefinementsGivenRule(begin, end, completionBegin, completionEnd, rule) + if (refinementController ne null) refinementController.validLeftChildRefinementsGivenRule(begin, end, completionBegin, completionEnd, rule) else { val r1arr = s1.validLeftChildRefinementsGivenRule(begin, end, completionBegin, completionEnd, rule) val r2arr = s2.validLeftChildRefinementsGivenRule(begin, end, completionBegin, completionEnd, rule) @@ -230,7 +229,7 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], } def validRightChildRefinementsGivenRule(completionBegin: Int, completionEnd: Int, begin: Int, end: Int, rule: Int): Array[Int] = { - if(refinementController ne null) refinementController.validRightChildRefinementsGivenRule(completionBegin, completionEnd, begin, end, rule) + if (refinementController ne null) refinementController.validRightChildRefinementsGivenRule(completionBegin, completionEnd, begin, end, rule) else { val r1arr = s1.validRightChildRefinementsGivenRule(completionBegin, completionEnd, begin, end, rule) val r2arr = s2.validRightChildRefinementsGivenRule(completionBegin, completionEnd, begin, end, rule) @@ -242,16 +241,16 @@ final case class ProductGrammarAnchoring[L,W](s1: GrammarAnchoring[L, W], abstract class ProductRefinementsHandler[L, W](s1: GrammarAnchoring[L, W], s2: GrammarAnchoring[L, W]) { protected final val refinementController: GrammarAnchoring[L, W] = { - if(s1.annotationTag == 0) s2 - else if(s2.annotationTag == 0) s1 + if (s1.annotationTag == 0) s2 + else if (s2.annotationTag == 0) s1 else if (s1.annotationTag < 0 || s2.annotationTag < 0) null - else if(s1.annotationTag == s2.annotationTag) s1 + else if (s1.annotationTag == s2.annotationTag) s1 else null } @inline protected final def label1Ref(label: Int, ref: Int): Int = { - if(refinementController != null) ref + if (refinementController != null) ref else { val num = s1.numValidRefinements(label) ref / num @@ -260,7 +259,7 @@ abstract class ProductRefinementsHandler[L, W](s1: GrammarAnchoring[L, W], s2: G @inline protected final def label2Ref(label: Int, ref: Int): Int = { - if(refinementController != null) ref + if (refinementController != null) ref else { val num = s1.numValidRefinements(label) ref % num @@ -269,7 +268,7 @@ abstract class ProductRefinementsHandler[L, W](s1: GrammarAnchoring[L, W], s2: G @inline protected final def rule1Ref(rule: Int, ref: Int): Int = { - if(refinementController != null) ref + if (refinementController != null) ref else { val num = s1.numValidRuleRefinements(rule) ref / num @@ -278,7 +277,7 @@ abstract class ProductRefinementsHandler[L, W](s1: GrammarAnchoring[L, W], s2: G @inline protected final def rule2Ref(rule: Int, ref: Int): Int = { - if(refinementController != null) ref + if (refinementController != null) ref else { val num = s1.numValidRuleRefinements(rule) ref % num diff --git a/src/main/scala/epic/parser/ProductRefinedFeaturizer.scala b/src/main/scala/epic/parser/ProductRefinedFeaturizer.scala index 39fad371..4e57f2ac 100644 --- a/src/main/scala/epic/parser/ProductRefinedFeaturizer.scala +++ b/src/main/scala/epic/parser/ProductRefinedFeaturizer.scala @@ -27,7 +27,6 @@ class ProductRefinedFeaturizer[L, W, Feat1, Feat2](sf1: Grammar[L, W], feat2: RefinedFeaturizer[L, W, Feat2]) extends RefinedFeaturizer[L, W, Either[Feat1, Feat2]] { def index: EitherIndex[Feat1, Feat2] = feat1.index | feat2.index - override def lock = new ProductRefinedFeaturizer(sf1, sf2, feat1.lock, feat2.lock) def anchor(w: IndexedSeq[W]):Anchoring = { diff --git a/src/main/scala/epic/parser/ProductUnrefinedGrammarAnchoring.scala b/src/main/scala/epic/parser/ProductUnrefinedGrammarAnchoring.scala index d8a832e8..d17c2324 100644 --- a/src/main/scala/epic/parser/ProductUnrefinedGrammarAnchoring.scala +++ b/src/main/scala/epic/parser/ProductUnrefinedGrammarAnchoring.scala @@ -27,7 +27,6 @@ final case class ProductUnrefinedGrammarAnchoring[L, W](s1: UnrefinedGrammarAnch s2: UnrefinedGrammarAnchoring[L, W], alpha: Double = 1.0) extends UnrefinedGrammarAnchoring[L, W] { - // def sparsityPattern = ChartConstraints.noSparsity[L] override def addConstraints(cs: ChartConstraints[L]): UnrefinedGrammarAnchoring[L, W] = copy(s1.addConstraints(cs)) @@ -39,9 +38,8 @@ final case class ProductUnrefinedGrammarAnchoring[L, W](s1: UnrefinedGrammarAnch def words = s1.words - -// override val sparsityPattern: ChartConstraints[L] = s1.sparsityPattern & s2.sparsityPattern -// def addConstraints(cs: ChartConstraints[L]): CoreAnchoring[L, W] = new ProductCoreAnchoring(s1.addConstraints(cs), s2, alpha) + // override val sparsityPattern: ChartConstraints[L] = s1.sparsityPattern & s2.sparsityPattern + // def addConstraints(cs: ChartConstraints[L]): CoreAnchoring[L, W] = new ProductCoreAnchoring(s1.addConstraints(cs), s2, alpha) def scoreSpan(begin: Int, end: Int, label: Int) = { val r1 = s1.scoreSpan(begin, end, label) diff --git a/src/main/scala/epic/parser/ProductionFeaturizer.scala b/src/main/scala/epic/parser/ProductionFeaturizer.scala index 22afdd11..8022db83 100644 --- a/src/main/scala/epic/parser/ProductionFeaturizer.scala +++ b/src/main/scala/epic/parser/ProductionFeaturizer.scala @@ -28,12 +28,12 @@ import epic.features.IndicatorFeature */ @SerialVersionUID(1L) class ProductionFeaturizer[L, L2, W](val topology: RuleTopology[L], refinements: GrammarRefinements[L, L2], - lGen: L2=>Seq[Feature] = {(x:L2)=>if(x.isInstanceOf[Feature]) Seq(x.asInstanceOf[Feature]) else Seq(IndicatorFeature(x))}, + lGen: L2=>Seq[Feature] = {(x:L2)=>if (x.isInstanceOf[Feature]) Seq(x.asInstanceOf[Feature]) else Seq(IndicatorFeature(x))}, rGen: Rule[L2] => Seq[Feature] = {(x: Rule[L2]) => Seq(x)}, filterRedundantFeatures: Boolean = false) extends RefinedFeaturizer[L, W, Feature] with Serializable { private val (index_ :Index[Feature], ruleFeatures: Array[Array[Int]], labelFeatures: Array[Array[Int]]) = { - if(filterRedundantFeatures) { + if (filterRedundantFeatures) { val index = epic.features.buildNonRedundantFeatureIndex[Either[Rule[L2], L2], Feature](refinements.rules.fineIndex.iterator.map(Left(_)) ++ refinements.labels.fineIndex.iterator.map(Right(_)), { case Left(r) => rGen(r) case Right(l) => lGen(l) @@ -63,7 +63,6 @@ class ProductionFeaturizer[L, L2, W](val topology: RuleTopology[L], refinements: def featuresForLabel(l: Int): Array[Int] = labelFeatures(l) - override def lock: RefinedFeaturizer[L, W, Feature] = this def anchor(w: IndexedSeq[W]) = new Anchoring { diff --git a/src/main/scala/epic/parser/ProjectionsGrammarAnchoring.scala b/src/main/scala/epic/parser/ProjectionsGrammarAnchoring.scala index 3f97b710..5b81b8ea 100644 --- a/src/main/scala/epic/parser/ProjectionsGrammarAnchoring.scala +++ b/src/main/scala/epic/parser/ProjectionsGrammarAnchoring.scala @@ -12,7 +12,6 @@ trait ProjectionsGrammarAnchoring[L, L2, W] extends GrammarAnchoring[L, W] { def refinements: GrammarRefinements[L, L2] def refinedTopology: RuleTopology[L2] - final def validLabelRefinements(begin: Int, end: Int, label: Int) = { refinements.labels.localRefinements(label) } @@ -60,7 +59,7 @@ trait ProjectionsGrammarAnchoring[L, L2, W] extends GrammarAnchoring[L, W] { val b2 = refinements.labels.globalize(b, refB) val rule = UnaryRule(refinements.labels.fineIndex.get(a2), refinements.labels.fineIndex.get(b2), topology.chain(r)) val refinedRuleIndex = refinements.rules.fineIndex(rule) - if(refinedRuleIndex < 0) { + if (refinedRuleIndex < 0) { -1 } else { refinements.rules.localize(refinedRuleIndex) @@ -79,7 +78,7 @@ trait ProjectionsGrammarAnchoring[L, L2, W] extends GrammarAnchoring[L, W] { refinements.labels.fineIndex.get(c2) ) val fi = refinements.rules.fineIndex(rule) - if(fi < 0) throw new RuntimeException(s"No such rule: $rule") + if (fi < 0) throw new RuntimeException(s"No such rule: $rule") refinements.rules.localize(fi) } @@ -100,6 +99,4 @@ trait ProjectionsGrammarAnchoring[L, L2, W] extends GrammarAnchoring[L, W] { refinements.rightChildRefinementsCompatibleWithRule(rule) } - - } diff --git a/src/main/scala/epic/parser/RefinedChartMarginal.scala b/src/main/scala/epic/parser/RefinedChartMarginal.scala index 6e6ab7f7..5e8ae9e8 100644 --- a/src/main/scala/epic/parser/RefinedChartMarginal.scala +++ b/src/main/scala/epic/parser/RefinedChartMarginal.scala @@ -41,7 +41,6 @@ final case class RefinedChartMarginal[L, W](anchoring: GrammarAnchoring[L, W], logPartition: Double, override val isMaxMarginal: Boolean) extends ParseMarginal[L, W] with SafeLogging { - override def insideTopScore(begin: Int, end: Int, sym: Int, ref: Int): Double = inside.top(begin, end, sym, ref) override def insideBotScore(begin: Int, end: Int, sym: Int, ref: Int): Double = inside.bot(begin, end, sym, ref) @@ -49,11 +48,9 @@ final case class RefinedChartMarginal[L, W](anchoring: GrammarAnchoring[L, W], val in = inside.bot.decodedLabelScores(begin, end) in += outside.bot.decodedLabelScores(begin, end) in -= logPartition - breeze.numerics.exp(in) } - def feasibleSplitPoints(begin: Int, end: Int, leftChild: Int, leftChildRef: Int, rightChild: Int, rightChildRef: Int):IndexedSeq[Int] = { inside.top.feasibleSplitPoints(begin, end, leftChild, leftChildRef, rightChild, rightChildRef).toIndexedSeq } @@ -62,15 +59,15 @@ final case class RefinedChartMarginal[L, W](anchoring: GrammarAnchoring[L, W], * Forest traversal that visits spans in a "bottom up" order. */ def visitPostorder(spanVisitor: AnchoredVisitor[L], spanThreshold: Double = Double.NegativeInfinity):Unit = { - if(logPartition.isInfinite) throw new RuntimeException("No parse for " + words) - if(logPartition.isNaN) throw new RuntimeException("NaN prob!") + if (logPartition.isInfinite) throw new RuntimeException("No parse for " + words) + if (logPartition.isNaN) throw new RuntimeException("NaN prob!") val itop = inside.top val lexLoc = anchoring.tagConstraints // handle lexical - for (i <- 0 until words.length) { + for (i <- words.indices) { var visitedSomething = false for { a <- lexLoc.allowedTags(i) if anchoring.sparsityPattern.bot.isAllowedLabeledSpan(i, i+1, a) @@ -98,12 +95,12 @@ final case class RefinedChartMarginal[L, W](anchoring: GrammarAnchoring[L, W], val aOutside = outside.bot.labelScore(begin, end, a, refA) val labelMarginal = aOutside + inside.bot.labelScore(begin, end, a, refA) - logPartition val aScore = aOutside + anchoring.scoreSpan(begin, end, a, refA) - if(labelMarginal > spanThreshold) { + if (labelMarginal > spanThreshold) { spanVisitor.visitSpan(begin, end, a, refA, math.exp(labelMarginal)) - if(!spanVisitor.skipBinaryRules) { + if (!spanVisitor.skipBinaryRules) { val rules = anchoring.validCoarseRulesGivenParentRefinement(a, refA) - while(i < rules.length) { + while (i < rules.length) { val r = rules(i) val b = topology.leftChild(r) val c = topology.rightChild(r) @@ -111,10 +108,10 @@ final case class RefinedChartMarginal[L, W](anchoring: GrammarAnchoring[L, W], val feasibleCoarseRange = inside.top.feasibleSplitPoints(begin, end, b, c) - if(feasibleCoarseRange.nonEmpty) { + if (feasibleCoarseRange.nonEmpty) { val refinements = anchoring.validRuleRefinementsGivenParent(begin, end, r, refA) var ruleRefIndex = 0 - while(ruleRefIndex < refinements.length) { + while (ruleRefIndex < refinements.length) { val refR = refinements(ruleRefIndex) ruleRefIndex += 1 val refB = anchoring.leftChildRefinement(r, refR) @@ -125,7 +122,7 @@ final case class RefinedChartMarginal[L, W](anchoring: GrammarAnchoring[L, W], var split = feasibleSplitRange.begin val endSplit = feasibleSplitRange.end - while(split < endSplit) { + while (split < endSplit) { val bInside = itop.labelScore(begin, split, b, refB) val cInside = itop.labelScore(split, end, c, refC) val withoutRefined = bInside + cInside @@ -147,7 +144,7 @@ final case class RefinedChartMarginal[L, W](anchoring: GrammarAnchoring[L, W], } // Unaries - if(!spanVisitor.skipUnaryRules) + if (!spanVisitor.skipUnaryRules) for { span <- 1 to words.length begin <- 0 to (words.length - span) @@ -190,7 +187,7 @@ final case class RefinedChartMarginal[L, W](anchoring: GrammarAnchoring[L, W], case UnaryTree( label, _, _, span) => val labelScore = breeze.linalg.softmax(inside.top.decodedLabelScores(span.begin, span.end, anchoring.topology.labelIndex(label))) if (labelScore.isInfinite) { - logger.warn("problem with unary: " + (label) + " " + span) + logger.warn("problem with unary: " + label + " " + span) } case tree => val labelScore = breeze.linalg.softmax(inside.bot.decodedLabelScores(tree.begin, tree.end, anchoring.topology.labelIndex(tree.label))) @@ -201,7 +198,6 @@ final case class RefinedChartMarginal[L, W](anchoring: GrammarAnchoring[L, W], this } - def checkForTreeOutside(tree: BinarizedTree[(L, Int)]) { for (t <- tree.allChildren) t match { case tree@UnaryTree( (label, ref), _, _, span) => @@ -243,14 +239,13 @@ object RefinedChartMarginal { apply(grammar.anchor(sent)) } - def apply[L, W](anchoring: GrammarAnchoring[L, W]): RefinedChartMarginal[L, W] = { apply(anchoring, false) } def apply[L, W](anchoring: GrammarAnchoring[L, W], maxMarginal: Boolean): RefinedChartMarginal[L, W] = { val sent = anchoring.words - val sum = if(maxMarginal) MaxSummer else LogSummer + val sum = if (maxMarginal) MaxSummer else LogSummer val inside = buildInsideChart(anchoring, sent, sum) val logPartition = rootScore(anchoring, inside, sum) val outside = buildOutsideChart(anchoring, inside, sum) @@ -269,7 +264,7 @@ object RefinedChartMarginal { private[parser] object MaxSummer extends Summer { def apply(a: Double, b: Double): Double = math.max(a,b) - def apply(a: Array[Double], length: Int): Double = if(length == 0) Double.NegativeInfinity else max.array(a, length) + def apply(a: Array[Double], length: Int): Double = if (length == 0) Double.NegativeInfinity else max.array(a, length) } private def rootScore[L, W](anchoring: GrammarAnchoring[L, W], inside: RefinedParseChart[L], sum: Summer): Double = { @@ -278,13 +273,13 @@ object RefinedChartMarginal { var offset = 0 for(ref <- inside.top.enteredLabelRefinements(0, inside.length, rootIndex)) { val score = inside.top.labelScore(0, inside.length, rootIndex, ref) - if(score != Double.NegativeInfinity) { + if (score != Double.NegativeInfinity) { rootScores(offset) = score offset += 1 } } val score = sum(rootScores, offset) -// assert(score != 0.0, rootScores.mkString(", ") + anchoring.words) + // assert(score != 0.0, rootScores.mkString(", ") + anchoring.words) assert(!score.isNaN, rootScores.mkString(", ")) score } @@ -300,7 +295,7 @@ object RefinedChartMarginal { val tagConstraints = anchoring.tagConstraints // handle lexical - for{i <- 0 until anchoring.words.length} { + for{i <- anchoring.words.indices} { assert(anchoring.sparsityPattern.isAllowedSpan(i,i+1), "a pos tag isn't allowed? " + anchoring.sparsityPattern) assert(anchoring.sparsityPattern.bot.isAllowedSpan(i,i+1), "a top of a length 1 span isn't allowed?") var foundSomething = false @@ -329,7 +324,6 @@ object RefinedChartMarginal { } { val end = begin + span - for ( a <- 0 until anchoring.topology.labelIndex.size if anchoring.sparsityPattern.bot.isAllowedLabeledSpan(begin, end, a)) { val numValidLabelRefs = anchoring.numValidRefinements(a) java.util.Arrays.fill(offsets, 0) @@ -337,27 +331,26 @@ object RefinedChartMarginal { val rules = anchoring.topology.indexedBinaryRulesWithParent(a) var ruleIndex = 0 // into rules - while(ruleIndex < rules.length) { + while (ruleIndex < rules.length) { val r = rules(ruleIndex) val b = anchoring.topology.leftChild(r) val c = anchoring.topology.rightChild(r) ruleIndex += 1 - val feasibleCoarseRange = inside.top.feasibleSplitPoints(begin, end, b, c) - if(feasibleCoarseRange.nonEmpty) { + if (feasibleCoarseRange.nonEmpty) { val validA = anchoring.validParentRefinementsGivenRule(begin, feasibleCoarseRange.begin, feasibleCoarseRange.end, end, r) var ai = 0 - while(ai < validA.length) { + while (ai < validA.length) { val refA = validA(ai) ai += 1 val spanScore = anchoring.scoreSpan(begin, end, a, refA) - if(!spanScore.isInfinite) { + if (!spanScore.isInfinite) { val refinements = anchoring.validRuleRefinementsGivenParent(begin, end, r, refA) var ruleRefIndex = 0 - while(ruleRefIndex < refinements.length) { + while (ruleRefIndex < refinements.length) { val refR = refinements(ruleRefIndex) ruleRefIndex += 1 val refB = anchoring.leftChildRefinement(r, refR) @@ -368,21 +361,21 @@ object RefinedChartMarginal { var split = feasibleSplitRange.begin val endSplit = feasibleSplitRange.end - while(split < endSplit) { + while (split < endSplit) { val bScore = inside.top.labelScore(begin, split, b, refB) val cScore = inside.top.labelScore(split, end, c, refC) val withoutRule = bScore + cScore + spanScore - if(withoutRule != Double.NegativeInfinity) { + if (withoutRule != Double.NegativeInfinity) { val prob = withoutRule + anchoring.scoreBinaryRule(begin, split, end, r, refR) assert(!prob.isNaN, s"$withoutRule ${anchoring.scoreBinaryRule(begin, split, end, r, refR)} $bScore $cScore $spanScore") - if(prob != Double.NegativeInfinity) { + if (prob != Double.NegativeInfinity) { scoreArray(refA)(offsets(refA)) = prob offsets(refA) += 1 // buffer full - if(offsets(refA) == scoreArray(refA).length) { + if (offsets(refA) == scoreArray(refA).length) { scoreArray(refA)(0) = sum(scoreArray(refA), offsets(refA)) offsets(refA) = 1 } @@ -395,14 +388,12 @@ object RefinedChartMarginal { } // end a refinement - - } // end canBuildThisRule } // end rules enterScoresForLabelRefinements(sum, scoreArray, offsets, inside.bot, begin, end, a, numValidLabelRefs) // assert(rootScore(anchoring, inside, sum) != 0.0, (begin, end, a)) -// if(!foundSomething && refined.sparsityPattern != ChartConstraints.noSparsity) { +// if (!foundSomething && refined.sparsityPattern != ChartConstraints.noSparsity) { // logger.warn(s"Failed to replicate a span in ($begin, $end) of ${anchoring.words}. Label is ${anchoring.grammar.labelIndex.get(a)}") // // } @@ -413,7 +404,6 @@ object RefinedChartMarginal { inside } - private def enterScoresForLabelRefinements[L](sum: Summer, scoreArray: Array[Array[Double]], offsets: Array[Int], bot: RefinedParseChart[L]#ChartScores, begin: Int, end: Int, parent: Int, numValidLabelRefs: Int) { var foundSomething = false var ai = 0 @@ -436,7 +426,6 @@ object RefinedChartMarginal { val grammar = anchoring.topology val rootIndex = grammar.labelIndex(grammar.root) - val length = inside.length val outside = RefinedParseChart(grammar.labelIndex, Array.tabulate(grammar.labelIndex.size)(refined.numValidRefinements), @@ -458,7 +447,7 @@ object RefinedChartMarginal { val enteredTop = inside.top.enteredLabelIndexes(begin, end) var a = 0 - while(a < grammar.labelIndex.size) { + while (a < grammar.labelIndex.size) { // we're going to populate a by looking at rules p -> a rc, p -> lc a if (enteredTop.contains(a)) { java.util.Arrays.fill(offsets, 0) @@ -476,7 +465,6 @@ object RefinedChartMarginal { outside } - private def doOutsideLeftCompletionUpdates[W, L](inside: RefinedParseChart[L], outside: RefinedParseChart[L], anchoring: GrammarAnchoring[L, W], begin: Int, end: Int, @@ -486,7 +474,6 @@ object RefinedChartMarginal { val grammar = refined.topology val rules = anchoring.topology.indexedBinaryRulesWithLeftChild(label) - var br = 0 while (br < rules.length) { val r = rules(br) @@ -528,7 +515,7 @@ object RefinedChartMarginal { if (cInside != Double.NegativeInfinity && pOutside != Double.NegativeInfinity) { val ruleScore = refined.scoreBinaryRule(begin, end, completion, r, refR) val score = cInside + ruleScore + pOutside - if(score != Double.NegativeInfinity) { + if (score != Double.NegativeInfinity) { scoreArray(refA)(offsets(refA)) = score offsets(refA) += 1 // buffer full @@ -557,7 +544,6 @@ object RefinedChartMarginal { val rcMaxCompletion = inside.top.rightMostEndForBegin(end)(rc)(refC) val completionBegin = math.max(math.max(parentMinCompletion, rcMinCompletion), end + 1) val completionEnd = math.min(parentMaxCompletion, rcMaxCompletion) - Span(completionBegin, completionEnd) } @@ -568,7 +554,6 @@ object RefinedChartMarginal { val rcMaxCompletion = inside.top.coarseRightMostEndForBegin(end)(rc) val completionBegin = math.max(math.max(parentMinCompletion, rcMinCompletion), end + 1) val completionEnd = math.min(parentMaxCompletion, rcMaxCompletion) - Span(completionBegin, completionEnd) } @@ -621,7 +606,7 @@ object RefinedChartMarginal { if (bInside != Double.NegativeInfinity && pOutside != Double.NegativeInfinity) { val ruleScore = refined.scoreBinaryRule(completion, begin, end, r, refR) val score = bInside + ruleScore + pOutside - if(score != Double.NegativeInfinity) { + if (score != Double.NegativeInfinity) { scoreArray(refA)(offsets(refA)) = score offsets(refA) += 1 // buffer full @@ -642,7 +627,6 @@ object RefinedChartMarginal { } } - private def feasibleSpanForRightCompletion[L, W](begin: Int, end: Int, p: Int, refP: Int, lc: Int, refB: Int, inside: RefinedParseChart[L]) = { val parentMinCompletion = inside.bot.leftMostBeginForEnd(end)(p)(refP) val rcMinCompletion = inside.top.leftMostBeginForEnd(begin)(lc)(refB) @@ -674,13 +658,13 @@ object RefinedChartMarginal { for(bi <- chart.bot.enteredLabelIndexes(begin, end); refB <- chart.bot.enteredLabelRefinements(begin, end, bi)) { val b = bi val bScore = chart.bot.labelScore(begin, end, b, refB) - if(bScore != Double.NegativeInfinity) { + if (bScore != Double.NegativeInfinity) { val rules = grammar.indexedUnaryRulesWithChild(b) var j = 0 - while(j < rules.length) { + while (j < rules.length) { val r = rules(j) val a = grammar.parent(r) - if(refined.sparsityPattern.top.isAllowedLabeledSpan(begin, end, a)) { + if (refined.sparsityPattern.top.isAllowedLabeledSpan(begin, end, a)) { for (refR <- refined.validUnaryRuleRefinementsGivenChild(begin, end, r, refB)) { val refA = refined.parentRefinement(r, refR) val ruleScore: Double = refined.scoreUnaryRule(begin, end, r, refR) @@ -709,15 +693,15 @@ object RefinedChartMarginal { val bScore = chart.top.labelScore(begin, end, a, refA) val rules = grammar.indexedUnaryRulesWithParent(a) var j = 0 - while(j < rules.length) { + while (j < rules.length) { val r = rules(j) val b = grammar.child(r) - if(inside.bot.isLabelEntered(begin, end, b)) + if (inside.bot.isLabelEntered(begin, end, b)) for(refR <- refined.validRuleRefinementsGivenParent(begin, end, rules(j), refA)) { val refB = refined.childRefinement(rules(j), refR) val ruleScore: Double = refined.scoreUnaryRule(begin, end, rules(j), refR) val prob: Double = bScore + ruleScore - if(prob != Double.NegativeInfinity) { + if (prob != Double.NegativeInfinity) { chart.bot.enter(begin, end, b, refB, sum(chart.bot.labelScore(begin, end, b, refB), prob)) } } @@ -727,8 +711,6 @@ object RefinedChartMarginal { } - - } diff --git a/src/main/scala/epic/parser/RefinedFeaturizer.scala b/src/main/scala/epic/parser/RefinedFeaturizer.scala index a42cb491..7e81eb8b 100644 --- a/src/main/scala/epic/parser/RefinedFeaturizer.scala +++ b/src/main/scala/epic/parser/RefinedFeaturizer.scala @@ -19,7 +19,6 @@ import epic.trees.{LexicalProduction, Production, Rule} import breeze.util.Index import epic.framework.Feature - /** * * @author dlwh diff --git a/src/main/scala/epic/parser/RefinedParseChart.scala b/src/main/scala/epic/parser/RefinedParseChart.scala index 7f6e8516..89068b4b 100644 --- a/src/main/scala/epic/parser/RefinedParseChart.scala +++ b/src/main/scala/epic/parser/RefinedParseChart.scala @@ -48,15 +48,13 @@ class RefinedParseChart[L](val index: Index[L], /** (begin,end) -> label -> refinement -> score */ // fill in arrays for spans we might touch val score: TriangularArray[Array[Array[Double]]] = TriangularArray.tabulate(length+1){(begin, end) => - if(sparsity.isAllowedSpan(begin, end)) { + if (sparsity.isAllowedSpan(begin, end)) { makeGrammarScoreArray(begin, end) } else { null } } - - /** (begin,end) -> which labels are on */ val enteredLabels: Array[BitSet] = mkBitSetArray(TriangularArray.arraySize(length+1)) /** (begin,end) -> label -> which refinements of label are on */ @@ -94,19 +92,19 @@ class RefinedParseChart[L](val index: Index[L], def enteredLabelScores(begin: Int, end: Int) = { val scoreArray = score(begin, end) - if(scoreArray eq null) Iterator.empty + if (scoreArray eq null) Iterator.empty else enteredLabels(TriangularArray.index(begin, end)).iterator.map { i => (index.get(i), scoreArray(i))} } def decodedLabelScores(begin: Int, end: Int):Counter2[L,Int,Double] = { val scoreArray = score(begin, end) - if(scoreArray eq null) Counter2() + if (scoreArray eq null) Counter2() else { val ret = Counter2[L, Int, Double]() for(i <- enteredLabels(TriangularArray.index(begin, end))) { val l = index.get(i) for((v,s) <- scoreArray(i).zipWithIndex) { - if(v != zero) + if (v != zero) ret(l,s) = v } } @@ -116,18 +114,17 @@ class RefinedParseChart[L](val index: Index[L], def decodedLabelScores(begin: Int, end: Int, label: Int):Counter[Int,Double] = { val scoreArray = score(begin, end) - if(scoreArray == null || scoreArray(label) == null) Counter() + if (scoreArray == null || scoreArray(label) == null) Counter() else { val ret = Counter[Int, Double]() for((v,s) <- scoreArray(label).zipWithIndex) { - if(v != zero) + if (v != zero) ret(s) = v } ret } } - private def rawEnter(begin: Int, end: Int, parent: Int, ref: Int, w: Double) = { val arrx = score(begin, end) val arr = arrx(parent) @@ -136,11 +133,10 @@ class RefinedParseChart[L](val index: Index[L], oldScore } - def enter(begin: Int, end: Int, parent: Int, ref: Int, w: Double): Unit = { val oldScore = rawEnter(begin, end, parent, ref, w) - if(oldScore == zero) { + if (oldScore == zero) { val index = TriangularArray.index(begin, end) updateExtents(index, parent, ref, begin, end) } @@ -190,22 +186,19 @@ class RefinedParseChart[L](val index: Index[L], narrowR < end } - - def feasibleSplitPoints(begin: Int, end: Int, b: Int, c: Int) = { val narrowR = coarseLeftMostEndForBegin(begin)(b) val narrowL = coarseRightMostBeginForEnd(end)(c) var split = math.max(narrowR, coarseLeftMostBeginForEnd(end)(c)) val endSplit = math.min(coarseRightMostEndForBegin(begin)(b), narrowL) + 1 val canBuildThisRule = narrowR < end && narrowL >= narrowR && split <= narrowL && split < endSplit - if(!canBuildThisRule) + if (!canBuildThisRule) split = endSplit - Span(split, endSplit) } def feasibleSplitPoints(begin: Int, end: Int, b: Int, refB: Int, c: Int, refC: Int) = { - if(leftMostEndForBegin(begin)(b) == null || rightMostBeginForEnd(end)(c) == null) { + if (leftMostEndForBegin(begin)(b) == null || rightMostBeginForEnd(end)(c) == null) { Span(0,0) } else { val narrowR = leftMostEndForBegin(begin)(b)(refB) @@ -213,9 +206,8 @@ class RefinedParseChart[L](val index: Index[L], var split = math.max(narrowR, leftMostBeginForEnd(end)(c)(refC)) val endSplit = math.min(rightMostEndForBegin(begin)(b)(refB), narrowL) + 1 val canBuildThisRule = narrowR < end && narrowL >= narrowR && split <= narrowL && split < endSplit - if(!canBuildThisRule) + if (!canBuildThisRule) split = endSplit - Span(split, endSplit) } } @@ -228,7 +220,6 @@ class RefinedParseChart[L](val index: Index[L], arr } - private def makeGrammarScoreArray(begin: Int, end: Int): Array[Array[Double]] = { val arr = new Array[Array[Double]](index.size) var l = 0 @@ -245,61 +236,38 @@ class RefinedParseChart[L](val index: Index[L], protected final def zero = Double.NegativeInfinity } - - object RefinedParseChart { def apply[L](g: Index[L], refinements: Array[Int], length: Int, constraints: ChartConstraints[L]): RefinedParseChart[L] = { new RefinedParseChart(g, refinements, length, constraints) } - // all of these methods could be replaced by an Array.fill or Array.tabulate, but // those were showing up in the profile. private def mkGrammarVector(grammarSize: Int, fill: Double) = { - val arr = new Array[Double](grammarSize) - Arrays.fill(arr, fill) - arr + Array.fill(grammarSize)(fill) } private def mkBitSetArray(grammarSize: Int) = { - val arr = new Array[collection.mutable.BitSet](grammarSize) - var i = 0 - while (i < arr.length) { - arr(i) = new collection.mutable.BitSet() - i += 1 - } - arr + Array.fill[collection.mutable.BitSet](grammarSize)(collection.mutable.BitSet()) } private def mkRefinementArray(length: Int, grammarSize: Int): Array[Array[BitSet]] = { - val arr = new Array[Array[BitSet]](length) - var i = 0 - while (i < arr.length) { - arr(i) = mkBitSetArray(grammarSize) - i += 1 - } - arr + Array.fill[Array[BitSet]](length)(mkBitSetArray(grammarSize)) } - private def makeRefinedExtentArray(len: Int, refinementsFor: Array[Int], fillValue: Int): Array[Array[Array[Int]]] = { val arr = Array.ofDim[Array[Int]](len, refinementsFor.length) var pos = 0 while (pos < len) { var l = 0 - while(l < refinementsFor.length) { - val myArr = new Array[Int](refinementsFor(l)) - util.Arrays.fill(myArr, fillValue) - arr(pos)(l) = myArr + while (l < refinementsFor.length) { + arr(pos)(l) = Array.fill[Int](refinementsFor(l))(fillValue) l += 1 } - pos += 1 } - - arr } } diff --git a/src/main/scala/epic/parser/RuleFeaturizer.scala b/src/main/scala/epic/parser/RuleFeaturizer.scala index 4cc87d57..2bfa3529 100644 --- a/src/main/scala/epic/parser/RuleFeaturizer.scala +++ b/src/main/scala/epic/parser/RuleFeaturizer.scala @@ -41,6 +41,5 @@ class RuleFeaturizer[L, W](grammar: RuleTopology[L]) extends RefinedFeaturizer[L } } - override def lock = this } diff --git a/src/main/scala/epic/parser/RuleTopology.scala b/src/main/scala/epic/parser/RuleTopology.scala index 91befb7f..e44cdba1 100644 --- a/src/main/scala/epic/parser/RuleTopology.scala +++ b/src/main/scala/epic/parser/RuleTopology.scala @@ -44,12 +44,11 @@ final class RuleTopology[L] private ( val rootIndex = labelIndex(root) - def labelEncoder = Encoder.fromIndex(labelIndex) // Accessors for properties of indexed rules /** Returns the parent label index from the rule index */ - def parent(r: Int):Int = indexedRules(r).parent + def parent(r: Int): Int = indexedRules(r).parent /** Returns the left child label index from the rule index */ def leftChild(r: Int): Int = indexedRules(r).asInstanceOf[BinaryRule[Int]].left /** Returns the right child label index from the rule index */ @@ -85,7 +84,7 @@ final class RuleTopology[L] private ( for( (parent,block) <- blocks) { var first = true for (r <- block) { - if(!first) + if (!first) builder ++= (" "*startLength) else builder ++= labelStrings(parent).padTo(startLength, ' ') @@ -94,7 +93,7 @@ final class RuleTopology[L] private ( r match { case UnaryRule(a, b, chain) => - if(chain.nonEmpty) + if (chain.nonEmpty) chain.addString(builder, "(", "^", ")^") builder ++= labelStrings(b) case BinaryRule(a, b, c) => @@ -120,10 +119,10 @@ final class RuleTopology[L] private ( object RuleTopology { /** Builds a grammar just from some productions */ def apply[L, W](root: L, productions: TraversableOnce[Rule[L]]): RuleTopology[L] = { - val index = Index[L](); + val index = Index[L]() val ruleIndex = Index[Rule[L]]() for(r <- productions) { - index.index(r.parent); + index.index(r.parent) r.children.foreach(index.index(_)) ruleIndex.index(r) } @@ -187,8 +186,6 @@ object RuleTopology { unaryRulesByChild.map(_.toArray)) } - - @SerialVersionUID(1) private class SerializedForm[L](var root: L, var labelIndex: Index[L], var ri: Index[Rule[L]]) extends Serializable { @throws(classOf[ObjectStreamException]) diff --git a/src/main/scala/epic/parser/SimpleChartMarginal.scala b/src/main/scala/epic/parser/SimpleChartMarginal.scala index ba2bacb9..f2da72ca 100644 --- a/src/main/scala/epic/parser/SimpleChartMarginal.scala +++ b/src/main/scala/epic/parser/SimpleChartMarginal.scala @@ -26,18 +26,18 @@ final case class SimpleChartMarginal[L, L2, W](anchoring: SimpleGrammar.Anchorin } override def feasibleSplitPoints(begin: Int, end: Int, leftChild: Int, leftChildRef: Int, rightChild: Int, rightChildRef: Int): IndexedSeq[Int] = { - (begin + 1) until (end) + (begin + 1) until end } override def visitPostorder(spanVisitor: AnchoredVisitor[L], spanThreshold: Double): Unit = { - if(logPartition.isInfinite) throw new RuntimeException("No parse for " + words) - if(logPartition.isNaN) throw new RuntimeException("NaN prob!") + if (logPartition.isInfinite) throw new RuntimeException("No parse for " + words) + if (logPartition.isNaN) throw new RuntimeException("NaN prob!") val refinedTopology = anchoring.refinedTopology val lexLoc = anchoring.lexicon.anchor(anchoring.words) // handle lexical - for (i <- 0 until words.length) { + for (i <- words.indices) { var visitedSomething = false for { a <- lexLoc.allowedTags(i) @@ -62,20 +62,20 @@ final case class SimpleChartMarginal[L, L2, W](anchoring: SimpleGrammar.Anchorin val end = begin + span val aOutside = outside.bot(begin, end, parent) val labelMarginal = inside.bot(begin, end, parent) + aOutside - logPartition - if(labelMarginal > spanThreshold) { + if (labelMarginal > spanThreshold) { val aCoarse = anchoring.refinements.labels.project(parent) val aRef = anchoring.refinements.labels.localize(parent) spanVisitor.visitSpan(begin, end, aCoarse, aRef, math.exp(labelMarginal)) - if(!spanVisitor.skipBinaryRules) { + if (!spanVisitor.skipBinaryRules) { val rules = anchoring.refinedTopology.indexedBinaryRulesWithParent(parent) var i = 0 - while(i < rules.length) { + while (i < rules.length) { val r = rules(i) val b = refinedTopology.leftChild(r) val c = refinedTopology.rightChild(r) var split = begin + 1 - while(split < end) { + while (split < end) { val bInside = inside.top.labelScore(begin, split, b) val cInside = inside.top.labelScore(split, end, c) val ruleScore = anchoring.grammar.ruleScore(r) @@ -85,7 +85,7 @@ final case class SimpleChartMarginal[L, L2, W](anchoring: SimpleGrammar.Anchorin val margScore = bInside + cInside + ruleScore + aOutside - logPartition - if(margScore != Double.NegativeInfinity) { + if (margScore != Double.NegativeInfinity) { spanVisitor.visitBinaryRule(begin, split, end, coarseR, refR, math.exp(margScore)) } @@ -100,7 +100,7 @@ final case class SimpleChartMarginal[L, L2, W](anchoring: SimpleGrammar.Anchorin } } - if(!spanVisitor.skipUnaryRules) + if (!spanVisitor.skipUnaryRules) for { span <- 1 to words.length begin <- 0 to (words.length - span) @@ -136,7 +136,7 @@ object SimpleChartMarginal { } def apply[L, L2, W](anchoring: SimpleGrammar.Anchoring[L, L2, W], maxMarginal: Boolean): SimpleChartMarginal[L, L2, W] = { - val sum = if(maxMarginal) MaxSummer else LogSummer + val sum = if (maxMarginal) MaxSummer else LogSummer val inside = buildInsideChart(anchoring, sum) val outside = buildOutsideChart(anchoring, inside, sum) SimpleChartMarginal(anchoring, inside, outside, maxMarginal) @@ -188,24 +188,23 @@ object SimpleChartMarginal { val rdoff = rcell.offset var lc = 0 - while(lc < numSyms) { + while (lc < numSyms) { val lcSpan = tensor.leftChildRange(lc) var rcOff = lcSpan.begin val rcEnd = lcSpan.end val bInside = ldata(ldoff + lc) - if(bInside != Double.NegativeInfinity) { - while(rcOff < rcEnd) { + if (bInside != Double.NegativeInfinity) { + while (rcOff < rcEnd) { val rc = tensor.rightChildForOffset(rcOff) val cInside = rdata(rdoff + rc) val rcSpan = tensor.rightChildRange(rcOff) val withoutRule = bInside + cInside - - if(cInside != Double.NegativeInfinity) { + if (cInside != Double.NegativeInfinity) { var pOff = rcSpan.begin val pEnd = rcSpan.end - while(pOff < pEnd) { + while (pOff < pEnd) { val p = tensor.parentForOffset(pOff) val score = tensor.ruleScoreForOffset(pOff) + withoutRule pdata(p + pdoff) = sum(pdata(p + pdoff), score) @@ -215,8 +214,6 @@ object SimpleChartMarginal { } - - rcOff += 1 } } @@ -228,7 +225,6 @@ object SimpleChartMarginal { split += 1 } - updateInsideUnaries(chart, anchoring, begin, end, sum) } @@ -246,7 +242,7 @@ object SimpleChartMarginal { val numSyms = tensor.numLeftChildren for { - span <- (length) until 0 by (-1) + span <- length until 0 by (-1) begin <- 0 to (length-span) } { val end = begin + span @@ -257,7 +253,7 @@ object SimpleChartMarginal { val pdoff = pcell.offset var a = 0 - while(a < numSyms) { + while (a < numSyms) { val outsideA = pdata(pdoff + a) if (outsideA != Double.NegativeInfinity) { val pSpan = tensor.leftChildRange(a) @@ -280,23 +276,23 @@ object SimpleChartMarginal { val ordoff = orcell.offset var lcOff = pSpan.begin - while(lcOff < lcEnd) { + while (lcOff < lcEnd) { val lc = tensor.rightChildForOffset(lcOff) val bInside = ldata(ldoff + lc) - if(bInside != Double.NegativeInfinity) { + if (bInside != Double.NegativeInfinity) { val lcSpan = tensor.rightChildRange(lcOff) var rcOff = lcSpan.begin val rcEnd = lcSpan.end - while(rcOff < rcEnd) { + while (rcOff < rcEnd) { val rc = tensor.parentForOffset(rcOff) val score = tensor.ruleScoreForOffset(rcOff) + outsideA val cInside = rdata(rdoff + rc) if (cInside != Double.NegativeInfinity) { oldata(oldoff + lc) = sum(oldata(oldoff + lc), cInside + score) ordata(ordoff + rc) = sum(ordata(ordoff + rc), bInside + score) -// outside.top.enter(begin, split, lc, sum(outside.top.labelScore(begin, split, lc), cInside + score)) -// outside.top.enter(split, end, rc, sum(outside.top.labelScore(split, end, rc), bInside + score)) + // outside.top.enter(begin, split, lc, sum(outside.top.labelScore(begin, split, lc), cInside + score)) + // outside.top.enter(split, end, rc, sum(outside.top.labelScore(split, end, rc), bInside + score)) } rcOff += 1 } @@ -312,7 +308,6 @@ object SimpleChartMarginal { outside } - private def updateInsideUnaries[L, L2, W](chart: SimpleParseChart[L2], anchoring: SimpleGrammar.Anchoring[L, L2, W], begin: Int, end: Int, sum: Summer) = { @@ -320,10 +315,8 @@ object SimpleChartMarginal { val parentCell = chart.top.cell(begin, end) val tensor = anchoring.grammar.insideTensor doMatrixMultiply(childCell, parentCell, tensor, sum) - } - private def doMatrixMultiply[W, L2, L](childCell: DenseVector[Double], parentCell: DenseVector[Double], tensor: SparseRuleTensor[L2], sum: RefinedChartMarginal.Summer) { val numSyms = childCell.size val cdata = childCell.data @@ -345,7 +338,6 @@ object SimpleChartMarginal { aOff += 1 } } - b += 1 } } @@ -357,7 +349,6 @@ object SimpleChartMarginal { val parentCell = outside.top.cell(begin, end) val tensor = anchoring.grammar.outsideTensor doMatrixMultiply(parentCell, childCell, tensor, sum) - } case class SimpleChartFactory[L, L2, W](refinedGrammar: SimpleGrammar[L, L2, W], maxMarginal: Boolean = false) extends ParseMarginal.Factory[L, W] { @@ -368,8 +359,6 @@ object SimpleChartMarginal { } - - @SerialVersionUID(1) final class SimpleParseChart[L](val index: Index[L], val length: Int) extends Serializable { @@ -386,7 +375,6 @@ final class SimpleParseChart[L](val index: Index[L], val length: Int) extends Se //scores(::, TriangularArray.index(begin, end)) } - def apply(begin: Int, end: Int, label: L):Double = apply(begin, end, index(label)) def labelScore(begin: Int, end: Int, label: L):Double = apply(begin, end, index(label)) @@ -397,6 +385,3 @@ final class SimpleParseChart[L](val index: Index[L], val length: Int) extends Se } } - - - diff --git a/src/main/scala/epic/parser/SimpleGrammar.scala b/src/main/scala/epic/parser/SimpleGrammar.scala index 61c14123..7ebb56ae 100644 --- a/src/main/scala/epic/parser/SimpleGrammar.scala +++ b/src/main/scala/epic/parser/SimpleGrammar.scala @@ -103,7 +103,6 @@ object SimpleGrammar { } - private def doCloseUnaries(matrix: DenseMatrix[Double], closureType: CloseUnaries.Value, syms: Index[AnnotatedLabel]): immutable.IndexedSeq[(UnaryRule[AnnotatedLabel], Double)] = closureType match { case CloseUnaries.None => val probs = breeze.numerics.log(matrix) @@ -161,13 +160,13 @@ object SimpleGrammar { val binaryIn = Source.fromInputStream(new FileInputStream(prefix+".binary")) for ( line <- binaryIn.getLines()) { val Array(_a,_b,_c, score) = line.split("\\s+") - val a = if(_a.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_a) - val b = if(_b.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_b) - val c = if(_c.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_c) + val a = if (_a.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_a) + val b = if (_b.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_b) + val c = if (_c.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_c) val logScore = math.log(score.toDouble) - if(logScore >= threshold) { + if (logScore >= threshold) { val ruleId = rules.index(BinaryRule(a,b,c).map(AnnotatedLabel(_))) - if(ruleId == ruleScores.length) + if (ruleId == ruleScores.length) ruleScores += logScore syms.index(AnnotatedLabel(a)) syms.index(AnnotatedLabel(b)) @@ -182,10 +181,10 @@ object SimpleGrammar { val unclosedUnaries: DenseMatrix[Double] = DenseMatrix.eye[Double](syms.size) for ( line <- unaryIn.getLines()) { val Array(_a, _b,score) = line.split("\\s+") - val a = if(_a.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_a) - val b = if(_b.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_b) + val a = if (_a.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_a) + val b = if (_b.startsWith("ROOT")) "TOP_0" else preprocessSymbol(_b) val logScore = math.log(score.toDouble) - if(logScore >= threshold) { + if (logScore >= threshold) { val ai = syms(AnnotatedLabel(a)) val bi = syms(AnnotatedLabel(b)) require(ai >= 0 && bi >= 0, a + " " + b + " " + syms) @@ -231,7 +230,6 @@ object SimpleGrammar { scorer) } - def preprocessSymbol(_sym: String): String = { val sym = if (_sym == "ROOT") "TOP" else if (_sym == "PRT|ADVP") "PRT" else _sym sym.replaceAll("ROOT","TOP").replaceAll("PRT\\|ADVP_[0-9]*", "PRT_0") @@ -248,7 +246,6 @@ object SimpleGrammar { case class Anchoring[L, L2, W](grammar: SimpleGrammar[L, L2, W], words: IndexedSeq[W], override val sparsityPattern: ChartConstraints[L]) extends ProjectionsGrammarAnchoring[L, L2, W] { - override def addConstraints(constraints: ChartConstraints[L]): GrammarAnchoring[L, W] = copy(sparsityPattern = sparsityPattern & constraints) def topology = grammar.topology @@ -260,7 +257,7 @@ object SimpleGrammar { override def toString() = "SimpleRefinedGrammar.Anchoring(...)" def scoreSpan(begin: Int, end: Int, label: Int, ref: Int) = { - val baseScore = if(begin + 1 == end) { + val baseScore = if (begin + 1 == end) { val fullId = refinements.labels.globalize(label, ref) tagAnchoring.scoreTag(begin, refinements.labels.fineIndex.get(fullId)) } else { @@ -279,5 +276,4 @@ object SimpleGrammar { } - } diff --git a/src/main/scala/epic/parser/SparseRuleTensor.scala b/src/main/scala/epic/parser/SparseRuleTensor.scala index f38cc5ab..3332277a 100644 --- a/src/main/scala/epic/parser/SparseRuleTensor.scala +++ b/src/main/scala/epic/parser/SparseRuleTensor.scala @@ -21,7 +21,7 @@ final class SparseRuleTensor[L] private(val leftChildOffsets: Array[Int], val unaryParentIndicesAndScores: Array[Int], outside: Boolean) extends Serializable { - val numLeftChildren = leftChildOffsets.size - 1 + val numLeftChildren = leftChildOffsets.length - 1 def leftChildRange(lc: Int):Span = Span(leftChildOffsets(lc), leftChildOffsets(lc+1) ) @@ -37,7 +37,6 @@ final class SparseRuleTensor[L] private(val leftChildOffsets: Array[Int], java.lang.Double.longBitsToDouble((first.toLong << 32) | (second.toLong&0xFFFFFFFFL)) } - def unaryChildRange(lc: Int):Span = Span(unaryChildPtrs(lc), unaryChildPtrs(lc+1) ) def unaryParentForOffset(off: Int) = unaryParentIndicesAndScores(off * 3) @@ -57,7 +56,7 @@ final class SparseRuleTensor[L] private(val leftChildOffsets: Array[Int], p = parentForOffset(pOff) score = ruleScoreForOffset(pOff) } yield { - if(outside) + if (outside) BinaryRule(lc, rc, p) -> score else BinaryRule(p, lc, rc) -> score @@ -79,27 +78,26 @@ object SparseRuleTensor { var lastRcOffset = 0 var lastOffset = 0 -// leftChildOffsets += 0 + // leftChildOffsets += 0 for(r <- orderedRuleIndices) { val lc = leftChild(r) var endRightChild = false assert(lastLc <= lc) - while(lastLc != lc) { + while (lastLc != lc) { lastLc += 1 leftChildOffsets += lastRcOffset endRightChild = true } val rc = rightChild(r) - if(endRightChild || rc != lastRc) { + if (endRightChild || rc != lastRc) { rightChildIndicesAndOffsets += rc rightChildIndicesAndOffsets += lastOffset lastRc = rc lastRcOffset += 1 } - val p = parent(r) val rs = grammar.ruleScore(r) val span: Span = new Span(java.lang.Double.doubleToLongBits(rs)) @@ -123,7 +121,7 @@ object SparseRuleTensor { for(r <- unaryRules) { val lc = child(r) assert(lastLc <= lc) - while(lastLc != lc) { + while (lastLc != lc) { unaryChildOffsets += lastOffset lastLc += 1 } @@ -136,7 +134,7 @@ object SparseRuleTensor { unaryParentIndicesAndScores += (p, encodedFirst, encodedSecond) lastOffset += 1 } - while(lastLc <= grammar.refinedTopology.labelIndex.size) { + while (lastLc <= grammar.refinedTopology.labelIndex.size) { lastLc += 1 unaryChildOffsets += lastOffset } @@ -161,27 +159,26 @@ object SparseRuleTensor { var lastRcOffset = 0 var lastOffset = 0 -// leftChildOffsets += 0 + // leftChildOffsets += 0 for(r <- orderedRuleIndices) { val lc = parent(r) var endRightChild = false assert(lastLc <= lc) - while(lastLc != lc) { + while (lastLc != lc) { lastLc += 1 leftChildOffsets += lastRcOffset endRightChild = true } val rc = leftChild(r) - if(endRightChild || rc != lastRc) { + if (endRightChild || rc != lastRc) { rightChildIndicesAndOffsets += rc rightChildIndicesAndOffsets += lastOffset lastRc = rc lastRcOffset += 1 } - val p = rightChild(r) val rs = grammar.ruleScore(r) val span: Span = new Span(java.lang.Double.doubleToLongBits(rs)) @@ -204,7 +201,7 @@ object SparseRuleTensor { lastOffset = 0 for(r <- unaryRules) { val lc = parent(r) - while(lastLc != lc) { + while (lastLc != lc) { unaryChildOffsets += lastOffset lastLc += 1 } @@ -217,19 +214,17 @@ object SparseRuleTensor { unaryParentIndicesAndScores += (p, encodedFirst, encodedSecond) lastOffset += 1 } - while(lastLc <= labelIndex.size) { + while (lastLc <= labelIndex.size) { lastLc += 1 unaryChildOffsets += lastOffset } val ret = new SparseRuleTensor[L2](leftChildOffsets.toArray, rightChildIndicesAndOffsets.toArray, parentIndicesAndScores.toArray, unaryChildOffsets.toArray, unaryParentIndicesAndScores.toArray, true) - assert(ret.ruleIterator.map(_._1).toIndexedSeq == orderedRuleIndices.map(indexedRules(_)), s"\n${ret.ruleIterator.map(_._1).toIndexedSeq}\n${orderedRuleIndices.map(indexedRules(_))}") assert(ret.ruleIterator.map(_._2).toIndexedSeq == orderedRuleIndices.map(grammar.ruleScore(_))) ret } - } \ No newline at end of file diff --git a/src/main/scala/epic/parser/TreeMarginal.scala b/src/main/scala/epic/parser/TreeMarginal.scala index f66d084e..ed8090a9 100644 --- a/src/main/scala/epic/parser/TreeMarginal.scala +++ b/src/main/scala/epic/parser/TreeMarginal.scala @@ -38,16 +38,16 @@ case class TreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], case n@NullaryTree( (a, ref), span ) => val aI = topology.labelIndex(a) score += anchoring.scoreSpan(span.begin, span.end, aI, ref) - if(score.isInfinite) throw new Exception(s"Could not score the terminal with tag ${a -> ref} at $span. $words") + if (score.isInfinite) throw new Exception(s"Could not score the terminal with tag ${a -> ref} at $span. $words") case UnaryTree( (a, refA), child@Tree((b, refB), _, _), chain, span) => val r = topology.index(UnaryRule(a, b, chain)) assert(r != -1, "Could not find rule " + UnaryRule(a, b, chain)) val ruleRef = anchoring.ruleRefinementFromRefinements(r, refA, refB) - if(ruleRef < 0) throw new Exception(s"Bad refined rule in gold tree!: ${UnaryRule(a, b, chain)} aRef: $refA bRef: $refB") + if (ruleRef < 0) throw new Exception(s"Bad refined rule in gold tree!: ${UnaryRule(a, b, chain)} aRef: $refA bRef: $refB") score += anchoring.scoreUnaryRule(t.span.begin, t.span.end, r, ruleRef) - if(score.isInfinite) throw new Exception(s"Could not score gold tree!\n Partial Tree: ${t.render(words)}\n Full Tree: ${tree.render(words)}\n ") + if (score.isInfinite) throw new Exception(s"Could not score gold tree!\n Partial Tree: ${t.render(words)}\n Full Tree: ${tree.render(words)}\n ") rec(child) case t@BinaryTree( (a, refA), bt@Tree( (b, refB), _, _), ct@Tree((c, refC), _, _), span) => val aI = topology.labelIndex(a) @@ -55,13 +55,12 @@ case class TreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], val ruleRef = anchoring.ruleRefinementFromRefinements(rule, refA, refB, refC) score += anchoring.scoreSpan(t.span.begin, t.span.end, aI, refA) score += anchoring.scoreBinaryRule(t.span.begin, bt.span.end, t.span.end, rule, ruleRef) - if(score.isInfinite) throw new Exception("Could not score gold tree!" + t.render(words)) + if (score.isInfinite) throw new Exception("Could not score gold tree!" + t.render(words)) rec(bt) rec(ct) } rec(tree) - score } @@ -73,7 +72,7 @@ case class TreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], case t@UnaryTree( (a, refA), Tree((b, refB), _, _), chain, span) => val r = topology.index(UnaryRule(a, b, chain)) val ruleRef = anchoring.ruleRefinementFromRefinements(r, refA, refB) - if(ruleRef < 0) throw new Exception(s"Bad refined rule in gold tree!: ${UnaryRule(a, b, chain)} aRef: $refA bRef: $refB") + if (ruleRef < 0) throw new Exception(s"Bad refined rule in gold tree!: ${UnaryRule(a, b, chain)} aRef: $refA bRef: $refB") visitor.visitUnaryRule(t.span.begin, t.span.end, r, ruleRef, 1.0) case t@BinaryTree( (a, refA), bt@Tree( (b, refB), _, _), Tree((c, refC), _, _), span) => val aI = topology.labelIndex(a) @@ -99,7 +98,6 @@ case class TreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], case Some(UnaryTree(_, Tree(a, _, span2), chain, span)) => logI(a == (sym -> ref)) case _ => Double.NegativeInfinity } - } override def insideTopScore(begin: Int, end: Int, sym: Int, ref: Int): Double = { @@ -107,7 +105,6 @@ case class TreeMarginal[L, W](anchoring: GrammarAnchoring[L, W], case Some(UnaryTree(a, _, chain, span)) => logI(a == (sym -> ref)) case _ => Double.NegativeInfinity } - } def marginalAt(begin: Int, end: Int): Counter2[L, Int, Double] = { diff --git a/src/main/scala/epic/parser/UnrefinedGrammarAnchoring.scala b/src/main/scala/epic/parser/UnrefinedGrammarAnchoring.scala index 95be5b8e..4d66fc5d 100644 --- a/src/main/scala/epic/parser/UnrefinedGrammarAnchoring.scala +++ b/src/main/scala/epic/parser/UnrefinedGrammarAnchoring.scala @@ -63,12 +63,11 @@ trait UnrefinedGrammarAnchoring[L, W] extends GrammarAnchoring[L, W] with Factor override def *(other: UnrefinedGrammarAnchoring[L, W]): UnrefinedGrammarAnchoring[L, W] = { // hacky multimethod dispatch is hacky if (other eq null) this // ugh - else if(other.isInstanceOf[UnrefinedGrammarAnchoring.Identity[L, W]]) this.addConstraints(other.sparsityPattern) - else if(this.isInstanceOf[UnrefinedGrammarAnchoring.Identity[L, W]]) other.addConstraints(this.sparsityPattern) + else if (other.isInstanceOf[UnrefinedGrammarAnchoring.Identity[L, W]]) this.addConstraints(other.sparsityPattern) + else if (this.isInstanceOf[UnrefinedGrammarAnchoring.Identity[L, W]]) other.addConstraints(this.sparsityPattern) else new ProductUnrefinedGrammarAnchoring(this,other) } - /** * The annotationTag controls if two grammars are over the same refinements. * If they are, then * and / can be much faster. @@ -92,8 +91,8 @@ trait UnrefinedGrammarAnchoring[L, W] extends GrammarAnchoring[L, W] with Factor def /(other: UnrefinedGrammarAnchoring[L, W]) = { // hacky multimethod dispatch is hacky if (other eq null) this // ugh - else if(this eq other) new UnrefinedGrammarAnchoring.Identity[L, W](topology, lexicon, words, this.sparsityPattern) - else if(other.isInstanceOf[UnrefinedGrammarAnchoring.Identity[L, W]]) this.addConstraints(other.sparsityPattern) + else if (this eq other) new UnrefinedGrammarAnchoring.Identity[L, W](topology, lexicon, words, this.sparsityPattern) + else if (other.isInstanceOf[UnrefinedGrammarAnchoring.Identity[L, W]]) this.addConstraints(other.sparsityPattern) else new ProductUnrefinedGrammarAnchoring(this, other, -1) } @@ -195,23 +194,11 @@ object UnrefinedGrammarAnchoring { */ @SerialVersionUID(1L) case class Identity[L, W](topology: RuleTopology[L], lexicon: Lexicon[L, W], words: IndexedSeq[W], sparsityPattern: ChartConstraints[L]) extends UnrefinedGrammarAnchoring[L, W] { - - // def sparsityPattern = ChartConstraints.noSparsity[L] override def addConstraints(cs: ChartConstraints[L]): UnrefinedGrammarAnchoring[L, W] = copy(sparsityPattern = sparsityPattern & cs) - def scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int) = 0.0 - def scoreUnaryRule(begin: Int, end: Int, rule: Int) = 0.0 - def scoreSpan(begin: Int, end: Int, tag: Int) = 0.0 - - } } - - - - - diff --git a/src/main/scala/epic/parser/kbest/AStarKBestParser.scala b/src/main/scala/epic/parser/kbest/AStarKBestParser.scala index f337270a..1535e05f 100644 --- a/src/main/scala/epic/parser/kbest/AStarKBestParser.scala +++ b/src/main/scala/epic/parser/kbest/AStarKBestParser.scala @@ -4,7 +4,6 @@ package kbest import epic.parser.projections.{AnchoredRuleMarginalProjector, ChartProjector} import epic.trees.BinarizedTree - /** * Uses Top Down KBest A* (as implemented in [[epic.parser.kbest.TopDownKBestAStar]]) to generate * kbest lists. diff --git a/src/main/scala/epic/parser/kbest/KBestListMarginal.scala b/src/main/scala/epic/parser/kbest/KBestListMarginal.scala index 2b585fef..41b3fa6e 100644 --- a/src/main/scala/epic/parser/kbest/KBestListMarginal.scala +++ b/src/main/scala/epic/parser/kbest/KBestListMarginal.scala @@ -19,7 +19,6 @@ import epic.parser._ import breeze.linalg._ import breeze.numerics._ - case class KBestListMarginal[L, W](anchoring: GrammarAnchoring[L, W], marginals: IndexedSeq[ParseMarginal[L, W]]) extends ParseMarginal[L, W] { @@ -41,12 +40,10 @@ case class KBestListMarginal[L, W](anchoring: GrammarAnchoring[L, W], m.visitPostorder(new AnchoredVisitor[L] { def visitUnaryRule(begin: Int, end: Int, rule: Int, ref: Int, score: Double) { spanVisitor.visitUnaryRule(begin, end, rule, ref, score * probsPerTree(i)) - } def visitSpan(begin: Int, end: Int, tag: Int, ref: Int, score: Double) { spanVisitor.visitSpan(begin, end, tag, ref, score * probsPerTree(i)) - } def visitBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int, score: Double) { @@ -56,8 +53,6 @@ case class KBestListMarginal[L, W](anchoring: GrammarAnchoring[L, W], } } - - override def insideBotScore(begin: Int, end: Int, sym: Int, ref: Int): Double = ??? override def insideTopScore(begin: Int, end: Int, sym: Int, ref: Int): Double = ??? diff --git a/src/main/scala/epic/parser/kbest/KBestParseTreebank.scala b/src/main/scala/epic/parser/kbest/KBestParseTreebank.scala index 5d2bcd3c..339de1ec 100644 --- a/src/main/scala/epic/parser/kbest/KBestParseTreebank.scala +++ b/src/main/scala/epic/parser/kbest/KBestParseTreebank.scala @@ -44,7 +44,7 @@ object KBestParseTreebank { def parse(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], out: PrintWriter) = { val parred = trainTrees.par - if(params.threads > 0) + if (params.threads > 0) parred.tasksupport = new ForkJoinTaskSupport(new ForkJoinPool(params.threads)) parred .map(ti => ti.words -> kbest.bestKParses(ti.words, params.k)) diff --git a/src/main/scala/epic/parser/kbest/TopDownKBestAStar.scala b/src/main/scala/epic/parser/kbest/TopDownKBestAStar.scala index 04736739..f9c33275 100644 --- a/src/main/scala/epic/parser/kbest/TopDownKBestAStar.scala +++ b/src/main/scala/epic/parser/kbest/TopDownKBestAStar.scala @@ -13,11 +13,10 @@ object TopDownKBestAStar { def apply[L, W](chart: RefinedChartMarginal[L, W], k: Int):IndexedSeq[(BinarizedTree[L], Double)] = { import chart._ val root = chart.topology.rootIndex - val kbestList = new ArrayBuffer[(BinarizedTree[L], Double)]() val queue = new mutable.PriorityQueue[TKAItem[(Int, Int)]] queue.enqueue(StartItem) - while(!queue.isEmpty && kbestList.size < k) { + while (queue.nonEmpty && kbestList.size < k) { queue.dequeue() match { case StartItem => val begin = 0 @@ -40,7 +39,7 @@ object TopDownKBestAStar { val chain = topology.chain(r) val refB = anchoring.childRefinement(r, refR) val bScore = inside.bot.labelScore(begin, end, b, refB) - if(!bScore.isInfinite) { + if (!bScore.isInfinite) { val rScore = anchoring.scoreUnaryRule(begin, end, r, refR) val newWeight = weight - aScore + bScore + rScore val newParentLabel = (b,refB) @@ -63,7 +62,7 @@ object TopDownKBestAStar { val end = zipper.end val aScore = inside.bot.labelScore(begin, end, root, rootRef) - val traceOn = (begin == 0 && end == 4) + val traceOn = begin == 0 && end == 4 val spanScore = anchoring.scoreSpan(begin, end, root, rootRef) for { r <- topology.indexedBinaryRulesWithParent(root) @@ -83,7 +82,7 @@ object TopDownKBestAStar { ) assert(score <= aScore + 1E-4, score -> aScore) val newWeight = weight - aScore + score - if(!newWeight.isInfinite) { + if (!newWeight.isInfinite) { val newZipper = zipper.copy(BinaryTree(zipper.tree.label, NullaryTree(b -> refB, Span(begin,split)), NullaryTree(c -> refC, Span(split, end)), zipper.tree.span)).down.get @@ -91,10 +90,7 @@ object TopDownKBestAStar { queue += TopItem(newZipper, newWeight) } } - - } - } kbestList } @@ -109,7 +105,6 @@ object TopDownKBestAStar { private case class BotItem[L](zipper: Zipper[L], weight: Double) extends TKAItem[L] private case class CompleteTreeItem[L](tree: BinarizedTree[L], weight: Double) extends TKAItem[L] - } diff --git a/src/main/scala/epic/parser/models/AnnotatedParserInference.scala b/src/main/scala/epic/parser/models/AnnotatedParserInference.scala index 3540fa6b..c32ca88f 100644 --- a/src/main/scala/epic/parser/models/AnnotatedParserInference.scala +++ b/src/main/scala/epic/parser/models/AnnotatedParserInference.scala @@ -36,7 +36,6 @@ case class AnnotatedParserInference[L, W](featurizer: RefinedFeaturizer[L, W, Fe grammar: Grammar[L, W], constrainer: ChartConstraints.Factory[L, W]) extends ParserInference[L, W] { - override def forTesting = copy(featurizer.forTesting, constrainer = ChartConstraints.Factory.noSparsity) def goldMarginal(scorer: Scorer, ti: TreeInstance[L, W], aug: UnrefinedGrammarAnchoring[L, W]): Marginal = { @@ -45,5 +44,4 @@ case class AnnotatedParserInference[L, W](featurizer: RefinedFeaturizer[L, W, Fe TreeMarginal(scorer, annotated) } - } diff --git a/src/main/scala/epic/parser/models/EPParserModelFactory.scala b/src/main/scala/epic/parser/models/EPParserModelFactory.scala index 24e5284a..ccd7910c 100644 --- a/src/main/scala/epic/parser/models/EPParserModelFactory.scala +++ b/src/main/scala/epic/parser/models/EPParserModelFactory.scala @@ -44,14 +44,12 @@ case class EPParserModelFactory(ep: EPParams, oldWeights: File = null) extends ParserExtractableModelFactory[AnnotatedLabel, String] { type MyModel = EPParserModel[AnnotatedLabel, String] - override def make(train: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], constrainer: Factory[AnnotatedLabel, String]): MyModel = { type ModelType = EPModel.CompatibleModel[TreeInstance[AnnotatedLabel, String], UnrefinedGrammarAnchoring[AnnotatedLabel, String]] val models = model.filterNot(_ eq null) map { model => model.make(train, topology, lexicon, constrainer): ModelType } val featureCounter = readWeights(oldWeights) - new EPParserModel[AnnotatedLabel, String](topology, lexicon, constrainer, ep.maxIterations, featureCounter.get, false, ep.dropOutFraction)(models:_*) } } diff --git a/src/main/scala/epic/parser/models/FeaturizedLexicon.scala b/src/main/scala/epic/parser/models/FeaturizedLexicon.scala index f4cd26bf..63d99e13 100644 --- a/src/main/scala/epic/parser/models/FeaturizedLexicon.scala +++ b/src/main/scala/epic/parser/models/FeaturizedLexicon.scala @@ -26,7 +26,6 @@ import epic.lexicon.TagScorer class FeaturizedLexicon[L, L2, W](val weights: DenseVector[Double], val featureIndexer: IndexedFeaturizer[L, L2, W]) extends TagScorer[L2, W] { - def anchor(w: IndexedSeq[W]): Anchoring = new Anchoring { val fi = featureIndexer.anchor(w) def words: IndexedSeq[W] = w diff --git a/src/main/scala/epic/parser/models/IndexedFeaturizer.scala b/src/main/scala/epic/parser/models/IndexedFeaturizer.scala index 9c2b088f..41539e9c 100644 --- a/src/main/scala/epic/parser/models/IndexedFeaturizer.scala +++ b/src/main/scala/epic/parser/models/IndexedFeaturizer.scala @@ -38,7 +38,6 @@ class IndexedFeaturizer[L, L2, W](val index: CrossProductIndex[Feature, Feature] indexedProjections: GrammarRefinements[L, L2], ruleCache: Array[Array[Int]]) extends RefinedFeaturizer[L, W, Feature] with Encoder[Feature] with Serializable { outer => - import indexedProjections._ def labelIndex = labels.fineIndex @@ -49,7 +48,6 @@ class IndexedFeaturizer[L, L2, W](val index: CrossProductIndex[Feature, Feature] def anchor(words: IndexedSeq[W]) = new Spec(words) - case class Spec private[IndexedFeaturizer](words: IndexedSeq[W]) extends super.Anchoring { val anch = wGen.anchor(words) @@ -74,12 +72,9 @@ class IndexedFeaturizer[L, L2, W](val index: CrossProductIndex[Feature, Feature] index.crossProduct(feat.featuresForLabel(tag), anch.featuresForWord(pos), usePlainLabelFeatures = false) } - def computeWeight(pos: Int, l: Int, weights: DenseVector[Double]) = new FeatureVector(featuresFor(pos, l)) dot weights } - - } object IndexedFeaturizer { diff --git a/src/main/scala/epic/parser/models/LatentParserModel.scala b/src/main/scala/epic/parser/models/LatentParserModel.scala index 2b57e97c..a0e334b3 100644 --- a/src/main/scala/epic/parser/models/LatentParserModel.scala +++ b/src/main/scala/epic/parser/models/LatentParserModel.scala @@ -102,7 +102,6 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma oldWeights: File = null) extends ParserModelFactory[AnnotatedLabel, String] with SafeLogging { type MyModel = LatentParserModel[AnnotatedLabel, (AnnotatedLabel, Int), String] - override def make(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], constrainer: Factory[AnnotatedLabel, String]): MyModel = { @@ -119,7 +118,7 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma AnnotatedLabel(split(0)) -> split(1).toInt } pairs.toMap + (xbarGrammar.root -> 1) - } else if(splitUselessStates) { + } else if (splitUselessStates) { Map(xbarGrammar.root -> 1) } else { LatentModelFactory.statesToNotSplit.iterator.map(s => AnnotatedLabel(s) -> 1).toMap + (xbarGrammar.root -> 1) diff --git a/src/main/scala/epic/parser/models/LexModel.scala b/src/main/scala/epic/parser/models/LexModel.scala index 1f40590e..b2f7f7fa 100644 --- a/src/main/scala/epic/parser/models/LexModel.scala +++ b/src/main/scala/epic/parser/models/LexModel.scala @@ -103,19 +103,14 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], private val unaryOffset = index.componentOffset(2) private val splitOffset = index.componentOffset(3) - override def lock = this - def joinTagRef(head: Int, ref: Int, length: Int) : Int = { head + ref * length } - def anchor(datum: IndexedSeq[W]):Spec = new Spec(datum) - - class Spec(val words: IndexedSeq[W]) extends Anchoring { private val fspec = ruleFeaturizer.anchor(words) private val bilexSpec = bilexFeaturizer.anchor(words) @@ -127,34 +122,34 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], def featuresForUnaryRule(begin: Int, end: Int, rule: Int, ref: Int) = { val head = unaryHeadIndex(ref) - if(head < begin || head >= end) throw new RuntimeException(s"Head $head not in bounds for rule $rule in span [$begin, $end)}") + if (head < begin || head >= end) throw new RuntimeException(s"Head $head not in bounds for rule $rule in span [$begin, $end)}") val ruleRef = unaryRuleRefinement(ref) val globalizedRule = refinements.rules.globalize(rule, ruleRef) var rcache = headCache(head) - if(rcache eq null) { + if (rcache eq null) { rcache = new OpenAddressHashArray[Array[Int]](refinements.rules.fineIndex.size) headCache(head) = rcache } var headCached = rcache(globalizedRule) - if(headCached == null) { + if (headCached == null) { val surfFeatures = unarySpec.featuresForWord(head) val rFeatures = fspec.featuresForUnaryRule(begin, end, rule, ruleRef) headCached = unaryFeatureIndex.crossProduct(rFeatures, surfFeatures, unaryOffset) rcache(globalizedRule) = headCached } - if(splitSpanSpec.isEmpty) { + if (splitSpanSpec.isEmpty) { headCached } else { var ucache = unarySpanCache(begin, end) - if(ucache eq null) { + if (ucache eq null) { ucache = new OpenAddressHashArray[Array[Int]](refinements.rules.fineIndex.size) unarySpanCache(begin, end) = ucache } var surfCached = ucache(globalizedRule) - if(surfCached == null) { + if (surfCached == null) { surfCached = splitSpanFeatureIndex.crossProduct(fspec.featuresForUnaryRule(begin, end, rule, ruleRef), getSpanFeatures(begin, end), splitOffset, true) ucache(globalizedRule) = surfCached @@ -168,42 +163,36 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], val localTagRef = tagRefinement(ref) val refinedTag = refinements.labels.globalize(tag, localTagRef) val head = headTagIndex(ref) - if(head < begin || head >= end) throw new RuntimeException(s"Head $head not in bounds for tag $tag in span [$begin, $end)}") + if (head < begin || head >= end) throw new RuntimeException(s"Head $head not in bounds for tag $tag in span [$begin, $end)}") var rcache = wordCache(head) - if(rcache eq null) { + if (rcache eq null) { rcache = new OpenAddressHashArray[Array[Int]](refinements.labels.fineIndex.size, null:Array[Int], 2) wordCache(head) = rcache } var cache = rcache(refinedTag) - if(cache == null) { + if (cache == null) { cache = wordFeatureIndex.crossProduct(fspec.featuresForSpan(begin, end, tag, localTagRef), wordSpec.featuresForWord(head), offset = wordOffset, usePlainLabelFeatures = false) rcache(refinedTag) = cache } - if(splitSpanSpec.nonEmpty && begin < end - 1) { + if (splitSpanSpec.nonEmpty && begin < end - 1) { var labelCache = spanCache(begin, end) - if(labelCache eq null) { + if (labelCache eq null) { labelCache = new OpenAddressHashArray[Array[Int]](refinements.labels.fineIndex.size) spanCache(begin, end) = labelCache } var lcached = labelCache(refinedTag) - if(lcached == null) { + if (lcached == null) { val spanFeats: Array[Int] = fspec.featuresForSpan(begin, end, tag, localTagRef) lcached = splitSpanFeatureIndex.crossProduct(spanFeats, getSpanFeatures(begin, end), splitOffset, true) labelCache(refinedTag) = lcached } - lcached - - cache = Arrays.concatenate(cache, lcached) } cache } - - - def featuresForBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int) = { val head = headIndex(ref) val dep = depIndex(ref) @@ -214,11 +203,11 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], val arrays = new ArrayBuffer[Array[Int]]() - if(useBilexRuleFeatures) { + if (useBilexRuleFeatures) { arrays += featuresForHeadDepRule(begin, split, end, head, dep, rule, ruleRef) } - if(splitSpanSpec.nonEmpty) { + if (splitSpanSpec.nonEmpty) { arrays += featuresForSplitRule(begin, split, end, rule, ruleRef) } @@ -226,7 +215,6 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], val refinedTag = refinements.labels.globalize(tag, refinements.parentRefinement(rule, ruleRef)) arrays += featuresForAttach(head, dep, refinedTag) - Arrays.concatenate(arrays:_*) } @@ -245,7 +233,7 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], bilexCache(head)(dep) = bilexFeatures } - val fi = Arrays.concatenate(rawLabelFeatures(refinedTag), if(head < dep) rawDirFeatures(0) else rawDirFeatures(1)) + val fi = Arrays.concatenate(rawLabelFeatures(refinedTag), if (head < dep) rawDirFeatures(0) else rawDirFeatures(1)) feats = bilexFeatureIndex.crossProduct(fi, bilexFeatures, offset = bilexOffset, usePlainLabelFeatures = false) cache(refinedTag) = feats @@ -253,17 +241,15 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], feats } - def featuresForHeadDepRule(begin: Int, split: Int, end: Int, head: Int, dep: Int, rule: Int, ruleRef: Int): Array[Int] = { var cache = ruleCache(head)(dep) if (cache == null) { cache = new OpenAddressHashArray[Array[Int]](refinements.rules.fineIndex.size, null:Array[Int], 256) ruleCache(head)(dep) = cache } - -// val x = cache.activeSize * 1.0/cache.size -// val y = cache.activeSize * 1.0/cache.data.length -// if(math.random < .01) println(x + " " + y + " " + cache.size) + // val x = cache.activeSize * 1.0/cache.size + // val y = cache.activeSize * 1.0/cache.data.length + // if (math.random < .01) println(x + " " + y + " " + cache.size) var feats = cache(refinements.rules.globalize(rule, ruleRef)) if (feats == null) { var bilexFeatures: Array[Int] = bilexCache(head)(dep) @@ -271,18 +257,16 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], bilexFeatures = bilexSpec.featuresForAttachment(head, dep) bilexCache(head)(dep) = bilexFeatures } - val fi = fspec.featuresForBinaryRule(begin, split, end, rule, ruleRef) feats = bilexFeatureIndex.crossProduct(fi, bilexFeatures, offset = bilexOffset, usePlainLabelFeatures = true) cache(refinements.rules.globalize(rule, ruleRef)) = feats - } feats } def featuresForSplitRule(begin: Int, split: Int, end: Int, rule: Int, ruleRef: Int): Array[Int] = { val globalizedRule = refinements.rules.globalize(rule, ruleRef) - + var ucache = binaryCache(begin, end) if (ucache eq null) { ucache = new Array[OpenAddressHashArray[Array[Int]]](end - begin) @@ -297,12 +281,12 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], var lcached = scache(globalizedRule) if (lcached == null) { -// val spanFeatures = getSpanFeatures(begin, end) -// lcached = splitSpanFeatureIndex.crossProduct(fspec.featuresForBinaryRule(begin, split, end, rule, ruleRef), spanFeatures, splitOffset, true) + // val spanFeatures = getSpanFeatures(begin, end) + // lcached = splitSpanFeatureIndex.crossProduct(fspec.featuresForBinaryRule(begin, split, end, rule, ruleRef), spanFeatures, splitOffset, true) lcached = splitSpanFeatureIndex.crossProduct(fspec.featuresForBinaryRule(begin, split, end, rule, ruleRef), getSplitFeatures(begin, split, end), splitOffset, true) -// if (forSplit.length > 0) -// lcached = Arrays.concatenate(lcached, forSplit) + // if (forSplit.length > 0) + // lcached = Arrays.concatenate(lcached, forSplit) scache(globalizedRule) = lcached } lcached @@ -340,18 +324,17 @@ class IndexedLexFeaturizer[L, L2, W](grammar: RuleTopology[L], private def getSpanFeatures(begin: Int, end: Int):Array[Int] = { var cache = rawSpanCache(begin, end) - if(cache eq null) { + if (cache eq null) { cache = splitSpanSpec.get.featuresForSpan(begin, end) rawSpanCache(begin, end) = cache } cache } - private def getSplitFeatures(begin: Int, split: Int, end: Int):Array[Int] = { var cache = rawSplitCache(begin, end) - if(cache eq null) { + if (cache eq null) { cache = new Array[Array[Int]](end- begin) rawSplitCache(begin, end) = cache } @@ -415,7 +398,7 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], // binaryRule is (head * words.length + dep) // unaryRule is (head) // parent/leftchild/rightchild is (head) - final case class Spec(val words: IndexedSeq[W], val sparsityPattern: ChartConstraints[L]) extends GrammarAnchoring[L, W] { + final case class Spec(words: IndexedSeq[W], sparsityPattern: ChartConstraints[L]) extends GrammarAnchoring[L, W] { override def addConstraints(constraints: ChartConstraints[L]): GrammarAnchoring[L, W] = copy(sparsityPattern = sparsityPattern & constraints) override def annotationTag: Int = 1 @@ -427,7 +410,7 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], private def dot(features: Array[Int]) = { var i = 0 var score = 0.0 - while(i < features.length) { + while (i < features.length) { score += weights(features(i)) i += 1 } @@ -442,7 +425,6 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], dot(f.featuresForUnaryRule(begin, end, rule, ref)) } - val attachCache = Array.ofDim[OpenAddressHashArray[Double]](words.length, words.length) val ruleCache = new TriangularArray[Array[OpenAddressHashArray[Double]]](words.length + 1) def scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int): Double = { @@ -462,13 +444,13 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], val ruleRef = this.binaryRuleRef(ref) val refinedTag = refinements.labels.globalize(tag, refinements.parentRefinement(rule, ruleRef)) var attachScore = cache(refinedTag) - if(java.lang.Double.isNaN(attachScore)) { + if (java.lang.Double.isNaN(attachScore)) { attachScore = dot(f.featuresForAttach(head, dep, refinedTag)) cache(refinedTag) = attachScore } score += attachScore - if(f.splitSpanSpec.nonEmpty) { + if (f.splitSpanSpec.nonEmpty) { var ucache = ruleCache(begin, end) if (ucache eq null) { ucache = new Array[OpenAddressHashArray[Double]](end - begin) @@ -492,7 +474,7 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], score += lcached } - if(featurizer.useBilexRuleFeatures) { + if (featurizer.useBilexRuleFeatures) { score += dot(f.featuresForHeadDepRule(begin, split, end, head, dep, rule, ruleRef)) } @@ -512,7 +494,6 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], head + ref * words.length * words.length } - def joinUnaryRuleRef(head: Int, ref: Int) : Int = { head + ref * words.length } @@ -529,13 +510,12 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], epic.util.Arrays.crossProduct(lexRefs, ruleRefs, words.length) } - def validLabelRefinements(begin: Int, end: Int, label: Int) = joinTagRefs(Array.range(begin,end), refinements.labels.localRefinements(label)) def numValidRefinements(label: Int) = joinTagRef(words.length, refinements.labels.numRefinements(label)) def numValidRuleRefinements(rule: Int): Int = { - if(binaries(rule)) { + if (binaries(rule)) { joinBinaryRuleRef(words.length * words.length, refinements.rules.numRefinements(rule)) } else { joinUnaryRuleRef(words.length, refinements.rules.numRefinements(rule)) @@ -543,36 +523,19 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], } def validRuleRefinementsGivenParent(begin: Int, end: Int, rule: Int, parentRef: Int) = { - if(!binaries(rule)) { + if (!binaries(rule)) { val lexicalizedRefinements = Array(unaryHeadIndex(parentRef)) val ruleRefs = refinements.ruleRefinementsCompatibleWithParentRef(rule, tagRef(parentRef)) joinUnaryRuleRefs(lexicalizedRefinements, ruleRefs) } else { - val lexicalizedRefinements = if(isHeadOnLeftForRule(rule)) { + val lexicalizedRefinements = if (isHeadOnLeftForRule(rule)) { val head = unaryHeadIndex(parentRef) // val x = Array.range(0,numValidRuleRefinements(rule)).filter(x => leftChildRefinement(rule,x) == parentRef && rightChildRefinement(rule, x) > parentRef && rightChildRefinement(rule, x) < end) - val result = new Array[Int](end - (head+1)) - var ref = head * words.length + head + 1 - var i = 0 - while(i < result.length) { - result(i) = ref - ref += 1 - i += 1 - } - result + Array.fillWith[Int](end - (head + 1)) { i => head * words.length + head + 1 + i } } else { val head = unaryHeadIndex(parentRef) // val x = Array.range(0,numValidRuleRefinements(rule)).filter(x => rightChildRefinement(rule,x) == parentRef && leftChildRefinement(rule, x) < parentRef && leftChildRefinement(rule, x) >= begin) - val result = new Array[Int](head - begin) - var ref = head * words.length + begin - var i = 0 - while(i < result.length) { - result(i) = ref - i += 1 - ref += 1 - } - // assert(x.toSet == result.toSet) - result + Array.fillWith[Int](head - begin) { i => head * words.length + begin + i } } val ruleRefs = refinements.ruleRefinementsCompatibleWithParentRef(rule, tagRef(parentRef)) @@ -581,15 +544,14 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], } - override def validRuleRefinementsGivenParent(begin: Int, splitBegin: Int, splitEnd: Int, end: Int, rule: Int, parentRef: Int): Array[Int] = { - if(!binaries(rule)) { + if (!binaries(rule)) { val lexicalizedRefinements = Array(parentRef:Int) val ruleRefs = refinements.ruleRefinementsCompatibleWithParentRef(rule, tagRef(parentRef)) joinUnaryRuleRefs(lexicalizedRefinements, ruleRefs) } else { val headIndex = unaryHeadIndex(parentRef) - val lexicalizedRefinements = if(isHeadOnLeftForRule(rule)) { + val lexicalizedRefinements = if (isHeadOnLeftForRule(rule)) { // if the head is on the left, then the dependent // can be in Span(math.max(splitBegin, ref1+1), end). // Further, if the ref1 is <= splitEnd, then @@ -598,17 +560,9 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], // ^------ref1------^ // max: ^------^----dep---------^ // - if(splitEnd <= headIndex) return Array.empty + if (splitEnd <= headIndex) return Array.empty val firstPossibleStart = math.max(headIndex +1, splitBegin) - val result = new Array[Int](end - firstPossibleStart) - var ref = headIndex * words.length + firstPossibleStart - var i = 0 - while(i < result.length) { - result(i) = ref - ref += 1 - i += 1 - } - result + Array.fillWith[Int](end - firstPossibleStart)(i => headIndex * words.length + firstPossibleStart + i) } else { // if the head is on the right, then the dependent // can be in (begin until math.min(splitEnd,ref1)) @@ -618,20 +572,12 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], // ^--------ref1------^ // ^-----------dep---^-----^ : min // - if(splitBegin >= headIndex) return Array.empty + if (splitBegin >= headIndex) return Array.empty val lastPossibleEnd = math.min(headIndex, splitEnd) - val result = new Array[Int](lastPossibleEnd - begin) - var ref = headIndex * words.length + begin - var i = 0 - while(i < result.length) { - result(i) = ref - i += 1 - ref += 1 - } - result + Array.fillWith[Int](lastPossibleEnd - begin)(i => headIndex * words.length + begin + i) } - if(lexicalizedRefinements.isEmpty) { + if (lexicalizedRefinements.isEmpty) { lexicalizedRefinements } else { val ruleRefs = refinements.ruleRefinementsCompatibleWithParentRef(rule, tagRef(parentRef)) @@ -642,57 +588,21 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], } def validRuleRefinementsGivenLeftChild(begin: Int, split: Int, completionBegin:Int, completionEnd: Int, rule: Int, lcRef: Int) = { - val lexicalizedRefinements = if(isHeadOnLeftForRule(rule)) { - val result = new Array[Int](completionEnd - split) - val lc = unaryHeadIndex(lcRef) - var ref = lc * words.length + split - var i = 0 - while(i < result.length) { - result(i) = ref - ref += 1 - i += 1 - } - result - } else { - val lc = unaryHeadIndex(lcRef) - val result = new Array[Int](completionEnd - split) - var ref = split * words.length + lc - var i = 0 - while(i < result.length) { - result(i) = ref - i += 1 - ref += words.length - } - result - } + val lc = unaryHeadIndex(lcRef) + val lexicalizedRefinements = if (isHeadOnLeftForRule(rule)) + Array.fillWith[Int](completionEnd - split)(i => lc * words.length + split + i) + else + Array.fillWith[Int](completionEnd - split)(i => (split + i) * words.length + lc) val ruleRefs = refinements.ruleRefinementsCompatibleWithLeftRef(rule, tagRef(lcRef)) joinBinaryRuleRefs(lexicalizedRefinements, ruleRefs) } - def validRuleRefinementsGivenRightChild(completionBegin: Int, completionEnd: Int, split: Int, end: Int, rule: Int, rcRef: Int): Array[Int] = { val rc = unaryHeadIndex(rcRef) - val lexicalizedRefinements = if(!isHeadOnLeftForRule(rule)) { - val result = new Array[Int](split - completionBegin) - var ref = rc * words.length + completionBegin - var i = 0 - while(i < result.length) { - result(i) = ref - ref += 1 - i += 1 - } - result - } else { - val result = new Array[Int](split - completionBegin) - var ref = completionBegin * words.length + rc - var i = 0 - while(i < result.length) { - result(i) = ref - i += 1 - ref += words.length - } - result - } + val lexicalizedRefinements = if (!isHeadOnLeftForRule(rule)) + Array.fillWith[Int](split - completionBegin)(i => rc * words.length + completionBegin + i) + else + Array.fillWith[Int](split - completionBegin)(i => (completionBegin + i) * words.length + rc) val ruleRefs = refinements.ruleRefinementsCompatibleWithRightRef(rule, tagRef(rcRef)) joinBinaryRuleRefs(lexicalizedRefinements, ruleRefs) } @@ -703,45 +613,39 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], joinUnaryRuleRefs(lexicalizedRefinements, ruleRefs) } - def leftChildRefinement(rule: Int, ruleRef: Int) = { - val word = if(isHeadOnLeftForRule(rule)) { + val word = if (isHeadOnLeftForRule(rule)) { headIndex(ruleRef) } else { depIndex(ruleRef) } - val refinedRuleId = refinements.rules.globalize(rule, binaryRuleRef(ruleRef)) val tagref = refinements.labels.localize(refinedGrammar.leftChild(refinedRuleId)) - joinTagRef(word, tagref) } def rightChildRefinement(rule: Int, ruleRef: Int) = { - val word = if(isHeadOnRightForRule(rule)) { + val word = if (isHeadOnRightForRule(rule)) { headIndex(ruleRef) } else { depIndex(ruleRef) } - val refinedRuleId = refinements.rules.globalize(rule, binaryRuleRef(ruleRef)) val tagref = refinements.labels.localize(refinedGrammar.rightChild(refinedRuleId)) joinTagRef(word, tagref) } def parentRefinement(rule: Int, ruleRef: Int) = { - val word = if(binaries(rule)) { + val word = if (binaries(rule)) { headIndex(ruleRef) } else { unaryHeadIndex(ruleRef) } - - val rr = if(binaries(rule)) { + val rr = if (binaries(rule)) { binaryRuleRef(ruleRef) } else { unaryRuleRef(ruleRef) } - val refinedRuleId = refinements.rules.globalize(rule, rr) val tagref = refinements.labels.localize(refinedGrammar.parent(refinedRuleId)) joinTagRef(word, tagref) @@ -749,7 +653,6 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], def childRefinement(rule: Int, ruleRef: Int) = { val word = unaryHeadIndex(ruleRef) - val refinedRuleId = refinements.rules.globalize(rule, unaryRuleRef(ruleRef)) val tagref = refinements.labels.localize(refinedGrammar.child(refinedRuleId)) joinTagRef(word, tagref) @@ -767,12 +670,11 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], val b2 = refinements.labels.globalize(b, labelB) val rule = UnaryRule(refinements.labels.fineIndex.get(a2), refinements.labels.fineIndex.get(b2), topology.chain(r)) val refinedRuleIndex = refinements.rules.fineIndex(rule) - val refR = if(refinedRuleIndex < 0) { + val refR = if (refinedRuleIndex < 0) { -1 } else { refinements.rules.localize(refinedRuleIndex) } - joinUnaryRuleRef(hA, refR) } @@ -780,19 +682,16 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], val hA = unaryHeadIndex(refA) val hB = unaryHeadIndex(refB) val hC = unaryHeadIndex(refC) - - val lexRef = if(isHeadOnLeftForRule(r)) { + val lexRef = if (isHeadOnLeftForRule(r)) { require(hA == hB) hA * words.length + hC } else { require(hA == hC) hA * words.length + hB } - val labelA = tagRef(refA) val labelB = tagRef(refB) val labelC = tagRef(refC) - val a = topology.parent(r) val b = topology.leftChild(r) val c = topology.rightChild(r) @@ -802,8 +701,7 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], val refR = refinements.rules.localize(refinements.rules.fineIndex(BinaryRule(refinements.labels.fineIndex.get(a2), refinements.labels.fineIndex.get(b2), refinements.labels.fineIndex.get(c2) - )) ) - + ))) assert(headIndex(lexRef) == hA) joinBinaryRuleRef(lexRef, refR) } @@ -817,11 +715,9 @@ final class LexGrammar[L, L2, W](val topology: RuleTopology[L], if (isHeadOnLeftForRule(rule)) Array.range(begin, splitEnd) else Array.range(splitBegin, end) } - joinTagRefs(lexRefs, refinements.parentRefinementsCompatibleWithRule(rule)) } - def validLeftChildRefinementsGivenRule(begin: Int, splitBegin: Int, splitEnd: Int, end: Int, rule: Int): Array[Int] = { val lexRefs = Array.range(begin, splitEnd) joinTagRefs(lexRefs, refinements.leftChildRefinementsCompatibleWithRule(rule)) @@ -847,7 +743,7 @@ case class LexGrammarBundle[L, L2, W](topology: RuleTopology[L], for( (rule@BinaryRule(a, b,c), r) <- bg.index.iterator.zipWithIndex) { binaries(r) = true val headChild = headFinder.findHeadChild(rule) - if(headChild == 0) { + if (headChild == 0) { leftRules(r) = true } else { rightRules(r) = true @@ -897,7 +793,7 @@ object IndexedLexFeaturizer extends LazyLogging { val words = hasWords.get(ti) val tree = ann(hasTree.get(ti), words) // returns head - def rec(t: BinarizedTree[L2]):Int= t match { + def rec(t: BinarizedTree[L2]): Int= t match { case NullaryTree(a, span) => val (ai, aref) = refinements.labels.indexAndLocalize(a) wordBuilder.add(ruleSpec.featuresForSpan(span.begin, span.end, ai, aref), @@ -909,14 +805,14 @@ object IndexedLexFeaturizer extends LazyLogging { val (ri, rref) = refinements.rules.indexAndLocalize(r) unaryBuilder.add(ruleSpec.featuresForUnaryRule(span.begin, span.end, ri, rref), unarySpec.featuresForWord(head)) - if(splitSpanSpec.nonEmpty) + if (splitSpanSpec.nonEmpty) splitBuilder.add(ruleSpec.featuresForUnaryRule(span.begin, span.end, ri, rref), splitSpanSpec.get.featuresForSpan(span.begin, span.end)) head case t@BinaryTree(a, b, c, span) => val (leftHead,rightHead) = (rec(t.leftChild), rec(t.rightChild)) val headIsLeft = headFinder.findHeadChild(t) == 0 - val (head, dep) = if(headIsLeft) leftHead -> rightHead else rightHead -> leftHead + val (head, dep) = if (headIsLeft) leftHead -> rightHead else rightHead -> leftHead val r = BinaryRule[L2](a, b.label, c.label) val (ri, rref) = refinements.rules.indexAndLocalize(r) val bilexFeatures = bilexSpec.featuresForAttachment(head, dep) @@ -926,18 +822,18 @@ object IndexedLexFeaturizer extends LazyLogging { wordSpec.featuresForWord(head)) val aglob = refinements.labels.fineIndex(a) - if(useBilexRuleFeatures) + if (useBilexRuleFeatures) bilexBuilder.add(ruleSpec.featuresForBinaryRule(span.begin, split, span.end, ri, rref), bilexFeatures) bilexBuilder.add(labelFeatures(aglob), bilexFeatures) - bilexBuilder.add(attachFeatures(if(headIsLeft) 0 else 1), bilexFeatures) + bilexBuilder.add(attachFeatures(if (headIsLeft) 0 else 1), bilexFeatures) - if(splitSpanFeaturizer.nonEmpty) splitBuilder.add(ruleSpec.featuresForBinaryRule(span.begin, t.splitPoint, span.end, ri, rref), + if (splitSpanFeaturizer.nonEmpty) splitBuilder.add(ruleSpec.featuresForBinaryRule(span.begin, t.splitPoint, span.end, ri, rref), splitSpanSpec.get.featuresForSpan(span.begin, span.end)) - if(splitSpanFeaturizer.nonEmpty) splitBuilder.add(ruleSpec.featuresForBinaryRule(span.begin, t.splitPoint, span.end, ri, rref), + if (splitSpanFeaturizer.nonEmpty) splitBuilder.add(ruleSpec.featuresForBinaryRule(span.begin, t.splitPoint, span.end, ri, rref), splitSpanSpec.get.featuresForSplit(span.begin, t.splitPoint, span.end)) - if(splitSpanFeaturizer.nonEmpty) splitBuilder.add(ruleSpec.featuresForSpan(span.begin, span.end, ai, aref), + if (splitSpanFeaturizer.nonEmpty) splitBuilder.add(ruleSpec.featuresForSpan(span.begin, span.end, ai, aref), splitSpanSpec.get.featuresForSpan(span.begin, span.end)) head } @@ -951,7 +847,6 @@ object IndexedLexFeaturizer extends LazyLogging { val ufi = unaryBuilder.result() val sfi = splitBuilder.result() - new IndexedLexFeaturizer(ruleFeaturizer.topology, labelFeatures, attachFeatures, @@ -978,7 +873,6 @@ case class LexModelFactory(@Help(text= "The kind of annotation to do on the refi useBilexRuleFeatures: Boolean = true) extends ParserModelFactory[AnnotatedLabel, String] with SafeLogging { type MyModel = LexModel[AnnotatedLabel, AnnotatedLabel, String] - override def make(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], constrainer: Factory[AnnotatedLabel, String]): MyModel = { @@ -1002,22 +896,21 @@ case class LexModelFactory(@Help(text= "The kind of annotation to do on the refi + offsets(dep) ) - bilexF = bilexF (wf, lfsuf + offsets, bilexF) } - val spanFeaturizer = if(!useSpanFeatures) { + val spanFeaturizer = if (!useSpanFeatures) { new ZeroSplitSpanFeaturizer[String] } else { val dsl = new WordFeaturizer.DSL(initLexicon) with SurfaceFeaturizer.DSL with SplitSpanFeaturizer.DSL import dsl._ // class(split + 1) - val baseCat = (lfsuf) + val baseCat = lfsuf - val leftOfSplit = ((baseCat)(-1)apply (split)) + val leftOfSplit = baseCat(-1)apply split var featurizer: SplitSpanFeaturizer[String] = zeroSplit[String] // if (useFirstLast) { @@ -1029,20 +922,20 @@ case class LexModelFactory(@Help(text= "The kind of annotation to do on the refi featurizer += baseCat(end) // } -// if(useSplits) { +// if (useSplits) { featurizer += leftOfSplit featurizer += baseCat(split) // } -// if(useSpanLength) { +// if (useSpanLength) { featurizer += length // } -// if(useShape) { +// if (useShape) { featurizer += spanShape // } -// if(useBinaryLengths) { +// if (useBinaryLengths) { featurizer += distance[String](begin, split) featurizer += distance[String](split, end) // } @@ -1060,19 +953,17 @@ case class LexModelFactory(@Help(text= "The kind of annotation to do on the refi } val indexedSplitSpanFeaturizer = { - if(useSpanFeatures) + if (useSpanFeatures) Some(IndexedSplitSpanFeaturizer.fromData(spanFeaturizer, trees)) else None } - - def labelFeaturizer(l: AnnotatedLabel) = Set(l, l.baseAnnotatedLabel).toSeq def ruleFeaturizer(r: Rule[AnnotatedLabel]) = r match { case r@BinaryRule(a,b,c) => val headIsLeft = headFinder.findHeadChild(r) == 0 - val dir = if(headIsLeft) AttachLeft else AttachRight + val dir = if (headIsLeft) AttachLeft else AttachRight Set(r, r.map(_.baseAnnotatedLabel), dir).toSeq case r@UnaryRule(a,b,c) => Set(r, r.map(_.baseAnnotatedLabel)).toSeq diff --git a/src/main/scala/epic/parser/models/NeuralModel.scala b/src/main/scala/epic/parser/models/NeuralModel.scala index 3a22fb77..f2dae6c2 100644 --- a/src/main/scala/epic/parser/models/NeuralModel.scala +++ b/src/main/scala/epic/parser/models/NeuralModel.scala @@ -40,8 +40,6 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], constrainer: Factory[AnnotatedLabel, String]): MyModel = { - - val annTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]] = trainTrees.map(annotator(_)) println("Here's what the annotation looks like on the first few trees") annTrees.slice(0, Math.min(3, annTrees.size)).foreach(tree => println(tree.render(false))) @@ -61,12 +59,10 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma span += new SingleWordSpanFeaturizer[String](wf) - val indexedSurface = IndexedSplitSpanFeaturizer.fromData(span, annTrees, bloomFilter = false) - def labelFeaturizer(l: AnnotatedLabel) = Set(l, l.baseAnnotatedLabel).toSeq - def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if(r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty + def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if (r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty val featurizer = new ProductionFeaturizer[AnnotatedLabel, AnnotatedLabel, String](xbarGrammar, indexedRefinements, lGen=labelFeaturizer, @@ -75,15 +71,18 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma val transform = new AffineTransform( featurizer.index.size, numOutputs, - new TanhTransform(new AffineTransform(numOutputs, numHidden, - new TanhTransform[FeatureVector](numHidden, indexedSurface.featureIndex.size, true)))) - + new TanhTransform( + new AffineTransform(numOutputs, numHidden, + new TanhTransform[FeatureVector](numHidden, indexedSurface.featureIndex.size, true))) + ) - new TransformModel(annotator.latent, + new TransformModel( + annotator.latent, constrainer, topology, lexicon, refGrammar, indexedRefinements, featurizer, indexedSurface, - transform) + transform + ) } } \ No newline at end of file diff --git a/src/main/scala/epic/parser/models/NeuralParserTrainer.scala b/src/main/scala/epic/parser/models/NeuralParserTrainer.scala index bcb07011..e6f59e1e 100644 --- a/src/main/scala/epic/parser/models/NeuralParserTrainer.scala +++ b/src/main/scala/epic/parser/models/NeuralParserTrainer.scala @@ -82,7 +82,7 @@ object NeuralParserTrainer extends epic.parser.ParserPipeline with LazyLogging { import params._ import extraPTParams._ -// if(threads >= 1) +// if (threads >= 1) // collection.parallel.ForkJoinTasks.defaultForkJoinPool.setParallelism(params.threads) val initialParser = params.parser match { @@ -107,14 +107,14 @@ object NeuralParserTrainer extends epic.parser.ParserPipeline with LazyLogging { var theTrees = trainTrees.toIndexedSeq.filterNot(sentTooLong(_, params.maxParseLength)) - if(useConstraints && enforceReachability) { + if (useConstraints && enforceReachability) { val treebankGrammar = GenerativeParser.annotated(initialParser.topology, initialParser.lexicon, TreeAnnotator.identity, trainTrees) val markovizedGrammar = GenerativeParser.annotated(initialParser.topology, initialParser.lexicon, annotator, trainTrees) val proj = new OracleParser(treebankGrammar, markovizedGrammar) theTrees = theTrees.par.map(ti => ti.copy(tree=proj.forTree(ti.tree, ti.words, constraints.constraints(ti.words)))).seq.toIndexedSeq } - val baseMeasure = if(useConstraints) { + val baseMeasure = if (useConstraints) { constraints } else { ChartConstraints.Factory.noSparsity[AnnotatedLabel, String] @@ -126,10 +126,10 @@ object NeuralParserTrainer extends epic.parser.ParserPipeline with LazyLogging { val cachedObj = new CachedBatchDiffFunction(obj) println("Initializing weights custom for model " + model.getClass) val init = model.initialWeightVector(initWeightsScale, initializerSpec) - if(checkGradient) { + if (checkGradient) { val cachedObj2 = new CachedBatchDiffFunction(new ModelObjective(model, theTrees.take(opt.batchSize), params.threads)) - val defaultIndices = (0 until 10).map(i => if(i < 0) model.featureIndex.size + i else i) - val indices = if (model.transforms.size > 0) { + val defaultIndices = (0 until 10).map(i => if (i < 0) model.featureIndex.size + i else i) + val indices = if (model.transforms.nonEmpty) { model.transforms(0).getInterestingWeightIndicesForGradientCheck(0) } else { defaultIndices @@ -152,7 +152,6 @@ object NeuralParserTrainer extends epic.parser.ParserPipeline with LazyLogging { } } - val name = Option(params.name).orElse(Option(model.getClass.getSimpleName).filter(_.nonEmpty)).getOrElse("DiscrimParser") val itr: Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State] = if (determinizeTraining) { val scanningBatchesObj = cachedObj.withScanningBatches(params.opt.batchSize) @@ -208,7 +207,7 @@ object NeuralParserTrainer extends epic.parser.ParserPipeline with LazyLogging { def evaluateNow = { val sentinel = new File("EVALUATE_NOW") - if(sentinel.exists()) { + if (sentinel.exists()) { sentinel.delete() logger.info("Evaluating now!!!!") true diff --git a/src/main/scala/epic/parser/models/ParserExtractable.scala b/src/main/scala/epic/parser/models/ParserExtractable.scala index 3b6517d0..4f8a17f7 100644 --- a/src/main/scala/epic/parser/models/ParserExtractable.scala +++ b/src/main/scala/epic/parser/models/ParserExtractable.scala @@ -36,11 +36,10 @@ trait ParserExtractable[L, W] { def extractParser(weights: DenseVector[Double])(implicit deb: Debinarizer[L]): Parser[L, W] } - trait ParserExtractableModelFactory[L,W] { def make(train: IndexedSeq[TreeInstance[L, W]], topology: RuleTopology[L], lexicon: Lexicon[L, W], constraintsFactory: ChartConstraints.Factory[L, W]): MyModel - def readWeights(in: File):Counter[Feature, Double] = if(in != null && in.exists) { + def readWeights(in: File):Counter[Feature, Double] = if (in != null && in.exists) { try { val ctr = breeze.util.readObject[Counter[Feature, Double]](in) ctr @@ -53,7 +52,6 @@ trait ParserExtractableModelFactory[L,W] { type MyModel <: Model[TreeInstance[L,W]] with ParserExtractable[L,W] - protected def extractBasicCounts[L, W](trees: IndexedSeq[TreeInstance[L, W]]): (Counter2[L, W, Double], Counter2[L, BinaryRule[L], Double], Counter2[L, UnaryRule[L], Double]) = { GenerativeParser.extractCounts(trees) } diff --git a/src/main/scala/epic/parser/models/ParserModel.scala b/src/main/scala/epic/parser/models/ParserModel.scala index 45a9ce62..274ef114 100644 --- a/src/main/scala/epic/parser/models/ParserModel.scala +++ b/src/main/scala/epic/parser/models/ParserModel.scala @@ -35,12 +35,8 @@ trait ParserModel[L, W] extends epic.framework.StandardExpectedCounts.Model[Tree val inf = inferenceFromWeights(weights).forTesting Parser(constrainer, inf.grammar, ChartDecoder[L, W]()) } - - - } - trait ParserInference[L, W] extends ProjectableInference[TreeInstance[L, W], UnrefinedGrammarAnchoring[L, W]] { type ExpectedCounts = StandardExpectedCounts[Feature] type Marginal = epic.parser.ParseMarginal[L, W] @@ -51,12 +47,10 @@ trait ParserInference[L, W] extends ProjectableInference[TreeInstance[L, W], Unr override def forTesting: ParserInference[L, W] = this - def scorer(v: TreeInstance[L, W]): Scorer = { grammar.anchor(v.words, constrainer.constraints(v.words)) } - /** * Produces the "guess marginal" which is the marginal conditioned on only the input data * @param v the example @@ -80,7 +74,6 @@ trait ParserInference[L, W] extends ProjectableInference[TreeInstance[L, W], Unr def baseAugment(v: TreeInstance[L, W]) = UnrefinedGrammarAnchoring.identity(grammar.topology, grammar.lexicon, v.words, ChartConstraints.noSparsity) - def project(v: TreeInstance[L, W], s: Scorer, m: Marginal, oldAugment: UnrefinedGrammarAnchoring[L, W]): UnrefinedGrammarAnchoring[L, W] = { projector.project(this, v, m) } @@ -90,7 +83,6 @@ trait ParserInference[L, W] extends ProjectableInference[TreeInstance[L, W], Unr trait ParserModelFactory[L, W] extends ParserExtractableModelFactory[L, W] { type MyModel <: ParserModel[L, W] - } diff --git a/src/main/scala/epic/parser/models/ParserTrainer.scala b/src/main/scala/epic/parser/models/ParserTrainer.scala index 931da01f..303ff4a7 100644 --- a/src/main/scala/epic/parser/models/ParserTrainer.scala +++ b/src/main/scala/epic/parser/models/ParserTrainer.scala @@ -87,7 +87,7 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging { validate: (Parser[AnnotatedLabel, String]) => Statistics, params: Params) = { import params._ -// if(threads >= 1) +// if (threads >= 1) // collection.parallel.ForkJoinTasks.defaultForkJoinPool.setParallelism(params.threads) val initialParser = params.parser match { @@ -112,14 +112,14 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging { var theTrees = trainTrees.toIndexedSeq.filterNot(sentTooLong(_, params.maxParseLength)) - if(useConstraints && enforceReachability) { + if (useConstraints && enforceReachability) { val treebankGrammar = GenerativeParser.annotated(initialParser.topology, initialParser.lexicon, TreeAnnotator.identity, trainTrees) val markovizedGrammar = GenerativeParser.annotated(initialParser.topology, initialParser.lexicon, annotator, trainTrees) val proj = new OracleParser(treebankGrammar, markovizedGrammar) theTrees = theTrees.par.map(ti => ti.copy(tree=proj.forTree(ti.tree, ti.words, constraints.constraints(ti.words)))).seq.toIndexedSeq } - val baseMeasure = if(useConstraints) { + val baseMeasure = if (useConstraints) { constraints } else { ChartConstraints.Factory.noSparsity[AnnotatedLabel, String] @@ -129,9 +129,9 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging { val obj = new ModelObjective(model, theTrees, params.threads) val cachedObj = new CachedBatchDiffFunction(obj) val init = obj.initialWeightVector(randomize) - if(checkGradient) { + if (checkGradient) { val cachedObj2 = new CachedBatchDiffFunction(new ModelObjective(model, theTrees.take(opt.batchSize), params.threads)) - val indices = (0 until 10).map(i => if(i < 0) model.featureIndex.size + i else i) + val indices = (0 until 10).map(i => if (i < 0) model.featureIndex.size + i else i) println("testIndices: " + indices) GradientTester.testIndices(cachedObj2, obj.initialWeightVector(randomize = true), indices, toString={(i: Int) => model.featureIndex.get(i).toString}, skipZeros = true) println("test") @@ -150,7 +150,6 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging { } } - val name = Option(params.name).orElse(Option(model.getClass.getSimpleName).filter(_.nonEmpty)).getOrElse("DiscrimParser") val itr: Iterator[FirstOrderMinimizer[DenseVector[Double], BatchDiffFunction[DenseVector[Double]]]#State] = if (determinizeTraining) { val scanningBatchesObj = cachedObj.withScanningBatches(params.opt.batchSize) @@ -190,14 +189,13 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging { def evaluateNow = { val sentinel = new File("EVALUATE_NOW") - if(sentinel.exists()) { + if (sentinel.exists()) { sentinel.delete() logger.info("Evaluating now!!!!") true } else { false } - } def computeLL(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], model: Model[TreeInstance[AnnotatedLabel, String]], weights: DenseVector[Double]) { @@ -216,25 +214,18 @@ object ParserTrainer extends epic.parser.ParserPipeline with LazyLogging { } } - object Suffixes extends LazyLogging { + def main(args: Array[String]):Unit = { val tb = CommandLineParser.readIn[ProcessedTreebank](args) - val counts = GenerativeParser.extractCounts(tb.trainTrees)._1 - val marginalized: Counter[String, Double] = sum(counts(::, *)) - val lfs = LongestFrequentSuffixFeaturizer(marginalized) - for(ti <- tb.trainTrees) { val suffixes = lfs.lookupSentence(ti.words) println("original: " + ti.words.mkString(" ")) println("suffixes: " + suffixes.mkString(" ")) } - - } - } diff --git a/src/main/scala/epic/parser/models/PositionalNeuralModel.scala b/src/main/scala/epic/parser/models/PositionalNeuralModel.scala index dc474d8c..7f9ec2a6 100644 --- a/src/main/scala/epic/parser/models/PositionalNeuralModel.scala +++ b/src/main/scala/epic/parser/models/PositionalNeuralModel.scala @@ -42,7 +42,7 @@ class PositionalNeuralModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W val decoupledTransforms: Seq[OutputTransform[Array[Int],DenseVector[Double]]]) extends ParserModel[L, W] with Serializable { def mergeWeightsForEnsembling(x1: DenseVector[Double], x2: DenseVector[Double]) = { - require(decoupledTransforms.size == 0) + require(decoupledTransforms.isEmpty) require(x1.size == x2.size) // Stack up the dense parts, average the sparse parts if (maybeSparseSurfaceFeaturizer.isDefined) { @@ -55,11 +55,11 @@ class PositionalNeuralModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W } def cloneModelForEnsembling = { - require(decoupledTransforms.size == 0) + require(decoupledTransforms.isEmpty) // Note that duping the transforms is okay because they still produce distinct // layers, so caching behavior is unaffected - val newTransforms = transforms ++ transforms; - val newDepTransforms = depTransforms ++ depTransforms; + val newTransforms = transforms ++ transforms + val newDepTransforms = depTransforms ++ depTransforms new PositionalNeuralModel(annotator, constrainer, topology, lexicon, refinedTopology, refinements, labelFeaturizer, surfaceFeaturizer, depFeaturizer, newTransforms, maybeSparseSurfaceFeaturizer, newDepTransforms, decoupledTransforms) } @@ -67,7 +67,7 @@ class PositionalNeuralModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W override type Inference = PositionalNeuralModel.Inference[L, L2, W] override def accumulateCounts(inf: Inference, s: Scorer, d: TreeInstance[L, W], m: Marginal, accum: ExpectedCounts, scale: Double): Unit = { -// println("Extracting ecounts") + // println("Extracting ecounts") inf.grammar.extractEcounts(m, accum.counts, scale) if (maybeSparseSurfaceFeaturizer.isDefined) { @@ -75,10 +75,10 @@ class PositionalNeuralModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W val innerAccum = StandardExpectedCounts.zero(f.index) m.expectedCounts(maybeSparseSurfaceFeaturizer.get, innerAccum, scale) // val totalTransformSize = transform.index.size - val totalTransformSize = transforms.map(_.index.size).foldLeft(0)(_ + _) + depTransforms.map(_.index.size).foldLeft(0)(_ + _) + decoupledTransforms.map(_.index.size).foldLeft(0)(_ + _) + val totalTransformSize = transforms.map(_.index.size).sum + depTransforms.map(_.index.size).sum + decoupledTransforms.map(_.index.size).sum accum.counts += DenseVector.vertcat(DenseVector.zeros[Double](totalTransformSize), innerAccum.counts) } -// println("Ecounts extracted") + // println("Ecounts extracted") accum.loss += scale * m.logPartition } @@ -87,16 +87,16 @@ class PositionalNeuralModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W * @return */ val index = if (maybeSparseSurfaceFeaturizer.isDefined) { - SegmentedIndex((transforms.map(_.index) ++ depTransforms.map(_.index) ++ decoupledTransforms.map(_.index) ++ IndexedSeq(maybeSparseSurfaceFeaturizer.get.index)):_*) + SegmentedIndex(transforms.map(_.index) ++ depTransforms.map(_.index) ++ decoupledTransforms.map(_.index) ++ IndexedSeq(maybeSparseSurfaceFeaturizer.get.index):_*) } else { - SegmentedIndex((transforms.map(_.index) ++ depTransforms.map(_.index) ++ decoupledTransforms.map(_.index)):_*) + SegmentedIndex(transforms.map(_.index) ++ depTransforms.map(_.index) ++ decoupledTransforms.map(_.index):_*) } def initialWeightVector(initWeightsScale: Double, initializerSpec: String, trulyRandom: Boolean = false): DenseVector[Double] = { val rng = if (trulyRandom) new Random() else new Random(0) - val initTransformWeights = DenseVector.vertcat(transforms.map(_.initialWeightVector(initWeightsScale, rng, true, initializerSpec)):_*); - val initDepWeights = DenseVector.vertcat(depTransforms.map(_.initialWeightVector(initWeightsScale, rng, true, initializerSpec)):_*); - val initDecoupledWeights = DenseVector.vertcat(decoupledTransforms.map(_.initialWeightVector(initWeightsScale, rng, true, initializerSpec)):_*); + val initTransformWeights = DenseVector.vertcat(transforms.map(_.initialWeightVector(initWeightsScale, rng, true, initializerSpec)):_*) + val initDepWeights = DenseVector.vertcat(depTransforms.map(_.initialWeightVector(initWeightsScale, rng, true, initializerSpec)):_*) + val initDecoupledWeights = DenseVector.vertcat(decoupledTransforms.map(_.initialWeightVector(initWeightsScale, rng, true, initializerSpec)):_*) val newInitVector: DenseVector[Double] = if (maybeSparseSurfaceFeaturizer.isDefined) { DenseVector.vertcat(initTransformWeights, initDepWeights, initDecoupledWeights, DenseVector.zeros(maybeSparseSurfaceFeaturizer.get.index.size)) } else { @@ -111,16 +111,16 @@ class PositionalNeuralModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W override def inferenceFromWeights(weights: DenseVector[Double]): Inference = inferenceFromWeights(weights, true) def inferenceFromWeights(weights: DenseVector[Double], forTrain: Boolean): Inference = { - val layersAndInnerLayers = for (i <- 0 until transforms.size) yield { + val layersAndInnerLayers = transforms.indices.map { i => transforms(i).extractLayerAndPenultimateLayer(weights(index.componentOffset(i) until index.componentOffset(i) + index.indices(i).size), forTrain) } val layers: IndexedSeq[OutputTransform[Array[Int],DenseVector[Double]]#OutputLayer] = layersAndInnerLayers.map(_._1) val innerLayers: IndexedSeq[epic.dense.Transform.Layer[Array[Int],DenseVector[Double]]] = layersAndInnerLayers.map(_._2) - val depLayers: IndexedSeq[OutputTransform[Array[Int],DenseVector[Double]]#OutputLayer] = for (i <- 0 until depTransforms.size) yield { + val depLayers: IndexedSeq[OutputTransform[Array[Int],DenseVector[Double]]#OutputLayer] = depTransforms.indices.map { i => val idxIdx = transforms.size + i depTransforms(i).extractLayer(weights(index.componentOffset(idxIdx) until index.componentOffset(idxIdx) + index.indices(idxIdx).size), forTrain) } - val decoupledLayersAndInner = for (i <- 0 until decoupledTransforms.size) yield { + val decoupledLayersAndInner = decoupledTransforms.indices.map { i => val idxIdx = transforms.size + depTransforms.size + i decoupledTransforms(i).extractLayerAndPenultimateLayer(weights(index.componentOffset(idxIdx) until index.componentOffset(idxIdx) + index.indices(idxIdx).size), forTrain) } @@ -136,7 +136,7 @@ class PositionalNeuralModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W */ def extractParser(weights: DenseVector[Double], trainExs: Seq[TreeInstance[L,W]])(implicit deb: Debinarizer[L]) = { val inf = inferenceFromWeights(weights).forTesting - inf.relativizeToData(trainExs.slice(0, Math.min(trainExs.size, 200)).asInstanceOf[Seq[TreeInstance[AnnotatedLabel,String]]]); + inf.relativizeToData(trainExs.slice(0, Math.min(trainExs.size, 200)).asInstanceOf[Seq[TreeInstance[AnnotatedLabel,String]]]) Parser(constrainer, inf.grammar, ChartDecoder[L, W]()) } @@ -186,16 +186,15 @@ object PositionalNeuralModel { val SpanLayerIdx = 0 val UnaryLayerIdx = 1 val BinaryLayerIdx = 2 - val dcSpanFeatOffset = layers.map(_.index.size).foldLeft(0)(_ + _) + depLayers.map(_.index.size).foldLeft(0)(_ + _) - val dcUnaryFeatOffset = dcSpanFeatOffset + (if (decoupledLayers.size > 0) decoupledLayers(0).index.size else 0) - val dcBinaryFeatOffset = dcUnaryFeatOffset + (if (decoupledLayers.size > 0) decoupledLayers(1).index.size else 0) + val dcSpanFeatOffset = layers.map(_.index.size).sum + depLayers.map(_.index.size).sum + val dcUnaryFeatOffset = dcSpanFeatOffset + (if (decoupledLayers.nonEmpty) decoupledLayers(0).index.size else 0) + val dcBinaryFeatOffset = dcUnaryFeatOffset + (if (decoupledLayers.nonEmpty) decoupledLayers(1).index.size else 0) override def withPermissiveLexicon: Grammar[L, W] = { new PositionalNeuralGrammar(topology, lexicon.morePermissive, refinedTopology, refinements, labelFeaturizer, surfaceFeaturizer, depFeaturizer, layers, penultimateLayers, depLayers, maybeSparseSurfaceFeaturizer, decoupledLayers, penultimateDecoupledLayers, weights, origPTModel) } - /** * N.B. does not extracted expected counts for sparse features; this is done outside this loop */ @@ -206,7 +205,7 @@ object PositionalNeuralModel { val depSpec = depFeaturizer.anchor(w) val lspec = labelFeaturizer.anchor(w) -// val maxTetraLen = ((w.size + 2) * (w.size + 3) * (w.size + 4))/6 + ((w.size + 1) * (w.size + 2))/2 + w.size + 2 + // val maxTetraLen = ((w.size + 2) * (w.size + 3) * (w.size + 4))/6 + ((w.size + 1) * (w.size + 2))/2 + w.size + 2 def tetra(begin: Int, split: Int, end: Int) = { (end * (end + 1) * (end + 2))/6 + ((split + 1) * split / 2 + begin) @@ -217,10 +216,10 @@ object PositionalNeuralModel { val unaryRuleCountsPerState = new HashMap[Int,SparseVector[Double]] val binaryRuleCountsPerState = new HashMap[Int,SparseVector[Double]] val spanCountsPerState = new HashMap[Int,SparseVector[Double]] -// val ruleCountsPerState = Array.fill(maxTetraLen)(SparseVector.zeros[Double](labelFeaturizer.index.size)) -// val countsPerHeadDepPair = Array.tabulate(w.size, w.size)((i, j) => 0.0) -// val statesUsed = Array.fill(maxTetraLen)(false) -// val untetra = Array.fill(maxTetraLen)((-1, -1, -1)) + // val ruleCountsPerState = Array.fill(maxTetraLen)(SparseVector.zeros[Double](labelFeaturizer.index.size)) + // val countsPerHeadDepPair = Array.tabulate(w.size, w.size)((i, j) => 0.0) + // val statesUsed = Array.fill(maxTetraLen)(false) + // val untetra = Array.fill(maxTetraLen)((-1, -1, -1)) val untetra = new HashMap[Int,(Int,Int,Int)] m visit new AnchoredVisitor[L] { @@ -231,7 +230,7 @@ object PositionalNeuralModel { val fv = new FeatureVector(lspec.featuresForUnaryRule(begin, end, rule, ref)) if (!ruleCountsPerState.contains(tetraIdx)) ruleCountsPerState.put(tetraIdx, SparseVector.zeros[Double](labelFeaturizer.index.size)) axpy(score, fv, ruleCountsPerState(tetraIdx)) - if (!decoupledLayers.isEmpty) { + if (decoupledLayers.nonEmpty) { if (!unaryRuleCountsPerState.contains(tetraIdx)) unaryRuleCountsPerState.put(tetraIdx, SparseVector.zeros[Double](labelFeaturizer.index.size)) axpy(score, fv, unaryRuleCountsPerState(tetraIdx)) } @@ -243,7 +242,7 @@ object PositionalNeuralModel { val fv = new FeatureVector(lspec.featuresForSpan(begin, end, tag, ref)) if (!ruleCountsPerState.contains(tetraIdx)) ruleCountsPerState.put(tetraIdx, SparseVector.zeros[Double](labelFeaturizer.index.size)) axpy(score, fv, ruleCountsPerState(tetraIdx)) - if (!decoupledLayers.isEmpty) { + if (decoupledLayers.nonEmpty) { if (!spanCountsPerState.contains(tetraIdx)) spanCountsPerState.put(tetraIdx, SparseVector.zeros[Double](labelFeaturizer.index.size)) axpy(score, fv, spanCountsPerState(tetraIdx)) } @@ -255,7 +254,7 @@ object PositionalNeuralModel { val fv = new FeatureVector(lspec.featuresForBinaryRule(begin, split, end, rule, ref)) if (!ruleCountsPerState.contains(tetraIdx)) ruleCountsPerState.put(tetraIdx, SparseVector.zeros[Double](labelFeaturizer.index.size)) axpy(score, fv, ruleCountsPerState(tetraIdx)) - if (!decoupledLayers.isEmpty) { + if (decoupledLayers.nonEmpty) { if (!binaryRuleCountsPerState.contains(tetraIdx)) binaryRuleCountsPerState.put(tetraIdx, SparseVector.zeros[Double](labelFeaturizer.index.size)) axpy(score, fv, binaryRuleCountsPerState(tetraIdx)) } @@ -266,12 +265,12 @@ object PositionalNeuralModel { val (begin, split, end) = untetra(key) val ffeats = if (end > length) sspec.featuresForSpan(begin, split) else sspec.featuresForSplit(begin, split, end) var layerSizeTally = 0 - for (j <- 0 until layers.size) { + layers.indices.foreach { j => layers(j).tallyDerivative(deriv(layerSizeTally until layerSizeTally + layers(j).index.size), { ruleCountsPerState(key) * scale }, ffeats) - layerSizeTally += layers(j).index.size; + layerSizeTally += layers(j).index.size } } - if (!decoupledLayers.isEmpty) { + if (decoupledLayers.nonEmpty) { for (key <- spanCountsPerState.keySet) { val (begin, end, _) = untetra(key) val ffeats = sspec.reducedFeaturesForSpan(begin, end) @@ -328,23 +327,23 @@ object PositionalNeuralModel { val depSpec = depFeaturizer.anchor(w) val lspec = labelFeaturizer.anchor(w) val fspec = if (maybeSparseSurfaceFeaturizer.isDefined) maybeSparseSurfaceFeaturizer.get.anchor(w) else null - val sparseFeatsStart = if (maybeSparseSurfaceFeaturizer.isDefined) (layers.map(_.index.size).foldLeft(0)(_ + _) + depLayers.map(_.index.size).foldLeft(0)(_ + _) + decoupledLayers.map(_.index.size).foldLeft(0)(_ + _)) else -1 + val sparseFeatsStart = if (maybeSparseSurfaceFeaturizer.isDefined) layers.map(_.index.size).sum + depLayers.map(_.index.size).sum + decoupledLayers.map(_.index.size).sum else -1 private def tetra(begin: Int, split: Int, end: Int) = { (end * (end + 1) * (end + 2))/6 + ((split + 1) * split / 2 + begin) } def scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int) = { - var total = 0.0; + var total = 0.0 val tetraIdx = tetra(begin, split, end) val rfeats = lspec.featuresForBinaryRule(begin, split, end, rule, ref) - for (layerIdx <- 0 until layers.size) { + layers.indices.foreach { layerIdx => val fs = getOrElseUpdate(layerIdx, tetraIdx, { penultimateLayers(layerIdx).activations(sspec.featuresForSplit(begin, split, end)) }) for (rfeat <- rfeats) { total += getOrElseUpdateFinal(layerIdx, tetraIdx, rfeat, labelFeaturizer.index.size, { layers(layerIdx).activationsFromPenultimateDot(fs, rfeat) }) } } - if (!decoupledLayers.isEmpty) { + if (decoupledLayers.nonEmpty) { val layerIdx = layers.size + BinaryLayerIdx val fs = getOrElseUpdate(layerIdx, tetraIdx, { penultimateDecoupledLayers(BinaryLayerIdx).activations(sspec.featuresForSplit(begin, split, end)) }) for (rfeat <- rfeats) { @@ -358,16 +357,16 @@ object PositionalNeuralModel { } def scoreUnaryRule(begin: Int, end: Int, rule: Int, ref: Int) = { - var total = 0.0; + var total = 0.0 val tetraIdx = tetra(begin, end, length + 1) val rfeats = lspec.featuresForUnaryRule(begin, end, rule, ref) - for (layerIdx <- 0 until layers.size) { + layers.indices.foreach { layerIdx => val fs = getOrElseUpdate(layerIdx, tetraIdx, { penultimateLayers(layerIdx).activations(sspec.featuresForSpan(begin, end)) }) for (rfeat <- rfeats) { total += getOrElseUpdateFinal(layerIdx, tetraIdx, rfeat, labelFeaturizer.index.size, { layers(layerIdx).activationsFromPenultimateDot(fs, rfeat) }) } } - if (!decoupledLayers.isEmpty) { + if (decoupledLayers.nonEmpty) { val layerIdx = layers.size + UnaryLayerIdx val fs = getOrElseUpdate(layerIdx, tetraIdx, { penultimateDecoupledLayers(UnaryLayerIdx).activations(sspec.reducedFeaturesForSpan(begin, end)) }) for (rfeat <- rfeats) { @@ -381,16 +380,16 @@ object PositionalNeuralModel { } def scoreSpan(begin: Int, end: Int, tag: Int, ref: Int) = { - var total = 0.0; + var total = 0.0 val tetraIdx = tetra(begin, end, length + 2) val rfeats = lspec.featuresForSpan(begin, end, tag, ref) - for (layerIdx <- 0 until layers.size) { + layers.indices.foreach { layerIdx => val fs = getOrElseUpdate(layerIdx, tetraIdx, { penultimateLayers(layerIdx).activations(sspec.featuresForSpan(begin, end)) }) for (rfeat <- rfeats) { total += getOrElseUpdateFinal(layerIdx, tetraIdx, rfeat, labelFeaturizer.index.size, { layers(layerIdx).activationsFromPenultimateDot(fs, rfeat) }) } } - if (!decoupledLayers.isEmpty) { + if (decoupledLayers.nonEmpty) { val layerIdx = layers.size + SpanLayerIdx val fs = getOrElseUpdate(layerIdx, tetraIdx, { penultimateDecoupledLayers(SpanLayerIdx).activations(sspec.reducedFeaturesForSpan(begin, end)) }) for (rfeat <- rfeats) { @@ -407,7 +406,7 @@ object PositionalNeuralModel { var i = 0 var score = 0.0 val wdata = weights.data - while(i < features.length) { + while (i < features.length) { score += wdata(features(i) + sparseFeaturesOffset) i += 1 } diff --git a/src/main/scala/epic/parser/models/PositionalNeuralModelFactory.scala b/src/main/scala/epic/parser/models/PositionalNeuralModelFactory.scala index 6031526f..68d897f2 100644 --- a/src/main/scala/epic/parser/models/PositionalNeuralModelFactory.scala +++ b/src/main/scala/epic/parser/models/PositionalNeuralModelFactory.scala @@ -112,8 +112,6 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma type MyModel = PositionalNeuralModel[AnnotatedLabel, AnnotatedLabel, String] - - override def make(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], @@ -141,7 +139,6 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma val prodFeaturizer = new ProductionFeaturizer[AnnotatedLabel, AnnotatedLabel, String](xbarGrammar, indexedRefinements, lGen=labelFeaturizer, rGen=ruleFeaturizer) - /////////////////////// // READ IN WORD VECTORS val tagCountsLexicon = TagSpanShapeGenerator.makeStandardLexicon(annTrees) @@ -162,8 +159,8 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma Word2Vec.smartLoadVectorsForVocabulary(word2vecPath.split(":"), voc.toSet, summedWordCounts, if (embeddingType == "trivial") 1 else Int.MaxValue, true, randomizeUnks) } // Convert Array[Float] values to Array[Double] values and rescale them - val word2vecDoubleVect = word2vec.map(keyValue => (keyValue._1 -> keyValue._2.map(_.toDouble * vectorRescaling))) -// val word2vecDoubleVect = word2vec.map(keyValue => (keyValue._1 -> new DenseVector[Double](keyValue._2.map(_.toDouble)))) + val word2vecDoubleVect = word2vec.map(keyValue => keyValue._1 -> keyValue._2.map(_.toDouble * vectorRescaling)) + // val word2vecDoubleVect = word2vec.map(keyValue => (keyValue._1 -> new DenseVector[Double](keyValue._2.map(_.toDouble)))) val word2vecIndexed: Word2VecIndexed[String] = if (embeddingType == "normalpos") { Word2VecIndexed(word2vecDoubleVect, (str: String) => Word2Vec.convertWord(str, lowercasedVectors)).augment(freqTagger.tagTypesIdx.size, freqTagger.convertToFeaturizer) } else { @@ -229,7 +226,7 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma annotator.latent, indexedRefinements, xbarGrammar, - if(dummyFeats < 0) HashFeature.Absolute(-dummyFeats.toInt) else HashFeature.Relative(dummyFeats), + if (dummyFeats < 0) HashFeature.Absolute(-dummyFeats.toInt) else HashFeature.Relative(dummyFeats), filterUnseenFeatures = false, minFeatCount = 1, trainTrees) @@ -289,8 +286,7 @@ object PositionalNeuralModelFactory { val innerTransform = buildNetInnerTransforms(word2vecIndexed, inputSize, numHidden, numHiddenLayers, nonLinType, dropoutRate, backpropIntoEmbeddings) new AffineOutputTransform(outputSize, if (numHiddenLayers >= 1) numHidden else inputSize, innerTransform) } - - + def buildNetOutputEmbedding(word2vecIndexed: Word2VecIndexed[String], inputSize: Int, numHidden: Int, @@ -333,6 +329,6 @@ object PositionalNeuralModelFactory { } } -case class ParentFeature(f: Feature) extends Feature; -case class LeftChildFeature(f: Feature) extends Feature; -case class RightChildFeature(f: Feature) extends Feature; +case class ParentFeature(f: Feature) extends Feature +case class LeftChildFeature(f: Feature) extends Feature +case class RightChildFeature(f: Feature) extends Feature diff --git a/src/main/scala/epic/parser/models/ProductParserModelFactory.scala b/src/main/scala/epic/parser/models/ProductParserModelFactory.scala index fdfd8f9c..20a2dc1e 100644 --- a/src/main/scala/epic/parser/models/ProductParserModelFactory.scala +++ b/src/main/scala/epic/parser/models/ProductParserModelFactory.scala @@ -45,21 +45,19 @@ case class ProductParserModelFactory(annotator: TreeAnnotator[AnnotatedLabel, St oldWeights: File = null, splitFactor: Int = 1) extends ParserModelFactory[AnnotatedLabel, String] with SafeLogging { - type MyModel = LatentParserModel[AnnotatedLabel, (AnnotatedLabel, Seq[Int]), String] def genSplits(numModels: Int, numStates: Int):Seq[IndexedSeq[Int]] = { - if(numModels == 0) Seq(IndexedSeq.empty) + if (numModels == 0) Seq(IndexedSeq.empty) else for(r <- genSplits(numModels -1, numStates); i <- 0 until numStates) yield i +: r } def split(x: AnnotatedLabel, counts: Map[AnnotatedLabel, Int]):Seq[(AnnotatedLabel, Seq[Int])] = { - for (split <- genSplits(numModels, counts.getOrElse(x, numStates))) yield (x -> split) + for (split <- genSplits(numModels, counts.getOrElse(x, numStates))) yield x -> split } def unsplit(x: (AnnotatedLabel, Seq[Int])) = x._1 - def splitRule[L, L2](r: Rule[L], split: L=>Seq[L2]):Seq[Rule[L2]] = r match { case BinaryRule(a, b, c) => for(aa <- split(a); bb <- split(b); cc <- split(c)) yield BinaryRule(aa, bb, cc) // don't allow non-identity rule refinements for identity rewrites @@ -67,12 +65,10 @@ case class ProductParserModelFactory(annotator: TreeAnnotator[AnnotatedLabel, St case UnaryRule(a, b, chain) => for(aa <- split(a); bb <- split(b)) yield UnaryRule(aa, bb, chain) } - override def make(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], constrainer: Factory[AnnotatedLabel, String]): MyModel = { val annTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]] = trainTrees.map(annotator(_)) val (annWords, annBinaries, annUnaries) = this.extractBasicCounts(annTrees) - val (xbarGrammar, xbarLexicon) = topology -> lexicon val cFactory = constrainer @@ -112,7 +108,6 @@ case class ProductParserModelFactory(annotator: TreeAnnotator[AnnotatedLabel, St Counter[Feature, Double]() } - def latentAnnotator(t: BinarizedTree[AnnotatedLabel], w: IndexedSeq[String]) = { annotator(t, w).map(finalRefinements.labels.refinementsOf) } diff --git a/src/main/scala/epic/parser/models/SpanModel.scala b/src/main/scala/epic/parser/models/SpanModel.scala index 919b7942..510e4055 100644 --- a/src/main/scala/epic/parser/models/SpanModel.scala +++ b/src/main/scala/epic/parser/models/SpanModel.scala @@ -55,7 +55,6 @@ class SpanModel[L, L2, W](val featurizer: RefinedFeaturizer[L, W, Feature], initialFeatureVal: (Feature => Option[Double]) = { _ => None }) extends ParserModel[L, W] with Serializable { type Inference = LatentParserInference[L, L2, W] - override def initialValueForFeature(f: Feature) = initialFeatureVal(f) getOrElse 0.0 def inferenceFromWeights(weights: DenseVector[Double]) = { @@ -63,13 +62,11 @@ class SpanModel[L, L2, W](val featurizer: RefinedFeaturizer[L, W, Feature], new LatentParserInference(featurizer, annotator, dpGrammar, constrainer, refinements) } - def accumulateCounts(inf: Inference, s: Scorer, d: TreeInstance[L, W], m: Marginal, accum: ExpectedCounts, scale: Double): Unit = { m.expectedCounts(featurizer, accum, scale) } } - @SerialVersionUID(4749637878577393596L) class DotProductGrammar[L, L2, W, Feature](val topology: RuleTopology[L], val lexicon: Lexicon[L, W], @@ -78,14 +75,12 @@ class DotProductGrammar[L, L2, W, Feature](val topology: RuleTopology[L], val weights: DenseVector[Double], val featurizer: RefinedFeaturizer[L, W, Feature]) extends Grammar[L, W] { - override def withPermissiveLexicon: Grammar[L, W] = { new DotProductGrammar(topology, lexicon.morePermissive, refinedTopology, refinements, weights, featurizer) } def anchor(w: IndexedSeq[W], cons: ChartConstraints[L]):GrammarAnchoring[L, W] = new ProjectionsGrammarAnchoring[L, L2, W] { - override def addConstraints(constraints: ChartConstraints[L]): GrammarAnchoring[L, W] = { anchor(w, cons & constraints) } @@ -117,14 +112,13 @@ class DotProductGrammar[L, L2, W, Feature](val topology: RuleTopology[L], var i = 0 var score = 0.0 val wdata = weights.data - while(i < features.length) { + while (i < features.length) { score += wdata(features(i)) i += 1 } score } - } } @@ -162,14 +156,14 @@ case class IndexedSpanFeaturizer[L, L2, W](wordFeatureIndex: CrossProductIndex[F val ind = TriangularArray.index(begin, end) var rcache = spanCache(ind) - if(rcache eq null) { + if (rcache eq null) { rcache = new OpenAddressHashArray[Array[Int]](refinements.labels.fineIndex.size) spanCache(ind) = rcache } var cache = rcache(globalized) - if(cache == null) { + if (cache == null) { val spanFeats: Array[Int] = fspec.featuresForSpan(begin, end, tag, ref) - cache = if(begin + 1 == end) { + cache = if (begin + 1 == end) { wordFeatureIndex.crossProduct(spanFeats, wspec.featuresForWord(begin), wordOffset) } else { require(rspec.featuresForSpan(begin, end, tag, ref).isEmpty, "Span features on the extraProductionFeaturizer currently unsupported") @@ -184,12 +178,12 @@ case class IndexedSpanFeaturizer[L, L2, W](wordFeatureIndex: CrossProductIndex[F val globalized = refinements.rules.globalize(rule, ref) val ind = TriangularArray.index(begin, end) var rcache = unaryCache(ind) - if(rcache eq null) { + if (rcache eq null) { rcache = new OpenAddressHashArray[Array[Int]](refinements.rules.fineIndex.size) unaryCache(ind) = rcache } var cache = rcache(globalized) - if(cache == null) { + if (cache == null) { require(rspec.featuresForUnaryRule(begin, end, rule, ref).isEmpty, "Span features on the extraProductionFeaturizer currently unsupported") cache = spanFeatureIndex.crossProduct(fspec.featuresForUnaryRule(begin, end, rule, ref), getSpanFeatures(begin, end), spanOffset, true) @@ -202,24 +196,24 @@ case class IndexedSpanFeaturizer[L, L2, W](wordFeatureIndex: CrossProductIndex[F val globalized = refinements.rules.globalize(rule, ref) val ind = TriangularArray.index(begin, end) var rcache = binaryCache(ind) - if(rcache eq null) { + if (rcache eq null) { rcache = new Array[OpenAddressHashArray[Array[Int]]](end - begin) binaryCache(ind) = rcache } var scache = rcache(split - begin) - if(scache eq null) { + if (scache eq null) { scache = new OpenAddressHashArray[Array[Int]](refinements.rules.fineIndex.size) rcache(split - begin) = scache } var cache = scache(globalized) - if(cache == null) { + if (cache == null) { val spanFeatures = getSpanFeatures(begin, end) cache = spanFeatureIndex.crossProduct(fspec.featuresForBinaryRule(begin, split, end, rule, ref),spanFeatures, spanOffset, true) -// val forSplit = spanFeatureIndex.crossProduct(fspec.featuresForBinaryRule(begin, split, end, rule, ref), sspec.featuresForSplit(begin, split, end), spanOffset, false) + // val forSplit = spanFeatureIndex.crossProduct(fspec.featuresForBinaryRule(begin, split, end, rule, ref), sspec.featuresForSplit(begin, split, end), spanOffset, false) val ruleAndSpansFeatures = RuleAndSpansFeaturizer.indexAndOffset(ruleAndSpansFeatureIndex, rspec.featuresForBinaryRule(begin, split, end, rule, ref), ruleAndSpansOffset) val forSplit = Arrays.concatenate(spanFeatureIndex.crossProduct(fspec.featuresForBinaryRule(begin, split, end, rule, ref), sspec.featuresForSplit(begin, split, end), spanOffset, false), ruleAndSpansFeatures) - if(forSplit.length > 0) + if (forSplit.length > 0) cache = Arrays.concatenate(cache, forSplit) scache(globalized) = cache } @@ -230,7 +224,7 @@ case class IndexedSpanFeaturizer[L, L2, W](wordFeatureIndex: CrossProductIndex[F private def getSpanFeatures(begin: Int, end: Int):Array[Int] = { val ind = TriangularArray.index(begin, end) var cache = rawSpanCache(ind) - if(cache eq null) { + if (cache eq null) { cache = sspec.featuresForSpan(begin, end) rawSpanCache(ind) = cache } @@ -263,7 +257,7 @@ object IndexedSpanFeaturizer { minFeatCount: Int, trees: Traversable[TreeInstance[L, W]]): IndexedSpanFeaturizer[L, L2, W] = { - def seenSet = if(filterUnseenFeatures) new ThreadLocalBloomFilter[Long](8 * 1024 * 1024 * 50, 3) else AlwaysSeenSet + def seenSet = if (filterUnseenFeatures) new ThreadLocalBloomFilter[Long](8 * 1024 * 1024 * 50, 3) else AlwaysSeenSet val spanBuilder = new CrossProductIndex.Builder(featurizer.index, surfaceFeaturizer.featureIndex, dummyFeatScale, seenSet = seenSet, minCount = minFeatCount) val wordBuilder = new CrossProductIndex.Builder(featurizer.index, wordFeaturizer.featureIndex, dummyFeatScale, seenSet = seenSet, includeLabelOnlyFeatures = false) @@ -285,7 +279,7 @@ object IndexedSpanFeaturizer { for(a <- as; b <- bs.label) { val r = UnaryRule(a, b, chain) val (ri, rref) = refinements.rules.indexAndLocalize(r) - if(rref != -1) { + if (rref != -1) { spanBuilder.add(spec.featuresForUnaryRule(span.begin, span.end, ri, rref), sspec.featuresForSpan(span.begin, span.end)) RuleAndSpansFeaturizer.addToIndex(ruleAndSpansIndex, rspec.featuresForUnaryRule(span.begin, span.end, ri, rref)) } @@ -295,7 +289,7 @@ object IndexedSpanFeaturizer { val (ai, aref) = refinements.labels.indexAndLocalize(a) val r = BinaryRule(a, b, c) val (ri, rref) = refinements.rules.indexAndLocalize(r) - if(rref != -1) { + if (rref != -1) { spanBuilder.add(spec.featuresForBinaryRule(span.begin, t.splitPoint, span.end, ri, rref), sspec.featuresForSpan(span.begin, span.end)) spanBuilder.add(spec.featuresForBinaryRule(span.begin, t.splitPoint, span.end, ri, rref), @@ -313,7 +307,6 @@ object IndexedSpanFeaturizer { } } - @SerialVersionUID(-155022487059445275L) case class ExtraParams(useHackyLexicalFeatures:Boolean = false, hackyLexicalFeatureDesc:String = "", @@ -349,7 +342,6 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma type MyModel = SpanModel[AnnotatedLabel, AnnotatedLabel, String] - override def make(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], @@ -376,35 +368,31 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma var wf = posFeaturizer.getOrElse( SpanModelFactory.defaultPOSFeaturizer(annWords)) - if(useMorph) + if (useMorph) wf += mf - - - var span: SplitSpanFeaturizer[String] = spanFeaturizer.getOrElse(SpanModelFactory.goodFeaturizer(annWords, commonWordThreshold, useShape = useShape)) - if(useRichSpanContext) + if (useRichSpanContext) span += spanShapeBetter - if(useNGrams) + if (useNGrams) span += ngramF - if(useTagSpanShape) + if (useTagSpanShape) span += tagSpanShape - if(useFullShape) + if (useFullShape) span += fullShape val indexedWord = IndexedWordFeaturizer.fromData(wf, annTrees.map{_.words}, deduplicateFeatures = pruneRedundantFeatures) val surface = IndexedSplitSpanFeaturizer.fromData(span, annTrees, bloomFilter = false, deduplicateFeatures = pruneRedundantFeatures) - - + def labelFeaturizer(l: AnnotatedLabel) = Set(l, l.baseAnnotatedLabel).toSeq -// def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if(useGrammar) Set(r, r.map(_.baseAnnotatedLabel)).toSeq else if(r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty - def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if(useGrammar) { +// def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if (useGrammar) Set(r, r.map(_.baseAnnotatedLabel)).toSeq else if (r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty + def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if (useGrammar) { if (useChildFeats && r.isInstanceOf[BinaryRule[AnnotatedLabel]]) { Set(r, r.map(_.baseAnnotatedLabel), @@ -413,13 +401,12 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma } else { Set(r, r.map(_.baseAnnotatedLabel)).toSeq } - } else if(r.isInstanceOf[UnaryRule[AnnotatedLabel]]) { + } else if (r.isInstanceOf[UnaryRule[AnnotatedLabel]]) { Set(r.parent, r.parent.baseAnnotatedLabel).toSeq } else { Seq.empty } - - + val featurizer = new ProductionFeaturizer[AnnotatedLabel, AnnotatedLabel, String](xbarGrammar, indexedRefinements, lGen=labelFeaturizer, rGen=ruleFeaturizer, filterRedundantFeatures = pruneRedundantFeatures) @@ -439,7 +426,7 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma annotator.latent, indexedRefinements, xbarGrammar, - if(dummyFeats < 0) HashFeature.Absolute(-dummyFeats.toInt) else HashFeature.Relative(dummyFeats), + if (dummyFeats < 0) HashFeature.Absolute(-dummyFeats.toInt) else HashFeature.Relative(dummyFeats), filterUnseenFeatures = false, minFeatCount, trainTrees) @@ -453,13 +440,8 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma new SpanModel[AnnotatedLabel, AnnotatedLabel, String](indexed, indexed.index, annotator.latent, constrainer, xbarGrammar, xbarLexicon, refGrammar, indexedRefinements,featureCounter.get(_)) } - - } - - - case class LatentSpanModelFactory(inner: SpanModelFactory, @Help(text="Path to substates to use for each symbol. Uses numStates for missing states.") substates: File = null, @@ -470,7 +452,6 @@ case class LatentSpanModelFactory(inner: SpanModelFactory, type MyModel = SpanModel[AnnotatedLabel, (AnnotatedLabel, Int), String] - override def make(train: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], constrainer: Factory[AnnotatedLabel, String]): MyModel = { import inner.{logger => _, _} import extraParams._ @@ -482,7 +463,6 @@ case class LatentSpanModelFactory(inner: SpanModelFactory, val xbarLexicon = lexicon - val substateMap = if (substates != null && substates.exists) { val in = Source.fromFile(substates).getLines() val pairs = for (line <- in) yield { @@ -490,7 +470,7 @@ case class LatentSpanModelFactory(inner: SpanModelFactory, AnnotatedLabel(split(0)) -> split(1).toInt } pairs.toMap + (topology.root -> 1) - } else if(splitUselessStates) { + } else if (splitUselessStates) { Map(topology.root -> 1) } else { LatentModelFactory.statesToNotSplit.iterator.map(s => AnnotatedLabel(s) -> 1).toMap + (topology.root -> 1) @@ -540,22 +520,20 @@ case class LatentSpanModelFactory(inner: SpanModelFactory, var span: SplitSpanFeaturizer[String] = SpanModelFactory.goodFeaturizer(annWords, commonWordThreshold) - if(useRichSpanContext) + if (useRichSpanContext) span += spanShapeBetter - if(useNGrams) + if (useNGrams) span += ngramF - - if(useFullShape) + if (useFullShape) span += fullShape val indexedWord = IndexedWordFeaturizer.fromData(wf, annTrees.map{_.words}) val surface = IndexedSplitSpanFeaturizer.fromData(span, annTrees) - def labelFeaturizer(l: (AnnotatedLabel, Int)) = Set[Feature](IndicatorFeature(l), l._1, l._1.baseAnnotatedLabel).toSeq - def ruleFeaturizer(r: Rule[(AnnotatedLabel, Int)]) = if(useGrammar) Set(r, r.map(_._1)).toSeq else if(r.isInstanceOf[UnaryRule[(AnnotatedLabel, Int)]]) labelFeaturizer(r.parent) else Seq.empty + def ruleFeaturizer(r: Rule[(AnnotatedLabel, Int)]) = if (useGrammar) Set(r, r.map(_._1)).toSeq else if (r.isInstanceOf[UnaryRule[(AnnotatedLabel, Int)]]) labelFeaturizer(r.parent) else Seq.empty val featurizer = new ProductionFeaturizer[AnnotatedLabel, (AnnotatedLabel, Int), String](topology, finalRefinements, lGen=labelFeaturizer, @@ -573,23 +551,23 @@ case class LatentSpanModelFactory(inner: SpanModelFactory, annotator(t, w).map(finalRefinements.labels.refinementsOf) } - val indexed = IndexedSpanFeaturizer.extract[AnnotatedLabel, (AnnotatedLabel, Int), String](indexedWord, + val indexed = IndexedSpanFeaturizer.extract[AnnotatedLabel, (AnnotatedLabel, Int), String]( + indexedWord, surface, featurizer, ruleAndSpansFeaturizer, latentAnnotator, finalRefinements, topology, - if(dummyFeats < 0) HashFeature.Absolute(-dummyFeats.toInt) else HashFeature.Relative(dummyFeats), -// filterUnseenFeatures = true, + if (dummyFeats < 0) HashFeature.Absolute(-dummyFeats.toInt) else HashFeature.Relative(dummyFeats), + // filterUnseenFeatures = true, filterUnseenFeatures = false, - 1, - train) - + 1, + train + ) val featureCounter = this.readWeights(oldWeights) - val refGrammar = RuleTopology(finalRefinements.labels.refinementsOf(topology.root)(0), finalRefinements.labels.fineIndex, finalRefinements.rules.fineIndex) @@ -611,7 +589,7 @@ object SpanModelFactory { import dsl._ // class(split + 1) - var baseCat: WordFeaturizer[String] = new ZeroFeaturizer[String]; + var baseCat: WordFeaturizer[String] = new ZeroFeaturizer[String] if (useLfsuf) { baseCat += lfsuf } @@ -619,7 +597,7 @@ object SpanModelFactory { baseCat += new BrownClusterFeaturizer(Array(4, 10)) } - val leftOfSplit: SplitSpanFeaturizer[String] = ((baseCat)(-1)apply (split)) + val leftOfSplit: SplitSpanFeaturizer[String] = baseCat(-1)apply split var featurizer: SplitSpanFeaturizer[String] = zeroSplit[String] featurizer += baseCat(begin) @@ -634,13 +612,13 @@ object SpanModelFactory { featurizer += baseCat(end-2) featurizer += baseCat(begin+1) featurizer += baseCat(end+1) - featurizer += ((baseCat)(-2)apply (split)) - featurizer += ((baseCat)(1)apply (split)) + featurizer += baseCat(-2)apply split + featurizer += baseCat(1)apply split } featurizer += distance[String](begin, split) featurizer += distance[String](split, end) - if(useShape) + if (useShape) featurizer += spanShape featurizer } @@ -677,7 +655,6 @@ object SpanModelFactory { new CachedChartConstraintsFactory[AnnotatedLabel, String](uncached) } - val mf = new SpanModelFactory(annotator = annotator, posFeaturizer = posFeaturizer, spanFeaturizer = spanFeaturizer).make(trees, topo, lexicon, constraints) val mobj = new ModelObjective(mf, trees) diff --git a/src/main/scala/epic/parser/models/StructModel.scala b/src/main/scala/epic/parser/models/StructModel.scala index d8bb0a9c..3a28762e 100644 --- a/src/main/scala/epic/parser/models/StructModel.scala +++ b/src/main/scala/epic/parser/models/StructModel.scala @@ -56,7 +56,6 @@ class StructModel[L, L2, W](indexedFeatures: IndexedFeaturizer[L, L2, W], initialFeatureVal: (Feature => Option[Double]) = { _ => None }) extends ParserModel[L, W] with Serializable { type Inference = AnnotatedParserInference[L, W] - def featureIndex = indexedFeatures.index override def initialValueForFeature(f: Feature) = initialFeatureVal(f) getOrElse 0.0 @@ -71,7 +70,6 @@ class StructModel[L, L2, W](indexedFeatures: IndexedFeaturizer[L, L2, W], new AnnotatedParserInference(indexedFeatures, reannotate, grammar, constrainer) } - def accumulateCounts(inf: Inference, s: Scorer, d: TreeInstance[L, W], m: Marginal, accum: ExpectedCounts, scale: Double): Unit = { m.expectedCounts(indexedFeatures, accum, scale) } @@ -84,14 +82,13 @@ case class StructModelFactory(@Help(text= "The kind of annotation to do on the r annotatedTreesDumpPath: File = null) extends ParserModelFactory[AnnotatedLabel, String] { type MyModel = StructModel[AnnotatedLabel, AnnotatedLabel, String] - override def make(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], constrainer: Factory[AnnotatedLabel, String]): MyModel = { val transformed = trainTrees.par.map(annotator).seq.toIndexedSeq - if(annotatedTreesDumpPath != null) { + if (annotatedTreesDumpPath != null) { val ps = new PrintStream(new FileOutputStream(annotatedTreesDumpPath)) for( (x,y) <- trainTrees zip transformed) { ps.println("Treebank:\n" + x.render() + "\nAnnotated:\n" + y.render() + "\n==========\n") @@ -106,13 +103,10 @@ case class StructModelFactory(@Help(text= "The kind of annotation to do on the r val surfaceFeaturizer = { val dsl = new WordFeaturizer.DSL(initLexicon) import dsl._ - - ( - unigrams(word + clss, 1) - + suffixes() - + prefixes() - + props - ) + unigrams(word + clss, 1) + + suffixes() + + prefixes() + + props } val wordFeaturizer = IndexedWordFeaturizer.fromData(surfaceFeaturizer, transformed.map{_.words}) def labelFlattener(l: AnnotatedLabel): Seq[AnnotatedLabel] = { diff --git a/src/main/scala/epic/parser/models/ThreePointModel.scala b/src/main/scala/epic/parser/models/ThreePointModel.scala index 8edd3d7e..72bc18ba 100644 --- a/src/main/scala/epic/parser/models/ThreePointModel.scala +++ b/src/main/scala/epic/parser/models/ThreePointModel.scala @@ -35,8 +35,8 @@ class ThreePointModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W]) => labelFeaturizer: RefinedFeaturizer[L, W, Feature], wordFeaturizer: IndexedWordFeaturizer[W], rank: Int) extends ParserModel[L, W] { - override type Inference = ThreePointModel.ThreePointInference[L, L2, W] + override type Inference = ThreePointModel.ThreePointInference[L, L2, W] override def accumulateCounts(inf: Inference, s: Scorer, d: TreeInstance[L, W], m: Marginal, accum: ExpectedCounts, scale: Double): Unit = { inf.grammar.extractEcounts(m, accum.counts, scale) @@ -44,9 +44,8 @@ class ThreePointModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W]) => } override val featureIndex = new SegmentedIndex(new AffineTransform.Index(rank, labelFeaturizer.index.size, false) +: IndexedSeq.fill(3)(new AffineTransform.Index(rank, wordFeaturizer.featureIndex.size, false))) - override def inferenceFromWeights(weights: DenseVector[Double]): Inference = { - + override def inferenceFromWeights(weights: DenseVector[Double]): Inference = { val grammar = new ThreePointModel.Grammar[L, L2, W](topology, lexicon, refinedTopology, refinements, labelFeaturizer, wordFeaturizer, featureIndex, @@ -57,6 +56,7 @@ class ThreePointModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W]) => } override def initialValueForFeature(f: Feature): Double = f.hashCode().toDouble / 1000 % 2 + } object ThreePointModel { @@ -79,8 +79,6 @@ object ThreePointModel { LatentTreeMarginal(product, annotated) } - - } @SerialVersionUID(1L) @@ -95,7 +93,6 @@ object ThreePointModel { val IndexedSeq(ruleMatrix, wordMatrices@ _*) = reshapeWeightMatrices(weights) assert(wordMatrices.length == 3) - private def reshapeWeightMatrices(weights: DenseVector[Double]): IndexedSeq[DenseMatrix[Double]] = { val segments = featureIndex.shardWeights(weights) (featureIndex.indices zip segments).map { case (index, segment) => index.makeMatrix(segment)} @@ -136,12 +133,12 @@ object ThreePointModel { // doesn't include split point, which we'll do online val precachedSpanActivations = TriangularArray.tabulate(words.length + 1) { (i, j) => - if(sparsityPattern.isAllowedSpan(i, j) && i != j) { + if (sparsityPattern.isAllowedSpan(i, j) && i != j) { val result = DenseVector.ones[Double](wordActivations.head.head.size) result :*= actForPos(i, Point.First) result :*= actForPos(j - 1, Point.Last) -// println(result) + // println(result) result } else { @@ -149,10 +146,9 @@ object ThreePointModel { } } - def scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int) = { val surfaceAct = precachedSpanActivations(begin, end) - if(surfaceAct == null) { + if (surfaceAct == null) { Double.NegativeInfinity } else { val rfeats = lspec.featuresForBinaryRule(begin, split, end, rule, ref) @@ -162,7 +158,7 @@ object ThreePointModel { def scoreUnaryRule(begin: Int, end: Int, rule: Int, ref: Int) = { val surfaceAct = precachedSpanActivations(begin, end) - if(surfaceAct == null) { + if (surfaceAct == null) { Double.NegativeInfinity } else { val rfeats = lspec.featuresForUnaryRule(begin, end, rule, ref) @@ -172,7 +168,7 @@ object ThreePointModel { def scoreSpan(begin: Int, end: Int, tag: Int, ref: Int) = { val surfaceAct = precachedSpanActivations(begin, end) - if(surfaceAct == null) { + if (surfaceAct == null) { Double.NegativeInfinity } else { val rfeats = lspec.featuresForSpan(begin, end, tag, ref) @@ -199,9 +195,9 @@ object ThreePointModel { def checkFlush(begin: Int, split: Int, end: Int) { val state: (Int, Int) = (begin, end) val oldState: (Int, Int) = states(split) - if(oldState != state) { - if(oldState != UNUSED) { - val ffeats = if(split >= length) sspec.featuresForSpan(oldState._1, oldState._2) else sspec.featuresForSplit(oldState._1, split, oldState._2) + if (oldState != state) { + if (oldState != UNUSED) { + val ffeats = if (split >= length) sspec.featuresForSpan(oldState._1, oldState._2) else sspec.featuresForSplit(oldState._1, split, oldState._2) layer.tallyDerivative(deriv, ruleCountsPerState(split) *= scale, new FeatureVector(ffeats)) ruleCountsPerState(split) := 0.0 } @@ -252,8 +248,6 @@ object ThreePointModel { for(f <- sspec.featuresForWord(end - 1)) { axpy(score * scale, actWithoutEnd, dWeights(Point.Last.id)(::, f)) } - - } override def visitBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int, score: Double): Unit = { @@ -279,15 +273,12 @@ object ThreePointModel { for(f <- sspec.featuresForWord(split)) { axpy(score * scale, splitAct, dWeights(Point.Split.id)(::, f)) } - } } } } - - } case class ThreePointModelFactory(@Help(text= @@ -311,8 +302,6 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma type MyModel = ThreePointModel[AnnotatedLabel, AnnotatedLabel, String] - - override def make(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], @@ -342,19 +331,18 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma } } - if(useMorph) + if (useMorph) wf += MorphFeaturizer(pathsToMorph.split(",")) val indexedWord = IndexedWordFeaturizer.fromData(wf, annTrees.map{_.words}) def labelFeaturizer(l: AnnotatedLabel) = Set(l, l.baseAnnotatedLabel).toSeq - def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if(useGrammar) Set(r, r.map(_.baseAnnotatedLabel)).toSeq else if(r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty + def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if (useGrammar) Set(r, r.map(_.baseAnnotatedLabel)).toSeq else if (r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty val featurizer = new ProductionFeaturizer[AnnotatedLabel, AnnotatedLabel, String](xbarGrammar, indexedRefinements, lGen=labelFeaturizer, rGen=ruleFeaturizer) - new ThreePointModel(annotator.latent, constrainer, topology, lexicon, @@ -364,6 +352,4 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma } - - } diff --git a/src/main/scala/epic/parser/models/TransformModel.scala b/src/main/scala/epic/parser/models/TransformModel.scala index c0c60de9..cc35e123 100644 --- a/src/main/scala/epic/parser/models/TransformModel.scala +++ b/src/main/scala/epic/parser/models/TransformModel.scala @@ -34,7 +34,6 @@ class TransformModel[L, L2, W](annotator: (BinarizedTree[L], IndexedSeq[W]) => B val transform: Transform[FeatureVector, Vector[Double]]) extends ParserModel[L, W] { override type Inference = TransformModel.Inference[L, L2, W, transform.type] - override def accumulateCounts(inf: Inference, s: Scorer, d: TreeInstance[L, W], m: Marginal, accum: ExpectedCounts, scale: Double): Unit = { inf.grammar.extractEcounts(m, accum.counts, scale) accum.loss += scale * m.logPartition @@ -87,7 +86,6 @@ object TransformModel { new TransformGrammar(topology, lexicon.morePermissive, refinedTopology, refinements, labelFeaturizer, surfaceFeaturizer, layer) } - def extractEcounts(m: ParseMarginal[L, W], deriv: DenseVector[Double], scale: Double): Unit = { val w = m.words val length = w.length @@ -105,9 +103,9 @@ object TransformModel { def checkFlush(begin: Int, split: Int, end: Int) { val state: (Int, Int) = (begin, end) val oldState: (Int, Int) = states(split) - if(oldState != state) { - if(oldState != UNUSED) { - val ffeats = if(split >= length) sspec.featuresForSpan(oldState._1, oldState._2) else sspec.featuresForSplit(oldState._1, split, oldState._2) + if (oldState != state) { + if (oldState != UNUSED) { + val ffeats = if (split >= length) sspec.featuresForSpan(oldState._1, oldState._2) else sspec.featuresForSplit(oldState._1, split, oldState._2) layer.tallyDerivative(deriv, ruleCountsPerState(split) *= scale, new FeatureVector(ffeats)) ruleCountsPerState(split) := 0.0 } @@ -120,34 +118,32 @@ object TransformModel { override def visitUnaryRule(begin: Int, end: Int, rule: Int, ref: Int, score: Double): Unit = { checkFlush(begin, length, end) axpy(score, new FeatureVector(lspec.featuresForUnaryRule(begin, end, rule, ref)), ruleCountsPerState(length)) -// val ffeats = sspec.featuresForSpan(begin, end) -// layer.tallyDerivative(deriv, SparseVector(labelFeaturizer.index.size)(lspec.featuresForUnaryRule(begin, end, rule, ref).map(_ -> (scale * score)):_*), new FeatureVector(ffeats)) + // val ffeats = sspec.featuresForSpan(begin, end) + // layer.tallyDerivative(deriv, SparseVector(labelFeaturizer.index.size)(lspec.featuresForUnaryRule(begin, end, rule, ref).map(_ -> (scale * score)):_*), new FeatureVector(ffeats)) } override def visitSpan(begin: Int, end: Int, tag: Int, ref: Int, score: Double): Unit = { checkFlush(begin, length + 1, end) axpy(score, new FeatureVector(lspec.featuresForSpan(begin, end, tag, ref)), ruleCountsPerState(length + 1)) -// val ffeats = sspec.featuresForSpan(begin, end) -// layer.tallyDerivative(deriv, SparseVector(labelFeaturizer.index.size)(lspec.featuresForSpan(begin, end, tag, ref).map(_ -> (scale * score)):_*), new FeatureVector(ffeats)) - + // val ffeats = sspec.featuresForSpan(begin, end) + // layer.tallyDerivative(deriv, SparseVector(labelFeaturizer.index.size)(lspec.featuresForSpan(begin, end, tag, ref).map(_ -> (scale * score)):_*), new FeatureVector(ffeats)) } override def visitBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int, score: Double): Unit = { -// val ffeats = sspec.featuresForSplit(begin, split, end) -// layer.tallyDerivative(deriv, SparseVector(labelFeaturizer.index.size)(lspec.featuresForBinaryRule(begin, split, end, rule, ref).map(_ -> (scale * score)):_*), new FeatureVector(ffeats)) + // val ffeats = sspec.featuresForSplit(begin, split, end) + // layer.tallyDerivative(deriv, SparseVector(labelFeaturizer.index.size)(lspec.featuresForBinaryRule(begin, split, end, rule, ref).map(_ -> (scale * score)):_*), new FeatureVector(ffeats)) checkFlush(begin, split, end) axpy(score, new FeatureVector(lspec.featuresForBinaryRule(begin, split, end, rule, ref)), ruleCountsPerState(split)) } } - for(i <- 0 until states.length) { + states.indices.foreach { i => checkFlush(-1, i, -1) // force a flush } } def anchor(w: IndexedSeq[W], cons: ChartConstraints[L]):GrammarAnchoring[L, W] = new ProjectionsGrammarAnchoring[L, L2, W] { - override def addConstraints(constraints: ChartConstraints[L]): GrammarAnchoring[L, W] = { anchor(w, cons & constraints) } @@ -168,7 +164,7 @@ object TransformModel { private def tetra(begin: Int, split: Int, end: Int) = { (end.toLong * (end + 1) * (end + 2))/6 + ((split + 1) * split / 2 + begin) -// (begin, split, end) + // (begin, split, end) } def scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int) = { @@ -176,11 +172,10 @@ object TransformModel { val sfeats = sspec.featuresForSplit(begin, split, end) layer.activations(new FeatureVector(sfeats)) }) -// if(fs != layer.activations(new FeatureVector( sspec.featuresForSplit(begin, split, end)))) { -// println("!!!!") -// } + // if (fs != layer.activations(new FeatureVector( sspec.featuresForSplit(begin, split, end)))) { + // println("!!!!") + // } val rfeats = lspec.featuresForBinaryRule(begin, split, end, rule, ref) - new FeatureVector(rfeats) dot fs } @@ -205,12 +200,8 @@ object TransformModel { } } - - - } - case class TransformModelFactory(@Help(text= """The kind of annotation to do on the refined grammar. Default uses just parent annotation. You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Manning 2003. @@ -232,8 +223,6 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma type MyModel = TransformModel[AnnotatedLabel, AnnotatedLabel, String] - - override def make(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], topology: RuleTopology[AnnotatedLabel], lexicon: Lexicon[AnnotatedLabel, String], @@ -255,41 +244,37 @@ You can also epic.trees.annotations.KMAnnotator to get more or less Klein and Ma val summedWordCounts: Counter[String, Double] = sum(annWords, Axis._0) lazy val ngramF = new NGramSpanFeaturizer(summedWordCounts, NGramSpanFeaturizer.countBigrams(annTrees), annTrees.map(_.words), ngramCountThreshold, maxNGramOrder, useNot = false) lazy val tagSpanShape = new TagSpanShapeFeaturizer(TagSpanShapeGenerator.makeBaseLexicon(trainTrees)) -// lazy val fullShape = new FullWordSpanShapeFeaturizer(summedWordCounts.iterator.filter(_._2 > commonWordThreshold * 10).map(_._1).toSet, numSpanContextWords, useRichSpanContext) + // lazy val fullShape = new FullWordSpanShapeFeaturizer(summedWordCounts.iterator.filter(_._2 > commonWordThreshold * 10).map(_._1).toSet, numSpanContextWords, useRichSpanContext) var wf = posFeaturizer.getOrElse( SpanModelFactory.defaultPOSFeaturizer(annWords)) - if(useMorph) + if (useMorph) wf += mf - var span: SplitSpanFeaturizer[String] = spanFeaturizer.getOrElse(SpanModelFactory.goodFeaturizer(annWords, commonWordThreshold, useShape = false)) - if(useNGrams) + if (useNGrams) span += ngramF span += new SingleWordSpanFeaturizer[String](wf) - val indexedSurface = IndexedSplitSpanFeaturizer.fromData(span, annTrees, bloomFilter = false) - def labelFeaturizer(l: AnnotatedLabel) = Set(l, l.baseAnnotatedLabel).toSeq - def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if(useGrammar) Set(r, r.map(_.baseAnnotatedLabel)).toSeq else if(r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty + def ruleFeaturizer(r: Rule[AnnotatedLabel]) = if (useGrammar) Set(r, r.map(_.baseAnnotatedLabel)).toSeq else if (r.isInstanceOf[UnaryRule[AnnotatedLabel]]) Set(r.parent, r.parent.baseAnnotatedLabel).toSeq else Seq.empty val featurizer = new ProductionFeaturizer[AnnotatedLabel, AnnotatedLabel, String](xbarGrammar, indexedRefinements, lGen=labelFeaturizer, rGen=ruleFeaturizer) - - new TransformModel(annotator.latent, + new TransformModel( + annotator.latent, constrainer, topology, lexicon, refGrammar, indexedRefinements, featurizer, indexedSurface, - new AffineTransform(featurizer.index.size, rank, new AffineTransform(rank, indexedSurface.featureIndex.size, new IdentityTransform[FeatureVector]()))) + new AffineTransform(featurizer.index.size, rank, new AffineTransform(rank, indexedSurface.featureIndex.size, new IdentityTransform[FeatureVector]())) + ) } - - } \ No newline at end of file diff --git a/src/main/scala/epic/parser/morph/MorphFeat.scala b/src/main/scala/epic/parser/morph/MorphFeat.scala index 8f627378..3cb00ba7 100644 --- a/src/main/scala/epic/parser/morph/MorphFeat.scala +++ b/src/main/scala/epic/parser/morph/MorphFeat.scala @@ -1,22 +1,22 @@ package epic.parser.morph -case class MorphFeat(label: String, value: String); +case class MorphFeat(label: String, value: String) object MorphFeat { def readMorphFeatsFromBit(morphBit: String): Set[MorphFeat] = { if (morphBit == "_") { - Set(); + Set() } else { - val morphFeats = morphBit.split("\\|").filter(_ != "_"); + val morphFeats = morphBit.split("\\|").filter(_ != "_") val morphFeatsSeq = for (feat <- morphFeats) yield { if (feat.contains("=")) { - val equalsIndex = feat.indexOf("="); - MorphFeat(feat.substring(0, equalsIndex), feat.substring(equalsIndex + 1)); + val equalsIndex = feat.indexOf("=") + MorphFeat(feat.substring(0, equalsIndex), feat.substring(equalsIndex + 1)) } else { - MorphFeat(feat, ""); + MorphFeat(feat, "") } } - morphFeatsSeq.toSet; + morphFeatsSeq.toSet } } } diff --git a/src/main/scala/epic/parser/projections/AnchoredForestProjector.scala b/src/main/scala/epic/parser/projections/AnchoredForestProjector.scala index 0b3c19f5..b8e34c96 100644 --- a/src/main/scala/epic/parser/projections/AnchoredForestProjector.scala +++ b/src/main/scala/epic/parser/projections/AnchoredForestProjector.scala @@ -46,7 +46,7 @@ class AnchoredForestProjector(threshold: Double) extends Serializable { } def getOrElseUpdate[T<:AnyRef](arr: Array[T], i: Int, t : =>T) = { - if(arr(i) == null) { + if (arr(i) == null) { arr(i) = t } arr(i) @@ -63,10 +63,10 @@ class AnchoredForestProjector(threshold: Double) extends Serializable { val visitor = new AnchoredVisitor[L] { def visitSpan(begin: Int, end: Int, tag: Int, ref: Int, score: Double) { // fill in spans with 0 if they're active - if(score > 0.0) { + if (score > 0.0) { val index = TriangularArray.index(begin, end) getOrElseUpdate(lexicalScores, index, projVector())(tag) = 1.0 - if(totals(index) eq null) { + if (totals(index) eq null) { totals(index) = projVector() } totals(index)(tag) += score @@ -74,16 +74,16 @@ class AnchoredForestProjector(threshold: Double) extends Serializable { } def visitBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int, count: Double) { - if(count > 0.0) { + if (count > 0.0) { val index = TriangularArray.index(begin, end) var forSpan = binaryScores(index) - if(forSpan eq null) { + if (forSpan eq null) { val numSplits = end - begin forSpan = new Array[OpenAddressHashArray[Double]](numSplits) binaryScores(index) = forSpan } - val parentArray = if(forSpan(split-begin) eq null) { + val parentArray = if (forSpan(split-begin) eq null) { forSpan(split-begin) = projRuleVector() forSpan(split-begin) } else { @@ -95,14 +95,14 @@ class AnchoredForestProjector(threshold: Double) extends Serializable { def visitUnaryRule(begin: Int, end: Int, rule: Int, ref: Int, count: Double) { val index = TriangularArray.index(begin, end) - val parentArray = if(unaryScores(index) eq null) { + val parentArray = if (unaryScores(index) eq null) { unaryScores(index) = projRuleVector() unaryScores(index) } else { unaryScores(index) } parentArray(rule) += count - if(totalsUnaries(index) eq null) { + if (totalsUnaries(index) eq null) { totalsUnaries(index) = projVector() } totalsUnaries(index)(charts.topology.parent(rule)) += count @@ -116,7 +116,6 @@ class AnchoredForestProjector(threshold: Double) extends Serializable { } } - object AnchoredForestProjector { /** diff --git a/src/main/scala/epic/parser/projections/AnchoredSpanProjector.scala b/src/main/scala/epic/parser/projections/AnchoredSpanProjector.scala index 2d3a6fc0..bf139ecb 100644 --- a/src/main/scala/epic/parser/projections/AnchoredSpanProjector.scala +++ b/src/main/scala/epic/parser/projections/AnchoredSpanProjector.scala @@ -45,21 +45,17 @@ class AnchoredSpanProjector(threshold: Double = Double.NegativeInfinity) extends val totals = TriangularArray.fill[DenseVector[Double]](length+1)(labelBeliefs) val totalsUnaries = TriangularArray.fill[DenseVector[Double]](length+1)(labelBeliefs) - val visitor = new AnchoredVisitor[L] { def visitSpan(begin: Int, end: Int, tag: Int, ref: Int, score: Double): Unit = { // fill in spans with 0 if they're active - if(score > 0.0) { + if (score > 0.0) { totals(begin, end)(tag) += score } } - override def skipBinaryRules: Boolean = true - def visitBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int, count: Double): Unit = { - - } + def visitBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int, count: Double): Unit = () def visitUnaryRule(begin: Int, end: Int, rule: Int, ref: Int, count: Double): Unit = { if (count > 0.0) @@ -68,18 +64,12 @@ class AnchoredSpanProjector(threshold: Double = Double.NegativeInfinity) extends } - charts.visitPostorder(visitor, threshold) new AnchoredSpanProjector.AnchoredData(totalsUnaries, totals) } } - - - - - object AnchoredSpanProjector { /** diff --git a/src/main/scala/epic/parser/projections/ChartProjector.scala b/src/main/scala/epic/parser/projections/ChartProjector.scala index 5cd07f80..8a21b074 100644 --- a/src/main/scala/epic/parser/projections/ChartProjector.scala +++ b/src/main/scala/epic/parser/projections/ChartProjector.scala @@ -33,7 +33,7 @@ trait ChartProjector[L, W] { def project(charts: ParseMarginal[L, W], goldTagPolicy: GoldTagPolicy[L] = GoldTagPolicy.noGoldTags[L]):MyAnchoring = { - if(charts.logPartition.isInfinite) throw new NoParseException("infinite partition", charts.words) + if (charts.logPartition.isInfinite) throw new NoParseException("infinite partition", charts.words) val ruleData = proj.projectRulePosteriors(charts, goldTagPolicy) createAnchoring(charts, ruleData, charts.logPartition) } diff --git a/src/main/scala/epic/parser/projections/ConstraintAnchoring.scala b/src/main/scala/epic/parser/projections/ConstraintAnchoring.scala index 93ff2d82..fe55d70c 100644 --- a/src/main/scala/epic/parser/projections/ConstraintAnchoring.scala +++ b/src/main/scala/epic/parser/projections/ConstraintAnchoring.scala @@ -61,7 +61,6 @@ class ParserChartConstraintsFactory[L, W](val parser: Parser[L, W], val prunedtags = new AtomicInteger(0) val notprunedtags = new AtomicInteger(0) - private val synthetics = BitSet.empty ++ (0 until topology.labelIndex.size).filter(l => isIntermediate(labelIndex.get(l))) def constraints(w: IndexedSeq[W]):ChartConstraints[L] = constraints(w, GoldTagPolicy.noGoldTags[L]) @@ -74,7 +73,7 @@ class ParserChartConstraintsFactory[L, W](val parser: Parser[L, W], logger.debug(s"Building Constraints for ${marg.words}") assert(marg.isMaxMarginal) val length = marg.length - if(marg.logPartition.isInfinite) + if (marg.logPartition.isInfinite) throw new NoParseException("No parse for sentence we're trying to constrain!", marg.words) val (botLabelScores, unaryScores) = computeScores(length, marg) @@ -92,20 +91,19 @@ class ParserChartConstraintsFactory[L, W](val parser: Parser[L, W], assert(labelThresholds(i, i+1) != null && labelThresholds(i,i+1).nonEmpty, "label thresholds" + labelThresholds(i, i+1)) assert(topLabelThresholds(i, i+1) != null && topLabelThresholds(0,length).nonEmpty, "top label thresholds" + topLabelThresholds(i, i+1)) } - if(topLabelThresholds(0,length) == null || !topLabelThresholds(0,length).contains(marg.topology.rootIndex)) + if (topLabelThresholds(0,length) == null || !topLabelThresholds(0,length).contains(marg.topology.rootIndex)) throw new NoParseException("No score at the root!", marg.words) -// val hasMaximalProjection: BitSet = BitSet.empty ++ (0 to length).filter{ i => -// ((labelThresholds(i) ne null) && (topLabelThresholds(i) ne null)) && ((labelThresholds(i)|topLabelThresholds(i)) -- synthetics).nonEmpty -// } + // val hasMaximalProjection: BitSet = BitSet.empty ++ (0 to length).filter{ i => + // ((labelThresholds(i) ne null) && (topLabelThresholds(i) ne null)) && ((labelThresholds(i)|topLabelThresholds(i)) -- synthetics).nonEmpty + // } - //, hasMaximalProjection) + //, hasMaximalProjection) val con = ChartConstraints[L](topLabelThresholds, labelThresholds) -// PrecacheConstraints.checkConstraints(TreeInstance("viterbi", vit, marg.words), con, this) + // PrecacheConstraints.checkConstraints(TreeInstance("viterbi", vit, marg.words), con, this) con } - private def extractLabelThresholds(length: Int, numLabels: Int, scores: Array[Array[Double]], index: Index[_], @@ -115,25 +113,25 @@ class ParserChartConstraintsFactory[L, W](val parser: Parser[L, W], val thresholdedTags = if (arr eq null) { BitSet.empty } else { - BitSet.empty ++ (0 until arr.length filter { s => + BitSet.empty ++ (arr.indices filter { s => arr(s) >= threshold }) } - if(arr ne null) - if(j == i) { - } else if(j - i > 1) { + if (arr ne null) + if (j == i) { + } else if (j - i > 1) { this.notpruned.addAndGet(thresholdedTags.size) this.pruned.addAndGet(arr.count(_ != 0.0) - thresholdedTags.size) } else { - if(thresholdedTags.isEmpty) assert(false, arr.toIndexedSeq) + if (thresholdedTags.isEmpty) assert(false, arr.toIndexedSeq) this.notprunedtags.addAndGet(thresholdedTags.size) this.prunedtags.addAndGet(arr.count(_ != 0.0) - thresholdedTags.size) } val goldTags = (0 until numLabels).filter { isGold(i, j, _) } for(t <- goldTags if arr == null || arr(t) < threshold) { - if(arr == null) { + if (arr == null) { logger.warn(s"Can't even construct span that has gold tag ${labelIndex.get(t)}!") } else { logger.warn(s"Got a below threshold for a goldTag! ${arr(t)} $threshold ${labelIndex.get(t)} " @@ -166,12 +164,11 @@ class ParserChartConstraintsFactory[L, W](val parser: Parser[L, W], for(c <- 0 until topology.labelIndex.size) { thresholds += arr(c) nConstructed += 1 - if(gold.isGoldBotTag(i, j, c)) { - if(arr(c) != 0) + if (gold.isGoldBotTag(i, j, c)) { + if (arr(c) != 0) nGoldConstructed += 1 else { throw new RuntimeException("Can't construct gold tree for " + " " + marg.words) - counts(c) += 1 } gThresholds += arr(c) } @@ -183,8 +180,8 @@ class ParserChartConstraintsFactory[L, W](val parser: Parser[L, W], for(c <- 0 until grammar.labelIndex.size) { thresholds += arr(c) nConstructed += 1 - if(gold.isGoldTopTag(i, j, c)) { - if(arr(c) != 0) + if (gold.isGoldTopTag(i, j, c)) { + if (arr(c) != 0) nGoldConstructed += 1 else counts(c) += 1 gThresholds += arr(c) @@ -203,7 +200,6 @@ class ParserChartConstraintsFactory[L, W](val parser: Parser[L, W], val visitor = new AnchoredVisitor[L] { def visitBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int, score: Double) {} - override def skipBinaryRules: Boolean = true def visitUnaryRule(begin: Int, end: Int, rule: Int, ref: Int, score: Double) { @@ -216,7 +212,6 @@ class ParserChartConstraintsFactory[L, W](val parser: Parser[L, W], } } - def visitSpan(begin: Int, end: Int, tag: Int, ref: Int, score: Double) { val index = TriangularArray.index(begin, end) if (score != 0.0) { @@ -240,7 +235,7 @@ object ParserChartConstraintsFactory { case class PruningStatistics(data: Array[Double], nConstructed: Double, pruningCounts: DenseVector[Double]) { def merge(other: PruningStatistics, nAllowed:Int = data.length): PruningStatistics = { - if(nAllowed >= data.length + other.data.length) { + if (nAllowed >= data.length + other.data.length) { PruningStatistics(data ++ other.data, this.nConstructed + other.nConstructed, pruningCounts + other.pruningCounts) } else { val subsetThisSize = new Binomial(nAllowed, nConstructed/(other.nConstructed + nConstructed)).draw() @@ -257,9 +252,6 @@ object ParserChartConstraintsFactory { } - - - /** * Object for creating [[epic.constraints.CachedChartConstraintsFactory]] * from a parser and prepopulating it with the contents of a treebank. @@ -283,20 +275,19 @@ object PrecacheConstraints extends LazyLogging { **/ def forTreebank(constrainer: ParserChartConstraintsFactory[AnnotatedLabel, String], treebank: ProcessedTreebank, tableName: String = "parseConstraints", verifyNoGoldPruningInTrain: Boolean = true)(implicit broker: CacheBroker) = { val cached = forTrainingSet(constrainer, treebank.trainTrees.par.map(ti => ti.copy(tree = ti.tree.map(_.baseAnnotatedLabel))), tableName, verifyNoGoldPruning = verifyNoGoldPruningInTrain) - (treebank.devTrees).par.foreach { ti => + treebank.devTrees.par.foreach { ti => logger.info(s"Ensuring existing constraint for dev tree ${ti.id} ${ti.words}") val constraints = cached.constraints(ti.words) - if(verifyNoGoldPruningInTrain) + if (verifyNoGoldPruningInTrain) checkConstraints(ti.copy(tree = ti.tree.map(_.baseAnnotatedLabel)), constraints, constrainer) } - (treebank.testTrees).par.foreach { ti => + treebank.testTrees.par.foreach { ti => logger.info(s"Ensuring existing constraint for test sentence ${ti.id} ${ti.words}") cached.constraints(ti.words) } cached } - /** * Method for creating [[epic.constraints.CachedChartConstraintsFactory]] * from a parser and prepopulating it with constraints for a training set. @@ -316,16 +307,16 @@ object PrecacheConstraints extends LazyLogging { logger.info(s"Building constraints for ${ti.id} ${ti.words}") constrainer.constraints(ti.words) }) - if(located) { + if (located) { logger.info(s"Already had constraints for ${ti.id} ${ti.words}.") - } else if(verifyNoGoldPruning) { + } else if (verifyNoGoldPruning) { checkConstraints(ti, constraints, constrainer) } val count: Int = parsed.incrementAndGet() - if(count % 10 == 0) { + if (count % 10 == 0) { logger.info("Pruning statistics so far: " + constrainer.overallStatistics) } - if(count % 100 == 0) { + if (count % 100 == 0) { logger.info(s"Parsed $count/$len.") } @@ -338,7 +329,6 @@ object PrecacheConstraints extends LazyLogging { new CachedChartConstraintsFactory(constrainer, cache) } - def checkConstraints[W, L](ti: TreeInstance[L, W], constraints: ChartConstraints[L], constrainer: ParserChartConstraintsFactory[L, W]) { // val decoded = new ViterbiDecoder[L, W].extractBestParse(marg) var printTree = true diff --git a/src/main/scala/epic/parser/projections/EnumeratedAnchoring.scala b/src/main/scala/epic/parser/projections/EnumeratedAnchoring.scala index f8f6f2b7..0757fb4f 100644 --- a/src/main/scala/epic/parser/projections/EnumeratedAnchoring.scala +++ b/src/main/scala/epic/parser/projections/EnumeratedAnchoring.scala @@ -31,12 +31,12 @@ case class AnchoredPCFGProjector[L, W](threshold: Double = Double.NegativeInfini type MyAnchoring = EnumeratedAnchoring[L, W] private def normalize(grammar: RuleTopology[L], ruleScores: OpenAddressHashArray[Double], totals: OpenAddressHashArray[Double]):OpenAddressHashArray[Double] = { - if(ruleScores eq null) null + if (ruleScores eq null) null else { val r = new OpenAddressHashArray[Double](ruleScores.length, Double.NegativeInfinity, ruleScores.activeSize) for( (rule, score) <- ruleScores.activeIterator) { val parent = grammar.parent(rule) - if(score > 0) + if (score > 0) r(rule) = math.log(score) - math.log(totals(parent)) } r @@ -44,7 +44,7 @@ case class AnchoredPCFGProjector[L, W](threshold: Double = Double.NegativeInfini } private def logify(ruleScores: OpenAddressHashArray[Double]):OpenAddressHashArray[Double] = { - if(ruleScores eq null) null + if (ruleScores eq null) null else { val r = new OpenAddressHashArray[Double](ruleScores.length, Double.NegativeInfinity, ruleScores.activeSize) for( (rule, score) <- ruleScores.activeIterator) { @@ -61,7 +61,7 @@ case class AnchoredPCFGProjector[L, W](threshold: Double = Double.NegativeInfini } val normBinaries:Array[Array[OpenAddressHashArray[Double]]] = for ((splits, totals) <- binaryScores zip totalsBinaries) yield { - if(splits eq null) null + if (splits eq null) null else for(ruleScores <- splits) yield normalize(charts.topology, ruleScores, totals) } val sparsity = charts.anchoring.sparsityPattern @@ -70,7 +70,6 @@ case class AnchoredPCFGProjector[L, W](threshold: Double = Double.NegativeInfini } - /** * Creates anchorings for a set of trees from some parser using p(rule | sentence) marginals. * @author dlwh @@ -78,7 +77,7 @@ case class AnchoredPCFGProjector[L, W](threshold: Double = Double.NegativeInfini @SerialVersionUID(469174684243960202L) case class AnchoredRuleMarginalProjector[L, W](threshold: Double = Double.NegativeInfinity) extends ChartProjector[L, W] { private def normalize(ruleScores: OpenAddressHashArray[Double]):OpenAddressHashArray[Double] = { - if(ruleScores eq null) null + if (ruleScores eq null) null else { val r = new OpenAddressHashArray[Double](ruleScores.length, Double.NegativeInfinity, ruleScores.activeSize) for( (rule, score) <- ruleScores.activeIterator) { @@ -97,7 +96,7 @@ case class AnchoredRuleMarginalProjector[L, W](threshold: Double = Double.Negati val normUnaries:Array[OpenAddressHashArray[Double]] = unaryScores.map(normalize) val normBinaries:Array[Array[OpenAddressHashArray[Double]]] = for (splits <- binaryScores) yield { - if(splits eq null) null + if (splits eq null) null else splits.map(normalize) } val sparsity = charts.anchoring.sparsityPattern @@ -127,7 +126,6 @@ case class EnumeratedAnchoring[L, W](topology: RuleTopology[L], override def addConstraints(cs: ChartConstraints[L]): UnrefinedGrammarAnchoring[L, W] = copy(sparsityPattern = sparsityPattern & cs) - /** * Computes the pointwise division of two grammars, augmenting * their refinement space to reflect this. If they share the same annotationTag, @@ -143,7 +141,6 @@ case class EnumeratedAnchoring[L, W](topology: RuleTopology[L], } } - /** * Computes the point-wise division of this grammar with some other grammar. * @@ -156,12 +153,11 @@ case class EnumeratedAnchoring[L, W](topology: RuleTopology[L], case that: EnumeratedAnchoring[L, W] => EnumeratedAnchoring.divide(this, that) case _ => super./(other) } - } def scoreUnaryRule(begin: Int, end: Int, rule: Int) = { val forSpan = unaryScores(TriangularArray.index(begin, end)) - if(forSpan eq null) Double.NegativeInfinity + if (forSpan eq null) Double.NegativeInfinity else forSpan(rule) } @@ -169,35 +165,32 @@ case class EnumeratedAnchoring[L, W](topology: RuleTopology[L], val ti = TriangularArray.index(begin, end) val forSpan = binaryScores(ti) val cached = checkCache(split, rule, ti) - if(!java.lang.Double.isNaN(cached)) { + if (!java.lang.Double.isNaN(cached)) { cached - } else if(forSpan eq null) { + } else if (forSpan eq null) { Double.NegativeInfinity } else { val forSplit = forSpan(split - begin) - val result = if(forSplit eq null) Double.NegativeInfinity + val result = if (forSplit eq null) Double.NegativeInfinity else forSplit(rule) - updateCache(split, rule, ti, result) - result } } def scoreSpan(begin: Int, end: Int, tag: Int): Double = { val scores = spanScores(TriangularArray.index(begin, end)) - if(scores ne null) scores(tag) + if (scores ne null) scores(tag) else Double.NegativeInfinity } // (1 entry for each position (a split point), an entry has a rule index, a begin/end pair, and a score - private val cache = new Array[Int](length * (1 + 1 + 2)) - util.Arrays.fill(cache, -1) // + private val cache = Array.fill[Int](length * (1 + 1 + 2))(-1) private def checkCache(splitPoint: Int, rule: Int, ti: Int) = { val crule = cache(splitPoint * 4) val cti = cache(splitPoint * 4 + 1) - if(rule == crule && cti == ti) { + if (rule == crule && cti == ti) { java.lang.Double.longBitsToDouble(Span(cache(splitPoint * 4 + 2), cache(splitPoint * 4 + 3)).encoded) } else { Double.NaN @@ -219,7 +212,7 @@ object EnumeratedAnchoring { val newSpanScores = Array.tabulate(a.spanScores.length) { i => val oldA = a.spanScores(i) val oldB = b.spanScores(i) - if(null == oldA || null == oldB) { + if (null == oldA || null == oldB) { null } else { doDivide(oldA, oldB) @@ -229,24 +222,23 @@ object EnumeratedAnchoring { val newUnaryScores = Array.tabulate(a.unaryScores.length) { i => val oldA = a.unaryScores(i) val oldB = b.unaryScores(i) - if(null == oldA || null == oldB) { + if (null == oldA || null == oldB) { null } else { doDivide(oldA, oldB) } } - val newBinaryScores = Array.tabulate(a.binaryScores.length) { i => val aArray = a.binaryScores(i) val bArray = b.binaryScores(i) - if(null == aArray || null == bArray) { + if (null == aArray || null == bArray) { null } else { Array.tabulate(aArray.length) { split => val oldA = aArray(split) val oldB = bArray(split) - if(null == oldA || null == oldB) { + if (null == oldA || null == oldB) { null } else { doDivide(oldA, oldB) @@ -260,21 +252,19 @@ object EnumeratedAnchoring { } private def doDivide(a: OpenAddressHashArray[Double], b: OpenAddressHashArray[Double]) = { - if(a == null || b == null) { + if (a == null || b == null) { null } else { val oah = new OpenAddressHashArray[Double](a.size, a.default, a.activeSize min b.activeSize) - var off = 0 - while(off < a.iterableSize) { - if(a.isActive(off)) { + while (off < a.iterableSize) { + if (a.isActive(off)) { val aa = a.valueAt(off) val ii = a.indexAt(off) val bb = b(ii) - if(aa != Double.NegativeInfinity && bb != Double.NegativeInfinity) { + if (aa != Double.NegativeInfinity && bb != Double.NegativeInfinity) { oah(ii) = aa - bb } - } off += 1 } diff --git a/src/main/scala/epic/parser/projections/GoldTagPolicy.scala b/src/main/scala/epic/parser/projections/GoldTagPolicy.scala index 3ac8a080..cf560e28 100644 --- a/src/main/scala/epic/parser/projections/GoldTagPolicy.scala +++ b/src/main/scala/epic/parser/projections/GoldTagPolicy.scala @@ -27,7 +27,7 @@ import breeze.collection.mutable.TriangularArray * @tparam L */ trait GoldTagPolicy[L] { - def isGoldSpan(start: Int, end: Int):Boolean + def isGoldSpan(start: Int, end: Int): Boolean def isGoldTopTag(start: Int, end: Int, tag: Int): Boolean def isGoldBotTag(start: Int, end: Int, tag: Int): Boolean } @@ -36,16 +36,16 @@ object GoldTagPolicy { def noGoldTags[L]:GoldTagPolicy[L] = new GoldTagPolicy[L] { def isGoldTopTag(start: Int, end: Int, tag: Int): Boolean = false def isGoldBotTag(start: Int, end: Int, tag: Int): Boolean = false - def isGoldSpan(start: Int, end: Int):Boolean = false + def isGoldSpan(start: Int, end: Int): Boolean = false } def goldTreeForcing[L](trees: BinarizedTree[Int]*):GoldTagPolicy[L] ={ val goldTop = TriangularArray.raw(trees.head.span.end+1,collection.mutable.BitSet()) val goldBot = TriangularArray.raw(trees.head.span.end+1,collection.mutable.BitSet()) for(tree <- trees) { - if(tree != null) { + if (tree != null) { for( t <- tree.allChildren if t.label != -1) { - if(t.children.size == 1) + if (t.children.size == 1) goldTop(TriangularArray.index(t.span.begin,t.span.end)) += t.label else goldBot(TriangularArray.index(t.span.begin,t.span.end)) += t.label @@ -53,7 +53,7 @@ object GoldTagPolicy { } } new GoldTagPolicy[L] { - def isGoldSpan(start: Int, end: Int):Boolean = { + def isGoldSpan(start: Int, end: Int): Boolean = { val set = goldTop(TriangularArray.index(start,end)) set != null && set.nonEmpty } diff --git a/src/main/scala/epic/parser/projections/GrammarRefinements.scala b/src/main/scala/epic/parser/projections/GrammarRefinements.scala index 229c58ff..91c8e15c 100644 --- a/src/main/scala/epic/parser/projections/GrammarRefinements.scala +++ b/src/main/scala/epic/parser/projections/GrammarRefinements.scala @@ -69,19 +69,15 @@ final case class GrammarRefinements[C, F](labels: ProjectionIndexer[C, F], rules } /** Gives the localized refinement of each parent */ - def parentRefinement(r: Int, ref: Int):Int = parentRefinements(r)(ref) + def parentRefinement(r: Int, ref: Int): Int = parentRefinements(r)(ref) private val parentRefinements: Array[Array[Int]] = Array.tabulate(rules.coarseIndex.size) { r => val parent = labels.coarseIndex(rules.coarseIndex.get(r).parent) - rules.refinementsOf(r).map { ref => labels.localize(rules.fineIndex.get(ref).parent)._2 } - - } - // rule -> parentRef -> [ruleRef] private val parentCompatibleRefinements: Array[Array[Array[Int]]] = Array.tabulate(rules.coarseIndex.size) { r => val parent = labels.coarseIndex(rules.coarseIndex.get(r).parent) @@ -94,7 +90,7 @@ final case class GrammarRefinements[C, F](labels: ProjectionIndexer[C, F], rules } private val leftChildCompatibleRefinements: Array[Array[Array[Int]]] = Array.tabulate(rules.coarseIndex.size) { r => - if(rules.coarseIndex.get(r).isInstanceOf[UnaryRule[C]]) { + if (rules.coarseIndex.get(r).isInstanceOf[UnaryRule[C]]) { null } else { val leftChild = labels.coarseIndex(rules.coarseIndex.get(r).asInstanceOf[BinaryRule[C]].left) @@ -108,10 +104,9 @@ final case class GrammarRefinements[C, F](labels: ProjectionIndexer[C, F], rules } private val rightChildCompatibleRefinements: Array[Array[Array[Int]]] = Array.tabulate(rules.coarseIndex.size) { r => - if(rules.coarseIndex.get(r).isInstanceOf[UnaryRule[C]]) { + if (rules.coarseIndex.get(r).isInstanceOf[UnaryRule[C]]) { null } else { - val rightChild = labels.coarseIndex(rules.coarseIndex.get(r).asInstanceOf[BinaryRule[C]].right) val rightChildRefs = Array.fill(labels.refinementsOf(rightChild).length){ArrayBuffer[Int]()} for(ruleRef <- rules.refinementsOf(r)) { @@ -124,7 +119,7 @@ final case class GrammarRefinements[C, F](labels: ProjectionIndexer[C, F], rules // rule -> parentRef -> [ruleRef] private val childCompatibleRefinements: Array[Array[Array[Int]]] = Array.tabulate(rules.coarseIndex.size) { r => - if(rules.coarseIndex.get(r).isInstanceOf[UnaryRule[C]]) { + if (rules.coarseIndex.get(r).isInstanceOf[UnaryRule[C]]) { val child = labels.coarseIndex(rules.coarseIndex.get(r).asInstanceOf[UnaryRule[C]].child) val childRefs = Array.fill(labels.refinementsOf(child).length){ArrayBuffer[Int]()} for(ruleRef <- rules.refinementsOf(r)) { @@ -138,14 +133,13 @@ final case class GrammarRefinements[C, F](labels: ProjectionIndexer[C, F], rules } private val coarseRulesGivenParentRefinement = Array.tabulate(labels.coarseIndex.size) { p => - // refinement -> rules + // refinement -> rules val result = Array.fill(labels.refinementsOf(p).length)(ArrayBuffer[Int]()) - for( (rule, r) <- rules.coarseIndex.pairs if labels.coarseIndex(rule.parent) == p && rule.isInstanceOf[BinaryRule[_]]; ref <- 0 until result.length) { - if(parentCompatibleRefinements(r)(ref).nonEmpty) { + for( (rule, r) <- rules.coarseIndex.pairs if labels.coarseIndex(rule.parent) == p && rule.isInstanceOf[BinaryRule[_]]; ref <- result.indices) { + if (parentCompatibleRefinements(r)(ref).nonEmpty) { result(ref) += r } } - result.map(_.toArray) } @@ -156,7 +150,7 @@ final case class GrammarRefinements[C, F](labels: ProjectionIndexer[C, F], rules } private val leftChildRefinementsGivenCoarseRule:Array[Array[Int]] = Array.tabulate(rules.coarseIndex.size) { r => - if(rules.coarseIndex.get(r).isInstanceOf[UnaryRule[_]]) Array.empty + if (rules.coarseIndex.get(r).isInstanceOf[UnaryRule[_]]) Array.empty else { def fineLeftChild(r: Int) = labels.fineIndex(rules.fineIndex.get(r).asInstanceOf[BinaryRule[F]].left) rules.refinementsOf(r).map(fineLeftChild).toSet.toArray.map(labels.localize).sorted @@ -164,7 +158,7 @@ final case class GrammarRefinements[C, F](labels: ProjectionIndexer[C, F], rules } private val rightChildRefinementsGivenCoarseRule:Array[Array[Int]] = Array.tabulate(rules.coarseIndex.size) { r => - if(rules.coarseIndex.get(r).isInstanceOf[UnaryRule[_]]) Array.empty + if (rules.coarseIndex.get(r).isInstanceOf[UnaryRule[_]]) Array.empty else { def fineRightChild(r: Int) = labels.fineIndex(rules.fineIndex.get(r).asInstanceOf[BinaryRule[F]].right) rules.refinementsOf(r).map(fineRightChild).toSet.toArray.map(labels.localize).sorted diff --git a/src/main/scala/epic/parser/projections/LabeledSpanProjector.scala b/src/main/scala/epic/parser/projections/LabeledSpanProjector.scala index ad347718..9369f4ba 100644 --- a/src/main/scala/epic/parser/projections/LabeledSpanProjector.scala +++ b/src/main/scala/epic/parser/projections/LabeledSpanProjector.scala @@ -29,14 +29,14 @@ case class LabeledSpanProjector[L, W](topology: RuleTopology[L], threshold: Doub type MyAnchoring = SpanAnchoring[L, W] private def normalize(ruleScores: OpenAddressHashArray[Double], totals: OpenAddressHashArray[Double]):OpenAddressHashArray[Double] = { - if(ruleScores eq null) null + if (ruleScores eq null) null else { val r = new OpenAddressHashArray[Double](ruleScores.length, Double.NegativeInfinity) for( (rule, score) <- ruleScores.activeIterator) { val parent = topology.parent(rule) - if(score > 0.9999999) { + if (score > 0.9999999) { r(rule) = 10 - } else if(score > 0) { + } else if (score > 0) { r(rule) = math.log(score) - math.log1p(-score) } } @@ -45,13 +45,13 @@ case class LabeledSpanProjector[L, W](topology: RuleTopology[L], threshold: Doub } private def normalizeSpans(totals: OpenAddressHashArray[Double]):OpenAddressHashArray[Double] = { - if(totals eq null) null + if (totals eq null) null else { val r = new OpenAddressHashArray[Double](totals.length, Double.NegativeInfinity) for( (parent, score) <- totals.activeIterator) { - if(score > 0.9999999) { + if (score > 0.9999999) { r(parent) = 10 - } else if(score > 0) { + } else if (score > 0) { r(parent) = math.log(score) - math.log1p(-score) } } @@ -97,13 +97,13 @@ case class SpanAnchoring[L, W](topology: RuleTopology[L], def scoreUnaryRule(begin: Int, end: Int, rule: Int) = { val forSpan = unaryScores(TriangularArray.index(begin, end)) - if(forSpan eq null) Double.NegativeInfinity + if (forSpan eq null) Double.NegativeInfinity else forSpan(rule) } def scoreSpan(begin: Int, end: Int, tag: Int) = { val scores = spanScores(TriangularArray.index(begin, end)) - if(scores ne null) scores(tag) + if (scores ne null) scores(tag) else Double.NegativeInfinity } } diff --git a/src/main/scala/epic/parser/projections/OracleParser.scala b/src/main/scala/epic/parser/projections/OracleParser.scala index a44b9ff9..4a2b3aeb 100644 --- a/src/main/scala/epic/parser/projections/OracleParser.scala +++ b/src/main/scala/epic/parser/projections/OracleParser.scala @@ -52,15 +52,15 @@ class OracleParser[L, L2, W](val grammar: SimpleGrammar[L, L2, W], backupGrammar val projectedTree: BinarizedTree[L] = tree.map(grammar.refinements.labels.project) cache.getOrElseUpdate(words, { val treeconstraints = ChartConstraints.fromTree(grammar.topology.labelIndex, projectedTree) - if(constraints.top.containsAll(treeconstraints.top) && constraints.bot.containsAll(treeconstraints.bot)) { + if (constraints.top.containsAll(treeconstraints.top) && constraints.bot.containsAll(treeconstraints.bot)) { synchronized(total += 1) tree } else try { logger.warn { val ratio = synchronized { - problems += 1; - total += 1; + problems += 1 + total += 1 problems * 1.0 / total } f"Gold tree for $words not reachable. $ratio%.2f are bad so far. " @@ -97,7 +97,6 @@ class OracleParser[L, L2, W](val grammar: SimpleGrammar[L, L2, W], backupGrammar throw ex } - def makeGoldPromotingAnchoring(grammar: SimpleGrammar[L, L2, W], w: IndexedSeq[W], tree: BinarizedTree[L2], @@ -118,7 +117,6 @@ class OracleParser[L, L2, W](val grammar: SimpleGrammar[L, L2, W], backupGrammar makeGoldPromotingAnchoring(grammar, w, tree, treeconstraints, constraints & cs) } - override def sparsityPattern: ChartConstraints[L] = constraints def scoreBinaryRule(begin: Int, split: Int, end: Int, rule: Int, ref: Int): Double = { @@ -148,7 +146,6 @@ class OracleParser[L, L2, W](val grammar: SimpleGrammar[L, L2, W], backupGrammar } } - } def oracleMarginalFactory(trees: IndexedSeq[TreeInstance[L2, W]]):ParseMarginal.Factory[L, W] = new ParseMarginal.Factory[L, W] { @@ -227,7 +224,6 @@ object OracleParser { case e: Exception => e.printStackTrace() } - val name = params.name println("Parser " + name) @@ -239,7 +235,6 @@ object OracleParser { println(stats) } - } } diff --git a/src/main/scala/epic/parser/projections/ProjectingCoreGrammar.scala b/src/main/scala/epic/parser/projections/ProjectingCoreGrammar.scala index f7ea0062..84be4b32 100644 --- a/src/main/scala/epic/parser/projections/ProjectingCoreGrammar.scala +++ b/src/main/scala/epic/parser/projections/ProjectingCoreGrammar.scala @@ -25,12 +25,10 @@ import epic.constraints.ChartConstraints case class ProjectingCoreGrammar[L, W](parser: Parser[L, W], projector: ChartProjector[L, W]) extends Grammar[L, W] { - def topology = parser.topology def lexicon = parser.lexicon - override def withPermissiveLexicon: Grammar[L, W] = { ??? } @@ -46,4 +44,3 @@ case class ProjectingCoreGrammar[L, W](parser: Parser[L, W], } } - diff --git a/src/main/scala/epic/parser/projections/ProjectionIndexer.scala b/src/main/scala/epic/parser/projections/ProjectionIndexer.scala index deae6bd6..e9e837d2 100644 --- a/src/main/scala/epic/parser/projections/ProjectionIndexer.scala +++ b/src/main/scala/epic/parser/projections/ProjectionIndexer.scala @@ -28,7 +28,7 @@ final class ProjectionIndexer[C, F] private (val coarseIndex: Index[C], for( (coarse, fine) <- indexedProjections.zipWithIndex if coarse != -1) { result(coarse) += fine } - result.map(arr => (arr.toArray)) + result.map(arr => arr.toArray) } // globaleRefined -> localRefined @@ -41,37 +41,37 @@ final class ProjectionIndexer[C, F] private (val coarseIndex: Index[C], Array.range(0, arr.length) } - def localize(f: Int):Int = localizationArray(f) - def globalize(c: Int, f: Int):Int = globalRefinements(c)(f) + def localize(f: Int): Int = localizationArray(f) + def globalize(c: Int, f: Int): Int = globalRefinements(c)(f) def globalize(c: C, f: Int):F = fineIndex.get(globalRefinements(coarseIndex(c))(f)) def indexAndLocalize(f: F):(Int, Int) = { val glob = fineIndex(f) - if(glob < 0) (-1, -1) + if (glob < 0) (-1, -1) else project(glob) -> localize(glob) } - def localize(f: F):(C, Int) = { val i = fineIndex(f) - if(i < 0) throw new RuntimeException(s"Not in fine index: $f") + if (i < 0) throw new RuntimeException(s"Not in fine index: $f") coarseIndex.get(indexedProjections(i)) -> localizationArray(i) } def refinementsOf(c: Int):Array[Int] = globalRefinements(c) + def localRefinements(c: Int):Array[Int] = perSymbolRefinements(c) def numRefinements(c: Int): Int = perSymbolRefinements(c).length def refinementsOf(c: C):IndexedSeq[F] = { val ci = coarseIndex(c) - if(ci < 0) throw new RuntimeException("Not a coarse symbol: " + c) + if (ci < 0) throw new RuntimeException("Not a coarse symbol: " + c) globalRefinements(ci).map(fineIndex.get _) } /** * Computes the projection of the indexed fine label f to an indexed coarse label. */ - def project(f: Int):Int = indexedProjections(f) + def project(f: Int): Int = indexedProjections(f) def project(f: F):C = coarseIndex.get(project(fineIndex(f))) @@ -93,7 +93,6 @@ final class ProjectionIndexer[C, F] private (val coarseIndex: Index[C], coarseIndex.map(x => x -> refinementsOf(x)).mkString("ProjectionIndexer(", ", ", ")") } - def localizeArray[T:ClassTag](array: Array[T]):Array[Array[T]] = { require(array.length == fineIndex.size) Array.tabulate(coarseIndex.size) { c => @@ -110,8 +109,8 @@ object ProjectionIndexer { val indexedProjections = Encoder.fromIndex(fineIndex).fillArray(-1) for( (l, idx) <- fineIndex.zipWithIndex) { val projectedIdx = coarseIndex(proj(l)) - if(projectedIdx < 0) { - if(!skipMissingCoarse) + if (projectedIdx < 0) { + if (!skipMissingCoarse) throw new RuntimeException("error while indexing" + l + " to " + proj(l) + fineIndex(l)) } else { indexedProjections(idx) = projectedIdx @@ -130,7 +129,6 @@ object ProjectionIndexer { } } new ProjectionIndexer(coarseIndex, fineIndex, indexedProjections) - } def fromSplitter[C, F](coarseIndex: Index[C], split: C=>Seq[F]) = { @@ -141,7 +139,6 @@ object ProjectionIndexer { indexedProjections += cf } new ProjectionIndexer(coarseIndex, fineIndex, indexedProjections.toArray) - } } diff --git a/src/main/scala/epic/parser/repl/DSLGrammar.scala b/src/main/scala/epic/parser/repl/DSLGrammar.scala index e22563fe..46537bd2 100644 --- a/src/main/scala/epic/parser/repl/DSLGrammar.scala +++ b/src/main/scala/epic/parser/repl/DSLGrammar.scala @@ -38,8 +38,6 @@ object DSLGrammar { case DSLLex(a, word, w) => lexicon(a, word) = w } - - val grammar = RuleTopology("S", binaryProductions, unaryProductions) val unsmoothed = new UnsmoothedLexicon(grammar.labelIndex, lexicon.keySet.toSet) Grammar.generative(grammar, unsmoothed, binaryProductions, unaryProductions, lexicon) diff --git a/src/main/scala/epic/parser/repl/ReplGrammar.scala b/src/main/scala/epic/parser/repl/ReplGrammar.scala index b195840c..d362267f 100644 --- a/src/main/scala/epic/parser/repl/ReplGrammar.scala +++ b/src/main/scala/epic/parser/repl/ReplGrammar.scala @@ -32,7 +32,7 @@ class ReplGrammar(treebankPath: String, binarizationKind: String = "xbar") { case "head" => HeadFinder.collins case _ => HeadFinder.collins } - Trees.binarize((_:Tree[String]), headRules) + Trees.binarize(_:Tree[String], headRules) } val maxLength = 15 diff --git a/src/main/scala/epic/preprocess/JavaWordTokenizer.scala b/src/main/scala/epic/preprocess/JavaWordTokenizer.scala index ab27eb61..0d7a068b 100644 --- a/src/main/scala/epic/preprocess/JavaWordTokenizer.scala +++ b/src/main/scala/epic/preprocess/JavaWordTokenizer.scala @@ -34,7 +34,6 @@ import epic.slab.Sentence class JavaWordTokenizer(locale: Locale) extends Tokenizer { def this() = this(Locale.getDefault) - override def apply[In <: Sentence](slab: StringSlab[In]): StringSlab[In with Token] = { slab.addLayer[Token](slab.iterator[Sentence].flatMap { s => val breaker = BreakIterator.getWordInstance(locale) diff --git a/src/main/scala/epic/preprocess/MLSentenceSegmenter.scala b/src/main/scala/epic/preprocess/MLSentenceSegmenter.scala index 6c04d9e4..a604c39a 100644 --- a/src/main/scala/epic/preprocess/MLSentenceSegmenter.scala +++ b/src/main/scala/epic/preprocess/MLSentenceSegmenter.scala @@ -27,9 +27,9 @@ class MLSentenceSegmenter(inf: MLSentenceSegmenter.ClassificationInference) exte slab.addLayer[Sentence]( Iterators.fromProducer { def rec():Option[(Span, Sentence)] = { - if(iter.hasNext) { + if (iter.hasNext) { val pos = iter.next() - if(!iter.hasNext || inf.classify(MLSentenceSegmenter.featuresForEndPointDetection(text, pos))) { + if (!iter.hasNext || inf.classify(MLSentenceSegmenter.featuresForEndPointDetection(text, pos))) { val res = Some(Span(lastOffset, math.min(pos + 1, text.length)) -> Sentence()) lastOffset = pos + 1 res @@ -43,7 +43,6 @@ class MLSentenceSegmenter(inf: MLSentenceSegmenter.ClassificationInference) exte rec() }.filterNot(s => text.substring(s._1.begin, s._1.end).forall(_.isWhitespace)) ) - } override def toString = "MLSentenceSegmenter(...)" } @@ -57,7 +56,7 @@ object MLSentenceSegmenter { val oin = new ObjectInputStream(new GZIPInputStream(strm)) oin.readObject().asInstanceOf[MLSentenceSegmenter] } finally { - if(strm != null) + if (strm != null) strm.close() } } @@ -67,11 +66,11 @@ object MLSentenceSegmenter { breeze.util.readObject[MLSentenceSegmenter](file) } - def nextPotentialSentenceBoundary(text: String, offset: Int):Int = { + def nextPotentialSentenceBoundary(text: String, offset: Int): Int = { var start = offset + 1 while (start < text.length) { val codepoint = text.codePointAt(start) - if(isPotentialSentenceBoundary(text, start, codepoint)) { + if (isPotentialSentenceBoundary(text, start, codepoint)) { return start } start += Character.charCount(codepoint) @@ -80,15 +79,13 @@ object MLSentenceSegmenter { } def codepointToString(cp: Int) = { - if(Character.charCount(cp) == 1 && !Character.isISOControl(cp) && !Character.isSpaceChar(cp)) { + if (Character.charCount(cp) == 1 && !Character.isISOControl(cp) && !Character.isSpaceChar(cp)) { cp.toChar.toString } else { Character.getName(cp) } - } - case class CodePointFeature(cp: String, offset: Int = 0) extends Feature case class NextRealLetterFeature(ct: Int) extends Feature { override def toString = { @@ -115,7 +112,7 @@ object MLSentenceSegmenter { case class JavaDistFeature(x: Int) extends Feature case object LineIsShortFeature extends Feature - private def stringForCharType(ct: Int):String = { + private def stringForCharType(ct: Int): String = { val characterClass = Class.forName("java.lang.Character") val fields = characterClass.getDeclaredFields() for (f <- fields) { @@ -136,16 +133,16 @@ object MLSentenceSegmenter { Array(BiasFeature, EOFFeature) } else { val buf = new ArrayBuffer[Feature] -// val break = BreakIterator.getSentenceInstance -// break.setText(text) -// val pos = break.following(math.max(offset - 3, 0)) -// buf += JavaDistFeature(math.min(pos - offset, 5)) + // val break = BreakIterator.getSentenceInstance + // break.setText(text) + // val pos = break.following(math.max(offset - 3, 0)) + // buf += JavaDistFeature(math.min(pos - offset, 5)) buf += BiasFeature // baseline features for the current char val curCharFeatures: IndexedSeq[Feature] = addCharFeatures(text, offset, 0) buf ++= curCharFeatures - if(previousLineIsShort(text, offset)) { + if (previousLineIsShort(text, offset)) { buf += LineIsShortFeature for(m <- curCharFeatures) { buf += CrossProductFeature(LineIsShortFeature, m) @@ -176,34 +173,31 @@ object MLSentenceSegmenter { buf += CrossProductFeature(f1, CrossProductFeature(fmid, f2)) } - for(f1 <- addCharFeatures(text, offset, -1); f2 <- addCharFeatures(text, offset, 2)) { buf += CrossProductFeature(f1, f2) } - val prevSpace = math.max(text.lastIndexWhere(!_.isLetterOrDigit, offset - 2), -1) // -1 is ok, assume BOS is space buf += ContextWord(text.substring(prevSpace + 1, offset)) buf += LastWordLength(offset - prevSpace) val nextNotSpace = text.indexWhere(c => !c.isSpaceChar && !c.isControl, offset + 1) - if(nextNotSpace >= 0) { + if (nextNotSpace >= 0) { val nextWordEnd = if (text.charAt(nextNotSpace).isLetterOrDigit){ text.indexWhere(c => !c.isLetterOrDigit, nextNotSpace + 1) } else { text.indexWhere(c => Character.getType(c) != text.charAt(nextNotSpace), nextNotSpace + 1) } buf += ContextWord(text.substring(prevSpace + 1, prevSpace + 2)+"--" + text.substring(nextNotSpace, nextNotSpace + 1), -3) -// if(nextWordEnd >= 0) { -// buf += ContextWord(text.substring(nextNotSpace, nextWordEnd), 1) -// } + // if (nextWordEnd >= 0) { + // buf += ContextWord(text.substring(nextNotSpace, nextWordEnd), 1) + // } } val nextLetterPos = text.indexWhere(_.isLetterOrDigit, offset + 1) - if(nextLetterPos >= 0) { + if (nextLetterPos >= 0) { buf += NextRealLetterFeature(Character.getType(text.charAt(nextLetterPos))) } - buf += SurroundingCharFeature(if (offset == 0) "BOS" else codepointToString(text.codePointBefore(offset)), if (nextNotSpace < 0) "EOS" else codepointToString(text.codePointAt(nextNotSpace))) @@ -215,15 +209,13 @@ object MLSentenceSegmenter { buf.toArray } - } - def addCharFeatures(text: String, base: Int, rel: Int): IndexedSeq[Feature] = { val buf = new ArrayBuffer[Feature] - val next = try {text.offsetByCodePoints(base, rel)} catch { case ex: IndexOutOfBoundsException => if(rel > 0) text.length else 0} + val next = try {text.offsetByCodePoints(base, rel)} catch { case ex: IndexOutOfBoundsException => if (rel > 0) text.length else 0} val (cp, cps) = - if(next < 0 || next >= text.length) { + if (next < 0 || next >= text.length) { 0 -> "###" } else { val cp = text.codePointAt(next) @@ -242,12 +234,11 @@ object MLSentenceSegmenter { case Character.OTHER_PUNCTUATION if ch == '\'' || ch == '"' => true case _ => false } - } // http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/SentenceBreakProperty.txt // http://www.unicode.org/reports/tr29/#Sentence_Boundaries - def isPotentialSentenceBoundary(text: String, offset: Int, codepoint: Int):Boolean = { + def isPotentialSentenceBoundary(text: String, offset: Int, codepoint: Int): Boolean = { Character.getType(codepoint) match { case Character.OTHER_PUNCTUATION => codepoint != ',' && isProbablyNotContraction(text, offset, codepoint, '\'') case Character.INITIAL_QUOTE_PUNCTUATION => true @@ -265,7 +256,7 @@ object MLSentenceSegmenter { } case Character.CONTROL => isControl(codepoint) && (offset == 0 || -// !isPotentialSentenceBoundary(text, offset - Character.charCount(codepoint), text.codePointBefore(offset)) + // !isPotentialSentenceBoundary(text, offset - Character.charCount(codepoint), text.codePointBefore(offset)) text.codePointBefore(offset)!= ',' && (offset == text.length - 1 || isControl(text.codePointAt(offset + 1)) || previousLineIsShort(text, offset) || Character.isUpperCase(text.codePointAt(offset + 1))) ) @@ -275,20 +266,17 @@ object MLSentenceSegmenter { } - def isControl(codepoint: Int): Boolean = { codepoint == '\r' || codepoint == '\n' || codepoint == '\t' } - def previousLineIsShort(s: String, pos: Int): Boolean = { val SHORT_LINE = 35 // in characters (pos - s.lastIndexOf('\n', pos - 1) ) < SHORT_LINE } - def isProbablyNotContraction(text: String, offset: Int, codepoint: Int, quote: Char): Boolean = { - (codepoint != quote || offset >= text.length - 1 || offset == 0 || !Character.isLetterOrDigit(text.codePointAt(offset + 1)) || !Character.isLetterOrDigit(text.codePointBefore(offset))) + codepoint != quote || offset >= text.length - 1 || offset == 0 || !Character.isLetterOrDigit(text.codePointAt(offset + 1)) || !Character.isLetterOrDigit(text.codePointBefore(offset)) } def potentialSentenceBoundariesIterator(text: String):Iterator[Int] = new Iterator[Int] { @@ -309,43 +297,41 @@ object MLSentenceSegmenter { var lastSpan = Span(0, 0) val mapped = for(s@Span(begin, _p) <- endPoints if !lastSpan.crosses(s) && !lastSpan.contains(s)) yield { var p = math.max(_p, 0) - var cp = text.codePointAt(p) - if(p > 0 && !Character.isSpaceChar(cp) && !isPotentialSentenceBoundary(text, p, cp)) { + if (p > 0 && !Character.isSpaceChar(cp) && !isPotentialSentenceBoundary(text, p, cp)) { p -= Character.charCount(cp) cp = text.codePointAt(p) } var earliestControlChar = p val nextNonSpacePos = text.indexWhere(!_.isSpaceChar, p) - if(nextNonSpacePos > p) { + if (nextNonSpacePos > p) { val ccp = text.charAt(nextNonSpacePos) if (ccp == '\n' || ccp == '\t' || ccp == '\r') { earliestControlChar = nextNonSpacePos } } - while(p > 0 && (Character.isSpaceChar(cp) || cp == '\n' || cp == '\t' || cp == '\r')) { - if(!Character.isSpaceChar(cp)) { + while (p > 0 && (Character.isSpaceChar(cp) || cp == '\n' || cp == '\t' || cp == '\r')) { + if (!Character.isSpaceChar(cp)) { earliestControlChar = p } p -= Character.charCount(cp) cp = text.codePointAt(p) } - - if(!isPotentialSentenceBoundary(text, p, cp)) { + if (!isPotentialSentenceBoundary(text, p, cp)) { p += Character.charCount(cp) cp = text.codePointAt(p) } - if(Character.isSpaceChar(cp) && p < text.length) { + if (Character.isSpaceChar(cp) && p < text.length) { p = earliestControlChar cp = text.codePointAt(p) } - if(lastSpan.crosses(s) || lastSpan.contains(s)) { + if (lastSpan.crosses(s) || lastSpan.contains(s)) { println(text.substring(lastSpan.begin, lastSpan.end)) println(text.substring(s.begin, s.end)) println(text.charAt(p)) @@ -382,7 +368,6 @@ object MLSentenceSegmenter { def main(args: Array[String]):Unit = { val mascDir = new File(args(0)) - var sentenceBoundaryProblems = for(dir <- new File(new File(mascDir,"data"), "written").listFiles() if !dir.toString.contains("twitter") && dir.isDirectory; f <- dir.listFiles(new FilenameFilter { @@ -391,19 +376,18 @@ object MLSentenceSegmenter { val slab = MascSlab(f.toURI.toURL) val slabWithSentences = MascSlab.s(slab) - val guessPoints: IndexedSeq[Int] = potentialSentenceBoundariesIterator(slabWithSentences.content).toIndexedSeq val text = slab.content val goldPoints = adjustGoldSentenceBoundaries(text, slabWithSentences.iterator[Sentence].map(_._1)) -// println("<<<<" + f ) -// printOutSentenceBoundaries(text, guessPoints.toSet, goldPoints) + // println("<<<<" + f ) + // printOutSentenceBoundaries(text, guessPoints.toSet, goldPoints) for(guess <- guessPoints) yield { val contextBegin = math.max(0, guess - 50) val contextEnd = math.min(text.length, guess + 50) - val context = if(guess != text.length) { + val context = if (guess != text.length) { text.substring(contextBegin, guess) + "[[" + text.charAt(guess) + "]]" + text.substring(guess + 1, contextEnd) } else { text.substring(contextBegin, guess) + "[[]]" @@ -415,14 +399,13 @@ object MLSentenceSegmenter { } } - val extraInstances = { for ( (text, goldPoints) <- extraExamples) yield { val guessPoints: IndexedSeq[Int] = potentialSentenceBoundariesIterator(text).toIndexedSeq for (guess <- guessPoints) yield { val contextBegin = math.max(0, guess - 50) val contextEnd = math.min(text.length, guess + 50) - val context = if(guess != text.length) { + val context = if (guess != text.length) { text.substring(contextBegin, guess) + "[[" + text.charAt(guess) + "]]" + text.substring(guess + 1, contextEnd) } else { text.substring(contextBegin, guess) + "[[]]" @@ -455,7 +438,7 @@ object MLSentenceSegmenter { val inf = model.inferenceFromWeights(bestWeights) - val decoded = (Encoder.fromIndex(featureIndex).decode(bestWeights)) + val decoded = Encoder.fromIndex(featureIndex).decode(bestWeights) println("Train") evalDev(inf, train, decoded) @@ -464,7 +447,6 @@ object MLSentenceSegmenter { println("Special") evalDev(inf, extraInstances.flatten, decoded) - val segmenter: MLSentenceSegmenter = new MLSentenceSegmenter(inf) breeze.util.writeObject(new File("en-sent-segmenter.model.ser.gz"), segmenter) @@ -476,8 +458,8 @@ object MLSentenceSegmenter { var tN, fN = 0 var tP, fP = 0 for(inst <- dev) { - if(inst.label != inf.classify(inst.features)) { - val weights = (inst.features.toIndexedSeq.map( f => f -> decoded(f))) + if (inst.label != inf.classify(inst.features)) { + val weights = inst.features.toIndexedSeq.map(f => f -> decoded(f)) val sum: Double = weights.map(_._2).sum println("===========") println(inst.label, inst.id, sum) @@ -516,7 +498,6 @@ object MLSentenceSegmenter { } } - case class Marginal(prob: Double, logPartition: Double) extends epic.framework.Marginal class ClassificationModel(val featureIndex: Index[Feature]) extends StandardExpectedCounts.Model[SentenceDecisionInstance] { @@ -526,7 +507,6 @@ object MLSentenceSegmenter { type Inference = MLSentenceSegmenter.ClassificationInference type Scorer = ClassificationInference - override def inferenceFromWeights(weights: DenseVector[Double]): Inference = new ClassificationInference(featureIndex, weights) override def accumulateCounts(inf: Inference, s: Scorer, d: SentenceDecisionInstance, m: Marginal, accum: ExpectedCounts, scale: Double): Unit = { @@ -536,7 +516,6 @@ object MLSentenceSegmenter { } } - @SerialVersionUID(1L) case class ClassificationInference(featureIndex: Index[Feature], weights: DenseVector[Double]) extends epic.framework.Inference[SentenceDecisionInstance] { type Scorer = ClassificationInference @@ -565,7 +544,6 @@ object MLSentenceSegmenter { val fs = new FeatureVector(v.features.map(featureIndex).filterNot(_ == -1)) val act = weights dot fs val prob = sigmoid(act) - Marginal(prob, -log1p(-prob)) } } diff --git a/src/main/scala/epic/preprocess/NewLineSentenceSegmenter.scala b/src/main/scala/epic/preprocess/NewLineSentenceSegmenter.scala index 6ed29679..75f4ab4b 100644 --- a/src/main/scala/epic/preprocess/NewLineSentenceSegmenter.scala +++ b/src/main/scala/epic/preprocess/NewLineSentenceSegmenter.scala @@ -19,15 +19,14 @@ class NewLineSentenceSegmenter(locale: Locale = Locale.getDefault) extends Sente val spans = new ArrayBuffer[(Span, Sentence)]() var start = 0 - while(m.find()) { + while (m.find()) { val end = m.end() - if(end - start > 1) + if (end - start > 1) spans += (Span(start, end) -> Sentence()) start = end } spans += Span(start, slab.content.length) -> Sentence() - slab.addLayer[Sentence](spans) } } @@ -35,7 +34,7 @@ class NewLineSentenceSegmenter(locale: Locale = Locale.getDefault) extends Sente class SegmentingIterator(inner: BreakIterator, private var start: Int = 0, private val last: Int = -1) extends Iterator[Span] { private var end = inner.following(start) - def hasNext = (end != BreakIterator.DONE && (last == -1 || end <= last)) + def hasNext = end != BreakIterator.DONE && (last == -1 || end <= last) def next = { val res = Span(start, end) diff --git a/src/main/scala/epic/preprocess/RegexSearchTokenizer.scala b/src/main/scala/epic/preprocess/RegexSearchTokenizer.scala index 32160148..f2d54840 100644 --- a/src/main/scala/epic/preprocess/RegexSearchTokenizer.scala +++ b/src/main/scala/epic/preprocess/RegexSearchTokenizer.scala @@ -28,19 +28,14 @@ import epic.trees.Span case class RegexSearchTokenizer(pattern : String) extends Tokenizer { private val compiled = pattern.r - def apply[In <: Sentence](slab:StringSlab[In]):StringSlab[In with Token] = { slab.addLayer[Token](slab.iterator[Sentence].flatMap { s => compiled.findAllMatchIn(slab.spanned(s._1)).map{ m => Span(m.start, m.end) -> new Token(m.group(0))} }) } - - - - -// override def apply(doc : String) = new Iterable[String] { -// override def iterator = (pattern.r.findAllIn(doc)); -// } + // override def apply(doc : String) = new Iterable[String] { + // override def iterator = (pattern.r.findAllIn(doc)); + // } override def toString: String = ScalaRunTime._toString(this) } diff --git a/src/main/scala/epic/preprocess/RegexSplitTokenizer.scala b/src/main/scala/epic/preprocess/RegexSplitTokenizer.scala index 9fd3a7f4..115c257f 100644 --- a/src/main/scala/epic/preprocess/RegexSplitTokenizer.scala +++ b/src/main/scala/epic/preprocess/RegexSplitTokenizer.scala @@ -45,7 +45,7 @@ case class RegexSplitTokenizer(pattern : String) extends Tokenizer { spans += (Span(start, end) -> Token(slab.content.substring(start, end))) start = m.end() } - if(start != slab.content.length) + if (start != slab.content.length) spans += Span(start, slab.content.length) -> Token(slab.content.substring(start, slab.content.length)) slab.addLayer[Token](spans) } diff --git a/src/main/scala/epic/preprocess/SentenceSegmenter.scala b/src/main/scala/epic/preprocess/SentenceSegmenter.scala index 040db71b..9bb0361f 100644 --- a/src/main/scala/epic/preprocess/SentenceSegmenter.scala +++ b/src/main/scala/epic/preprocess/SentenceSegmenter.scala @@ -20,7 +20,6 @@ trait SentenceSegmenter extends StringAnalysisFunction[Any, Sentence] with (Stri } - object SegmentSentences { case class Params(splitOnNewline: Boolean = false) def main(_args: Array[String]):Unit = { @@ -28,7 +27,7 @@ object SegmentSentences { val params = config.readIn[Params]() import params._ - val ins = if(args.isEmpty) IndexedSeq(System.in) else args.toStream.map(new FileInputStream(_)) + val ins = if (args.isEmpty) IndexedSeq(System.in) else args.toStream.map(new FileInputStream(_)) val streaming = new StreamSentenceSegmenter(MLSentenceSegmenter.bundled().get, segmentOnNewLines = params.splitOnNewline) for(in <- ins) { try { diff --git a/src/main/scala/epic/preprocess/StreamSentenceSegmenter.scala b/src/main/scala/epic/preprocess/StreamSentenceSegmenter.scala index add7d5bf..c03be66c 100644 --- a/src/main/scala/epic/preprocess/StreamSentenceSegmenter.scala +++ b/src/main/scala/epic/preprocess/StreamSentenceSegmenter.scala @@ -39,11 +39,11 @@ class StreamSentenceSegmenter(val baseSegmenter: SentenceSegmenter, segmentOnNew val buffer = new Array[Char](1024 * 1024) var done = false Iterators.fromProducer { - if(done) { + if (done) { None } else { val numRead = reader.read(buffer) - if(numRead == -1) { + if (numRead == -1) { done = true None } else { diff --git a/src/main/scala/epic/preprocess/TextExtractor.scala b/src/main/scala/epic/preprocess/TextExtractor.scala index be42595f..f399be49 100644 --- a/src/main/scala/epic/preprocess/TextExtractor.scala +++ b/src/main/scala/epic/preprocess/TextExtractor.scala @@ -73,14 +73,11 @@ object TextExtractor { val textHandler = new ToTextContentHandler() { override def ignorableWhitespace(ch: Array[Char], start: Int, length: Int): Unit = characters(ch, start, length) - override def startElement(uri: String, localName: String, qName: String, attributes: Attributes): Unit = { super.startElement(uri, localName, qName, attributes) - if (newLineTags(qName.toLowerCase)) { ignorableWhitespace(Array('\n'), 0, 1) } - } override def endElement(uri: String, localName: String, qName: String): Unit = { @@ -91,7 +88,7 @@ object TextExtractor { } } - val handler = if(extractMainContentOnly) { + val handler = if (extractMainContentOnly) { new BoilerpipeContentHandler(textHandler, ArticleExtractor.getInstance()) { // stupid handler doesn't pass whitespace /* @@ -103,9 +100,6 @@ object TextExtractor { } } */ - - - setIncludeMarkup(true) } } else { @@ -125,14 +119,11 @@ object TextExtractor { stream.close() } - val content = textHandler.toString.trim Slab(content).addLayer(Span(0, content.length) -> epic.slab.Source(url)) } - - /* TODO: I'd like to be able to keep the XHTML formatting in the text, but right now that looks like it's going to cause problems with the way slabs work. (Namely, we'll get discontiguous blocks of text, even in the middle of words. * Uses boilerpipe to extract the content from an XHTML document @@ -168,11 +159,9 @@ object TextExtractor { sb.append("\"") } - sb.append('>') } - override def endElement(uri: String, localName: String, qName: String): Unit = { sb.append(" Content(getLabelsForTextElement(doc, index))) } } @@ -204,7 +193,6 @@ object TextExtractor { } - private def getLabelsForTextElement(doc: TextDocument, index: Int): Set[String] = { doc.getTextBlocks.asScala.find(_.getContainedTextElements.get(index)).map(b => Option(b.getLabels).map(_.asScala).iterator.flatten.toSet).getOrElse(Set.empty) } @@ -213,12 +201,9 @@ object TextExtractor { def extractXHTML(url: URL) = { val metadata = new Metadata() val stream: InputStream = TikaInputStream.get(url, metadata) - val loader = new Loader() new Tika().getParser.parse(stream, loader, metadata, new ParseContext) - loader.value - } def foo(url: URL)= { @@ -241,7 +226,7 @@ import scala.xml._ override def endDocument() { newAdapter.endDocument() // the pdf parser sends two end documents... - if(newAdapter.scopeStack.nonEmpty) + if (newAdapter.scopeStack.nonEmpty) newAdapter.scopeStack.pop() } override def endElement(uri: String, localName: String, qName: String) { @@ -261,8 +246,6 @@ import scala.xml._ override def ignorableWhitespace(ch: Array[Char], start: Int, length: Int): Unit = { characters(ch, start, length) } - - } def hasTika = { diff --git a/src/main/scala/epic/preprocess/Textify.scala b/src/main/scala/epic/preprocess/Textify.scala index aef153be..40702ca2 100644 --- a/src/main/scala/epic/preprocess/Textify.scala +++ b/src/main/scala/epic/preprocess/Textify.scala @@ -18,17 +18,12 @@ object Textify { println(f) val out = new File(outdir, f.getName) val toks = preprocess(f) - - val oo = new PrintWriter(new FileWriter(out)) - for(line <- toks) { oo.println(line.mkString("\t")) } - oo.close() } - } } diff --git a/src/main/scala/epic/preprocess/TreebankTokenizer.scala b/src/main/scala/epic/preprocess/TreebankTokenizer.scala index 656fb5f9..470d7576 100644 --- a/src/main/scala/epic/preprocess/TreebankTokenizer.scala +++ b/src/main/scala/epic/preprocess/TreebankTokenizer.scala @@ -29,7 +29,6 @@ class TreebankTokenizer() extends Tokenizer with Serializable { }) } - } object TreebankTokenizer extends TreebankTokenizer { @@ -71,7 +70,6 @@ object TreebankTokenizer extends TreebankTokenizer { slabWithTokens.iterator[Sentence].map{sent => val gold = slabWithTokens.covered[Segment](sent._1).map { case (span, tok) => slab.spanned(span)} val guess = TreebankTokenizer(slab.spanned(sent._1)) - (gold, guess, slab.spanned(sent._1)) } } diff --git a/src/main/scala/epic/preprocess/WhitespaceTokenizer.scala b/src/main/scala/epic/preprocess/WhitespaceTokenizer.scala index ebdddb65..facc4cfc 100644 --- a/src/main/scala/epic/preprocess/WhitespaceTokenizer.scala +++ b/src/main/scala/epic/preprocess/WhitespaceTokenizer.scala @@ -20,14 +20,12 @@ package epic.preprocess * * @author dramage */ -class WhitespaceTokenizer() extends RegexSplitTokenizer("\\s+"); +class WhitespaceTokenizer() extends RegexSplitTokenizer("\\s+") object WhitespaceTokenizer { - def apply() : WhitespaceTokenizer = new WhitespaceTokenizer; - - private val _instance : WhitespaceTokenizer = apply(); - def apply(in : String) : Iterable[String] = _instance(in); - + def apply() : WhitespaceTokenizer = new WhitespaceTokenizer + private val _instance : WhitespaceTokenizer = apply() + def apply(in : String) : Iterable[String] = _instance(in) } diff --git a/src/main/scala/epic/preprocess/package.scala b/src/main/scala/epic/preprocess/package.scala index b01f062f..da3ac3f3 100644 --- a/src/main/scala/epic/preprocess/package.scala +++ b/src/main/scala/epic/preprocess/package.scala @@ -11,7 +11,7 @@ import java.net.URL package object preprocess { def tokenize(sentence: String): IndexedSeq[String] = TreebankTokenizer(sentence) - def loadContent(url: URL):String = TextExtractor.extractText(url) + def loadContent(url: URL): String = TextExtractor.extractText(url) def preprocess(url: URL):IndexedSeq[IndexedSeq[String]] = { preprocess(loadContent(url)) @@ -25,7 +25,6 @@ package object preprocess { preprocess(file.toURI.toURL) } - private lazy val _seg = MLSentenceSegmenter.bundled().get } diff --git a/src/main/scala/epic/sentiment/SentimentEvaluator.scala b/src/main/scala/epic/sentiment/SentimentEvaluator.scala index acbc679a..e7c6ed96 100644 --- a/src/main/scala/epic/sentiment/SentimentEvaluator.scala +++ b/src/main/scala/epic/sentiment/SentimentEvaluator.scala @@ -8,7 +8,7 @@ object SentimentEvaluator { 8 1133 25941 1213 10 1 392 1340 3926 122 2 84 106 914 572 -"""); +""") def socherDevRootMatrix = readStringMatrix(""" 14 112 7 6 0 @@ -16,7 +16,7 @@ object SentimentEvaluator { 2 112 41 72 2 1 68 14 165 31 1 17 1 100 46 -"""); +""") def socherNonneutralDevSpanMatrix = readStringMatrix(""" 119 690 114 35 1 @@ -24,7 +24,7 @@ object SentimentEvaluator { 6 745 20576 855 5 1 314 1088 3306 113 2 78 97 828 509 -"""); +""") def socherNonneutralDevRootMatrix = readStringMatrix(""" 14 112 7 6 0 @@ -32,7 +32,7 @@ object SentimentEvaluator { 0 0 0 0 0 1 68 14 165 31 1 17 1 100 46 -"""); +""") def socherNonneutralTestSpanMatrix = readStringMatrix(""" 295 1166 250 89 0 @@ -40,7 +40,7 @@ object SentimentEvaluator { 11 1592 42470 1694 7 3 538 2404 6205 216 3 142 253 2013 1123 -"""); +""") def socherNonneutralTestRootMatrix = readStringMatrix(""" 44 193 23 19 0 @@ -48,7 +48,7 @@ object SentimentEvaluator { 0 0 0 0 0 0 131 31 297 51 0 36 8 255 100 -"""); +""") def socherNonneutralTestSpanMatrixNew = transpose(readStringMatrix(""" 294 147 11 3 3 @@ -56,7 +56,7 @@ object SentimentEvaluator { 250 2467 42469 2403 252 89 498 1700 6208 2014 0 0 7 215 1124 -""")); +""")) def socherNonneutralTestRootMatrixNew = transpose(readStringMatrix(""" 44 39 0 0 0 @@ -64,82 +64,82 @@ object SentimentEvaluator { 23 62 0 30 8 19 81 0 299 255 0 0 0 50 100 -""")); +""")) - def readStringMatrix(str: String) = str.split("\n").map(_.trim).filter(!_.isEmpty).map(_.split("\\s+").map(_.toInt)); + def readStringMatrix(str: String) = str.split("\n").map(_.trim).filter(!_.isEmpty).map(_.split("\\s+").map(_.toInt)) def transpose(arr: Array[Array[Int]]) = arr.transpose def printFromConfusionMatrix(mat: Array[Array[Int]]) { // println("Accuracy: " + accuracy(mat)); // agrees with the Stanford system's way of combining the matrix -// println("Ternary: " + ternaryCoarseEval(mat)); -// println("Binary: " + binaryCoarseEval(mat)); +// println("Ternary: " + ternaryCoarseEval(mat)) +// println("Binary: " + binaryCoarseEval(mat)) // println("Socher binary: " + socherCoarseEval(mat)); // agrees with the Stanford system's way of combining the matrix println("Accuracy: " + accuracy(mat, isCorrectNormal, isUsedAlways)); // agrees with the Stanford system's way of combining the matrix - println("Ternary: " + accuracy(mat, isCorrectTernary, isUsedAlways)); - println("Binary: " + accuracy(mat, isCorrectBinary, isUsedBinaryCoarse)); + println("Ternary: " + accuracy(mat, isCorrectTernary, isUsedAlways)) + println("Binary: " + accuracy(mat, isCorrectBinary, isUsedBinaryCoarse)) } def accuracy(mat: Array[Array[Int]]) = { - val numer = (0 until mat.size).map(i => mat(i)(i)).reduce(_+_); - val denom = (0 until mat.size).map(i => mat(i).reduce(_+_)).reduce(_+_); - renderNumerDenom(numer, denom); + val numer = mat.indices.map(i => mat(i)(i)).reduce(_+_) + val denom = mat.indices.map(i => mat(i).reduce(_+_)).reduce(_+_) + renderNumerDenom(numer, denom) } def accuracy(mat: Array[Array[Int]], isCorrect: (Int, Int) => Boolean, isUsed: (Int, Int) => Boolean) = { - val numer = (0 until mat.size).map(i => (0 until mat(i).size).map(j => { + val numer = mat.indices.map(i => mat(i).indices.map(j => { if (isUsed(i, j) && isCorrect(i, j)) mat(i)(j) else 0 - }).reduce(_+_)).reduce(_+_); - val denom = (0 until mat.size).map(i => (0 until mat(i).size).map(j => { + }).reduce(_+_)).reduce(_+_) + val denom = mat.indices.map(i => mat(i).indices.map(j => { if (isUsed(i, j)) mat(i)(j) else 0 - }).reduce(_+_)).reduce(_+_); - renderNumerDenom(numer, denom); + }).reduce(_+_)).reduce(_+_) + renderNumerDenom(numer, denom) } - def isCorrectNormal(gold: Int, guess: Int) = gold == guess; - def isCorrectTernary(gold: Int, guess: Int) = (gold < 2 && guess < 2) || (gold > 2 && guess > 2) || (gold == 2 && guess == 2); - def isCorrectBinary(gold: Int, guess: Int) = (gold < 2 && guess < 2) || (gold > 2 && guess > 2); + def isCorrectNormal(gold: Int, guess: Int) = gold == guess + def isCorrectTernary(gold: Int, guess: Int) = (gold < 2 && guess < 2) || (gold > 2 && guess > 2) || (gold == 2 && guess == 2) + def isCorrectBinary(gold: Int, guess: Int) = (gold < 2 && guess < 2) || (gold > 2 && guess > 2) - def isUsedAlways(gold: Int, guess: Int) = true; - def isUsedBinaryCoarse(gold: Int, guess: Int) = gold != 2; + def isUsedAlways(gold: Int, guess: Int) = true + def isUsedBinaryCoarse(gold: Int, guess: Int) = gold != 2 // def ternaryCoarseEval(mat: Array[Array[Int]]) = { // val numer = mat(0)(0) + mat(0)(1) + mat(1)(0) + mat(1)(1) + mat(2)(2) + mat(3)(3) + mat(3)(4) + mat(4)(3) + mat(4)(4); -// val denom = (0 until mat.size).map(i => mat(i).reduce(_+_)).reduce(_+_); -// renderNumerDenom(numer, denom); +// val denom = (0 until mat.size).map(i => mat(i).reduce(_+_)).reduce(_+_) +// renderNumerDenom(numer, denom) // } // // def binaryCoarseEval(mat: Array[Array[Int]]) = { -// val numer = mat(0)(0) + mat(0)(1) + mat(1)(0) + mat(1)(1) + mat(3)(3) + mat(3)(4) + mat(4)(3) + mat(4)(4); -// val denom = numer + mat(0)(3) + mat(0)(4) + mat(1)(3) + mat(1)(4) + mat(3)(0) + mat(3)(1) + mat(4)(0) + mat(4)(1) + mat(0)(2) + mat(1)(2) + mat(3)(2) + mat(4)(2); -// renderNumerDenom(numer, denom); +// val numer = mat(0)(0) + mat(0)(1) + mat(1)(0) + mat(1)(1) + mat(3)(3) + mat(3)(4) + mat(4)(3) + mat(4)(4) +// val denom = numer + mat(0)(3) + mat(0)(4) + mat(1)(3) + mat(1)(4) + mat(3)(0) + mat(3)(1) + mat(4)(0) + mat(4)(1) + mat(0)(2) + mat(1)(2) + mat(3)(2) + mat(4)(2) +// renderNumerDenom(numer, denom) // } // // def socherCoarseEval(mat: Array[Array[Int]]) = { -// val numer = mat(0)(0) + mat(0)(1) + mat(1)(0) + mat(1)(1) + mat(3)(3) + mat(3)(4) + mat(4)(3) + mat(4)(4); -// val denom = numer + mat(0)(3) + mat(0)(4) + mat(1)(3) + mat(1)(4) + mat(3)(0) + mat(3)(1) + mat(4)(0) + mat(4)(1); -// renderNumerDenom(numer, denom); +// val numer = mat(0)(0) + mat(0)(1) + mat(1)(0) + mat(1)(1) + mat(3)(3) + mat(3)(4) + mat(4)(3) + mat(4)(4) +// val denom = numer + mat(0)(3) + mat(0)(4) + mat(1)(3) + mat(1)(4) + mat(3)(0) + mat(3)(1) + mat(4)(0) + mat(4)(1) +// renderNumerDenom(numer, denom) // } def renderNumerDenom(numer: Int, denom: Int) = { - numer + " / " + denom + " = " + (numer.toDouble/denom.toDouble); + numer + " / " + denom + " = " + (numer.toDouble/denom.toDouble) } def main(args: Array[String]) { - println("DEV SPAN"); - printFromConfusionMatrix(socherDevSpanMatrix); - println("DEV ROOT"); - printFromConfusionMatrix(socherDevRootMatrix); - println("NONNEUTRAL DEV SPAN"); - printFromConfusionMatrix(socherNonneutralDevSpanMatrix); - println("NONNEUTRAL DEV ROOT"); - printFromConfusionMatrix(socherNonneutralDevRootMatrix); - println("NONNEUTRAL TEST SPAN"); - printFromConfusionMatrix(socherNonneutralTestSpanMatrix); - println("NONNEUTRAL TEST ROOT"); + println("DEV SPAN") + printFromConfusionMatrix(socherDevSpanMatrix) + println("DEV ROOT") + printFromConfusionMatrix(socherDevRootMatrix) + println("NONNEUTRAL DEV SPAN") + printFromConfusionMatrix(socherNonneutralDevSpanMatrix) + println("NONNEUTRAL DEV ROOT") + printFromConfusionMatrix(socherNonneutralDevRootMatrix) + println("NONNEUTRAL TEST SPAN") + printFromConfusionMatrix(socherNonneutralTestSpanMatrix) + println("NONNEUTRAL TEST ROOT") printFromConfusionMatrix(socherNonneutralTestRootMatrix) - println("NONNEUTRAL TEST SPAN NEW"); - printFromConfusionMatrix(socherNonneutralTestSpanMatrixNew); - println("NONNEUTRAL TEST ROOT NEW"); - printFromConfusionMatrix(socherNonneutralTestRootMatrixNew); + println("NONNEUTRAL TEST SPAN NEW") + printFromConfusionMatrix(socherNonneutralTestSpanMatrixNew) + println("NONNEUTRAL TEST ROOT NEW") + printFromConfusionMatrix(socherNonneutralTestRootMatrixNew) } } \ No newline at end of file diff --git a/src/main/scala/epic/sentiment/SentimentLossAugmentation.scala b/src/main/scala/epic/sentiment/SentimentLossAugmentation.scala index 01d4812c..8d37d037 100644 --- a/src/main/scala/epic/sentiment/SentimentLossAugmentation.scala +++ b/src/main/scala/epic/sentiment/SentimentLossAugmentation.scala @@ -20,7 +20,7 @@ case class SentimentLossAugmentation[W](trainTrees: IndexedSeq[TreeInstance[Anno val losses = Array.tabulate(5,5)(loss) - def projectedLabel(l: AnnotatedLabel) = if(l == AnnotatedLabel.TOP) -1 else l.label.toInt + def projectedLabel(l: AnnotatedLabel) = if (l == AnnotatedLabel.TOP) -1 else l.label.toInt val sentimentScores: Array[Int] = topology.labelEncoder.tabulateArray(projectedLabel) val trainingMap = trainTrees.iterator.map(ti => ti.words -> ti).toMap @@ -28,11 +28,9 @@ case class SentimentLossAugmentation[W](trainTrees: IndexedSeq[TreeInstance[Anno def lossAugmentation(datum: TreeInstance[AnnotatedLabel, W]): UnrefinedGrammarAnchoring[AnnotatedLabel, W] = { // drop the root val goldMap = datum.tree.map(projectedLabel).preorder.filter(_.label != -1).map{t => t.span -> t.label}.toMap - new SentimentLossAnchoring(topology, lexicon, datum.words, goldMap, constraintFactory.constraints(datum.words)) } - /** * Returns a [[epic.parser.UnrefinedGrammarAnchoring]] for this particular sentence. * @param words @@ -60,10 +58,10 @@ case class SentimentLossAugmentation[W](trainTrees: IndexedSeq[TreeInstance[Anno case Some(goldLabel) => assert(goldLabel != -1) val guessLabel = sentimentScores(tag) - if(guessLabel == -1) { + if (guessLabel == -1) { breeze.numerics.I(goldLabel == guessLabel) * 10000 } else { - losses(goldLabel)(guessLabel) * (if (begin == 0 && end == words.size) rootLossScaling else 1.0); + losses(goldLabel)(guessLabel) * (if (begin == 0 && end == words.size) rootLossScaling else 1.0) } case None => 0 @@ -72,7 +70,6 @@ case class SentimentLossAugmentation[W](trainTrees: IndexedSeq[TreeInstance[Anno } - } object SentimentLossAugmentation { @@ -86,6 +83,6 @@ object SentimentLossAugmentation { if (guess == 2) 0 else 1 } } - def hammingLoss(gold: Int, guess: Int) = if (gold != guess) 1 else 0; + def hammingLoss(gold: Int, guess: Int) = if (gold != guess) 1 else 0 def noLoss(gold: Int, guess: Int) = 0 } diff --git a/src/main/scala/epic/sentiment/SentimentTreebankPipeline.scala b/src/main/scala/epic/sentiment/SentimentTreebankPipeline.scala index a733d207..7b27dc21 100644 --- a/src/main/scala/epic/sentiment/SentimentTreebankPipeline.scala +++ b/src/main/scala/epic/sentiment/SentimentTreebankPipeline.scala @@ -46,22 +46,20 @@ object SentimentTreebankPipeline extends LazyLogging { rootLossScaling: Double = 1.0, computeTrainLL: Boolean = false) - def main(args: Array[String]):Unit = { val params = CommandLineParser.readIn[Options](args) val treebank = new ProcessedTreebank(params.path, treebankType = "simple") var trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]] = treebank.trainTrees - if(params.evalOnTest && params.includeDevInTrain) + if (params.evalOnTest && params.includeDevInTrain) trainTrees ++= treebank.devTrees - println(trainTrees.size + " train trees, " + treebank.devTrees.size + " dev trees, " + treebank.testTrees.size + " test trees"); + println(trainTrees.size + " train trees, " + treebank.devTrees.size + " dev trees, " + treebank.testTrees.size + " test trees") val gen = GenerativeParser.fromTrees(trainTrees) - class GoldBracketingsConstraints extends ChartConstraints.Factory[AnnotatedLabel, String] { val trees = (trainTrees ++ treebank.devTrees ++ treebank.testTrees).map(ti => ti.words -> ti.tree).toMap -// val trees = ((if (params.includeDevInTrain) trainTrees else trainTrees ++ treebank.devTrees) ++ treebank.testTrees).map(ti => ti.words -> ti.tree).toMap + // val trees = ((if (params.includeDevInTrain) trainTrees else trainTrees ++ treebank.devTrees) ++ treebank.testTrees).map(ti => ti.words -> ti.tree).toMap def constraints(w: IndexedSeq[String]): ChartConstraints[AnnotatedLabel] = { val constraints = SpanConstraints.fromTree(trees.getOrElse(w, gen.bestBinarizedTree(w))) @@ -78,7 +76,7 @@ object SentimentTreebankPipeline extends LazyLogging { } else if (params.lossType == "hammingLoss") { SentimentLossAugmentation.hammingLoss } else { - SentimentLossAugmentation.noLoss; + SentimentLossAugmentation.noLoss } val constrainer = new SentimentLossAugmentation(trainTrees, gen.topology, @@ -87,7 +85,7 @@ object SentimentTreebankPipeline extends LazyLogging { sentimentLoss, params.rootLossScaling) -// val model = new SpanModelFactory(annotator = GenerativeParser.defaultAnnotator(vertical = params.v), dummyFeats = 0.5).make(trainTrees, constrainer) + // val model = new SpanModelFactory(annotator = GenerativeParser.defaultAnnotator(vertical = params.v), dummyFeats = 0.5).make(trainTrees, constrainer) val model = params.modelFactory.make(trainTrees, gen.topology, gen.lexicon, new GoldBracketingsConstraints) val obj = new ModelObjective(model, trainTrees) @@ -100,30 +98,28 @@ object SentimentTreebankPipeline extends LazyLogging { for ((state, iter) <- itr.take(params.maxIterations).zipWithIndex if iter % params.iterationsPerEval == 0) try { val parser = model.extractParser(state.x).copy(decoder=new MaxConstituentDecoder[AnnotatedLabel, String]) -// if(params.evalOnTest) -// println("Eval: " + evaluate(s"$name-$iter", parser, treebank.testTrees)) -// else -// println("Eval: " + evaluate(s"$name-$iter", parser, treebank.devTrees)) + // if (params.evalOnTest) + // println("Eval: " + evaluate(s"$name-$iter", parser, treebank.testTrees)) + // else + // println("Eval: " + evaluate(s"$name-$iter", parser, treebank.devTrees)) if (params.computeTrainLL) { computeLL(trainTrees, model, state.x) } if (params.evalOnTest) { - println("NORMAL DECODE: Eval: " + evaluate(s"$name-$iter", parser, treebank.testTrees, DecodeType.Normal)); + println("NORMAL DECODE: Eval: " + evaluate(s"$name-$iter", parser, treebank.testTrees, DecodeType.Normal)) } else { - println("NORMAL DECODE: Eval: " + evaluate(s"$name-$iter", parser, treebank.devTrees, DecodeType.Normal)); + println("NORMAL DECODE: Eval: " + evaluate(s"$name-$iter", parser, treebank.devTrees, DecodeType.Normal)) if (params.alsoEvalOnTest) { - println("TEST SET: Eval: " + evaluate(s"$name-$iter", parser, treebank.testTrees, DecodeType.Normal)); + println("TEST SET: Eval: " + evaluate(s"$name-$iter", parser, treebank.testTrees, DecodeType.Normal)) } } } catch { case e: Exception => e.printStackTrace(); throw e } - } - def renderArr(arr: Array[Array[Int]]) = arr.map(_.map(_.toString).reduce(_ + "\t" + _)).reduce(_ + "\n" + _); - + def renderArr(arr: Array[Array[Int]]) = arr.map(_.map(_.toString).reduce(_ + "\t" + _)).reduce(_ + "\n" + _) class Model[L, W](val inner: ParserModel[L, W]) extends epic.framework.Model[TreeInstance[L, W]] { type ExpectedCounts = inner.ExpectedCounts @@ -133,7 +129,6 @@ object SentimentTreebankPipeline extends LazyLogging { def emptyCounts = inner.emptyCounts - def accumulateCounts(inf: Inference, s: Scorer, d: TreeInstance[L, W], m: Marginal, accum: ExpectedCounts, scale: Double): Unit = { inner.accumulateCounts(inf.pm.asInstanceOf[inner.Inference], s, d, m, accum, scale) } @@ -164,7 +159,6 @@ object SentimentTreebankPipeline extends LazyLogging { pm.goldMarginal(scorer, v) } - def marginal(anch: Scorer, v: TreeInstance[L, W]): Inference[L, W]#Marginal = { LatentTreeMarginal[L, W](anch, v.tree.map(l => labels:scala.collection.IndexedSeq[(L, Int)])) } @@ -189,27 +183,25 @@ object SentimentTreebankPipeline extends LazyLogging { numRoots + stats.numRoots, rootsRightTernary + stats.rootsRightTernary, rootsRightBinary + stats.rootsRightBinary, - numBinaryRoots + stats.numBinaryRoots); + numBinaryRoots + stats.numBinaryRoots) - override def toString = { "Spans: " + SentimentEvaluator.renderNumerDenom(spansRight, numSpans) + " (Ternary: " + SentimentEvaluator.renderNumerDenom(spansRightTernary, numSpans) + - "), Roots: " + SentimentEvaluator.renderNumerDenom(rootsRight, numRoots) + " (Ternary: " + SentimentEvaluator.renderNumerDenom(rootsRightTernary, numRoots) + ")"; + "), Roots: " + SentimentEvaluator.renderNumerDenom(rootsRight, numRoots) + " (Ternary: " + SentimentEvaluator.renderNumerDenom(rootsRightTernary, numRoots) + ")" } - - -// override def toString = f"Stats(cspans=${coarseSpansRight.toDouble/coarseSpans}%.4f: $coarseSpansRight/$coarseSpans spans=${spansRight.toDouble/numSpans}%.4f: $spansRight/$numSpans, coarseRoots=${coarseRootsRight.toDouble/numCoarseRoots}: $coarseRootsRight/$numCoarseRoots , roots=${rootsRight.toDouble/numRoots}%.4f: $rootsRight/$numRoots)" + + // override def toString = f"Stats(cspans=${coarseSpansRight.toDouble/coarseSpans}%.4f: $coarseSpansRight/$coarseSpans spans=${spansRight.toDouble/numSpans}%.4f: $spansRight/$numSpans, coarseRoots=${coarseRootsRight.toDouble/numCoarseRoots}: $coarseRootsRight/$numCoarseRoots , roots=${rootsRight.toDouble/numRoots}%.4f: $rootsRight/$numRoots)" } object DecodeType extends Enumeration { type DecodeType = Value - val Normal, Binary, Ternary = Value; + val Normal, Binary, Ternary = Value } import DecodeType._ def evaluateSpanConfusions(name: String, parser: Parser[AnnotatedLabel, String], testTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], decodeType: DecodeType) = { testTrees.par.map { ti => - val spanConfusions = Array.tabulate(5, 5)((i, j) => 0); + val spanConfusions = Array.tabulate(5, 5)((i, j) => 0) val goldTree = ti.tree.children.head.map(_.label.toInt) val marg = parser.marginal(ti.words) val guessTree = decode(ti.tree.map(_ => ()), marg, decodeType).map(_.label.toInt) @@ -217,57 +209,57 @@ object SentimentTreebankPipeline extends LazyLogging { val guessMap: HashMap[Span,Int] = new HashMap[Span,Int]() ++ guess.map(_.swap) val gold: Set[(Int, Span)] = goldTree.preorder.map(t => (t.label, t.span)).toSet for ((gLabel, gSpan) <- gold) { - val pLabel = guessMap(gSpan); - spanConfusions(gLabel)(pLabel) += 1; + val pLabel = guessMap(gSpan) + spanConfusions(gLabel)(pLabel) += 1 } - spanConfusions; - }.reduce((arr1, arr2) => Array.tabulate(5, 5)((i, j) => arr1(i)(j) + arr2(i)(j))); + spanConfusions + }.reduce((arr1, arr2) => Array.tabulate(5, 5)((i, j) => arr1(i)(j) + arr2(i)(j))) } def evaluateRootConfusions(name: String, parser: Parser[AnnotatedLabel, String], testTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], decodeType: DecodeType) = { testTrees.par.map { ti => - val rootConfusions = Array.tabulate(5, 5)((i, j) => 0); + val rootConfusions = Array.tabulate(5, 5)((i, j) => 0) val goldTree = ti.tree.children.head.map(_.label.toInt) val marg = parser.marginal(ti.words) val guessTree = decode(ti.tree.map(_ => ()), marg, decodeType).map(_.label.toInt) - rootConfusions(goldTree.label)(guessTree.label) += 1; - rootConfusions; - }.reduce((arr1, arr2) => Array.tabulate(5, 5)((i, j) => arr1(i)(j) + arr2(i)(j))); + rootConfusions(goldTree.label)(guessTree.label) += 1 + rootConfusions + }.reduce((arr1, arr2) => Array.tabulate(5, 5)((i, j) => arr1(i)(j) + arr2(i)(j))) } def evaluate(name: String, parser: Parser[AnnotatedLabel, String], testTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], decodeType: DecodeType) = { - println("Evaluating at " + name); + println("Evaluating at " + name) testTrees.par.map { ti => val goldTree = ti.tree.children.head.map(_.label.toInt) val goldRoot = goldTree.label val marg = parser.marginal(ti.words) val guessTree = decode(ti.tree.map(_ => ()), marg, decodeType).map(_.label.toInt) - val guessRoot = guessTree.label; + val guessRoot = guessTree.label val guess: Set[(Int, Span)] = guessTree.preorder.map(t => (t.label, t.span)).toSet - val guessMap: HashMap[Span,Int] = new HashMap[Span,Int]() ++ guess.map(pair => (pair._2, pair._1)); + val guessMap: HashMap[Span,Int] = new HashMap[Span,Int]() ++ guess.map(pair => (pair._2, pair._1)) val gold: Set[(Int, Span)] = goldTree.preorder.map(t => (t.label, t.span)).toSet - var spansRight = 0; - var numSpans = 0; - var spansRightTernary = 0; - var spansRightBinary = 0; - var numBinarySpans = 0; + var spansRight = 0 + var numSpans = 0 + var spansRightTernary = 0 + var spansRightBinary = 0 + var numBinarySpans = 0 for ((gLabel, gSpan) <- gold) { - val pLabel = guessMap(gSpan); - spansRight += (if (SentimentEvaluator.isCorrectNormal(gLabel, pLabel)) 1 else 0); - numSpans += 1; - spansRightTernary += (if (SentimentEvaluator.isCorrectTernary(gLabel, pLabel)) 1 else 0); - spansRightBinary += (if (SentimentEvaluator.isUsedBinaryCoarse(gLabel, pLabel) && SentimentEvaluator.isCorrectBinary(gLabel, pLabel)) 1 else 0); - numBinarySpans += (if (SentimentEvaluator.isUsedBinaryCoarse(gLabel, pLabel)) 1 else 0); + val pLabel = guessMap(gSpan) + spansRight += (if (SentimentEvaluator.isCorrectNormal(gLabel, pLabel)) 1 else 0) + numSpans += 1 + spansRightTernary += (if (SentimentEvaluator.isCorrectTernary(gLabel, pLabel)) 1 else 0) + spansRightBinary += (if (SentimentEvaluator.isUsedBinaryCoarse(gLabel, pLabel) && SentimentEvaluator.isCorrectBinary(gLabel, pLabel)) 1 else 0) + numBinarySpans += (if (SentimentEvaluator.isUsedBinaryCoarse(gLabel, pLabel)) 1 else 0) } - val rootsRight = (if (SentimentEvaluator.isCorrectNormal(goldRoot, guessRoot)) 1 else 0); - val numRoots = 1; - val rootsRightTernary = if (SentimentEvaluator.isCorrectTernary(goldRoot, guessRoot)) 1 else 0; - val rootsRightBinary = (if (SentimentEvaluator.isUsedBinaryCoarse(goldRoot, guessRoot) && SentimentEvaluator.isCorrectBinary(goldRoot, guessRoot)) 1 else 0); - val numBinaryRoots = (if (SentimentEvaluator.isUsedBinaryCoarse(goldRoot, guessRoot)) 1 else 0); + val rootsRight = if (SentimentEvaluator.isCorrectNormal(goldRoot, guessRoot)) 1 else 0 + val numRoots = 1 + val rootsRightTernary = if (SentimentEvaluator.isCorrectTernary(goldRoot, guessRoot)) 1 else 0 + val rootsRightBinary = if (SentimentEvaluator.isUsedBinaryCoarse(goldRoot, guessRoot) && SentimentEvaluator.isCorrectBinary(goldRoot, guessRoot)) 1 else 0 + val numBinaryRoots = if (SentimentEvaluator.isUsedBinaryCoarse(goldRoot, guessRoot)) 1 else 0 Stats(spansRight, numSpans, spansRightTernary, spansRightBinary, numBinarySpans, rootsRight, numRoots, rootsRightTernary, rootsRightBinary, numBinaryRoots) - }.reduce(_+_); + }.reduce(_+_) } def decode(tree: BinarizedTree[Unit], marginal: ParseMarginal[AnnotatedLabel, String], decodeType: DecodeType) = { @@ -279,19 +271,18 @@ object SentimentTreebankPipeline extends LazyLogging { // Elsewhere, use the top chart topMarg(t.begin, t.end) } - if(decodeType == Binary) { - val neg = (summed(AnnotatedLabel("0")) + summed(AnnotatedLabel("1")) ) - val pos = (summed(AnnotatedLabel("3")) + summed(AnnotatedLabel("4")) ) - if(neg > pos) { + if (decodeType == Binary) { + val neg = summed(AnnotatedLabel("0")) + summed(AnnotatedLabel("1")) + val pos = summed(AnnotatedLabel("3")) + summed(AnnotatedLabel("4")) + if (neg > pos) { AnnotatedLabel("0") } else { AnnotatedLabel("4") } - } else if(decodeType == Ternary) { - val neg = (summed(AnnotatedLabel("0")) + summed(AnnotatedLabel("1")) ) - val pos = (summed(AnnotatedLabel("3")) + summed(AnnotatedLabel("4")) ) - val neutral = (summed(AnnotatedLabel("2"))); - + } else if (decodeType == Ternary) { + val neg = summed(AnnotatedLabel("0")) + summed(AnnotatedLabel("1")) + val pos = summed(AnnotatedLabel("3")) + summed(AnnotatedLabel("4")) + val neutral = summed(AnnotatedLabel("2")) if(neg > pos && neg > neutral) { AnnotatedLabel("0") } else if (pos > neg && pos > neutral) { @@ -304,8 +295,7 @@ object SentimentTreebankPipeline extends LazyLogging { } } } - - + def computeLL(trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]], model: SpanModel[AnnotatedLabel,AnnotatedLabel,String], weights: DenseVector[Double]) { println("Computing final log likelihood on the whole training set...") val inf = model.inferenceFromWeights(weights) diff --git a/src/main/scala/epic/sequences/CRF.scala b/src/main/scala/epic/sequences/CRF.scala index 252f7c2e..5685cc08 100644 --- a/src/main/scala/epic/sequences/CRF.scala +++ b/src/main/scala/epic/sequences/CRF.scala @@ -83,7 +83,6 @@ object CRF { buildSimple(fixedData, false, gazetteer, opt = opt) } - trait Anchoring[L, W] extends TagConstraints[L] { def words : IndexedSeq[W] def length: Int = words.length @@ -92,7 +91,6 @@ object CRF { def startSymbol: L def validSymbols(pos: Int): Set[Int] - override def allowedTags(pos: Int): Set[Int] = validSymbols(pos) def *(other: Anchoring[L, W]):Anchoring[L, W] = { @@ -104,7 +102,6 @@ object CRF { } } - trait Marginal[L, W] extends VisitableMarginal[TransitionVisitor[L, W]] { def anchoring: Anchoring[L, W] @@ -125,7 +122,7 @@ object CRF { var prev = 0 val numLabels: Int = anchoring.labelIndex.size var sum = 0.0 - while(prev < numLabels) { + while (prev < numLabels) { sum += transitionMarginal(pos, prev, label) prev += 1 } @@ -133,7 +130,6 @@ object CRF { } } - object Marginal { def apply[L, W](scorer: Anchoring[L, W]):Marginal[L, W] = { @@ -143,7 +139,6 @@ object CRF { val partition = softmax(forwardScores.last) val _s = scorer - new Marginal[L, W] { def anchoring: Anchoring[L, W] = _s @@ -154,11 +149,11 @@ object CRF { while (pos < length) { var label = 0 while (label < numLabels) { - if(!backwardScore(pos+1)(label).isInfinite) { + if (!backwardScore(pos+1)(label).isInfinite) { var prevLabel = 0 while (prevLabel < numLabels) { val score = transitionMarginal(pos, prevLabel, label) - if(score != 0.0) + if (score != 0.0) f(pos, prevLabel, label, score) prevLabel += 1 } @@ -170,24 +165,19 @@ object CRF { } - /** Log-normalized probability of seing segment with transition */ def transitionMarginal(pos: Int, prev: Int, cur: Int): Double = { val withoutTrans = forwardScores(pos)(prev) + backwardScore(pos+1)(cur) - if(withoutTrans.isInfinite) 0.0 + if (withoutTrans.isInfinite) 0.0 else math.exp(withoutTrans + anchoring.scoreTransition(pos, prev, cur) - logPartition) } - - def logPartition: Double = partition // println(words + " " + partition) } } - - def goldMarginal[L, W](scorer: Anchoring[L, W], tags: IndexedSeq[L]):Marginal[L, W] = { var lastSymbol = scorer.labelIndex(scorer.startSymbol) var score = 0.0 @@ -220,14 +210,10 @@ object CRF { numerics.I(prev == indexedSymbols(pos) && cur == indexedSymbols(pos + 1)) } - def logPartition: Double = score } } - - - /** * * @param scorer @@ -247,20 +233,17 @@ object CRF { val cur = forwardScores(i+1) for ( next <- scorer.validSymbols(i)) { var offset = 0 - for ( previous <- if(i == 0) IndexedSeq(scorer.labelIndex(scorer.startSymbol)) else scorer.validSymbols(i-1)) { + for ( previous <- if (i == 0) IndexedSeq(scorer.labelIndex(scorer.startSymbol)) else scorer.validSymbols(i-1)) { val score = scorer.scoreTransition(i, previous, next) + forwardScores(i)(previous) - if(score != Double.NegativeInfinity) { + if (score != Double.NegativeInfinity) { cache(offset) = score offset += 1 } } cur(next) = softmax.array(cache, offset) } - - } - forwardScores } @@ -288,33 +271,27 @@ object CRF { for( next <- scorer.validSymbols(i)) { val nextScore = backwardScores(i+1)(next) val score = scorer.scoreTransition(i, curLabel, next) + nextScore - if(score != Double.NegativeInfinity) { + if (score != Double.NegativeInfinity) { accumArray(offset) = score offset += 1 } } cur(curLabel) = softmax(new DenseVector(accumArray, 0, 1, offset)) - } } backwardScores } - - } - trait TransitionVisitor[L, W] { def apply(pos: Int, prev: Int, cur: Int, count: Double) } trait IndexedFeaturizer[L, W] { def anchor(w: IndexedSeq[W]):AnchoredFeaturizer[L, W] - def startSymbol: L - def labelIndex: Index[L] def featureIndex: Index[Feature] } @@ -325,7 +302,6 @@ object CRF { def validSymbols(pos: Int):Set[Int] } - def viterbi[L, W](scorer: Anchoring[L ,W], id: String=""):TaggedSequence[L, W] = { val length = scorer.length val numLabels = scorer.labelIndex.size @@ -334,7 +310,6 @@ object CRF { forwardScores(0)(scorer.labelIndex(scorer.startSymbol)) = 0.0 val backPointer = Array.fill(length, numLabels)(-1) - // forward for(i <- 0 until length) { val cur = forwardScores(i+1) @@ -345,7 +320,7 @@ object CRF { for ( previous <- scorer.validSymbols(i-1)) { val score = scorer.scoreTransition(i, previous, next) + forwardScores(i)(previous) - if(score > currentMax) { + if (score > currentMax) { currentMax = score currentArgMax = previous } @@ -361,29 +336,27 @@ object CRF { def rec(end: Int, label: Int) { tags += scorer.labelIndex.get(label) - if(end > 0) { + if (end > 0) { val bestCurrentLabel = backPointer(end)(label) rec(end-1, bestCurrentLabel) } - } + rec(length-1, (0 until numLabels).maxBy(forwardScores(length)(_))) assert(tags.length == scorer.words.length, tags.reverse + " " + scorer.words) TaggedSequence(tags.reverse, scorer.words, id) } - def posteriorDecode[L, W](m: Marginal[L, W], id: String = "") = { val length = m.length val labels = (0 until length).map(pos => (0 until m.anchoring.labelIndex.size).maxBy(m.positionMarginal(pos, _))) - TaggedSequence(labels.map(m.anchoring.labelIndex.get), m.words, id) } case class ProductAnchoring[L, W](a: Anchoring[L ,W], b: Anchoring[L, W]) extends Anchoring[L, W] { - if((a.labelIndex ne b.labelIndex) && (a.labelIndex != b.labelIndex)) throw new IllegalArgumentException("Elements of product anchoring must have the same labelIndex!") - if(a.startSymbol != b.startSymbol) throw new IllegalArgumentException("Elements of product anchoring must have the same startSymbol!") + if ((a.labelIndex ne b.labelIndex) && (a.labelIndex != b.labelIndex)) throw new IllegalArgumentException("Elements of product anchoring must have the same labelIndex!") + if (a.startSymbol != b.startSymbol) throw new IllegalArgumentException("Elements of product anchoring must have the same startSymbol!") def words: IndexedSeq[W] = a.words @@ -402,11 +375,9 @@ object CRF { class IdentityAnchoring[L, W](val words: IndexedSeq[W], val validSyms: IndexedSeq[Set[Int]], val labelIndex: Index[L], val startSymbol: L) extends Anchoring[L, W] { def scoreTransition(pos: Int, prev: Int, cur: Int): Double = 0.0 - - def validSymbols(pos: Int): Set[Int] = validSyms(pos) - def canStartLongSegment(pos: Int): Boolean = true } + } diff --git a/src/main/scala/epic/sequences/CRFModel.scala b/src/main/scala/epic/sequences/CRFModel.scala index 103471eb..e04ee21a 100644 --- a/src/main/scala/epic/sequences/CRFModel.scala +++ b/src/main/scala/epic/sequences/CRFModel.scala @@ -43,7 +43,7 @@ class CRFModel[L, W](val featureIndex: Index[Feature], def apply(pos: Int, prev: Int, cur: Int, count: Double) { val feats = localization.featuresForTransition(pos, prev, cur) - if(count != 0) assert(feats ne null, (pos, prev, cur, marg.length, marg.anchoring.validSymbols(pos), marg.anchoring.validSymbols(pos-1))) + if (count != 0) assert(feats ne null, (pos, prev, cur, marg.length, marg.anchoring.validSymbols(pos), marg.anchoring.validSymbols(pos-1))) axpy(scale * count, feats, counts) } } @@ -59,16 +59,12 @@ class CRFInference[L, W](val weights: DenseVector[Double], val lexicon: TagConstraints.Factory[L, W], featurizer: CRF.IndexedFeaturizer[L, W]) extends AugmentableInference[TaggedSequence[L, W], CRF.Anchoring[L, W]] with CRF[L, W] with AnnotatingInference[TaggedSequence[L, W]] with Serializable { - - - def scorer(v: TaggedSequence[L, W]): Scorer = new Anchoring(v.words) def viterbi(sentence: IndexedSeq[W], anchoring: CRF.Anchoring[L, W]): TaggedSequence[L, W] = { CRF.viterbi(new Anchoring(sentence) * anchoring) } - def annotate(datum: TaggedSequence[L, W], m: Marginal): TaggedSequence[L, W] = { CRF.posteriorDecode(m) } @@ -81,21 +77,17 @@ class CRFInference[L, W](val weights: DenseVector[Double], def anchor(w: IndexedSeq[W]) = new Anchoring(w) - def labelIndex = featurizer.labelIndex def startSymbol = featurizer.startSymbol - def marginal(scorer: Scorer, v: TaggedSequence[L, W], aug: CRF.Anchoring[L, W]): CRFInference[L, W]#Marginal = { CRF.Marginal(scorer * aug) } - def goldMarginal(scorer: Scorer, v: TaggedSequence[L, W], aug: CRF.Anchoring[L, W]): Marginal = { CRF.Marginal.goldMarginal[L, W](new Anchoring(v.words) * aug, v.label) } - private val allLabels = (0 until labelIndex.size).toSet def baseAugment(v: TaggedSequence[L, W]): CRF.Anchoring[L, W] = { @@ -109,13 +101,11 @@ class CRFInference[L, W](val weights: DenseVector[Double], for(a <- transCache; b <- a) util.Arrays.fill(b, Double.NegativeInfinity) for(i <- 0 until length; c <- validSymbols(i); p <- validSymbols(i-1)) { val feats = localization.featuresForTransition(i, p, c) - if(feats ne null) + if (feats ne null) transCache(p)(c)(i) = weights dot feats else transCache(p)(c)(i) = Double.NegativeInfinity } - - def validSymbols(pos: Int): Set[Int] = localization.validSymbols(pos) def scoreTransition(pos: Int, prev: Int, cur: Int): Double = { @@ -127,7 +117,6 @@ class CRFInference[L, W](val weights: DenseVector[Double], def startSymbol = featurizer.startSymbol } - def posteriorDecode(m: Marginal):TaggedSequence[L, W] = { CRF.posteriorDecode(m) } @@ -146,10 +135,8 @@ class TaggedSequenceModelFactory[L](val startSymbol: L, val labelIndex: Index[L] = Index[L](Iterator(startSymbol) ++ train.iterator.flatMap(_.label)) val counts: Counter2[L, String, Double] = Counter2.count(train.flatMap(p => p.label zip p.words)).mapValues(_.toDouble) - val lexicon:TagConstraints.Factory[L, String] = new SimpleLexicon[L, String](labelIndex, counts) - var featurizer: WordFeaturizer[String] = wordFeaturizer.getOrElse(WordFeaturizer.goodPOSTagFeaturizer(counts)) featurizer = gazetteer.foldLeft(featurizer)(_ + _) val l2featurizer: WordFeaturizer[String] = transitionFeaturizer.getOrElse(WordFeaturizer.goodPOSTagTransitionFeaturizer(counts)) @@ -175,8 +162,8 @@ class TaggedSequenceModelFactory[L](val startSymbol: L, l <- lexLoc.allowedTags(b) } { lfBuilder.add(l, loc.featuresForWord(b)) - if(lexLoc.allowedTags(b).size > 1) { - for(prevTag <- if(b == 0) Set(labelIndex(startSymbol)) else lexLoc.allowedTags(b-1)) { + if (lexLoc.allowedTags(b).size > 1) { + for(prevTag <- if (b == 0) Set(labelIndex(startSymbol)) else lexLoc.allowedTags(b-1)) { l2Builder.add(label2Features(prevTag)(l), l2loc.featuresForWord(b)) } } @@ -184,7 +171,6 @@ class TaggedSequenceModelFactory[L](val startSymbol: L, progress.info(s"${lfBuilder.size + l2Builder.size}") } - val indexed = new IndexedStandardFeaturizer[L, String](indexedFeaturizer, indexedL2featurizer, lexicon, startSymbol, labelIndex, label2Features, lfBuilder.result(), l2Builder.result()) @@ -198,7 +184,6 @@ class TaggedSequenceModelFactory[L](val startSymbol: L, object TaggedSequenceModelFactory { - @SerialVersionUID(1L) class IndexedStandardFeaturizer[L, String](wordFeaturizer: IndexedWordFeaturizer[String], l2WordFeaturizer: IndexedWordFeaturizer[String], @@ -214,7 +199,6 @@ object TaggedSequenceModelFactory { private val loff = featureIndex.componentOffset(0) private val l2off = featureIndex.componentOffset(1) - private val startSymbolSet = Set(labelIndex(startSymbol)) def anchor(w: IndexedSeq[String]): AnchoredFeaturizer[L, String] = new AnchoredFeaturizer[L, String] { @@ -223,12 +207,10 @@ object TaggedSequenceModelFactory { val lexLoc = lexicon.anchor(w) def featureIndex: Index[Feature] = outer.featureIndex - def validSymbols(pos: Int): Set[Int] = if(pos < 0 || pos >= w.length) startSymbolSet else lexLoc.allowedTags(pos) + def validSymbols(pos: Int): Set[Int] = if (pos < 0 || pos >= w.length) startSymbolSet else lexLoc.allowedTags(pos) def length = w.length - - val featureArray = Array.ofDim[FeatureVector](length, labelIndex.size, labelIndex.size) private val posNeedsAmbiguity = Array.tabulate(length)(i => validSymbols(i).size > 1) for { @@ -239,7 +221,7 @@ object TaggedSequenceModelFactory { prevTag <- validSymbols(pos-1) } { val l2feats = l2loc.featuresForWord(pos) - val feats = if(posNeedsAmbiguity(pos)) { + val feats = if (posNeedsAmbiguity(pos)) { justLabel++ label2FeatureIndex.crossProduct(Array(label2Features(prevTag)(curTag)), l2feats, offset = l2off, usePlainLabelFeatures = true) } else { justLabel @@ -255,5 +237,4 @@ object TaggedSequenceModelFactory { } } - } diff --git a/src/main/scala/epic/sequences/GoldSegmentPolicy.scala b/src/main/scala/epic/sequences/GoldSegmentPolicy.scala index 0651b919..7a7f4ee7 100644 --- a/src/main/scala/epic/sequences/GoldSegmentPolicy.scala +++ b/src/main/scala/epic/sequences/GoldSegmentPolicy.scala @@ -30,7 +30,7 @@ object GoldSegmentPolicy { def goldSegmentForcing[L](trees: IndexedSeq[(Int,Span)]*):GoldSegmentPolicy[L] ={ val gold = TriangularArray.raw(trees.last.last._2.end+1,collection.mutable.BitSet()) for(tree <- trees) { - if(tree != null) { + if (tree != null) { for( (label, span) <- tree) { gold(TriangularArray.index(span.begin,span.end)) += label } diff --git a/src/main/scala/epic/sequences/HMM.scala b/src/main/scala/epic/sequences/HMM.scala index bafab0c2..f8d0ee4c 100644 --- a/src/main/scala/epic/sequences/HMM.scala +++ b/src/main/scala/epic/sequences/HMM.scala @@ -38,7 +38,7 @@ object HMM { val wcs = w.map(wordCounts(_)) val validSyms = w.map { w => - if(wordCounts(w) >= 10) { + if (wordCounts(w) >= 10) { emissions(::, w).findAll( _ > 0).map(labelIndex(_)).toSet } else { allSyms @@ -57,12 +57,12 @@ object HMM { emitScore + encodedTransitions(prev, cur) } - def scoreEmission(cur: Int, pos: Int): Double = if(smoothEmissions) { + def scoreEmission(cur: Int, pos: Int): Double = if (smoothEmissions) { val w = words(pos) var cWord = wcs(pos) var cTagWord = emissions(labelIndex.get(cur), w) assert(cWord >= cTagWord) - if(cWord < 10) { + if (cWord < 10) { cWord += 1.0 cTagWord += indexedLabelCounts(cur) / wordCounts.size } diff --git a/src/main/scala/epic/sequences/HammingLossAugmentation.scala b/src/main/scala/epic/sequences/HammingLossAugmentation.scala index ffeaeeed..f9c7025c 100644 --- a/src/main/scala/epic/sequences/HammingLossAugmentation.scala +++ b/src/main/scala/epic/sequences/HammingLossAugmentation.scala @@ -29,7 +29,7 @@ object HammingLossAugmentation { def scoreTransition(prev: Int, cur: Int, begin: Int, end: Int): Double = { - if(gt.isGoldSegment(begin, end, cur)) -precisionScale + if (gt.isGoldSegment(begin, end, cur)) -precisionScale else recallScale } diff --git a/src/main/scala/epic/sequences/SegmentText.scala b/src/main/scala/epic/sequences/SegmentText.scala index 7cd32980..3dcdba2a 100644 --- a/src/main/scala/epic/sequences/SegmentText.scala +++ b/src/main/scala/epic/sequences/SegmentText.scala @@ -9,7 +9,6 @@ import epic.util.ProcessTextMain */ object SegmentText extends ProcessTextMain[SemiCRF[Any, String], Segmentation[Any, String]] { - override def render(model: SemiCRF[Any, String], ann: Segmentation[Any, String], tokens: IndexedSeq[String]): String = { ann.render } diff --git a/src/main/scala/epic/sequences/Segmentation.scala b/src/main/scala/epic/sequences/Segmentation.scala index ecb87eac..fe366695 100644 --- a/src/main/scala/epic/sequences/Segmentation.scala +++ b/src/main/scala/epic/sequences/Segmentation.scala @@ -12,7 +12,6 @@ case class Segmentation[+L, +W](segments: IndexedSeq[(L, Span)], words: IndexedSeq[W], id: String = "") extends Example[IndexedSeq[(L, Span)], IndexedSeq[W]] { - def render: String = { segmentsWithOutside.map { case (None, span) => words.slice(span.begin, span.end).mkString(" ") @@ -20,7 +19,6 @@ case class Segmentation[+L, +W](segments: IndexedSeq[(L, Span)], }.mkString(" ") } - def features = words def length: Int = words.length @@ -41,7 +39,6 @@ case class Segmentation[+L, +W](segments: IndexedSeq[(L, Span)], for (i <- 0 until length) { - if (currentSegment < segments.length && segments(currentSegment)._2.end == i) { if (newSpanBegin != newOffset) newSegments += (segments(currentSegment)._1 -> Span(newSpanBegin, newOffset)) @@ -63,16 +60,14 @@ case class Segmentation[+L, +W](segments: IndexedSeq[(L, Span)], new Segmentation(newSegments, newWords, s"$id-filtered") } - def segmentsWithOutside: Iterator[(Option[L], Span)] = { val segs = for { qq@IndexedSeq((pL, pSpan), (l, span)) <- (segments.headOption.map(pair => pair._1 -> Span(0, 0)).toIndexedSeq ++ segments).sliding(2) padding = Iterator.range(pSpan.end, span.begin).map(i => None -> Span(i, i + 1)) pair <- padding ++ Iterator((Some(l), span)) } yield { - pair - } - + pair + } val lastSpanEnd = segments.lastOption match { case Some((_, Span(_, end))) => end @@ -82,23 +77,21 @@ case class Segmentation[+L, +W](segments: IndexedSeq[(L, Span)], segs ++ (lastSpanEnd until length).map(i => None -> Span(i, i + 1)) } - def asBIOSequence[LL>:L](outsideLabel: LL): TaggedSequence[BIOETag[L], W] = { val outLabels = new ArrayBuffer[BIOETag[L]]() for((l,span) <- segments if !span.isEmpty) { - while(outLabels.length < span.begin) { + while (outLabels.length < span.begin) { outLabels += BIOETag.Outside } - - if(l == outsideLabel) + if (l == outsideLabel) outLabels += BIOETag.Outside else outLabels += BIOETag.Begin(l) for(i <- (span.begin+1) until (span.end) ) { - outLabels += {if(l != outsideLabel) BIOETag.Inside(l) else BIOETag.Outside} + outLabels += {if (l != outsideLabel) BIOETag.Inside(l) else BIOETag.Outside} } } - while(outLabels.length < words.length) { + while (outLabels.length < words.length) { outLabels += BIOETag.Outside } assert(outLabels.length == words.length) @@ -108,7 +101,7 @@ case class Segmentation[+L, +W](segments: IndexedSeq[(L, Span)], def asFlatTaggedSequence[LL>:L]: TaggedSequence[Option[LL], W] = { val outLabels = new ArrayBuffer[Option[LL]]() for((l,span) <- segments if !span.isEmpty) { - while(outLabels.length < span.begin) { + while (outLabels.length < span.begin) { outLabels += None } @@ -116,7 +109,7 @@ case class Segmentation[+L, +W](segments: IndexedSeq[(L, Span)], outLabels += Some(l) } } - while(outLabels.length < words.length) { + while (outLabels.length < words.length) { outLabels += None } assert(outLabels.length == words.length) @@ -127,15 +120,14 @@ case class Segmentation[+L, +W](segments: IndexedSeq[(L, Span)], val outLabels = new ArrayBuffer[(LL, Span)]() for((l,span) <- segments if !span.isEmpty) { val lastEnd = outLabels.lastOption.map(_._2.end).getOrElse(0) - if(lastEnd < span.begin) { + if (lastEnd < span.begin) { outLabels += (outsideLabel -> Span(lastEnd, span.begin)) } - outLabels += (l -> span) } val lastEnd = outLabels.lastOption.map(_._2.end).getOrElse(0) - if(lastEnd < words.length) { + if (lastEnd < words.length) { outLabels += (outsideLabel -> Span(lastEnd, words.length)) } @@ -154,20 +146,20 @@ object Segmentation { for(i <- 0 until seq.length) { seq.label(i) match { case Begin(l) => - if(currentStart < i) + if (currentStart < i) spans += (currentLabel -> Span(currentStart, i)) currentStart = i currentLabel = l case Inside(l) => - if(currentLabel != l) { - if(currentStart < i) + if (currentLabel != l) { + if (currentStart < i) spans += (currentLabel -> Span(currentStart, i)) currentStart = i currentLabel = l } case End(l) => - if(currentLabel != l) { - if(currentStart < i) + if (currentLabel != l) { + if (currentStart < i) spans += (currentLabel -> Span(currentStart, i)) currentStart = i currentLabel = l @@ -175,18 +167,17 @@ object Segmentation { spans += (currentLabel -> Span(currentStart, i+1)) currentStart = i + 1 case Outside => - if(currentLabel != outsideLabel) { - if(currentStart < i) + if (currentLabel != outsideLabel) { + if (currentStart < i) spans += (currentLabel -> Span(currentStart, i)) currentStart = i currentLabel = outsideLabel } spans += (currentLabel -> Span(currentStart, i+1)) currentStart = i + 1 - } } - if(currentStart < seq.length) + if (currentStart < seq.length) spans += (currentLabel -> Span(currentStart, seq.length)) Segmentation(spans, seq.words, seq.id.replaceAll("-bio","-seg")) } diff --git a/src/main/scala/epic/sequences/SegmentationEval.scala b/src/main/scala/epic/sequences/SegmentationEval.scala index b000ff53..d0a3ea98 100644 --- a/src/main/scala/epic/sequences/SegmentationEval.scala +++ b/src/main/scala/epic/sequences/SegmentationEval.scala @@ -11,11 +11,12 @@ import com.typesafe.scalalogging.slf4j.LazyLogging * @author dlwh */ object SegmentationEval extends LazyLogging { + def eval[L ,W](crf: SemiCRF[L, W], examples: IndexedSeq[Segmentation[L, W]], logOnlyErrors: Boolean = true):Stats = { examples.par.aggregate(new Stats(0,0,0)) ({ (stats, gold )=> val guess = crf.bestSequence(gold.words, gold.id +"-guess") try { - if(guess.label != gold.label) + if (guess.label != gold.label) logger.trace(s"gold = $gold guess = $guess " + s"guess logPartition = ${crf.goldMarginal(guess.segments, guess.words).logPartition} " + s"gold logPartition =${crf.goldMarginal(gold.segments, gold.words).logPartition}") @@ -23,11 +24,10 @@ object SegmentationEval extends LazyLogging { case ex: Exception => logger.debug("Can't recover gold for " + gold) } val myStats = evaluateExample(Set(), guess, gold) - if(!logOnlyErrors || myStats.f1 < 1.0) + if (!logOnlyErrors || myStats.f1 < 1.0) logger.info("Guess:\n" + guess.render + "\n Gold:\n" + gold.render+ "\n" + myStats) stats + myStats }, {_ + _}) - } def evaluateExample[W, L](outsideLabel: Set[L], guess: Segmentation[L, W], gold: Segmentation[L, W]): SegmentationEval.Stats = { diff --git a/src/main/scala/epic/sequences/Segmenter.scala b/src/main/scala/epic/sequences/Segmenter.scala index 0c031fd8..9048356a 100644 --- a/src/main/scala/epic/sequences/Segmenter.scala +++ b/src/main/scala/epic/sequences/Segmenter.scala @@ -14,6 +14,7 @@ import scala.reflect.ClassTag * @author dlwh **/ trait Segmenter[Tag] extends StringAnalysisFunction[Sentence with Token, Tag] with (IndexedSeq[String]=>IndexedSeq[(Tag, Span)]) { + implicit protected def tagTag: ClassTag[Tag] override def apply[In <: Sentence with Token](slab: StringSlab[In]): StringSlab[In with Tag] = { val annotatedSentences = for((span, sent) <- slab.iterator[Sentence]) yield { @@ -23,7 +24,6 @@ trait Segmenter[Tag] extends StringAnalysisFunction[Sentence with Token, Tag] wi Span(tokens(espan.begin)._1.begin, tokens(espan.end - 1)._1.end) -> lbl } } - slab.addLayer[Tag](annotatedSentences.flatten) } @@ -32,7 +32,6 @@ trait Segmenter[Tag] extends StringAnalysisFunction[Sentence with Token, Tag] wi object Segmenter { def nerSystem[L](crf: SemiCRF[L, String]) = fromCRF(crf, (a: L) => EntityMention(a.toString)) - def fromCRF[L, Tag:ClassTag](crf: SemiCRF[L, String], lToTag: L=>Tag):Segmenter[Tag] = new SemiCRFSegmenter(crf, lToTag) case class SemiCRFSegmenter[L, Tag:ClassTag] (crf: SemiCRF[L, String], lToTag: L=>Tag) extends Segmenter[Tag] { @@ -41,4 +40,5 @@ object Segmenter { crf.bestSequence(v1).segments.map { case (l, span) => lToTag(l) -> span} } } + } diff --git a/src/main/scala/epic/sequences/SemiCRF.scala b/src/main/scala/epic/sequences/SemiCRF.scala index bc986970..21423aa6 100644 --- a/src/main/scala/epic/sequences/SemiCRF.scala +++ b/src/main/scala/epic/sequences/SemiCRF.scala @@ -59,7 +59,7 @@ object SemiCRF { val obj = new ModelObjective(model, data) val cached = new CachedBatchDiffFunction(obj) val weights = opt.minimize(cached, obj.initialWeightVector(false)) -// GradientTester.test(cached, weights, randFraction = 1.0, toString={(i: Int) => model.featureIndex.get(i).toString}, tolerance=0.0) + // GradientTester.test(cached, weights, randFraction = 1.0, toString={(i: Int) => model.featureIndex.get(i).toString}, tolerance=0.0) val crf = model.extractCRF(weights) crf @@ -102,7 +102,6 @@ object SemiCRF { override def labelIndex: OptionIndex[L] = new OptionIndex(crf.labelIndex) } - /** * An Anchoring encodes all the information needed to score Semimarkov models. * @@ -175,13 +174,13 @@ object SemiCRF { val allowedLabels = spanMarginals.map { arr => BitSet.empty ++ (0 until arr.length).filter(i => arr(i) >= threshold) -// BitSet.empty ++ (0 until arr.length) + // BitSet.empty ++ (0 until arr.length) } LabeledSpanConstraints(allowedLabels) } - def hasSupportOver(m: Marginal[L, W]):Boolean = { + def hasSupportOver(m: Marginal[L, W]): Boolean = { object FailureException extends Exception try { m visit new TransitionVisitor[L, W] { @@ -224,7 +223,6 @@ object SemiCRF { val partition = softmax(forwardScores.last) val _s = scorer - new Marginal[L, W] { def anchoring: Anchoring[L, W] = _s @@ -270,7 +268,6 @@ object SemiCRF { } - /** Log-normalized probability of seing segment with transition */ def transitionMarginal(prev: Int, cur: Int, begin: Int, end: Int): Double = { val withoutTrans = forwardScores(begin)(prev) + backwardScore(end)(cur) @@ -330,7 +327,6 @@ object SemiCRF { numerics.I(goldEnds(begin) == end && goldLabels(begin) == cur && goldPrevLabels(begin) == prev) } - def logPartition: Double = score } } @@ -455,8 +451,6 @@ object SemiCRF { backwardScores } - - } trait ConstraintSemiCRF[L, W] extends SemiCRF[L, W] with LabeledSpanConstraints.Factory[L, W] { @@ -466,17 +460,15 @@ object SemiCRF { @SerialVersionUID(1L) class IdentityConstraintSemiCRF[L, W](val labelIndex: OptionIndex[L]) extends ConstraintSemiCRF[L, W] with Serializable { outer => + def scorer(w: IndexedSeq[W]) = new Anchoring[L,W]() { def words = w def scoreTransition(prev: Int, cur: Int, begin: Int, end: Int) = 0.0 def labelIndex = outer.labelIndex - def constraints: LabeledSpanConstraints[L] = NoConstraints } - def constraints(w: IndexedSeq[W]) = NoConstraints - def constraints(seg: Segmentation[L, W], keepGold: Boolean) = NoConstraints } @@ -502,7 +494,6 @@ object SemiCRF { c = crf.marginal(w).computeSpanConstraints(threshold) cache.put(w, c) } - c } @@ -515,34 +506,24 @@ object SemiCRF { } } - def scorer(w: IndexedSeq[W]): Anchoring[L, W] = { val c = constraints(w) new Anchoring[L, W] { def words: IndexedSeq[W] = w - - def constraints: LabeledSpanConstraints[L] = c - def labelIndex:OptionIndex[L] = crf.labelIndex - def scoreTransition(prev: Int, cur: Int, begin: Int, end: Int): Double = numerics.logI(c.isAllowedLabeledSpan(begin, end, cur)) - } } } - trait IndexedFeaturizer[L, W] { def anchor(w: IndexedSeq[W]):AnchoredFeaturizer[L, W] - - def labelIndex: OptionIndex[L] def featureIndex: Index[Feature] - def hasTransitionFeatures: Boolean = true } @@ -551,7 +532,6 @@ object SemiCRF { def featuresForTransition(prev: Int, cur: Int, begin: Int, end: Int):FeatureVector } - def viterbi[L, W](anchoring: Anchoring[L ,W], id: String=""):Segmentation[L, W] = { val length = anchoring.length val numLabels = anchoring.labelIndex.size @@ -608,7 +588,6 @@ object SemiCRF { Segmentation(segments.reverse, anchoring.words, id) } - def posteriorDecode[L, W](m: Marginal[L, W], id: String = "") = { val length = m.length val numLabels = m.anchoring.labelIndex.size @@ -681,7 +660,6 @@ object SemiCRF { class IdentityAnchoring[L, W](val words: IndexedSeq[W], val labelIndex: OptionIndex[L], val constraints: LabeledSpanConstraints[L]) extends Anchoring[L, W] { def scoreTransition(prev: Int, cur: Int, beg: Int, end: Int): Double = 0.0 - def canStartLongSegment(pos: Int): Boolean = true } diff --git a/src/main/scala/epic/sequences/SemiCRFModel.scala b/src/main/scala/epic/sequences/SemiCRFModel.scala index a9d9bf49..69944950 100644 --- a/src/main/scala/epic/sequences/SemiCRFModel.scala +++ b/src/main/scala/epic/sequences/SemiCRFModel.scala @@ -124,7 +124,7 @@ class SemiCRFInference[L, W](weights: DenseVector[Double], val m = SemiCRF.Marginal(aug * scorer) val partition: Double = m.logPartition val partition1: Double = SemiCRF.Marginal.goldMarginal[L, W](scorer * aug, v.label).logPartition - if(partition1 > partition) + if (partition1 > partition) println(v + " " + SemiCRF.posteriorDecode(m).render + " " + v.render + " " + partition + " " + partition1) m } @@ -181,13 +181,13 @@ class SemiCRFInference[L, W](weights: DenseVector[Double], private def cachedSpanScore(prev: Int, cur: Int, beg: Int, end: Int):Double = { val tind: Int = TriangularArray.index(beg, end) var spanCell = spanCache(tind) - if(spanCache(tind) == null) { + if (spanCache(tind) == null) { spanCell = new Array[Array[Double]](labelIndex.size) spanCache(tind) = spanCell } var curLabelCell = spanCell(cur) - if(curLabelCell == null) { + if (curLabelCell == null) { val span = localization.featuresForSpan(prev, cur, beg, end) if (span eq null) { @@ -264,8 +264,6 @@ class SegmentationModelFactory[L](wordFeaturizer: Optional[WordFeaturizer[String model } - - } object SegmentationModelFactory { @@ -297,13 +295,10 @@ object SegmentationModelFactory { case class TransitionFeature[L](label: L, label2: L) extends Feature case object OutsideFeature extends Feature - object FeatureKinds extends Enumeration { val Begin, Interior, Span, Label = Value } - - @SerialVersionUID(2L) class IndexedStandardFeaturizer[L, W] private (wordFeaturizer: IndexedWordFeaturizer[W], surfaceFeaturizer: IndexedSurfaceFeaturizer[W], @@ -345,7 +340,7 @@ object SegmentationModelFactory { } else { var features = spanFeatureIndex.crossProduct(bioeFeatures(cur)(Span.id), loc.featuresForSpan(begin, end), spanOffset) - if(end - begin == 1) { + if (end - begin == 1) { features ++= wordFeatureIndex.crossProduct(bioeFeatures(cur)(Span.id), wloc.featuresForWord(begin), wordOffset) } @@ -365,12 +360,12 @@ object SegmentationModelFactory { (data: IndexedSeq[Segmentation[L, W]]):IndexedStandardFeaturizer[L, W] = { val labelPartIndex = Index[Feature]() val outsideFeature = labelPartIndex.index(OutsideFeature) - val bioeFeatures = Array.tabulate(labelIndex.size, FeatureKinds.maxId)((i,j) => if(i == labelIndex.size - 1) Array.empty[Int] else Array(labelPartIndex.index(Label1Feature(labelIndex.get(i).get, FeatureKinds(j))))) + val bioeFeatures = Array.tabulate(labelIndex.size, FeatureKinds.maxId)((i,j) => if (i == labelIndex.size - 1) Array.empty[Int] else Array(labelPartIndex.index(Label1Feature(labelIndex.get(i).get, FeatureKinds(j))))) val transitionFeatures = Array.tabulate(labelIndex.size, labelIndex.size) { (i, j) => val li = labelIndex.get(i).fold(OutsideFeature:Any)(identity) val lj = labelIndex.get(j).fold(OutsideFeature:Any)(identity) - if(lj == OutsideFeature) + if (lj == OutsideFeature) Array(labelPartIndex.index(TransitionFeature(li, lj)), outsideFeature) else Array(labelPartIndex.index(TransitionFeature(li, lj))) @@ -397,7 +392,7 @@ object SegmentationModelFactory { } // span spanBuilder.add(bioeFeatures(li)(FeatureKinds.Span.id), feats.featuresForSpan(span.begin, span.end)) - if(span.length == 1) { + if (span.length == 1) { wordBuilder.add(bioeFeatures(li)(FeatureKinds.Span.id), wordFeats.featuresForWord(span.begin)) } last = li @@ -416,5 +411,4 @@ object SegmentationModelFactory { } - } diff --git a/src/main/scala/epic/sequences/SemiNERPipeline.scala b/src/main/scala/epic/sequences/SemiNERPipeline.scala index 3b4c89da..b5a29582 100644 --- a/src/main/scala/epic/sequences/SemiNERPipeline.scala +++ b/src/main/scala/epic/sequences/SemiNERPipeline.scala @@ -16,7 +16,6 @@ import epic.preprocess.TreebankTokenizer import epic.corpora.CONLLSequenceReader import epic.framework.Example - /** * * @author dlwh @@ -52,14 +51,13 @@ object SemiNerPipeline extends LazyLogging { instances.splitAt(instances.length * 9 / 10) } - val gazetteer = None//Gazetteer.ner("en") // build feature Index val model = new SegmentationModelFactory(gazetteer = gazetteer).makeModel(train) val obj = new ModelObjective(model, train, params.nthreads) val cached = new CachedBatchDiffFunction(obj) - if(params.checkGradient) { + if (params.checkGradient) { GradientTester.test(cached, obj.initialWeightVector(true), toString = {(x: Int) => model.featureIndex.get(x).toString}) } @@ -68,7 +66,7 @@ object SemiNerPipeline extends LazyLogging { println("Eval + " + (state.iter+1) + " " + SegmentationEval.eval(crf, test)) } - val finalState = params.opt.iterations(cached, obj.initialWeightVector(randomize=false)).tee(state => if((state.iter +1) % params.iterPerEval == 0) eval(state)).take(params.opt.maxIterations).last + val finalState = params.opt.iterations(cached, obj.initialWeightVector(randomize=false)).tee(state => if ((state.iter +1) % params.iterPerEval == 0) eval(state)).take(params.opt.maxIterations).last eval(finalState) breeze.util.writeObject(params.modelOut, model.extractCRF(finalState.x)) @@ -77,8 +75,6 @@ object SemiNerPipeline extends LazyLogging { } - - object SemiConllNerPipeline extends LazyLogging { def makeSegmentation(ex: Example[IndexedSeq[String],IndexedSeq[IndexedSeq[String]]]): Segmentation[String, String] = { @@ -88,22 +84,22 @@ object SemiConllNerPipeline extends LazyLogging { val out = new ArrayBuffer[(String, Span)]() var start = labels.length var i = 0 - while(i < labels.length) { + while (i < labels.length) { val l = labels(i) l(0) match { case 'O' => - if(start < i) + if (start < i) out += (labels(start).replaceAll(".-","").intern -> Span(start, i)) -// out += ("O".intern -> Span(i, i+1)) + // out += ("O".intern -> Span(i, i+1)) start = i + 1 case 'B' => - if(start < i) + if (start < i) out += (labels(start).replaceAll(".-","").intern -> Span(start, i)) start = i case 'I' => - if(start >= i) { + if (start >= i) { start = i - } else if(labels(start) != l){ + } else if (labels(start) != l){ out += (labels(start).replaceAll(".-","").intern -> Span(start, i)) start = i } // else, still in a field, do nothing. @@ -113,15 +109,13 @@ object SemiConllNerPipeline extends LazyLogging { i += 1 } - if(start < i) + if (start < i) out += (labels(start).replaceAll(".-","").intern -> Span(start, i)) -// assert(out.nonEmpty && out.last._2.end == words.length, out + " " + words + " " + labels) + // assert(out.nonEmpty && out.last._2.end == words.length, out + " " + words + " " + labels) Segmentation(out, words, ex.id) } - - case class Params(train: File, test: File, nsents: Int = 100000, @@ -141,13 +135,12 @@ object SemiConllNerPipeline extends LazyLogging { standardTrain.take(params.nsents).map(makeSegmentation) -> standardTest.map(makeSegmentation) } - // build feature Index val model: SemiCRFModel[String, String] = new SegmentationModelFactory(/*, gazetteer = Gazetteer.ner("en" )*/).makeModel(train) val obj = new ModelObjective(model, train, params.nthreads) val cached = new CachedBatchDiffFunction(obj) - if(params.checkGradient) { + if (params.checkGradient) { GradientTester.test(cached, obj.initialWeightVector(true), toString={(i: Int) => model.featureIndex.get(i).toString}) } @@ -163,12 +156,11 @@ object SemiConllNerPipeline extends LazyLogging { stats } - val weights = params.opt.iterations(cached, obj.initialWeightVector(randomize=false)).tee(state => if((state.iter +1) % params.iterPerEval == 0) eval(state)).take(params.opt.maxIterations).last + val weights = params.opt.iterations(cached, obj.initialWeightVector(randomize=false)).tee(state => if ((state.iter +1) % params.iterPerEval == 0) eval(state)).take(params.opt.maxIterations).last val stats = eval(weights) breeze.util.writeObject(params.modelOut, model.extractCRF(weights.x)) println(stats) - } } diff --git a/src/main/scala/epic/sequences/TagText.scala b/src/main/scala/epic/sequences/TagText.scala index 2a725246..0edebaee 100644 --- a/src/main/scala/epic/sequences/TagText.scala +++ b/src/main/scala/epic/sequences/TagText.scala @@ -10,7 +10,6 @@ import epic.trees.AnnotatedLabel */ object TagText extends ProcessTextMain[CRF[AnnotatedLabel, String], TaggedSequence[AnnotatedLabel, String]] { - override def render(model: CRF[AnnotatedLabel, String], ann: TaggedSequence[AnnotatedLabel, String], tokens: IndexedSeq[String]): String = ann.render override def annotate(model: CRF[AnnotatedLabel, String], text: IndexedSeq[String]): TaggedSequence[AnnotatedLabel, String] = { diff --git a/src/main/scala/epic/sequences/TaggedSequence.scala b/src/main/scala/epic/sequences/TaggedSequence.scala index adeddf72..4d04dbe2 100644 --- a/src/main/scala/epic/sequences/TaggedSequence.scala +++ b/src/main/scala/epic/sequences/TaggedSequence.scala @@ -14,17 +14,10 @@ case class TaggedSequence[+L, +W](tags: IndexedSeq[L], require(tags.length == words.length) - def render = { - (tags zip words map { case (t, w) => w +"/" + t}).mkString(" ") - } - + def render = (tags zip words map { case (t, w) => w +"/" + t}).mkString(" ") def pairs = tags zip words - def features = words - def length: Int = words.length - def label: IndexedSeq[L] = tags - def asSegmentation = Segmentation(tags.zipWithIndex.map{case (l, i) => (l -> Span(i, i+1))}, words, id+"-seg") } diff --git a/src/main/scala/epic/sequences/TaggedSequenceEval.scala b/src/main/scala/epic/sequences/TaggedSequenceEval.scala index 84b3a7f9..eaa46ef1 100644 --- a/src/main/scala/epic/sequences/TaggedSequenceEval.scala +++ b/src/main/scala/epic/sequences/TaggedSequenceEval.scala @@ -17,7 +17,7 @@ object TaggedSequenceEval { val myStats = evaluateExample(guess, gold) val sent = for( ((p,g),w) <- guess.label zip gold.label zip guess.words) yield if (g == p) s"$w/$g" else s"$w/[G:$g,P:$p]" - if(myStats.exact != 1) + if (myStats.exact != 1) println(sent.mkString(" ") + "\n" + myStats) stats + myStats }, {_ + _}) @@ -28,7 +28,7 @@ object TaggedSequenceEval { val confusion = Counter2({for( (p, g) <- guess.tags zip gold.tags if p != g) yield (p,g,1)}:_*) val nRight = guess.length - confusion.size val nTotal = guess.length - val myStats = new Stats(nRight, nTotal, if(nRight == nTotal) 1 else 0, 1, confusion) + val myStats = new Stats(nRight, nTotal, if (nRight == nTotal) 1 else 0, 1, confusion) myStats } diff --git a/src/main/scala/epic/sequences/Tagger.scala b/src/main/scala/epic/sequences/Tagger.scala index cf91e1b1..08774fe5 100644 --- a/src/main/scala/epic/sequences/Tagger.scala +++ b/src/main/scala/epic/sequences/Tagger.scala @@ -20,10 +20,8 @@ trait Tagger[Tag] extends StringAnalysisFunction[Sentence with Token, Tag] with val tagSeq = apply(tokens.map(_._2.token)) tokens.map(_._1) zip tagSeq } - slab.addLayer[Tag](annotatedSentences.flatten) } - } object Tagger { diff --git a/src/main/scala/epic/sequences/TrainPosTagger.scala b/src/main/scala/epic/sequences/TrainPosTagger.scala index a1144b90..0b2509a3 100644 --- a/src/main/scala/epic/sequences/TrainPosTagger.scala +++ b/src/main/scala/epic/sequences/TrainPosTagger.scala @@ -48,9 +48,9 @@ object SemiPOSTagger extends LazyLogging { val crf = SemiCRF.buildSimple(train, opt = opt) val inf = crf.asInstanceOf[SemiCRFInference[_, _]] -// val out = new PrintWriter(new BufferedOutputStream(new FileOutputStream("weights.txt"))) -// Encoder.fromIndex(inf.featureIndex).decode(inf.weights).iterator foreach {case (x, v) if v.abs > 1E-6 => out.println(x -> v) case _ => } -// out.close() + // val out = new PrintWriter(new BufferedOutputStream(new FileOutputStream("weights.txt"))) + // Encoder.fromIndex(inf.featureIndex).decode(inf.weights).iterator foreach {case (x, v) if v.abs > 1E-6 => out.println(x -> v) case _ => } + // out.close() val stats = SegmentationEval.eval(crf, test) println("Final Stats: " + stats) } diff --git a/src/main/scala/epic/slab/AnalysisFunction.scala b/src/main/scala/epic/slab/AnalysisFunction.scala index bd582d1d..0b15e599 100644 --- a/src/main/scala/epic/slab/AnalysisFunction.scala +++ b/src/main/scala/epic/slab/AnalysisFunction.scala @@ -33,7 +33,6 @@ case class ComposedAnalysisFunction[C, B, I, O, II >: (I with O), +OO](a: Analys } - object StringIdentityAnalyzer extends StringAnalysisFunction[Any, Any] { def apply[In](slab: StringSlab[In]):StringSlab[In] = slab } @@ -50,7 +49,6 @@ object RegexTokenizer extends Tokenizer { }) } - object AnalysisPipeline { import AnnotatedSpan._ @@ -82,5 +80,4 @@ object AnalysisPipeline { } - } diff --git a/src/main/scala/epic/slab/Slab.scala b/src/main/scala/epic/slab/Slab.scala index 9e776c9b..434ddf05 100644 --- a/src/main/scala/epic/slab/Slab.scala +++ b/src/main/scala/epic/slab/Slab.scala @@ -45,7 +45,7 @@ trait Slab[ContentType, RegionType, +AnnotationTypes] { /** useful for downcasting */ def checkedCast[A: ClassTag]:Option[Slab[ContentType, RegionType, AnnotationTypes with A]] = { - if(!hasLayer[A]) { + if (!hasLayer[A]) { None } else { Some(this.asInstanceOf[Slab[ContentType, RegionType, AnnotationTypes with A]]) @@ -81,10 +81,8 @@ trait Slab[ContentType, RegionType, +AnnotationTypes] { def stringRep[A >: AnnotationTypes: ClassTag] = { iterator[A].mkString("\n") } - - -} +} object AnnotatedSpan { @@ -124,7 +122,7 @@ case class EntityMention(entityType: String, id: Option[String] = None) object Slab { trait ExtractRegion[Region, T] { - def apply(region: Region, t: T):T + def apply(region: Region, t: T): T } implicit object SpanStringExtractRegion extends ExtractRegion[Span, String] { @@ -148,7 +146,6 @@ object Slab { val annotations: Map[Class[_], Vector[(Span, Any)]] = Map.empty, val reverseAnnotations: Map[Class[_], Vector[(Span, Any)]] = Map.empty)(implicit extract: ExtractRegion[Span, ContentType]) extends Slab[ContentType, Span, AnnotationType] { - override def spanned(region: Span): ContentType = extract(region, content) override def addLayer[A:ClassTag](annotations: TraversableOnce[(Span, A)]): Slab[ContentType, Span, AnnotationType with A] = { @@ -164,14 +161,12 @@ object Slab { new SortedSequenceSlab(content, newAnnotations, reverseAnnotations) } - override def removeLayer[A >: AnnotationType: ClassTag]: Slab[ContentType, Span, AnnotationType] = { new SortedSequenceSlab(content, annotations - implicitly[ClassTag[A]].runtimeClass, reverseAnnotations - implicitly[ClassTag[A]].runtimeClass) } - /** Queries whether we have annotations of this type, even if the slab * doesn't have this type. Sometimes you just have to cast... */ override def hasLayer[A: ClassTag]: Boolean = { @@ -181,23 +176,23 @@ object Slab { override def following[A >: AnnotationType: ClassTag](region: Span): Iterator[(Span, A)] = { val annotations = selectAnnotations[A] var pos = BinarySearch.interpolationSearch(annotations, (_:(Span, Any))._1.begin, region.end) - if(pos < 0) pos = ~pos + if (pos < 0) pos = ~pos annotations.view(pos, annotations.length).iterator } override def preceding[A >: AnnotationType : ClassTag](region: Span): Iterator[(Span, A)] = { val annotations = selectReverse[A] var pos = BinarySearch.interpolationSearch(annotations, (_:(Span, Any))._1.end, region.begin + 1) - if(pos < 0) pos = ~pos + if (pos < 0) pos = ~pos annotations.view(0, pos).reverseIterator } override def covered[A >: AnnotationType : ClassTag](region: Span): IndexedSeq[(Span, A)] = { val annotations = selectAnnotations[A] var begin = BinarySearch.interpolationSearch(annotations, (_:(Span, Any))._1.begin, region.begin) - if(begin < 0) begin = ~begin + if (begin < 0) begin = ~begin var end = annotations.indexWhere(_._1.end > region.end, begin) - if(end < 0) end = annotations.length + if (end < 0) end = annotations.length annotations.slice(begin, end) } @@ -223,7 +218,4 @@ object Slab { } - - - } diff --git a/src/main/scala/epic/slab/package.scala b/src/main/scala/epic/slab/package.scala index 414a99b5..4ecbe423 100644 --- a/src/main/scala/epic/slab/package.scala +++ b/src/main/scala/epic/slab/package.scala @@ -9,9 +9,6 @@ import epic.trees.Span **/ package object slab { // some type aliases - - type StringAnalysisFunction[I, O] = AnalysisFunction[String, Span, I, O] type StringSlab[+AnnotationTypes] = Slab[String, Span, AnnotationTypes] - } diff --git a/src/main/scala/epic/trees/AnnotatedLabel.scala b/src/main/scala/epic/trees/AnnotatedLabel.scala index 4ba2645c..ead8b6e2 100644 --- a/src/main/scala/epic/trees/AnnotatedLabel.scala +++ b/src/main/scala/epic/trees/AnnotatedLabel.scala @@ -45,7 +45,6 @@ case class AnnotatedLabel(label: String, index: Int = -1) extends Feature with CachedHashCode { def hasAnnotation(f: Annotation): Boolean = features.contains(f) - def annotate(sym: Annotation*) = copy(features = features ++ sym) def isIntermediate = label.nonEmpty && label.charAt(0) == '@' @@ -60,17 +59,16 @@ case class AnnotatedLabel(label: String, if (index != -1) { x += s"-$index" } - x } override def toString = { val components = new ArrayBuffer[String]() headTag.foreach(components += _) - if(parents.nonEmpty) { + if (parents.nonEmpty) { components += parents.mkString("^","^","") } - if(siblings.nonEmpty) { + if (siblings.nonEmpty) { val b = new StringBuilder() siblings foreach { case Left(sib) => @@ -82,13 +80,13 @@ case class AnnotatedLabel(label: String, } components += b.toString } - if(features.nonEmpty) + if (features.nonEmpty) components ++= features.iterator.map(_.toString) - if(index != -1) + if (index != -1) components += s"_$index" - if(components.nonEmpty) components.mkString(label+"[", ", ", "]") + if (components.nonEmpty) components.mkString(label+"[", ", ", "]") else label } } @@ -110,7 +108,7 @@ object AnnotatedLabel { label.split("[-=#]") } - if(label.isEmpty) return AnnotatedLabel.TOP + if (label.isEmpty) return AnnotatedLabel.TOP val tag = fields.head diff --git a/src/main/scala/epic/trees/Debinarizer.scala b/src/main/scala/epic/trees/Debinarizer.scala index 48d413a7..50f754db 100644 --- a/src/main/scala/epic/trees/Debinarizer.scala +++ b/src/main/scala/epic/trees/Debinarizer.scala @@ -8,20 +8,17 @@ import java.io.ObjectStreamException * * @author dlwh **/ -trait Debinarizer[L] extends (BinarizedTree[L]=>Tree[L]) with Serializable { - -} +trait Debinarizer[L] extends (BinarizedTree[L] => Tree[L]) with Serializable object Debinarizer { @SerialVersionUID(1L) implicit object AnnotatedLabelDebinarizer extends Debinarizer[AnnotatedLabel] { def apply(t: BinarizedTree[AnnotatedLabel]): Tree[AnnotatedLabel] = { - Trees.debinarize(replaceUnaries(t), {(_:AnnotatedLabel).isIntermediate}).map(_.baseAnnotatedLabel) + Trees.debinarize(replaceUnaries(t), {(_: AnnotatedLabel).isIntermediate}).map(_.baseAnnotatedLabel) } - - def replaceUnaries(t: Tree[AnnotatedLabel]):Tree[AnnotatedLabel] = t match { + def replaceUnaries(t: Tree[AnnotatedLabel]): Tree[AnnotatedLabel] = t match { case UnaryTree(a, child, chain, span) if a.label == child.label.label && chain.isEmpty => replaceUnaries(child) case UnaryTree(a, child, chain, span) => @@ -37,7 +34,6 @@ object Debinarizer { case _ => t } - } @SerialVersionUID(1L) @@ -47,7 +43,7 @@ object Debinarizer { Trees.debinarize(Trees.deannotate(replaceUnaries(t))) } - def replaceUnaries(t: Tree[String]):Tree[String] = t match { + def replaceUnaries(t: Tree[String]): Tree[String] = t match { case UnaryTree(a, child, chain, span) if a == child.label && chain.isEmpty => replaceUnaries(child) case UnaryTree(a, child, chain, span) => @@ -64,7 +60,4 @@ object Debinarizer { } } - - - } diff --git a/src/main/scala/epic/trees/DependencyTree.scala b/src/main/scala/epic/trees/DependencyTree.scala index 8248329b..d7c02c59 100644 --- a/src/main/scala/epic/trees/DependencyTree.scala +++ b/src/main/scala/epic/trees/DependencyTree.scala @@ -7,7 +7,7 @@ package epic.trees case class DependencyTree[+L, +W](dependencies: IndexedSeq[(L, Int)], words: IndexedSeq[W]) { def render : String = { for(((label, head),dep) <- dependencies.zipWithIndex) yield { - if(head == words.length) s"ROOT(${words(dep)}-$dep)" else s"$label(${words(head)}-$head, ${words(dep)}-$dep)" + if (head == words.length) s"ROOT(${words(dep)}-$dep)" else s"$label(${words(head)}-$head, ${words(dep)}-$dep)" } }.mkString("\n") @@ -24,10 +24,10 @@ object DependencyTree { val deps = new Array[(L, Int)](words.length) for( subtree <- annotated.allChildren) { for(t <- subtree.children if t.label._2 != subtree.label._2) { - deps(t.label._2) = (t.label._1 -> subtree.label._2) + deps(t.label._2) = t.label._1 -> subtree.label._2 } } - deps(annotated.label._2) = (annotated.label._1 -> words.length) + deps(annotated.label._2) = annotated.label._1 -> words.length DependencyTree(deps, words) } } diff --git a/src/main/scala/epic/trees/HeadFinder.scala b/src/main/scala/epic/trees/HeadFinder.scala index c9794249..ca6d056e 100644 --- a/src/main/scala/epic/trees/HeadFinder.scala +++ b/src/main/scala/epic/trees/HeadFinder.scala @@ -30,17 +30,14 @@ import breeze.util.Lens */ object HeadFinder { def left[L]: HeadFinder[L] = new RuleBasedHeadFinder[L](Left, HeadRules.empty) - def right[L]: HeadFinder[L] = new RuleBasedHeadFinder[L](Right, HeadRules.empty) - - val collins = new RuleBasedHeadFinder(Left, rules = HeadRules.collinsHeadRules); - + val collins = new RuleBasedHeadFinder(Left, rules = HeadRules.collinsHeadRules) implicit def lensed[L, U](hf: HeadFinder[L])(implicit lens: Lens[U, L]) = hf.projected(lens.get(_: U)) } trait HeadFinder[L] { - def findHeadChild(l: L, children: L*):Int + def findHeadChild(l: L, children: L*): Int def findHeadChild(r: Rule[L]): Int = r match { case UnaryRule(_, _, _) => 0 @@ -50,33 +47,31 @@ trait HeadFinder[L] { def findHeadChild(t: Tree[L]): Int = findHeadChild(t.label, t.children.map(c => c.label): _*) - def findHeadWord[W](t: Tree[L], words: Seq[W]) = words(findHeadWordIndex(t)); + def findHeadWord[W](t: Tree[L], words: Seq[W]) = words(findHeadWordIndex(t)) def findHeadWordIndex(t: Tree[L]): Int = { - if (t.isLeaf) t.span.begin; + if (t.isLeaf) t.span.begin else { - findHeadWordIndex(t.children(findHeadChild(t))); + findHeadWordIndex(t.children(findHeadChild(t))) } } def findHeadTag(t: Tree[L]): L = { if (t.isLeaf) t.label else { - findHeadTag(t.children(findHeadChild(t))); + findHeadTag(t.children(findHeadChild(t))) } } - def annotateHeadIndices[W](t: Tree[L]): Tree[(L, Int)] = t match { case t:BinarizedTree[L] => annotateHeadIndices(t) - case Tree(l, children, span) if children.length == 0 => Tree(l -> t.span.begin, IndexedSeq.empty, t.span) + case Tree(l, children, span) if children.isEmpty => Tree(l -> t.span.begin, IndexedSeq.empty, t.span) case Tree(l, children, span) => val headChild = findHeadChild(t) val rec = children.map(annotateHeadIndices(_)) Tree(l -> rec(headChild).label._2, rec, t.span) } - def annotateHeadIndices(t: BinarizedTree[L]): BinarizedTree[(L, Int)] = t match { case NullaryTree(l, span) => NullaryTree(l -> t.span.begin, t.span) case u@UnaryTree(a, b, chain, span) => @@ -86,7 +81,7 @@ trait HeadFinder[L] { val headChild = findHeadChild(t) val recB = annotateHeadIndices(b) val recC = annotateHeadIndices(c) - val head = if(headChild == 0) recB.label._2 else recC.label._2 + val head = if (headChild == 0) recB.label._2 else recC.label._2 BinaryTree(a -> head, recB, recC, t.span) } @@ -100,15 +95,13 @@ trait HeadFinder[L] { val headChild = findHeadChild(t) val recB: BinarizedTree[(L, L)] = annotateHeadTags(b) val recC: BinarizedTree[(L, L)] = annotateHeadTags(c) - val head = if(headChild == 0) recB.label._2 else recC.label._2 + val head = if (headChild == 0) recB.label._2 else recC.label._2 BinaryTree(a -> head, recB, recC, t.span) } - def projected[U](f: U => L): HeadFinder[U] } - /** * Can annotate a tree with the head word. Usually * you should just use HeadFinder.collinsHeadFinder @@ -127,14 +120,13 @@ class RuleBasedHeadFinder[L](defaultDirection: Dir = Left, rules: HeadRules[L]) } def annotateHeadWords[W](t: Tree[L], words: Seq[W]): Tree[(L, W)] = t match { - case Tree(l, children, span) if children.length == 0 => Tree(l -> words(t.span.begin), IndexedSeq.empty, t.span) + case Tree(l, children, span) if children.isEmpty => Tree(l -> words(t.span.begin), IndexedSeq.empty, t.span) case Tree(l, children, span) => val headChild = findHeadChild(t) val rec = children.map(annotateHeadWords(_, words)) Tree(l -> rec(headChild).label._2, rec, t.span) } - def projected[U](f: U => L): HeadFinder[U] = new RuleBasedHeadFinder[U](defaultDirection, rules.projected(f)) def lensed[U](implicit lens: Lens[U, L]) = HeadFinder.lensed(this) @@ -167,10 +159,9 @@ case class HeadRule[L](dir: Dir, dis: Boolean, heads: Seq[L]) { rule => val candidates = for (l <- rule.heads.iterator) yield { if (rule.dir == Left) children.indexOf(l) else children.lastIndexOf(l) - }; - candidates.find(_ >= 0) getOrElse -1; + } + candidates.find(_ >= 0) getOrElse -1 } - } } @@ -186,10 +177,7 @@ trait HeadRules[L] extends Serializable { outer => val myRules: Seq[HeadRule[InnerLabel]] = findRules(proj(parent)) val mapped = children.map(proj) val answers = myRules.view.map(_.findMatchIndex(mapped: _*)).filterNot(_ == -1) - if (answers.nonEmpty) { - Some(answers.head) - } - else None + answers.headOption } def lensed[U](implicit lens: Lens[U, L]) = projected(lens.get(_: U)) @@ -206,28 +194,22 @@ trait HeadRules[L] extends Serializable { outer => object HeadRules { - /** * Search direction for the match. */ - sealed trait Dir; - - case object Left extends Dir; - - case object Right extends Dir; + sealed trait Dir + case object Left extends Dir + case object Right extends Dir def empty[L]: HeadRules[L] = fromMap[L](Map.empty) def fromMap[L](map: Map[L, Seq[HeadRule[L]]]): HeadRules[L] = new HeadRules[L] { protected type InnerLabel = L - protected def findRules(l: L) = map.getOrElse(l, Seq.empty) - protected def proj(l: L) = l - } - private def shr[L](dir: Dir, dis: Boolean, heads: L*) = HeadRule(dir, dis, heads); + private def shr[L](dir: Dir, dis: Boolean, heads: L*) = HeadRule(dir, dis, heads) val collinsHeadRules = fromMap[String] { val allNonTerms = shr(Right, false, "ROOT", "TOP", "ADJP", "ADVP", "CONJP", "FRAG", "S", "INTJ", "LST", "NAC", "NX", "PP", "PRN", "PRT", "QP", "RRC", "S", "SBAR", "SBARQ", "SINV", "SQ", "UCP", "VP", "WHADJP", "WHADVP", "WHNP", "WHPP", "X", "NML", "NP", "NN", "NNP", "NNPS", "NNS", "VB", "VBZ", "VBG", "VBD", "JJ", "JJR", "JJS", "CC", "VBP", "PRP", "PRP$", "PRPS", "CD", "IN", "TO", "WDT", "WP", "WP$", "WRB", "RB", "SYM", "RB", "UH", "RP", "RBR", "RBS", "DT") @@ -282,7 +264,6 @@ object HeadRules { shr(Right, true, "JJ", "JJS", "RB", "QP")) ) - //add in binarized symbols, and look for the binarized symbol first (basic ++ basic.map { case (k, v) => ("@" + k, v) @@ -296,32 +277,28 @@ object HeadRules { } : Map[String, Seq[HeadRule[String]]] } - - } /* - - object NegraHeadFinder extends HeadFinder[AnnotatedLabel] { def findHeadChild(l: AnnotatedLabel, children: AnnotatedLabel*): Int = l.label match { case "ISU" => var index = children.indexWhere(a => a.hasAnnotation(FunctionalTag("UC"))) - if(index < 0) { + if (index < 0) { children.length - 1 } else { index } case "DL" => var index = children.indexWhere(a => a.hasAnnotation(FunctionalTag("HD")) || a.hasAnnotation(FunctionalTag("DH"))) - if(index < 0) { + if (index < 0) { children.length - 1 } else { index } case _ => var index = children.indexWhere(a => a.hasAnnotation(FunctionalTag("HD")) || a.hasAnnotation(FunctionalTag("PH"))) - if(index < 0) { + if (index < 0) { index = children.length - 1 } index diff --git a/src/main/scala/epic/trees/PartialTreeProcessor.scala b/src/main/scala/epic/trees/PartialTreeProcessor.scala index ee3dd1e5..898a230a 100644 --- a/src/main/scala/epic/trees/PartialTreeProcessor.scala +++ b/src/main/scala/epic/trees/PartialTreeProcessor.scala @@ -7,7 +7,7 @@ case class PartialTreeProcessor() { def apply(tree: Tree[String]):Tree[String] = { var transformed = xox(ens(tree).get) - transformed = if(transformed.children.length != 1) { + transformed = if (transformed.children.length != 1) { Tree("", IndexedSeq(transformed), transformed.span) } else { transformed diff --git a/src/main/scala/epic/trees/PennTreeReader.scala b/src/main/scala/epic/trees/PennTreeReader.scala index c195fa87..ada4b4a4 100644 --- a/src/main/scala/epic/trees/PennTreeReader.scala +++ b/src/main/scala/epic/trees/PennTreeReader.scala @@ -52,17 +52,15 @@ class PennTreeReader(reader: Reader, private var nextTree = readRootTree() - def hasNext = (nextTree != null) + def hasNext = nextTree != null def next() = { if (!hasNext) throw new NoSuchElementException() val tree = nextTree - nextTree = readRootTree() - if(nextTree == null) { + if (nextTree == null) { in.close() } - tree } @@ -73,10 +71,8 @@ class PennTreeReader(reader: Reader, val tree = readTree(true, 0) tree } - } - private def readTree(isRoot : Boolean, pos : Int) : (Tree[String],IndexedSeq[String]) = { readLeftParen() val label = { @@ -110,7 +106,6 @@ class PennTreeReader(reader: Reader, ch = in.read() atLeastOne = false } - in.unread(ch) sb.toString() } @@ -144,14 +139,13 @@ class PennTreeReader(reader: Reader, } private def isTextParen() = { - var numRead = 0 var ch = in.read() while (isLeftParen(ch)) { numRead += 1 ch = in.read() } - val yes = numRead > 0 && (isRightParen(ch)) + val yes = numRead > 0 && isRightParen(ch) in.unread(ch) for (i <- 0 until numRead) { in.unread('(') @@ -167,9 +161,9 @@ class PennTreeReader(reader: Reader, private def readLeaf() = { var label = readText(true, true) - if(unescapeTokens) + if (unescapeTokens) label = TreebankTokenizer.treebankTokenToToken(label) - if(label.startsWith("/") && label.length == 2 && label(1) != '/') { + if (label.startsWith("/") && label.length == 2 && label(1) != '/') { label = label.drop(1) // ontonotes escapes periods as /. } label @@ -196,7 +190,7 @@ class PennTreeReader(reader: Reader, } private def isWhiteSpace(ch : Int) = { - (ch == ' ' || ch == '\t' || ch == '\f' || ch == '\r' || ch == '\n') + ch == ' ' || ch == '\t' || ch == '\f' || ch == '\r' || ch == '\n' } private def isLeftParen(ch : Int) = { @@ -208,6 +202,3 @@ class PennTreeReader(reader: Reader, } } - - - diff --git a/src/main/scala/epic/trees/ProcessedTreebank.scala b/src/main/scala/epic/trees/ProcessedTreebank.scala index 34a2210f..523d96f8 100644 --- a/src/main/scala/epic/trees/ProcessedTreebank.scala +++ b/src/main/scala/epic/trees/ProcessedTreebank.scala @@ -48,7 +48,7 @@ case class ProcessedTreebank(@Help(text="Location of the treebank directory") case "conllonto" => Treebank.fromOntonotesDirectory(path) case "spmrl" => var trainPath: File = new File(path, "train") - if(!trainPath.exists) + if (!trainPath.exists) trainPath = new File(path, "train5k") val train = trainPath.listFiles().filter(_.getName.endsWith("ptb")) val dev = new File(path, "dev").listFiles().filter(_.getName.endsWith("ptb")) @@ -66,17 +66,16 @@ case class ProcessedTreebank(@Help(text="Location of the treebank directory") lazy val trainTrees: IndexedSeq[TreeInstance[AnnotatedLabel, String]] = { var train = transformTrees(treebank.train, maxLength, collapseUnaries = true) - if(includeDevInTrain) train ++= transformTrees(treebank.dev, maxLength, collapseUnaries = true) + if (includeDevInTrain) train ++= transformTrees(treebank.dev, maxLength, collapseUnaries = true) train.take(numSentences) } lazy val devTrees = transformTrees(treebank.dev, 100000) lazy val testTrees = transformTrees(treebank.test, 1000000) - def transformTrees(portion: treebank.Portion, maxL: Int, collapseUnaries: Boolean = false): IndexedSeq[TreeInstance[AnnotatedLabel, String]] = { val binarizedAndTransformed = for ( ((tree, words), index) <- portion.trees.zipWithIndex if words.length <= maxL; - w2 = if(debuckwalterize) words.map(ArabicNormalization.buckwalterToUnicode) else words + w2 = if (debuckwalterize) words.map(ArabicNormalization.buckwalterToUnicode) else words ) yield { val name = s"${portion.name}-$index" makeTreeInstance(name, tree, w2, collapseUnaries) @@ -85,7 +84,6 @@ case class ProcessedTreebank(@Help(text="Location of the treebank directory") binarizedAndTransformed.toIndexedSeq } - def makeTreeInstance(name: String, tree: Tree[String], words: IndexedSeq[String], collapseUnaries: Boolean): TreeInstance[AnnotatedLabel, String] = { var transformed = process(tree.map(AnnotatedLabel.parseTreebank)) if (collapseUnaries) { @@ -100,7 +98,7 @@ case class ProcessedTreebank(@Help(text="Location of the treebank directory") case "xbar" | "right" => HeadFinder.right[String] case "leftXbar" | "left" => HeadFinder.left[String] case "head" => if (treebankType .startsWith("spmrl")) { - SupervisedHeadFinder.trainHeadFinderFromFiles(supervisedHeadFinderPtbPath, supervisedHeadFinderConllPath); + SupervisedHeadFinder.trainHeadFinderFromFiles(supervisedHeadFinderPtbPath, supervisedHeadFinderConllPath) } else { HeadFinder.collins } diff --git a/src/main/scala/epic/trees/Rule.scala b/src/main/scala/epic/trees/Rule.scala index 146fa208..8ced6dba 100644 --- a/src/main/scala/epic/trees/Rule.scala +++ b/src/main/scala/epic/trees/Rule.scala @@ -27,33 +27,24 @@ sealed trait Production[@specialized(Int) +L, +W] extends Feature { sealed trait Rule[@specialized(Int) +L] extends Production[L, Nothing] { def parent: L - def children: Seq[L] - def symbols = parent +: children - def map[A](f: L => A): Rule[A] - def mapChildren[A >: L](f: L => A): Rule[A] } @SerialVersionUID(8613629952079423488L) final case class BinaryRule[@specialized(Int) +L](parent: L, left: L, right: L) extends Rule[L] { def children = Seq(left, right) - def map[A](f: L => A) = BinaryRule(f(parent), f(left), f(right)) - def mapChildren[A >: L](f: L => A) = BinaryRule(parent, f(left), f(right)) } @SerialVersionUID(8559479322874082992L) final case class UnaryRule[@specialized(Int) +L](parent: L, child: L, chain: IndexedSeq[String]) extends Rule[L] { def children = Seq(child) - def map[A](f: L => A) = UnaryRule(f(parent), f(child), chain) - def mapChildren[A >: L](f: L => A) = UnaryRule(parent, f(child), chain) - def isIdentity = chain.isEmpty && parent == child } @@ -65,7 +56,6 @@ case class NullRule[@specialized(Int) +L](parent: L) extends Production[L, Nothi def map[A](f: (L) => A): NullRule[A] = NullRule(f(parent)) } - object BinaryRule { def leftChildFirstOrdering[L:Ordering]:Ordering[BinaryRule[L]] = Ordering.Tuple3[L, L, L].on(br => (br.left, br.right, br.parent)) def parentFirstOrdering[L:Ordering]:Ordering[BinaryRule[L]] = Ordering.Tuple3[L, L, L].on(br => (br.parent, br.left, br.right)) diff --git a/src/main/scala/epic/trees/Span.scala b/src/main/scala/epic/trees/Span.scala index 0f8639a2..3810d718 100644 --- a/src/main/scala/epic/trees/Span.scala +++ b/src/main/scala/epic/trees/Span.scala @@ -30,16 +30,16 @@ class Span(val encoded: Long) extends AnyVal with Serializable { def length = end - begin - def map[U](f: Int=>U) = Range(begin,end).map(f) + def map[U](f: Int => U) = Range(begin, end).map(f) @inline - def foreach(f: Int=>Unit) = { + def foreach(f: Int => Unit) = { cfor.cfor(begin)(_ < end, _ +1) { f } } def iterator = toRange.iterator - def toRange = Range(begin,end) + def toRange = Range(begin, end) def contains(pos: Int) = pos >= begin && pos < end @@ -53,7 +53,6 @@ class Span(val encoded: Long) extends AnyVal with Serializable { || (other.begin < begin && other.end < end && other.end > begin) ) - // override def hashCode(): Int = { // (begin, end).hashCode() // } @@ -66,7 +65,7 @@ class Span(val encoded: Long) extends AnyVal with Serializable { /** * Return true if this' range contains the other range. */ - def contains(other:Span) = { + def contains(other: Span) = { begin <= other.begin && end >= other.end } @@ -77,6 +76,5 @@ class Span(val encoded: Long) extends AnyVal with Serializable { object Span { def apply(begin: Int, end: Int) = new Span((begin.toLong << 32) | (end.toLong&0xFFFFFFFFL)) - def unapply(span: Span) = Some((span.begin, span.end)) } diff --git a/src/main/scala/epic/trees/StandardTreeProcessor.scala b/src/main/scala/epic/trees/StandardTreeProcessor.scala index 63af9570..44fba167 100644 --- a/src/main/scala/epic/trees/StandardTreeProcessor.scala +++ b/src/main/scala/epic/trees/StandardTreeProcessor.scala @@ -42,15 +42,14 @@ case class StandardTreeProcessor(headFinder: HeadFinder[AnnotatedLabel] = HeadFi oin.defaultReadObject() } - def apply(rawTree: Tree[AnnotatedLabel]):BinarizedTree[AnnotatedLabel] = { -// val ann = tree.map { AnnotatedLabel.parseTreebank } + // val ann = tree.map { AnnotatedLabel.parseTreebank } var detraced = traceProcessor(rawTree) if (removeTraces) { detraced = detraced.map(_.copy(index = -1)) } var transformed = xox(detraced) - transformed = if(transformed.children.length != 1) { + transformed = if (transformed.children.length != 1) { Tree(AnnotatedLabel.TOP, IndexedSeq(transformed), transformed.span) } else { transformed @@ -70,7 +69,4 @@ case class StandardTreeProcessor(headFinder: HeadFinder[AnnotatedLabel] = HeadFi } } -object StandardTreeProcessor { - - -} +object StandardTreeProcessor diff --git a/src/main/scala/epic/trees/SubsampledTreebank.scala b/src/main/scala/epic/trees/SubsampledTreebank.scala index ddaf983c..8e63af08 100644 --- a/src/main/scala/epic/trees/SubsampledTreebank.scala +++ b/src/main/scala/epic/trees/SubsampledTreebank.scala @@ -36,7 +36,7 @@ class SubsampledTreebank(base: Treebank[String], numTrain: Int, numDev:Int, numT } private def downSample[K](trees: Iterator[K], num: Int) = { - if(num < 0) trees + if (num < 0) trees else { // TODO: maybe randomly sample trees.take(num) diff --git a/src/main/scala/epic/trees/SupervisedHeadFinder.scala b/src/main/scala/epic/trees/SupervisedHeadFinder.scala index 70155aa1..9905f0c8 100644 --- a/src/main/scala/epic/trees/SupervisedHeadFinder.scala +++ b/src/main/scala/epic/trees/SupervisedHeadFinder.scala @@ -8,33 +8,33 @@ import java.io.InputStreamReader class SupervisedHeadFinder[L](innards: SupervisedHeadFinderInnards[L,_]) extends HeadFinder[L] { - def findHeadChild(l: L, children: L*):Int = { - val head = innards.findHeadChild(l, children.toSeq); - head; + def findHeadChild(l: L, children: L*): Int = { + val head = innards.findHeadChild(l, children.toSeq) + head } def projected[U](f: U => L): HeadFinder[U] = { - new SupervisedHeadFinder[U](innards.projected(f)); + new SupervisedHeadFinder[U](innards.projected(f)) } } trait SupervisedHeadFinderInnards[L,B] extends Serializable { outer => - protected def proj(l: L): B; + protected def proj(l: L): B - protected def getHeadDB: HeadDB[B]; + protected def getHeadDB: HeadDB[B] def findHeadChild(l: L, children: Seq[L]) = { - val b = proj(l); - val bChildren = children.map(child => proj(child)); - getHeadDB.findHeadChild(b, bChildren); + val b = proj(l) + val bChildren = children.map(child => proj(child)) + getHeadDB.findHeadChild(b, bChildren) } def projected[U](f: U => L): SupervisedHeadFinderInnards[U,B] = new SupervisedHeadFinderInnards[U,B] { - protected def proj(l: U): B = outer.proj(f(l)); + protected def proj(l: U): B = outer.proj(f(l)) - protected def getHeadDB = outer.getHeadDB; + protected def getHeadDB = outer.getHeadDB } } @@ -44,117 +44,114 @@ object SupervisedHeadFinderInnards extends Serializable { protected def proj(l: B) = l - protected def getHeadDB: HeadDB[B] = db; + protected def getHeadDB: HeadDB[B] = db } } case class HeadDB[B](symbolArityHeadChildCounts: Counter2[(B,Int),Int,Int], ruleHeadChildCounts: Counter2[(B,Seq[B]),Int,Int], defaultToLeft: Boolean = true) { - def findHeadChild(l: B, children: Seq[B]):Int = { + def findHeadChild(l: B, children: Seq[B]): Int = { // Manual arg-max because I suck at using Counter2 - var best = -1; - var bestCount = 0; - for (i <- 0 until children.size) { + var best = -1 + var bestCount = 0 + children.indices.foreach { i => if (ruleHeadChildCounts((l,children),i) > bestCount) { - best = i; - bestCount = ruleHeadChildCounts((l,children),i); + best = i + bestCount = ruleHeadChildCounts((l,children),i) } } if (best == -1) { // Else, the rule has never been seen before, so try just the symbol+arity - for (i <- 0 until children.size) { + children.indices.foreach { i => if (symbolArityHeadChildCounts((l,children.size),i) > bestCount) { - best = i; - bestCount = ruleHeadChildCounts((l,children),i); + best = i + bestCount = ruleHeadChildCounts((l,children),i) } } } if (best == -1) { - best = if (defaultToLeft) 0 else children.size - 1; + best = if (defaultToLeft) 0 else children.size - 1 } - best; + best } } object SupervisedHeadFinder { def trainHeadFinderFromFiles(ptbPath: String, conllPath: String): HeadFinder[String] = { - println("Training supervised head finder from PTB trees at " + ptbPath + " and CoNLL trees at " + conllPath); - val treebank = new SimpleTreebank(new File(ptbPath), new File(ptbPath), new File(ptbPath)); - val process = PartialTreeProcessor(); - val processedTrees = treebank.train.trees.toSeq.map(treeWordsPair => process(treeWordsPair._1)); - val conllTrees = readDepTrees(conllPath); + println("Training supervised head finder from PTB trees at " + ptbPath + " and CoNLL trees at " + conllPath) + val treebank = new SimpleTreebank(new File(ptbPath), new File(ptbPath), new File(ptbPath)) + val process = PartialTreeProcessor() + val processedTrees = treebank.train.trees.toSeq.map(treeWordsPair => process(treeWordsPair._1)) + val conllTrees = readDepTrees(conllPath) if (processedTrees.size != conllTrees.size) { throw new RuntimeException("Error in training the supervised head finder: dep and const trees don't match: " + - processedTrees.size + " const but " + conllTrees.size + " dep"); + processedTrees.size + " const but " + conllTrees.size + " dep") } - val symbolArityHeadChildCounts = Counter2[(String,Int),Int,Int](); - val ruleHeadChildCounts = Counter2[(String,Seq[String]),Int,Int](); - - + val symbolArityHeadChildCounts = Counter2[(String,Int),Int,Int]() + val ruleHeadChildCounts = Counter2[(String,Seq[String]),Int,Int]() + def rec(tree: Tree[String], conllTree: Seq[Int]) { if (!tree.isLeaf) { - val label = tree.label; + val label = tree.label // Find the head under this span which has its label outside the span - var headIdx = -1; + var headIdx = -1 for (idx <- tree.span.begin until tree.span.end) { if (conllTree(idx) < tree.span.begin || conllTree(idx) >= tree.span.end) { - headIdx = idx; + headIdx = idx } } if (headIdx != -1) { // Now identify which child contains the head and make that the head child - var childIdx = 0; + var childIdx = 0 while (tree.children(childIdx).span.end <= headIdx) { - childIdx += 1; + childIdx += 1 } - symbolArityHeadChildCounts(label -> tree.children.size, childIdx) += 1; - ruleHeadChildCounts(label -> tree.children.map(_.label), childIdx) += 1; + symbolArityHeadChildCounts(label -> tree.children.size, childIdx) += 1 + ruleHeadChildCounts(label -> tree.children.map(_.label), childIdx) += 1 } - tree.children.foreach(rec(_, conllTree)); + tree.children.foreach(rec(_, conllTree)) } } - var numMatched = 0; - for (i <- 0 until conllTrees.size) { - val conllTree = conllTrees(i); - val constTree = processedTrees(i); + var numMatched = 0 + (conllTrees zip processedTrees).foreach { case (conllTree, constTree) => if (conllTree.size == constTree.span.length) { - rec(constTree, conllTree); - numMatched += 1; + rec(constTree, conllTree) + numMatched += 1 } } - println("Head finder trained; lengths matched on " + numMatched + " / " + conllTrees.size + " trees"); - new SupervisedHeadFinder[String](SupervisedHeadFinderInnards.fromHeadDB(new HeadDB(symbolArityHeadChildCounts, ruleHeadChildCounts))); -// HeadFinder.collins; + println("Head finder trained; lengths matched on " + numMatched + " / " + conllTrees.size + " trees") + new SupervisedHeadFinder[String](SupervisedHeadFinderInnards.fromHeadDB(new HeadDB(symbolArityHeadChildCounts, ruleHeadChildCounts))) + // HeadFinder.collins } // Reads in a vector of parents, 0-indexed, with the root being -1 def readDepTrees(conllPath: String): Seq[Seq[Int]] = { val in = breeze.io.FileStreams.input(new File(conllPath)) - val br = new BufferedReader(new InputStreamReader(in, "UTF-8")); -// val sents = new ArrayBuffer[Seq[Seq[String]]](); - val trees = new ArrayBuffer[Seq[Int]](); - var currSent = new ArrayBuffer[Seq[String]]; - var i = 0; + val br = new BufferedReader(new InputStreamReader(in, "UTF-8")) + // val sents = new ArrayBuffer[Seq[Seq[String]]]() + val trees = new ArrayBuffer[Seq[Int]]() + var currSent = new ArrayBuffer[Seq[String]] + var i = 0 while (br.ready()) { - val line = br.readLine(); + val line = br.readLine() if (line.trim.isEmpty) { - if (!currSent.isEmpty) { - trees += conllToTree(currSent); + if (currSent.nonEmpty) { + trees += conllToTree(currSent) } - currSent = new ArrayBuffer[Seq[String]]; + currSent = new ArrayBuffer[Seq[String]] } else { - currSent += line.split("\\s+"); + currSent += line.split("\\s+") } - i += 1; + i += 1 } - if (!currSent.isEmpty) { - trees += conllToTree(currSent); + if (currSent.nonEmpty) { + trees += conllToTree(currSent) } - trees; + trees } - def conllToTree(sent: Seq[Seq[String]]) = sent.map(_(6).toInt - 1); + def conllToTree(sent: Seq[Seq[String]]) = sent.map(_(6).toInt - 1) } \ No newline at end of file diff --git a/src/main/scala/epic/trees/TestTreebank.scala b/src/main/scala/epic/trees/TestTreebank.scala index acdce1d8..dca56c42 100644 --- a/src/main/scala/epic/trees/TestTreebank.scala +++ b/src/main/scala/epic/trees/TestTreebank.scala @@ -21,10 +21,9 @@ package epic.trees */ object TstTreebank { val treebank = { - val train = TstTreebank.getClass.getClassLoader.getResource("smallbank/train"); - val test = TstTreebank.getClass.getClassLoader.getResource("smallbank/test"); - val dev = TstTreebank.getClass.getClassLoader.getResource("smallbank/dev"); - - new SimpleTreebank(Map("train"->train),Map("dev"->dev),Map("test"->test)); + val train = TstTreebank.getClass.getClassLoader.getResource("smallbank/train") + val test = TstTreebank.getClass.getClassLoader.getResource("smallbank/test") + val dev = TstTreebank.getClass.getClassLoader.getResource("smallbank/dev") + new SimpleTreebank(Map("train"->train),Map("dev"->dev),Map("test"->test)) } } \ No newline at end of file diff --git a/src/main/scala/epic/trees/TraceRemover.scala b/src/main/scala/epic/trees/TraceRemover.scala index a9a36b12..a893dbcd 100644 --- a/src/main/scala/epic/trees/TraceRemover.scala +++ b/src/main/scala/epic/trees/TraceRemover.scala @@ -10,19 +10,17 @@ class TraceRemover[T, W](emptyCategory: T=>Boolean) extends (Tree[T] =>Tree[T]) def rec(tree: Tree[T]):Option[Tree[T]] = { if (emptyCategory(tree.label) || tree.span.begin == tree.span.end) { None - } else if (tree.children.length == 0) { + } else if (tree.children.isEmpty) { Some(tree) } else { val newChildren = tree.children.map(rec).collect{ case Some(t) => t } - if (newChildren.length == 0 && !tree.isLeaf) { + if (newChildren.isEmpty && !tree.isLeaf) { None } else { Some(Tree(tree.label,newChildren, tree.span)) } } } - rec(tree).get } - } diff --git a/src/main/scala/epic/trees/TraceToSlashCategoryConverter.scala b/src/main/scala/epic/trees/TraceToSlashCategoryConverter.scala index 5f713e81..c77bf7bf 100644 --- a/src/main/scala/epic/trees/TraceToSlashCategoryConverter.scala +++ b/src/main/scala/epic/trees/TraceToSlashCategoryConverter.scala @@ -37,18 +37,12 @@ class TraceToSlashCategoryConverter extends (Tree[AnnotatedLabel] =>Tree[Annotat val (newChildren, gapsList) = tree.children.filterNot(_.label.label == "-NONE-").map(recursive(_, resolvedIndices ++ cCommandIndices)).unzip val gaps: IndexedSeq[(String, Int)] = gapsList.flatten.distinct val unresolvedGaps = gaps.filterNot(pair => resolvedIndices(pair._2)) - - val newLabel = label.copy(siblings = label.siblings ++ unresolvedGaps.map(pair => Left(pair._1))) - - -// if(unresolvedGaps.nonEmpty) { -// println(unresolvedGaps, newLabel) -// } - + // if (unresolvedGaps.nonEmpty) { + // println(unresolvedGaps, newLabel) + // } Tree(newLabel, newChildren, span) -> unresolvedGaps } - } val (newTree, gaps) = recursive(tree, Set.empty) @@ -57,7 +51,6 @@ class TraceToSlashCategoryConverter extends (Tree[AnnotatedLabel] =>Tree[Annotat } } - object TraceToSlashCategoryConverter { def main(args: Array[String]):Unit = { diff --git a/src/main/scala/epic/trees/Tree.scala b/src/main/scala/epic/trees/Tree.scala index 55b73b21..05fa2146 100644 --- a/src/main/scala/epic/trees/Tree.scala +++ b/src/main/scala/epic/trees/Tree.scala @@ -16,7 +16,6 @@ package epic.trees limitations under the License. */ - import java.io.StringReader import breeze.util.Lens @@ -34,7 +33,7 @@ trait Tree[+L] extends Serializable { def begin = span.begin def end = span.end - def isLeaf = children.size == 0 + def isLeaf = children.isEmpty /** * A tree is valid if this' span contains all children's spans * and each child abuts the next one. @@ -48,7 +47,7 @@ trait Tree[+L] extends Serializable { children.last.span.end == this.span.end } - def leaves:Iterable[Tree[L]] = if(isLeaf) { + def leaves: Iterable[Tree[L]] = if (isLeaf) { IndexedSeq(this).view } else { children.map(_.leaves).foldLeft[Stream[Tree[L]]](Stream.empty){_ append _} @@ -58,54 +57,53 @@ trait Tree[+L] extends Serializable { * Useful for stripping the words out of a tree * Returns (tree without leaves, leaves) */ - def cutLeaves: (Tree[L],IndexedSeq[L]) = { - def recCutLeaves(tree: Tree[L]): (Option[Tree[L]],IndexedSeq[L]) = { - if(tree.isLeaf) (None,IndexedSeq(tree.label)) + def cutLeaves: (Tree[L], IndexedSeq[L]) = { + def recCutLeaves(tree: Tree[L]): (Option[Tree[L]], IndexedSeq[L]) = { + if (tree.isLeaf) (None,IndexedSeq(tree.label)) else { val fromChildren = tree.children.map(recCutLeaves _) - Some(Tree(tree.label,fromChildren.flatMap(_._1), tree.span)) -> fromChildren.flatMap(_._2) + Some(Tree(tree.label, fromChildren.flatMap(_._1), tree.span)) -> fromChildren.flatMap(_._2) } } - val (treeOpt,leaves) = recCutLeaves(this) + val (treeOpt, leaves) = recCutLeaves(this) treeOpt.get -> leaves } - - def map[M](f: L=>M):Tree[M] = Tree( f(label), children map { _ map f}, span) - def extend[B](f: Tree[L]=>B):Tree[B] = Tree(f(this),children map { _ extend f}, span) - def relabelRoot[B>:L](f: L=>B):Tree[B] + def map[M](f: L => M): Tree[M] = Tree(f(label), children map (_ map f), span) + def extend[B](f: Tree[L] => B): Tree[B] = Tree(f(this),children map (_ extend f), span) + def relabelRoot[B >: L](f: L => B): Tree[B] def allChildren = preorder def preorder: Iterator[Tree[L]] = { - children.map(_.preorder).foldLeft( Iterator(this)) { _ ++ _ } + children.map(_.preorder).foldLeft( Iterator(this))(_ ++ _) } def postorder: Iterator[Tree[L]] = { - children.map(_.postorder).foldRight(Iterator(this)){_ ++ _} + children.map(_.postorder).foldRight(Iterator(this))(_ ++ _) } - def leftHeight:Int = if(isLeaf) 0 else 1 + children(0).leftHeight + def leftHeight: Int = if (isLeaf) 0 else 1 + children(0).leftHeight import epic.trees.Tree._ override def toString = toString(false) - def toString(newline: Boolean) = recursiveToString(this,0, newline, new StringBuilder).toString + def toString(newline: Boolean) = recursiveToString(this, 0, newline, new StringBuilder).toString - def render[W](words: Seq[W], newline: Boolean = true) = recursiveRender(this,1,words, newline, new StringBuilder).toString + def render[W](words: Seq[W], newline: Boolean = true) = recursiveRender(this, 1, words, newline, new StringBuilder).toString } object Tree { def apply[L](label: L, children: IndexedSeq[Tree[L]], span: Span): NaryTree[L] = NaryTree(label,children, span) - def unapply[L](t: Tree[L]): Option[(L,IndexedSeq[Tree[L]], Span)] = Some((t.label,t.children, t.span)) - def fromString(input: String):(Tree[String],IndexedSeq[String]) = new PennTreeReader(new StringReader(input)).next + def unapply[L](t: Tree[L]): Option[(L, IndexedSeq[Tree[L]], Span)] = Some((t.label, t.children, t.span)) + def fromString(input: String): (Tree[String], IndexedSeq[String]) = new PennTreeReader(new StringReader(input)).next - private def recursiveToString[L](tree: Tree[L], depth: Int, newline: Boolean, sb: StringBuilder):StringBuilder = { + private def recursiveToString[L](tree: Tree[L], depth: Int, newline: Boolean, sb: StringBuilder): StringBuilder = { import tree._ sb append "( " append tree.label append " [" append span.begin append "," append span.end append "] " for( c <- tree.children ) { - if(newline && (c.children.nonEmpty)) sb append "\n" append " " * depth + if (newline && c.children.nonEmpty) sb append "\n" append " " * depth else sb.append(' ') recursiveToString(c,depth+1,newline, sb) } @@ -113,49 +111,45 @@ object Tree { sb } - - private def recursiveRender[L,W](tree: Tree[L], depth: Int, words: Seq[W], newline: Boolean, sb: StringBuilder): StringBuilder = { + private def recursiveRender[L, W](tree: Tree[L], depth: Int, words: Seq[W], newline: Boolean, sb: StringBuilder): StringBuilder = { import tree._ sb append "(" append tree.label - if(isLeaf) { + if (isLeaf) { sb append TreebankTokenizer.tokensToTreebankTokens(span.map(words).map(_.toString)).mkString(" "," ","") } else { val anyNonTerminals = children.exists(!_.isLeaf) //sb append "\n" for( c <- children ) { - if(newline && (c.span.length != words.length) && anyNonTerminals) sb append "\n" append " " * depth + if (newline && (c.span.length != words.length) && anyNonTerminals) sb append "\n" append " " * depth else sb.append(' ') recursiveRender(c,depth+1, words, newline, sb) } } sb append ')' - if(sb.length > 1 && sb(sb.length-2) != ')' && sb(sb.length-2) != ' ') + if (sb.length > 1 && sb(sb.length-2) != ')' && sb(sb.length-2) != ' ') sb append ' ' sb } - - - } case class NaryTree[L](label: L, children: IndexedSeq[Tree[L]], span: Span) extends Tree[L] { - def relabelRoot[B >: L](f: (L) => B): Tree[B] = copy(f(label)) + def relabelRoot[B >: L](f: L => B): Tree[B] = copy(f(label)) } sealed trait BinarizedTree[+L] extends Tree[L] { def findSpan(begin: Int, end: Int): Option[Tree[L]] = this match { case t if t.span == Span(begin, end) => Some(t) - case t@BinaryTree(a, b, c, span) if end <= t.splitPoint => if(t.begin <= begin) b.findSpan(begin, end) else None - case t@BinaryTree(a, b, c, span) if t.splitPoint <= begin => if(t.end <= end) c.findSpan(begin, end) else None - case t@UnaryTree(a, b, chain, span) => if(span.contains(Span(begin, end))) b.findSpan(begin, end) else None + case t@BinaryTree(a, b, c, span) if end <= t.splitPoint => if (t.begin <= begin) b.findSpan(begin, end) else None + case t@BinaryTree(a, b, c, span) if t.splitPoint <= begin => if (t.end <= end) c.findSpan(begin, end) else None + case t@UnaryTree(a, b, chain, span) => if (span.contains(Span(begin, end))) b.findSpan(begin, end) else None case _ => None } - override def map[M](f: L=>M): BinarizedTree[M] = null + override def map[M](f: L => M): BinarizedTree[M] = null // have to override to trick scala to refine the type - override def extend[B](f: Tree[L]=>B):BinarizedTree[B] = {sys.error("...")} - def relabelRoot[B>:L](f: L=>B):BinarizedTree[B] + override def extend[B](f: Tree[L] => B): BinarizedTree[B] = { sys.error("...") } + def relabelRoot[B >: L](f: L => B): BinarizedTree[B] } case class BinaryTree[+L](label: L, @@ -164,9 +158,9 @@ case class BinaryTree[+L](label: L, span: Span) extends BinarizedTree[L] { def children = IndexedSeq(leftChild, rightChild) - override def map[M](f: L=>M):BinaryTree[M] = BinaryTree( f(label), leftChild map f, rightChild map f, span) - override def extend[B](f: Tree[L]=>B) = BinaryTree( f(this), leftChild extend f, rightChild extend f, span) - def relabelRoot[B>:L](f: L=>B):BinarizedTree[B] = BinaryTree(f(label), leftChild, rightChild, span) + override def map[M](f: L => M): BinaryTree[M] = BinaryTree(f(label), leftChild map f, rightChild map f, span) + override def extend[B](f: Tree[L] => B) = BinaryTree(f(this), leftChild extend f, rightChild extend f, span) + def relabelRoot[B >: L](f: L => B): BinarizedTree[B] = BinaryTree(f(label), leftChild, rightChild, span) def splitPoint = leftChild.span.end override def allChildren: Iterator[BinaryTree[L]] = super.allChildren.asInstanceOf[Iterator[BinaryTree[L]]] @@ -178,25 +172,24 @@ case class BinaryTree[+L](label: L, case class UnaryTree[+L](label: L, child: BinarizedTree[L], chain: IndexedSeq[String], span: Span) extends BinarizedTree[L] { def children = IndexedSeq(child) - override def map[M](f: L=>M): UnaryTree[M] = UnaryTree( f(label), child map f, chain, span) - override def extend[B](f: Tree[L]=>B) = UnaryTree( f(this), child extend f, chain, span) - def relabelRoot[B>:L](f: L=>B):BinarizedTree[B] = UnaryTree(f(label), child, chain, span) + override def map[M](f: L => M): UnaryTree[M] = UnaryTree( f(label), child map f, chain, span) + override def extend[B](f: Tree[L] => B) = UnaryTree( f(this), child extend f, chain, span) + def relabelRoot[B >: L](f: L => B): BinarizedTree[B] = UnaryTree(f(label), child, chain, span) } case class NullaryTree[+L](label: L, span: Span) extends BinarizedTree[L] { def children = IndexedSeq.empty - - override def map[M](f: L=>M): NullaryTree[M] = NullaryTree( f(label), span) - override def extend[B](f: Tree[L]=>B) = NullaryTree( f(this), span) - def relabelRoot[B>:L](f: L=>B):BinarizedTree[B] = NullaryTree(f(label), span) + override def map[M](f: L => M): NullaryTree[M] = NullaryTree( f(label), span) + override def extend[B](f: Tree[L] => B) = NullaryTree( f(this), span) + def relabelRoot[B >: L](f: L => B): BinarizedTree[B] = NullaryTree(f(label), span) } object Trees { def binarize[L](tree: Tree[L], - makeIntermediate: (L, L)=>L, - extendIntermediate: (L, Either[L,L])=>L, - headFinder: HeadFinder[L]):BinarizedTree[L] = tree match { + makeIntermediate: (L, L) => L, + extendIntermediate: (L, Either[L,L]) => L, + headFinder: HeadFinder[L]): BinarizedTree[L] = tree match { case Tree(l, Seq(), span) => NullaryTree(l, span) case Tree(l, Seq(oneChild), span) => UnaryTree(l,binarize(oneChild, makeIntermediate, extendIntermediate, headFinder), IndexedSeq.empty, tree.span) case Tree(l, Seq(leftChild,rightChild), span) => @@ -211,7 +204,7 @@ object Trees { val right = binarized.drop(headChildIndex+1).foldLeft(headChild){ (tree,newArg) => // TODO ugh val intermediate = { - if(tree eq headChild) + if (tree eq headChild) makeIntermediate(l, headChild.label) else tree.label @@ -222,7 +215,7 @@ object Trees { // now fold in left args val fullyBinarized = binarized.take(headChildIndex).foldRight(right){(newArg,tree) => val intermediate = { - if(tree eq headChild) + if (tree eq headChild) makeIntermediate(l, headChild.label) else tree.label @@ -234,13 +227,13 @@ object Trees { fullyBinarized.relabelRoot(_ => l) } - def binarize(tree: Tree[String], headFinder: HeadFinder[String] = HeadFinder.collins):BinarizedTree[String] = { + def binarize(tree: Tree[String], headFinder: HeadFinder[String] = HeadFinder.collins): BinarizedTree[String] = { def stringBinarizer(currentLabel: String, headTag: String) = { - if(currentLabel.startsWith("@")) currentLabel + if (currentLabel.startsWith("@")) currentLabel else s"@$currentLabel[$headTag]" } - def extendIntermediate(currentLabel: String, sib:Either[String, String]) = { + def extendIntermediate(currentLabel: String, sib: Either[String, String]) = { sib match { case Left(s) => s"$currentLabel<$s" @@ -253,9 +246,8 @@ object Trees { binarize[String](tree, stringBinarizer, extendIntermediate, headFinder) } - - def deannotate(tree: Tree[String]):Tree[String] = tree.map(deannotateLabel _) - def deannotate(tree: BinarizedTree[String]):BinarizedTree[String] = tree.map(deannotateLabel _) + def deannotate(tree: Tree[String]): Tree[String] = tree.map(deannotateLabel _) + def deannotate(tree: BinarizedTree[String]): BinarizedTree[String] = tree.map(deannotateLabel _) def deannotateLabel(l: String) = l.takeWhile(c => c != '^' && c != '>') /** @@ -269,9 +261,9 @@ object Trees { */ def addHorizontalMarkovization[T](tree: BinarizedTree[T], order: Int, - join: (T,IndexedSeq[Either[T,T]])=>T, - isIntermediate: T=>Boolean):BinarizedTree[T] = { - def rec(tree: BinarizedTree[T]):(BinarizedTree[T], IndexedSeq[Either[T,T]]) = { + join: (T, IndexedSeq[Either[T, T]]) => T, + isIntermediate: T => Boolean): BinarizedTree[T] = { + def rec(tree: BinarizedTree[T]): (BinarizedTree[T], IndexedSeq[Either[T, T]]) = { tree match { case BinaryTree(label, t1, t2, span) if isIntermediate(t1.label) => val (newt1, newhist) = rec(t1) @@ -288,13 +280,13 @@ object Trees { case BinaryTree(label, t1, t2, span) => val (newt1, _) = rec(t1) val (newt2, _) = rec(t2) - val newHistory = if(isIntermediate(label) && order > 0) IndexedSeq(Right(t2.label)) else IndexedSeq.empty - val newLabel = if(isIntermediate(label)) join(label, newHistory) else label + val newHistory = if (isIntermediate(label) && order > 0) IndexedSeq(Right(t2.label)) else IndexedSeq.empty + val newLabel = if (isIntermediate(label)) join(label, newHistory) else label BinaryTree(newLabel, newt1, newt2, tree.span) -> newHistory case UnaryTree(label, child, chain, span) => val (newt1, hist) = rec(child) - val newHistory = if(isIntermediate(label)) hist else IndexedSeq.empty - val newLabel = if(isIntermediate(label)) join(label, newHistory) else label + val newHistory = if (isIntermediate(label)) hist else IndexedSeq.empty + val newLabel = if (isIntermediate(label)) join(label, newHistory) else label UnaryTree(newLabel, newt1, chain, tree.span) -> newHistory case tree@NullaryTree(_, span) => tree -> IndexedSeq.empty } @@ -304,18 +296,17 @@ object Trees { rec(tree)._1 } - def addHorizontalMarkovization(tree: BinarizedTree[String], order: Int):BinarizedTree[String] = { - def join(t: String, chain: IndexedSeq[Either[String,String]]) = chain.map{ case Left(l) => "\\" + l case Right(r) => "/" + r}.mkString(t +">","_","") - addHorizontalMarkovization(tree,order,join,(_:String).startsWith("@")) + def addHorizontalMarkovization(tree: BinarizedTree[String], order: Int): BinarizedTree[String] = { + def join(t: String, chain: IndexedSeq[Either[String, String]]) = chain.map{ case Left(l) => "\\" + l case Right(r) => "/" + r}.mkString(t +">","_","") + addHorizontalMarkovization(tree,order,join,(_: String).startsWith("@")) } - - def debinarize[L](tree: Tree[L], isBinarized: L=>Boolean):Tree[L] = { + def debinarize[L](tree: Tree[L], isBinarized: L => Boolean): Tree[L] = { val l = tree.label val children = tree.children val buf = new ArrayBuffer[Tree[L]] for(c <- children) { - if(isBinarized(c.label)) { + if (isBinarized(c.label)) { buf ++= debinarize(c,isBinarized).children } else { buf += debinarize(c,isBinarized) @@ -324,17 +315,17 @@ object Trees { Tree(l,buf, tree.span) } - def debinarize(tree: Tree[String]):Tree[String] = debinarize(tree, (x:String) => x.startsWith("@")) + def debinarize(tree: Tree[String]): Tree[String] = debinarize(tree, (x: String) => x.startsWith("@")) - def annotateParents[L](tree: Tree[L], join: (L,L)=>L, depth: Int, history: List[L] = List.empty):Tree[L] = { - if(depth == 0) tree + def annotateParents[L](tree: Tree[L], join: (L, L) => L, depth: Int, history: List[L] = List.empty):Tree[L] = { + if (depth == 0) tree else { val newLabel = (tree.label :: history).iterator.take(depth).reduceLeft(join) Tree(newLabel,tree.children.map(c => annotateParents[L](c,join,depth,tree.label :: history.take(depth-1 max 0))), tree.span) } } - def annotateParents(tree: Tree[String], depth: Int):Tree[String] = annotateParents(tree,{(x:String,b:String)=>x + '^' + b},depth) + def annotateParents(tree: Tree[String], depth: Int):Tree[String] = annotateParents(tree, { (x: String, b: String) => x + '^' + b },depth) /** * Adds parent-markovization to an already binarized tree. Also handles the unary layering we do by ignoring @@ -346,13 +337,13 @@ object Trees { * @tparam L type of the tree * @return */ - def annotateParentsBinarized[L](tree: BinarizedTree[L], join: (L,Seq[L])=>L, isIntermediate: L=>Boolean, dontAnnotate: Tree[L]=>Boolean, depth: Int):BinarizedTree[L] = { + def annotateParentsBinarized[L](tree: BinarizedTree[L], join: (L, Seq[L]) => L, isIntermediate: L => Boolean, dontAnnotate: Tree[L] => Boolean, depth: Int): BinarizedTree[L] = { val ot = tree - def rec(tree: BinarizedTree[L], history: List[L] = List.empty):BinarizedTree[L] = { + def rec(tree: BinarizedTree[L], history: List[L] = List.empty): BinarizedTree[L] = { import tree._ - val newLabel = if(dontAnnotate(tree)) { + val newLabel = if (dontAnnotate(tree)) { label - } else if(isIntermediate(label)) { + } else if (isIntermediate(label)) { assert(history.length > 1, label + " " + history + "\n\n\n" + tree + "\n\n\n" + ot) join(label, history drop 1 take depth) } else { @@ -362,14 +353,14 @@ object Trees { tree match { //invariant: history is the (depth) non-intermediate symbols, where we remove unary-identity transitions case BinaryTree(label, t1, t2, span) => - val newHistory = if(!isIntermediate(label)) (label :: history) else history + val newHistory = if (!isIntermediate(label)) label :: history else history val lchild = rec(t1,newHistory) val rchild = rec(t2,newHistory) BinaryTree(newLabel, lchild, rchild, span) case u@UnaryTree(label, child, chain, span) => - if(isIntermediate(label)) assert(history.nonEmpty, ot.toString(true) + "\n" + u.toString(true) ) - //if(isIntermediate(label)) assert(label != newLabel, label + " " + newLabel + " " + u + " " + history) - val newHistory = if(!isIntermediate(label) && label != child.label) (label :: history) else history + if (isIntermediate(label)) assert(history.nonEmpty, ot.toString(true) + "\n" + u.toString(true) ) + //if (isIntermediate(label)) assert(label != newLabel, label + " " + newLabel + " " + u + " " + history) + val newHistory = if (!isIntermediate(label) && label != child.label) label :: history else history UnaryTree(newLabel,rec(child,newHistory), chain, span) case NullaryTree(label, span) => NullaryTree(newLabel, span) @@ -379,28 +370,28 @@ object Trees { } - def annotateParentsBinarized(tree: BinarizedTree[String], depth: Int):BinarizedTree[String] = { - annotateParentsBinarized(tree,{(x:String,b:Seq[String])=>b.foldLeft(x)(_ + '^' + _)},(_:String).startsWith("@"), {(l: Tree[String]) => l.label.nonEmpty && l != "$" && !l.label.head.isLetterOrDigit && l.label != "."}, depth) + def annotateParentsBinarized(tree: BinarizedTree[String], depth: Int): BinarizedTree[String] = { + annotateParentsBinarized(tree, { (x: String, b: Seq[String]) => b.foldLeft(x)(_ + '^' + _) },(_: String).startsWith("@"), { (l: Tree[String]) => l.label.nonEmpty && l != "$" && !l.label.head.isLetterOrDigit && l.label != "." }, depth) } object Transforms { @SerialVersionUID(1L) - class EmptyNodeStripper[T](implicit lens: Lens[T,String]) extends (Tree[T]=>Option[Tree[T]]) with Serializable { - def apply(tree: Tree[T]):Option[Tree[T]] = { - if(lens.get(tree.label) == "-NONE-") None - else if(tree.span.begin == tree.span.end) None // screw stupid spans + class EmptyNodeStripper[T](implicit lens: Lens[T, String]) extends (Tree[T] => Option[Tree[T]]) with Serializable { + def apply(tree: Tree[T]): Option[Tree[T]] = { + if (lens.get(tree.label) == "-NONE-") None + else if (tree.span.begin == tree.span.end) None // screw stupid spans else { val newC = tree.children map this filter (None!=) - if(newC.length == 0 && !tree.isLeaf) None + if (newC.isEmpty && !tree.isLeaf) None else Some(Tree(tree.label,newC map (_.get), tree.span)) } } } - class XOverXRemover[L] extends (Tree[L]=>Tree[L]) { - def apply(tree: Tree[L]):Tree[L] = { - if(tree.children.size == 1 && tree.label == tree.children(0).label) { + class XOverXRemover[L] extends (Tree[L] => Tree[L]) { + def apply(tree: Tree[L]): Tree[L] = { + if (tree.children.size == 1 && tree.label == tree.children(0).label) { this(tree.children(0)) } else { Tree(tree.label,tree.children.map(this), tree.span) @@ -408,51 +399,49 @@ object Trees { } } - class FunctionNodeStripper[T](implicit lens: Lens[T,String]) extends (Tree[T]=>Tree[T]) { + class FunctionNodeStripper[T](implicit lens: Lens[T, String]) extends (Tree[T] => Tree[T]) { def apply(tree: Tree[T]): Tree[T] = { - tree.map{ label => + tree.map { label => lens.get(label) match { case "-RCB-" | "-RRB-" | "-LRB-" | "-LCB-" => label case "PRT|ADVP" => lens.set(label, "PRT") case x => - if(x.startsWith("--")) lens.set(label,x.replaceAll("---.*","--")) + if (x.startsWith("--")) lens.set(label,x.replaceAll("---.*","--")) else lens.set(label,x.replaceAll("[-|=].*","")) } } } } - class StripLabels[L](labels: String*)(implicit lens: Lens[L,String]) extends (Tree[L]=>Tree[L]) { + class StripLabels[L](labels: String*)(implicit lens: Lens[L, String]) extends (Tree[L] => Tree[L]) { private val badLabels = labels.toSet - def apply(tree: Tree[L]):Tree[L] = { + def apply(tree: Tree[L]): Tree[L] = { def rec(t: Tree[L]): IndexedSeq[Tree[L]] = { - if (badLabels(lens.get (t.label) ) ) { + if (badLabels(lens.get (t.label))) { t.children.flatMap(rec) } else { IndexedSeq(Tree (t.label, t.children.flatMap(rec), t.span)) } } - rec(tree).head } } - object StandardStringTransform extends (Tree[String]=>Tree[String]) { + object StandardStringTransform extends (Tree[String] => Tree[String]) { private val ens = new EmptyNodeStripper[String] private val xox = new XOverXRemover[String] -// private val fns = new FunctionNodeStripper[String] + // private val fns = new FunctionNodeStripper[String] def apply(tree: Tree[String]): Tree[String] = { xox(ens(tree).get) } } - class LensedStandardTransform[T](implicit lens: Lens[T,String]) extends (Tree[T]=>Tree[T]) { + class LensedStandardTransform[T](implicit lens: Lens[T, String]) extends (Tree[T] => Tree[T]) { private val ens = new EmptyNodeStripper[T] private val xox = new XOverXRemover[T] private val fns = new FunctionNodeStripper[T] - def apply(tree: Tree[T]) = { - xox(fns(ens(tree).get)) map ( l => lens.set(l,lens.get(l))) + xox(fns(ens(tree).get)) map(l => lens.set(l, lens.get(l))) } } @@ -489,7 +478,6 @@ object Trees { def begin = tree.begin def end = tree.end - /* * assertion * @@ -500,7 +488,6 @@ object Trees { case _ => } - def up: Option[Zipper[L]] = location match { case Root => None case LeftChild(pl, p, rightSibling) => @@ -529,9 +516,9 @@ object Trees { * @return */ def down: Option[Zipper[L]] = tree match { - case BinaryTree(l,lc,rc,span) => Some(Zipper(lc, LeftChild(tree.label, location, rc))) - case NullaryTree(_,_) => None - case UnaryTree(parent,child,chain,span) => + case BinaryTree(l, lc, rc, span) => Some(Zipper(lc, LeftChild(tree.label, location, rc))) + case NullaryTree(_, _) => None + case UnaryTree(parent, child, chain, span) => Some(Zipper(child, UnaryChild(tree.label, chain, location))) case _ => sys.error("Shouldn't be here!") } @@ -541,7 +528,7 @@ object Trees { * @return */ def downRight: Option[Zipper[L]] = tree match { - case BinaryTree(l,lc,rc,span) => Some(Zipper(rc, RightChild(tree.label, location, lc))) + case BinaryTree(l, lc, rc, span) => Some(Zipper(rc, RightChild(tree.label, location, lc))) case _ => None } @@ -554,8 +541,8 @@ object Trees { case NullaryTree(l, span) => // go up until we find a LeftChild, then go to its right child. // if we hit the root (that is, we only go up right children), there is no next. - var cur:Option[Zipper[L]] = Some(this) - while(true) { + var cur: Option[Zipper[L]] = Some(this) + while (true) { cur match { case None => return None case Some(loc@Zipper(_, LeftChild(_, _, _))) => @@ -580,5 +567,4 @@ object Trees { case class UnaryChild[+L](parentLabel: L, chain: IndexedSeq[String], parent: Location[L]) extends NotRoot[L] } - } diff --git a/src/main/scala/epic/trees/Treebank.scala b/src/main/scala/epic/trees/Treebank.scala index 057dfac8..49e889e7 100644 --- a/src/main/scala/epic/trees/Treebank.scala +++ b/src/main/scala/epic/trees/Treebank.scala @@ -16,7 +16,6 @@ package epic.trees limitations under the License. */ - import java.io._ import epic.ontonotes.ConllOntoReader @@ -111,7 +110,7 @@ object Treebank { * of the parsed Treebank. */ def fromPennTreebankDir(dir: File):Treebank[String] = new Treebank[String] { - if(!dir.exists) throw new FileNotFoundException(dir.toString) + if (!dir.exists) throw new FileNotFoundException(dir.toString) def sections = dir.listFiles.filter(_.isDirectory).map(_.getName) val train = Portion("train", IndexedSeq.range(2,10).map("0" + _) ++ IndexedSeq.range(10,22).map(""+_)) @@ -135,9 +134,9 @@ object Treebank { def fromChineseTreebankDir(dir: File):Treebank[String] = new Treebank[String] { def sections = dir.listFiles.map(_.getName) - private def id_to_name(id: Int) = s"chtb_${if(id < 100) "0" + id else id}.mrg" + private def id_to_name(id: Int) = s"chtb_${if (id < 100) "0" + id else id}.mrg" - val train = Portion("train",{(1 to 270) ++ (400 to 1151)} map(id_to_name)) + val train = Portion("train",{(1 to 270) ++ (400 to 1151)} map id_to_name) val test = Portion("test", 271 to 300 map id_to_name) val dev = Portion("dev",301 to 325 map id_to_name) diff --git a/src/main/scala/epic/trees/UnaryChainCollapser.scala b/src/main/scala/epic/trees/UnaryChainCollapser.scala index 7282dda2..f5034748 100644 --- a/src/main/scala/epic/trees/UnaryChainCollapser.scala +++ b/src/main/scala/epic/trees/UnaryChainCollapser.scala @@ -30,12 +30,12 @@ object UnaryChainCollapser { def transform(t: BinarizedTree[AnnotatedLabel],parentWasUnary:Boolean):BinarizedTree[AnnotatedLabel] = t match { case UnaryTree(l,c, _chain, span) => val (chain,cn) = stripChain(c) - UnaryTree(l,transform(cn,true), if(keepChains) _chain ++ chain.toIndexedSeq else IndexedSeq.empty, t.span) + UnaryTree(l,transform(cn,true), if (keepChains) _chain ++ chain.toIndexedSeq else IndexedSeq.empty, t.span) case BinaryTree(l,lchild,rchild, span) => - if(parentWasUnary) BinaryTree(l,transform(lchild,false),transform(rchild,false), t.span) + if (parentWasUnary) BinaryTree(l,transform(lchild,false),transform(rchild,false), t.span) else UnaryTree(l,BinaryTree(l,transform(lchild,false),transform(rchild,false), t.span), IndexedSeq.empty, t.span) case NullaryTree(l, span) => - if(parentWasUnary) NullaryTree(l, t.span) + if (parentWasUnary) NullaryTree(l, t.span) else UnaryTree(l,NullaryTree(l, t.span), IndexedSeq.empty, t.span) case t => t } diff --git a/src/main/scala/epic/trees/annotations/KMAnnotator.scala b/src/main/scala/epic/trees/annotations/KMAnnotator.scala index 2a002d4e..56b76f38 100644 --- a/src/main/scala/epic/trees/annotations/KMAnnotator.scala +++ b/src/main/scala/epic/trees/annotations/KMAnnotator.scala @@ -21,7 +21,8 @@ package annotations */ case class KMAnnotator( horizontal: Int = 2, vertical: Int = 2) extends TreeAnnotator[AnnotatedLabel, String, AnnotatedLabel] { - val pipeline = ( + + val pipeline = FilterAnnotations[String](Set(FunctionalTag("TMP"))) andThen Markovize[String](horizontal,vertical) andThen SplitAuxiliary() andThen @@ -33,10 +34,7 @@ case class KMAnnotator( horizontal: Int = 2, vertical: Int = 2) extends TreeAnno MarkNonIdentityUnaries[String]() andThen MarkExternalUnaries[String]() andThen DominatesV[String]() - ) - def apply(tree: BinarizedTree[AnnotatedLabel], words: Seq[String]) = pipeline(tree, words) - } diff --git a/src/main/scala/epic/trees/annotations/TreeAnnotations.scala b/src/main/scala/epic/trees/annotations/TreeAnnotations.scala index 7ca6058c..da5acd08 100644 --- a/src/main/scala/epic/trees/annotations/TreeAnnotations.scala +++ b/src/main/scala/epic/trees/annotations/TreeAnnotations.scala @@ -22,7 +22,6 @@ object TreeAnnotations { case class HeadTagAnnotation(tag: String) extends Annotation - // KM Annotations trait KMAnnotation extends Annotation diff --git a/src/main/scala/epic/trees/annotations/TreeAnnotator.scala b/src/main/scala/epic/trees/annotations/TreeAnnotator.scala index cb0e265e..ad5bc968 100644 --- a/src/main/scala/epic/trees/annotations/TreeAnnotator.scala +++ b/src/main/scala/epic/trees/annotations/TreeAnnotator.scala @@ -87,8 +87,6 @@ case class StripAnnotations[W]() extends TreeAnnotator[AnnotatedLabel, W, Annota } } - - /** * Removes all features from the [[epic.trees.AnnotatedLabel]] * @tparam W @@ -107,10 +105,9 @@ case class Markovize[W](horizontal: Int=0, vertical: Int=2) extends TreeAnnotato } } - case class ParentAnnotate[W](order: Int = 0, skipPunctTags: Boolean = true) extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] { def apply(tree: BinarizedTree[AnnotatedLabel], words: Seq[W]) = { - if(order == 0) { + if (order == 0) { tree } else { def join(base: AnnotatedLabel, parent: Seq[AnnotatedLabel]) = { @@ -122,15 +119,13 @@ case class ParentAnnotate[W](order: Int = 0, skipPunctTags: Boolean = true) ext case ex: AssertionError => throw new RuntimeException(s"While handling $words", ex) } - } } - } case class ParentAnnotatePosTags[W](order: Int = 1, skipPunctTags: Boolean = true) extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] { def apply(tree: BinarizedTree[AnnotatedLabel], words: Seq[W]) = { - if(order == 0) { + if (order == 0) { tree } else { def join(base: AnnotatedLabel, parent: Seq[AnnotatedLabel]) = { @@ -139,20 +134,17 @@ case class ParentAnnotatePosTags[W](order: Int = 1, skipPunctTags: Boolean = tr Trees.annotateParentsBinarized(tree, join, {(_:AnnotatedLabel).isIntermediate}, {(l:Tree[AnnotatedLabel])=> !(l.isLeaf || l.children.length == 1 && l.children.head.label.label == l.label.label && l.span.length == 1) || l.label.label.isEmpty || (l.label.label.head != '@' && !l.label.label.head.isLetterOrDigit)}, order) } } - } case class ForgetHeadTag[W]() extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] { def apply(tree: BinarizedTree[AnnotatedLabel], words: Seq[W]) = { tree.map(_.copy(headTag=None)) } - } - case class MarkovizeSiblings[W](order: Int=0) extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] { def apply(tree: BinarizedTree[AnnotatedLabel], words: Seq[W]) = { - if(order == 0) tree.map {l => l.copy(siblings = IndexedSeq.empty)} + if (order == 0) tree.map {l => l.copy(siblings = IndexedSeq.empty)} else tree.map { l => l.copy(siblings = l.siblings.takeRight(order)) } } @@ -169,7 +161,6 @@ case class MarkovizeSiblings[W](order: Int=0) extends TreeAnnotator[AnnotatedLab } */ - } /** * Marks verb tags based on the auxiliary @@ -179,22 +170,19 @@ case class SplitAuxiliary() extends TreeAnnotator[AnnotatedLabel, String, Annota val hasVerbs = Set("has", "have", "had") def apply(tree: BinarizedTree[AnnotatedLabel], words: Seq[String]) = { - tree.extend { t => - t match { - case UnaryTree(label, NullaryTree(lbl2, _), chain, span) if label.baseLabel == lbl2.baseLabel => - val w = words(span.begin) - if (beVerbs.contains(w.toLowerCase)) label.annotate(AuxBe).annotate(Aux) - else if (hasVerbs.contains(w.toLowerCase)) label.annotate(AuxHave).annotate(Aux) - else label - case NullaryTree(label, span) => - val w = words(span.begin) - if (beVerbs.contains(w.toLowerCase)) label.annotate(AuxBe).annotate(Aux) - else if (hasVerbs.contains(w.toLowerCase)) label.annotate(AuxHave).annotate(Aux) - else label - case _ => t.label - } + tree.extend { + case UnaryTree(label, NullaryTree(lbl2, _), chain, span) if label.baseLabel == lbl2.baseLabel => + val w = words(span.begin) + if (beVerbs.contains(w.toLowerCase)) label.annotate(AuxBe).annotate(Aux) + else if (hasVerbs.contains(w.toLowerCase)) label.annotate(AuxHave).annotate(Aux) + else label + case NullaryTree(label, span) => + val w = words(span.begin) + if (beVerbs.contains(w.toLowerCase)) label.annotate(AuxBe).annotate(Aux) + else if (hasVerbs.contains(w.toLowerCase)) label.annotate(AuxHave).annotate(Aux) + else label + case t => t.label } - } } @@ -213,7 +201,7 @@ case class SplitPunct() extends TreeAnnotator[AnnotatedLabel, String, AnnotatedL val w = words(span.begin) if (w.forall(!_.isLetterOrDigit) && label.baseLabel != w) label.annotate(Punct(w)) - else if(w.matches("-[LR].B-") && label.baseLabel != w) label.annotate(Punct(w)) + else if (w.matches("-[LR].B-") && label.baseLabel != w) label.annotate(Punct(w)) else label case NullaryTree(label, span) => val w = words(span.begin) @@ -232,7 +220,7 @@ case class SplitPunct() extends TreeAnnotator[AnnotatedLabel, String, AnnotatedL case class SplitVP() extends TreeAnnotator[AnnotatedLabel, String, AnnotatedLabel] { val finiteVerbs = Set("VBZ", "VBD", "VBP", "MD") def apply(tree: BinarizedTree[AnnotatedLabel], words: Seq[String]) = tree.extend { t => - if(t.label.baseLabel != "VP") { + if (t.label.baseLabel != "VP") { t.label } else { val headTag = HeadFinder.collins.lensed[AnnotatedLabel].findHeadTag(t) @@ -247,7 +235,6 @@ case class SplitVP() extends TreeAnnotator[AnnotatedLabel, String, AnnotatedLabe } - case class SplitIN[W]() extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] { def apply(tree: BinarizedTree[AnnotatedLabel], words: Seq[W]) = { def rec(tree: BinarizedTree[AnnotatedLabel], root: String, @@ -256,14 +243,14 @@ case class SplitIN[W]() extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] val blbl = tree.label.baseLabel tree match { case tree@NullaryTree(lbl, span) if blbl == "IN" => - if(grandParent.isEmpty || grandParent.exists(_ == root) || parent.exists(_ == root)) { + if (grandParent.isEmpty || grandParent.contains(root) || parent.contains(root)) { tree - } else if (grandParent.exists(_(0) == 'N') && (parent.exists(s => s(0) == 'P' || s(0) == 'A'))) { + } else if (grandParent.exists(_(0) == 'N') && parent.exists(s => s(0) == 'P' || s(0) == 'A')) { tree.copy(lbl.annotate(IN_N), span) - } else if (parent.exists(_(0) == 'Q') && (grandParent.exists(s => s(0) == 'N' || s.startsWith("ADJP")))) { + } else if (parent.exists(_(0) == 'Q') && grandParent.exists(s => s(0) == 'N' || s.startsWith("ADJP"))) { tree.copy(lbl.annotate(IN_Q), span) - } else if(grandParent.exists(_ == "S")) { - if(parent.exists(_ == "SBAR")) { + } else if (grandParent.contains("S")) { + if (parent.contains("SBAR")) { tree.copy(lbl.annotate(IN_SCC), span) } else { tree.copy(lbl.annotate(IN_SC), span) @@ -272,8 +259,8 @@ case class SplitIN[W]() extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] tree } case u @ UnaryTree(lbl, c, chain, span) => - if(blbl != "IN") { - if(parent.exists(_ != blbl)) + if (blbl != "IN") { + if (parent.exists(_ != blbl)) u.copy(lbl, rec(c, root, Some(blbl), parent)) else u.copy(lbl, rec(c, root, parent, grandParent)) @@ -293,7 +280,7 @@ case class SplitIN[W]() extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] case class SplitPossNP[W]() extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] { def apply(tree: BinarizedTree[AnnotatedLabel], words: Seq[W]) = tree.extend { t => - if(t.label.baseLabel != "NP") t.label + if (t.label.baseLabel != "NP") t.label else { val headTag = HeadFinder.collins.lensed[AnnotatedLabel].findHeadTag(t) if (headTag.baseLabel == "POS") { @@ -320,7 +307,7 @@ case class AnnotateBaseNP[W]() extends TreeAnnotator[AnnotatedLabel, W, Annotate t -> true case t@UnaryTree(lbl1, child, chain, span) => val (newchild, ok) = rec(child) - if(lbl1.baseLabel == "NP" && (ok || newchild.label.hasAnnotation(BaseNP))) { + if (lbl1.baseLabel == "NP" && (ok || newchild.label.hasAnnotation(BaseNP))) { UnaryTree(lbl1.annotate(BaseNP), newchild, chain, span) -> lbl1.isIntermediate } else { UnaryTree(lbl1, newchild, chain, span) -> false @@ -328,12 +315,11 @@ case class AnnotateBaseNP[W]() extends TreeAnnotator[AnnotatedLabel, W, Annotate case t@BinaryTree(lbl, lc, rc, span) => val (newlc, lok) = rec(lc) val (newrc, rok) = rec(rc) - if(lok && rok && lbl.baseLabel == "NP") { + if (lok && rok && lbl.baseLabel == "NP") { BinaryTree(lbl.annotate(BaseNP), newlc, newrc, span) -> lbl.isIntermediate } else { BinaryTree(lbl, newlc, newrc, span) -> false } - } rec(tree)._1 @@ -354,7 +340,7 @@ case class AnnotateRightRecNP[W]() extends TreeAnnotator[AnnotatedLabel, W, Anno def rec(tree: BinarizedTree[AnnotatedLabel]):BinarizedTree[AnnotatedLabel] = tree match { case t@UnaryTree(lbl1, child, chain, span) => val newchild = rec(child) - if(lbl1.baseLabel == "NP" && newchild.label.hasAnnotation(RightRecNP)) { + if (lbl1.baseLabel == "NP" && newchild.label.hasAnnotation(RightRecNP)) { UnaryTree(lbl1.annotate(RightRecNP), newchild, chain, span) } else { UnaryTree(lbl1, newchild, chain, span) @@ -363,7 +349,7 @@ case class AnnotateRightRecNP[W]() extends TreeAnnotator[AnnotatedLabel, W, Anno val newrc = rec(rc) val isRightRec = lbl.baseLabel == "NP" && (newrc.label.label == "NP" || (newrc.label.label == "@NP" && newrc.label.hasAnnotation(RightRecNP))) val newlc = rec(lc) - if(isRightRec) { + if (isRightRec) { val lclc = annotateDownwards(newlc) BinaryTree(lbl.annotate(RightRecNP), lclc, newrc, span) } else { @@ -379,7 +365,7 @@ case class AnnotateRightRecNP[W]() extends TreeAnnotator[AnnotatedLabel, W, Anno case UnaryTree(lbl, child, chain, span) if lbl.label == "@NP" => UnaryTree(lbl.annotate(RightRecNP), annotateDownwards(child), chain, span) case BinaryTree(lbl, lc, rc, span) if lbl.label == "@NP" => - BinaryTree(lbl.annotate(RightRecNP), if(lc.label.isIntermediate) annotateDownwards(lc) else lc, if(rc.label.isIntermediate) annotateDownwards(rc) else rc, span) + BinaryTree(lbl.annotate(RightRecNP), if (lc.label.isIntermediate) annotateDownwards(lc) else lc, if (rc.label.isIntermediate) annotateDownwards(rc) else rc, span) case _ => tree } rec(tree) @@ -397,9 +383,9 @@ case class AnnotateDomCC[W]() extends TreeAnnotator[AnnotatedLabel, W, Annotated def rec(tree: BinarizedTree[AnnotatedLabel]):BinarizedTree[AnnotatedLabel] = tree match { case t@UnaryTree(lbl1, child, chain, span) => val newchild = rec(child) - if(newchild.label.hasAnnotation(DomCCLeft)) { + if (newchild.label.hasAnnotation(DomCCLeft)) { UnaryTree(lbl1.annotate(DomCCLeft), newchild, chain, span) - } else if(newchild.label.hasAnnotation(DomCCRight)) { + } else if (newchild.label.hasAnnotation(DomCCRight)) { UnaryTree(lbl1.annotate(DomCCRight), newchild, chain, span) } else { UnaryTree(lbl1, newchild, chain, span) @@ -409,7 +395,7 @@ case class AnnotateDomCC[W]() extends TreeAnnotator[AnnotatedLabel, W, Annotated val newlc = rec(lc) val domsCCR = newrc.label.label == "CC" || (newrc.label.isIntermediate && newrc.label.hasAnnotation(DomCCRight)) val domsCCL = newlc.label.label == "CC" || (newlc.label.isIntermediate && newlc.label.hasAnnotation(DomCCLeft)) - val sym = if(domsCCL) lbl.annotate(DomCCLeft) else if(domsCCR) lbl.annotate(DomCCRight) else lbl + val sym = if (domsCCL) lbl.annotate(DomCCLeft) else if (domsCCR) lbl.annotate(DomCCRight) else lbl BinaryTree(sym, newlc, newrc, span) case _ => tree } @@ -432,7 +418,6 @@ case class MarkNonIdentityUnaries[W]() extends TreeAnnotator[AnnotatedLabel, W, else u.copy(child = rec(c)) } - rec(tree) } } @@ -452,7 +437,6 @@ case class MarkExternalUnaries[W]() extends TreeAnnotator[AnnotatedLabel, W, Ann else u.copy(child=rec(c)) } - rec(tree) } } @@ -468,7 +452,7 @@ case class FixRootLabelVerticalAnnotation[W]() extends TreeAnnotator[AnnotatedLa UnaryTree(label, c.relabelRoot(rootLabel => new AnnotatedLabel(rootLabel.label, rootLabel.headTag, (0 until 1).map(i => rootLabel.label), rootLabel.siblings, rootLabel.features)), chain, - span); + span) } } } @@ -481,35 +465,34 @@ case class TagAnnotation() extends Annotation case class MarkPreterminals[W]() extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] { def apply(tree: BinarizedTree[AnnotatedLabel], words: Seq[W]) = { def rec(tree: BinarizedTree[AnnotatedLabel]):BinarizedTree[AnnotatedLabel] = tree match { - case b@BinaryTree(label, lc, rc, span) => BinaryTree(label, rec(lc), rec(rc), span); - case n@NullaryTree(label, span) => NullaryTree(label, span); - case u@UnaryTree(label, c, chain, span) => { + case b@BinaryTree(label, lc, rc, span) => BinaryTree(label, rec(lc), rec(rc), span) + case n@NullaryTree(label, span) => NullaryTree(label, span) + case u@UnaryTree(label, c, chain, span) => if (c.isLeaf) { UnaryTree(new AnnotatedLabel(label.label, label.headTag, label.parents, label.siblings, label.features ++ Set(new PreterminalAnnotation())), c.relabelRoot(cLabel => new AnnotatedLabel(cLabel.label, cLabel.headTag, cLabel.parents, cLabel.siblings, cLabel.features ++ Set(new TagAnnotation()))), chain, - span); + span) } else { - UnaryTree(label, rec(c), chain, span); + UnaryTree(label, rec(c), chain, span) } - } } - rec(tree); + rec(tree) } } trait MarkDominates[W] extends TreeAnnotator[AnnotatedLabel, W, AnnotatedLabel] { - protected def dominates(x: Tree[AnnotatedLabel]):Boolean + protected def dominates(x: Tree[AnnotatedLabel]): Boolean protected def sym: String def apply(tree: BinarizedTree[AnnotatedLabel], words: Seq[W]) = tree.extend { t => - if(t eq tree) t.label - else if(dominates(t)) t.label.annotate(Dom(sym)) + if (t eq tree) t.label + else if (dominates(t)) t.label.annotate(Dom(sym)) else t.label } } case class DominatesV[W]() extends MarkDominates[W] { - protected def dominates(x: Tree[AnnotatedLabel]):Boolean = x.leaves.exists { t => t.label.label.startsWith("V") || t.label.label.startsWith("MD")} + protected def dominates(x: Tree[AnnotatedLabel]): Boolean = x.leaves.exists { t => t.label.label.startsWith("V") || t.label.label.startsWith("MD")} def sym = "V" } diff --git a/src/main/scala/epic/trees/util/FilterTreesByLength.scala b/src/main/scala/epic/trees/util/FilterTreesByLength.scala index e1fce074..30f6b64f 100644 --- a/src/main/scala/epic/trees/util/FilterTreesByLength.scala +++ b/src/main/scala/epic/trees/util/FilterTreesByLength.scala @@ -40,7 +40,7 @@ object FilterTreesByLength { for ( (gold, guess) <- getTrees(gold) zip getTrees(params.guess)) { assert(gold._2 == guess._2) - val len = (if(ignorePunct) gold._2.count(!_.forall(!_.isLetterOrDigit)) else gold._2.length)/bucketSize * bucketSize + val len = (if (ignorePunct) gold._2.count(!_.forall(!_.isLetterOrDigit)) else gold._2.length)/bucketSize * bucketSize goldOut(len).println(gold._1.render(gold._2, newline = false)) guessOut(len).println(guess._1.render(guess._2, newline = false)) } @@ -48,7 +48,6 @@ object FilterTreesByLength { goldOut.values.foreach(_.close()) guessOut.values.foreach(_.close()) - } def getTrees(file: File): PennTreeReader = { diff --git a/src/main/scala/epic/util/ArabicNormalization.scala b/src/main/scala/epic/util/ArabicNormalization.scala index b3839568..ab44244c 100644 --- a/src/main/scala/epic/util/ArabicNormalization.scala +++ b/src/main/scala/epic/util/ArabicNormalization.scala @@ -13,7 +13,7 @@ import scala.annotation.switch object ArabicNormalization extends LazyLogging { def handleTreebankThings(s: String):Option[String] = { - if(!s.startsWith("-")) { + if (!s.startsWith("-")) { None } else { s match { @@ -27,11 +27,10 @@ object ArabicNormalization extends LazyLogging { case "-MINUS-" => Some(s) case _ => None } - } } - def buckwalterToUnicode(buckwalter: String):String = { + def buckwalterToUnicode(buckwalter: String): String = { handleTreebankThings(buckwalter) match { case Some(x) => x case None => @@ -89,11 +88,10 @@ object ArabicNormalization extends LazyLogging { case '{' => '\u0671' case '.' | '?' | '!' | ',' | '"' | '%' | '-' | '/' | ':' | ';' | '=' => buckwalter(i) case x => - if(!x.isDigit) + if (!x.isDigit) logger.warn("Unknown buckwalter character: " + x) x }} - i += 1 } out.result() diff --git a/src/main/scala/epic/util/BinarySearch.scala b/src/main/scala/epic/util/BinarySearch.scala index 5c9176f5..131535fa 100644 --- a/src/main/scala/epic/util/BinarySearch.scala +++ b/src/main/scala/epic/util/BinarySearch.scala @@ -7,8 +7,8 @@ package epic.util **/ object BinarySearch { - def interpolationSearch[T](objs: IndexedSeq[T], proj: T=>Int, toFind: Int):Int = { - if(objs.length == 0) return ~0 + def interpolationSearch[T](objs: IndexedSeq[T], proj: T=>Int, toFind: Int): Int = { + if (objs.isEmpty) return ~0 // Returns index of toFind in sortedArray, or -1 if not found var low = 0 @@ -17,7 +17,7 @@ object BinarySearch { var highV = proj(objs(high)) while (lowV <= toFind && highV >= toFind) { - val mid = (if(highV == lowV) low else low + ((toFind - lowV.toLong) * (high - low)) / (highV.toLong - lowV.toLong)).toInt + val mid = (if (highV == lowV) low else low + ((toFind - lowV.toLong) * (high - low)) / (highV.toLong - lowV.toLong)).toInt val midV = proj(objs(mid)) if (midV < toFind){ @@ -31,16 +31,13 @@ object BinarySearch { } } - if (lowV == toFind) { low - } else if(lowV > toFind) { + } else if (lowV > toFind) { ~low } else { ~(high + 1) } } - - } diff --git a/src/main/scala/epic/util/Cache.scala b/src/main/scala/epic/util/Cache.scala index 552eaecd..56603de9 100644 --- a/src/main/scala/epic/util/Cache.scala +++ b/src/main/scala/epic/util/Cache.scala @@ -16,17 +16,17 @@ case class CacheBroker(path: File = null, copyFrom: File = null, clearCaches: St @transient private var _actualCache:CacheBroker.ActualCache = null private def actualCache = synchronized { - lazy val dbMaker = if(path eq null) { + lazy val dbMaker = if (path eq null) { DBMaker.newMemoryDB() } else { DBMaker.newFileDB(path) }.closeOnJvmShutdown().cacheSoftRefEnable() - if(_actualCache eq null) { + if (_actualCache eq null) { _actualCache = CacheBroker.getCacheBroker(path, dbMaker, autocommit, copyFrom) } - if(disableWriteAheadLog) _actualCache.dbMaker.writeAheadLogDisable() - if(clearCaches != null && clearCaches.nonEmpty) + if (disableWriteAheadLog) _actualCache.dbMaker.writeAheadLogDisable() + if (clearCaches != null && clearCaches.nonEmpty) for(toDisable <- clearCaches.split(",")) { _actualCache.db.getHashMap(toDisable).clear() } @@ -34,11 +34,9 @@ case class CacheBroker(path: File = null, copyFrom: File = null, clearCaches: St _actualCache } - def dbMaker = actualCache.dbMaker def db = actualCache.db - def commit() { db.commit()} def close() {actualCache.close()} @@ -50,22 +48,22 @@ object CacheBroker extends LazyLogging { private class ActualCache private[CacheBroker] (val path: File, val dbMaker: DBMaker, val autocommit: Boolean, copyFrom: File = null) { lazy val db = { val db = dbMaker.make() - if(copyFrom != null) { + if (copyFrom != null) { logger.info(s"Copying database from $copyFrom to ${if (path ne null) path else "in memory database"}") val from = DBMaker.newFileDB(copyFrom).make() Pump.copy(from, db) from.close() } - if(autocommit) cacheThread.start() + if (autocommit) cacheThread.start() db } private lazy val cacheThread: Thread = new Thread(new Runnable { def run() { try { - while(!db.isClosed && !Thread.interrupted()) { + while (!db.isClosed && !Thread.interrupted()) { Thread.sleep(1000 * 60) - if(!db.isClosed) + if (!db.isClosed) db.commit() } } catch { @@ -88,18 +86,17 @@ object CacheBroker extends LazyLogging { private val cacheCache = Collections.synchronizedMap(new util.HashMap[File, ActualCache]()).asScala private def getCacheBroker(path: File, dbMaker: =>DBMaker, autocommit: Boolean, copyFrom: File) = { - if(path eq null) new ActualCache(path, dbMaker, autocommit) + if (path eq null) new ActualCache(path, dbMaker, autocommit) else cacheCache.getOrElseUpdate(path, new ActualCache(path, dbMaker, autocommit, copyFrom)) } - @SerialVersionUID(1L) private class CacheMap[K, V](name: String, cache: CacheBroker)(implicit kser: Serializer[K], vser: Serializer[V]) extends Map[K, V] with Serializable { import cache._ @transient private var _theMap : Map[K, V] = null def theMap = synchronized { - if(_theMap eq null) { + if (_theMap eq null) { _theMap = try { // this throws if the hash map exists, and there's no "does it exist" method // that takes the serializers... @@ -111,7 +108,6 @@ object CacheBroker extends LazyLogging { _theMap } - def +=(kv: (K, V)): this.type = {theMap += kv; this} def -=(key: K): this.type = {theMap -= key; this} diff --git a/src/main/scala/epic/util/FIFOWorkQueue.scala b/src/main/scala/epic/util/FIFOWorkQueue.scala index 7600f8ce..bf57bd9d 100644 --- a/src/main/scala/epic/util/FIFOWorkQueue.scala +++ b/src/main/scala/epic/util/FIFOWorkQueue.scala @@ -35,9 +35,9 @@ class FIFOWorkQueue[-In, Out](f: In=>Out)(implicit context: ExecutionContext) ex def next() = {waitUntilReady(); Await.result(queue.poll(), Duration.Inf)} - private def waitUntilReady():Boolean = { + private def waitUntilReady(): Boolean = { synchronized { - while(!done && queue.isEmpty) { + while (!done && queue.isEmpty) { wait() } } diff --git a/src/main/scala/epic/util/Has.scala b/src/main/scala/epic/util/Has.scala index 3193c238..c8813d3d 100644 --- a/src/main/scala/epic/util/Has.scala +++ b/src/main/scala/epic/util/Has.scala @@ -2,7 +2,6 @@ package epic.util import epic.framework.{Example, Observation} - /** * * @author dlwh @@ -16,7 +15,6 @@ trait Has2[Haver, +WhatIHave] { def get(h: Haver):WhatIHave } - object Has2 { implicit def identityHas2[H]:Has2[H, H] = new Has2[H, H] with Serializable { def get(h: H): H = h diff --git a/src/main/scala/epic/util/LRUCache.scala b/src/main/scala/epic/util/LRUCache.scala index 4b7abd39..c7ee95ff 100644 --- a/src/main/scala/epic/util/LRUCache.scala +++ b/src/main/scala/epic/util/LRUCache.scala @@ -1,6 +1,5 @@ package epic.util - import scala.reflect.ClassTag import scala.util.hashing.MurmurHash3 @@ -11,8 +10,7 @@ import scala.util.hashing.MurmurHash3 **/ final class LRUCache[@specialized(Int, Long) K:ClassTag, V:ClassTag](size: Int, onEvict: (K, V)=>Unit = (k: K, v: V) => ()) { private val keys = new Array[K](size) - private val occupied = new Array[Int](size) - java.util.Arrays.fill(occupied, -1) + private val occupied = Array.fill[Int](size)(-1) private var nextKey = 0 private val values = new Array[V](size) @@ -33,7 +31,6 @@ final class LRUCache[@specialized(Int, Long) K:ClassTag, V:ClassTag](size: Int, occupied(pos) = -1 onEvict(keys(pos), values(pos)) } - } def iterator: Iterator[(K, V)] = { @@ -65,7 +62,7 @@ final class LRUCache[@specialized(Int, Long) K:ClassTag, V:ClassTag](size: Int, values(pos) = v } - private def lookup(k: K):Int = { + private def lookup(k: K): Int = { val hc : Int = k.## val hh = MurmurHash3.mixLast(10891, hc).abs % keys.length val hh2 = MurmurHash3.mixLast(10909, hc).abs % keys.length diff --git a/src/main/scala/epic/util/LockableSeenSet.scala b/src/main/scala/epic/util/LockableSeenSet.scala index 8bc83725..795993ca 100644 --- a/src/main/scala/epic/util/LockableSeenSet.scala +++ b/src/main/scala/epic/util/LockableSeenSet.scala @@ -9,7 +9,7 @@ import java.io.ObjectStreamException * @author dlwh **/ trait LockableSeenSet[@specialized(Int, Long) -T] extends Serializable { - def addOrSeen(x: T):Boolean + def addOrSeen(x: T): Boolean def lock: LockableSeenSet[T] } @@ -18,7 +18,6 @@ class BloomFilterSeenSet[@specialized(Int, Long) T](bf: BloomFilter[T]) extends override def addOrSeen(x: T): Boolean = { bf(x) } - override def lock: LockableSeenSet[T] = this } @@ -26,13 +25,10 @@ object LockableSeenSet { def always[T]:LockableSeenSet[T] = AlwaysSeenSet } - @SerialVersionUID(1L) object AlwaysSeenSet extends LockableSeenSet[Any] { override def addOrSeen(x: Any): Boolean = true - override def lock: LockableSeenSet[Any] = this - @throws[ObjectStreamException] private def readResolve() = { AlwaysSeenSet diff --git a/src/main/scala/epic/util/Optional.scala b/src/main/scala/epic/util/Optional.scala index 806431c7..8f4fd244 100644 --- a/src/main/scala/epic/util/Optional.scala +++ b/src/main/scala/epic/util/Optional.scala @@ -18,11 +18,7 @@ case class Provided[A](value: A) extends Optional[A] case object NotProvided extends Optional[Nothing] object Optional { - implicit def anyToOptional[A](x: A): Optional[A] = if (x == null) NotProvided else Provided(x) - implicit def optionToOptional[A](x: Option[A]): Optional[A] = x.fold(NotProvided:Optional[A])(Provided(_)) - implicit def optionalToOption[A](x: Optional[A]): Option[A] = x.toOption - } diff --git a/src/main/scala/epic/util/ProcessTextMain.scala b/src/main/scala/epic/util/ProcessTextMain.scala index e510d36a..83dfe52a 100644 --- a/src/main/scala/epic/util/ProcessTextMain.scala +++ b/src/main/scala/epic/util/ProcessTextMain.scala @@ -16,7 +16,7 @@ import java.util.concurrent.{LinkedBlockingDeque, TimeUnit, ThreadPoolExecutor} trait ProcessTextMain[Model, AnnotatedType] { import ProcessTextMain._ - def render(model: Model, ann: AnnotatedType, tokens: IndexedSeq[String]):String + def render(model: Model, ann: AnnotatedType, tokens: IndexedSeq[String]): String def renderFailed(model: Model, tokens: IndexedSeq[String], reason: Throwable): String = { s"### Could not tag $tokens, because ${reason.getMessage}... ${reason.getStackTrace.take(2).mkString(";")}".replaceAll("\n", " ") @@ -60,19 +60,19 @@ trait ProcessTextMain[Model, AnnotatedType] { case "none" | "whitespace" => new WhitespaceTokenizer } - implicit val context = if(params.threads > 0) { + implicit val context = if (params.threads > 0) { scala.concurrent.ExecutionContext.fromExecutor(new ThreadPoolExecutor(1, params.threads, 1, TimeUnit.SECONDS, new LinkedBlockingDeque[Runnable]())) } else { scala.concurrent.ExecutionContext.global } - val iter = if(files.length == 0) Iterator(System.in) else files.iterator.map(new FileInputStream(_)) + val iter = if (files.isEmpty) Iterator(System.in) else files.iterator.map(new FileInputStream(_)) for(src <- iter) { val queue = FIFOWorkQueue(sentenceSegmenter.sentences(src)){sent => val tokens = tokenizer(sent).toIndexedSeq try { - if(tokens.length > params.maxLength) { + if (tokens.length > params.maxLength) { throw new SentenceTooLongException(tokens.length) } val tree = annotate(model, tokens) diff --git a/src/main/scala/epic/util/ProgressLog.scala b/src/main/scala/epic/util/ProgressLog.scala index e930aec8..9a82ba45 100644 --- a/src/main/scala/epic/util/ProgressLog.scala +++ b/src/main/scala/epic/util/ProgressLog.scala @@ -12,17 +12,16 @@ class ProgressLog(log: Logger, items: Int, frequency: Int = 100, name: String = val initialTime = System.currentTimeMillis() val item = new AtomicInteger() - def reportProgress() = { val x = item.incrementAndGet() - if(x % frequency == 0 || x == items) { + if (x % frequency == 0 || x == items) { log.info(s"$name $x/$items (${(System.currentTimeMillis() - initialTime)/1000.0}s elapsed.)") } } def info(msg: =>String) = { val x = item.incrementAndGet() - if(x % frequency == 0 || x == items) { + if (x % frequency == 0 || x == items) { val m = msg log.info(s"$name $x/$items: $m (${(System.currentTimeMillis() - initialTime)/1000.0}s elapsed.)") } @@ -30,7 +29,7 @@ class ProgressLog(log: Logger, items: Int, frequency: Int = 100, name: String = def debug(msg: =>String) = { val x = item.incrementAndGet() - if(x % frequency == 0 || x == items) { + if (x % frequency == 0 || x == items) { val m = msg log.debug(s"$name $x/$items: $m (${(System.currentTimeMillis() - initialTime)/1000.0}s elapsed.)") } diff --git a/src/main/scala/epic/util/SafeLogging.scala b/src/main/scala/epic/util/SafeLogging.scala index 9d3b9b92..c8072c68 100644 --- a/src/main/scala/epic/util/SafeLogging.scala +++ b/src/main/scala/epic/util/SafeLogging.scala @@ -14,10 +14,10 @@ trait SafeLogging { def logger: Logger = { var logger = _the_logger - if(logger eq null) { + if (logger eq null) { synchronized { logger = _the_logger - if(logger eq null) { + if (logger eq null) { val ll = Logger(LoggerFactory.getLogger(this.getClass)) _the_logger = ll logger = ll @@ -27,5 +27,4 @@ trait SafeLogging { logger } - } diff --git a/src/main/scala/epic/util/ThreadLocalBloomFilter.scala b/src/main/scala/epic/util/ThreadLocalBloomFilter.scala index e55ed49f..40ade11e 100644 --- a/src/main/scala/epic/util/ThreadLocalBloomFilter.scala +++ b/src/main/scala/epic/util/ThreadLocalBloomFilter.scala @@ -20,7 +20,6 @@ class ThreadLocalBloomFilter[@specialized(Int, Long) T](numBuckets: Int, numHash } } - override def addOrSeen(x: T): Boolean = {tl.get() += x; true} private val queue = new ConcurrentLinkedDeque[BloomFilter[T]]() @@ -29,7 +28,7 @@ class ThreadLocalBloomFilter[@specialized(Int, Long) T](numBuckets: Int, numHash val bf = tl.get() var i = 0 val len = queue.size - while(!queue.isEmpty && i < len) { + while (!queue.isEmpty && i < len) { bf |= queue.pop() i += 1 } @@ -41,7 +40,6 @@ class ThreadLocalBloomFilter[@specialized(Int, Long) T](numBuckets: Int, numHash val u = union val load = u.load val size = - u.numBuckets * math.log1p(-load)/u.numHashFunctions - logger.info(f"Bloom filter has load of ${u.load}%.3f and approx size $size. Queue is ${queue.size()} elements long.") new BloomFilterSeenSet[T](u) } diff --git a/src/main/scala/epic/util/Unicode.scala b/src/main/scala/epic/util/Unicode.scala index ffa79e78..2bd0b652 100644 --- a/src/main/scala/epic/util/Unicode.scala +++ b/src/main/scala/epic/util/Unicode.scala @@ -26,7 +26,6 @@ object Unicode { if (cp < 0) { return false } - var i = 0 while (i < rangeStarts.length && cp < rangeStarts(i)) { i += 1 diff --git a/src/main/scala/epic/util/WeightsCache.scala b/src/main/scala/epic/util/WeightsCache.scala index ef8e54a8..1914f9d8 100644 --- a/src/main/scala/epic/util/WeightsCache.scala +++ b/src/main/scala/epic/util/WeightsCache.scala @@ -17,8 +17,8 @@ object WeightsCache { require(weights.length == index.size) val out = new PrintStream(new GZIPOutputStream(new FileOutputStream(file), 1024)) var i = 0 - while(i < index.size) { - if(weights(i).abs > threshold) + while (i < index.size) { + if (weights(i).abs > threshold) out.println(index.get(i) + "\t" + weights(i)) i += 1 } diff --git a/src/test/scala/epic/constraints/LabeledSpanConstraintsTest.scala b/src/test/scala/epic/constraints/LabeledSpanConstraintsTest.scala index 9aa3495d..e42a7b7a 100644 --- a/src/test/scala/epic/constraints/LabeledSpanConstraintsTest.scala +++ b/src/test/scala/epic/constraints/LabeledSpanConstraintsTest.scala @@ -14,9 +14,9 @@ class LabeledSpanConstraintsTest extends FunSuite { /* test("serialization") { val x = LabeledSpanConstraints[Int](TriangularArray.tabulate(10) { (i,j) => - if(i == j || i > 5) null + if (i == j || i > 5) null else { - if(i < j - 1) BitSet(1,2,3,4) + if (i < j - 1) BitSet(1,2,3,4) else BitSet(i,j) } @@ -37,15 +37,15 @@ class LabeledSpanConstraintsTest extends FunSuite { test("containsAll") { val x = LabeledSpanConstraints[Int](TriangularArray.tabulate(10) { (i,j) => - if(i == j || i > 5) null + if (i == j || i > 5) null else { - if(i < j) BitSet(1,2,3,4) + if (i < j) BitSet(1,2,3,4) else BitSet(1) } }) val z = LabeledSpanConstraints[Int](TriangularArray.tabulate(10) { (i,j) => - if(i < j) BitSet(1,2,3,4) + if (i < j) BitSet(1,2,3,4) else BitSet(1) }) diff --git a/src/test/scala/epic/features/CrossProductIndexTest.scala b/src/test/scala/epic/features/CrossProductIndexTest.scala index 966cd126..da348707 100644 --- a/src/test/scala/epic/features/CrossProductIndexTest.scala +++ b/src/test/scala/epic/features/CrossProductIndexTest.scala @@ -32,7 +32,7 @@ class CrossProductIndexTest extends FunSuite { for(i <- 0 until index1.size; j <- 0 until index2.size) { val mapped = res.mapped(i, j) - if(mapped >= 0) + if (mapped >= 0) assert(csc(i, j) === weights(mapped)) } @@ -62,7 +62,7 @@ class CrossProductIndexTest extends FunSuite { for(i <- 0 until index1.size; j <- 0 until index2.size) { val mapped = res.mapped(i, j) - if(mapped >= 0) + if (mapped >= 0) assert(csc(i, j) === weights(mapped)) } diff --git a/src/test/scala/epic/features/DistanceBinnerTest.scala b/src/test/scala/epic/features/DistanceBinnerTest.scala index 8d83f4bd..a6cd5fc9 100644 --- a/src/test/scala/epic/features/DistanceBinnerTest.scala +++ b/src/test/scala/epic/features/DistanceBinnerTest.scala @@ -11,17 +11,17 @@ class DistanceBinnerTest extends FunSuite { val binner = new DistanceBinner(preserveDirection = true) val dists = Array.tabulate(20,20) { (i, j) => val dist = binner.binnedDistance(i, j) - if(i < j) + if (i < j) assert(dist > 0, (dist, i, j)) - else if(i == j) + else if (i == j) assert(dist === 0) - else if(i > j) assert(dist < 0, (dist, i, j)) + else if (i > j) assert(dist < 0, (dist, i, j)) dist } assert(dists.flatten.toSet.size == binner.numBins * 2 - 1, dists.flatten.toSet -> binner.numBins) for(i <- 1 until 19; j <- 1 until 19) { - if(i != j) { + if (i != j) { assert(dists(i)(j) >= dists(i +1)(j), (i,j,dists(i)(j),dists(i+1)(j))) assert(dists(i)(j) >= dists(i)(j-1)) assert(dists(i)(j) <= dists(i -1)(j), (i,j,dists(i)(j),dists(i+1)(j))) @@ -34,16 +34,16 @@ class DistanceBinnerTest extends FunSuite { val binner = new DistanceBinner(preserveDirection = false) val dists = Array.tabulate(20,20) { (i, j) => val dist = binner.binnedDistance(i, j) - if(i != j) + if (i != j) assert(dist > 0, (dist, i, j)) - else if(i == j) + else if (i == j) assert(dist === 0) dist } assert(dists.flatten.toSet.size === binner.numBins) for(i <- 1 until 19; j <- i until 19) { - if(i < j) { + if (i < j) { assert(dists(i)(j) >= dists(i +1)(j), (i,j,dists(i)(j),dists(i+1)(j))) assert(dists(i)(j) >= dists(i)(j-1)) assert(dists(i)(j) <= dists(i -1)(j), (i,j,dists(i)(j),dists(i+1)(j))) diff --git a/src/test/scala/epic/parser/InsideOutsideTest.scala b/src/test/scala/epic/parser/InsideOutsideTest.scala index ad4995e4..7a8415ea 100644 --- a/src/test/scala/epic/parser/InsideOutsideTest.scala +++ b/src/test/scala/epic/parser/InsideOutsideTest.scala @@ -29,7 +29,7 @@ import repl.DSLGrammar class InsideOutsideTest extends FunSuite { implicit def near(x: Double) = new { - def near(y: Double) = if( (x-y).abs < 1E-4 * math.max(x+y,1E-4)/2) None else Some(x + " not near " + y) + def near(y: Double) = if ( (x-y).abs < 1E-4 * math.max(x+y,1E-4)/2) None else Some(x + " not near " + y) } test("Simple test from iobasics") { diff --git a/src/test/scala/epic/preprocess/TreebankTokenizerTest.scala b/src/test/scala/epic/preprocess/TreebankTokenizerTest.scala index 530d61a5..e58b40a7 100644 --- a/src/test/scala/epic/preprocess/TreebankTokenizerTest.scala +++ b/src/test/scala/epic/preprocess/TreebankTokenizerTest.scala @@ -5,7 +5,7 @@ import org.scalatest.FunSuite class TreebankTokenizerTest extends FunSuite { private def isOneToken(w: String) = - if(w === TreebankTokenizer(w).head) None else Some(w + " " + TreebankTokenizer(w)) + if (w === TreebankTokenizer(w).head) None else Some(w + " " + TreebankTokenizer(w)) test("simple words") { val words = List("Hi","there","pilgrim","happy","Thanksgiving","there") diff --git a/src/test/scala/epic/sequences/SegmentationTest.scala b/src/test/scala/epic/sequences/SegmentationTest.scala index 35daa020..188802bb 100644 --- a/src/test/scala/epic/sequences/SegmentationTest.scala +++ b/src/test/scala/epic/sequences/SegmentationTest.scala @@ -17,7 +17,7 @@ class SegmentationTest extends FunSuite with Checkers { } yield { val segments = segs.foldLeft((Vector((0,Span(0,0))))) { (cur, sl) => val (segId, len) = sl - if(segId == 0) cur :+ (segId -> Span(cur.last._2.end, cur.last._2.end + 1)) + if (segId == 0) cur :+ (segId -> Span(cur.last._2.end, cur.last._2.end + 1)) else cur :+ (segId -> Span(cur.last._2.end, cur.last._2.end + len)) } Segmentation(segments.drop(1), 0 until segments.last._2.end) diff --git a/src/test/scala/epic/trees/annotations/KMAnnotatorTest.scala b/src/test/scala/epic/trees/annotations/KMAnnotatorTest.scala index 36b0a4ae..ba3d03ee 100644 --- a/src/test/scala/epic/trees/annotations/KMAnnotatorTest.scala +++ b/src/test/scala/epic/trees/annotations/KMAnnotatorTest.scala @@ -43,7 +43,7 @@ class KMAnnotatorTest extends FunSuite { // make sure the S dominates a V assert(pipelined.allChildren.exists(t => t.label.label == "S" && t.label.hasAnnotation(Dom("V"))), "DomV2") // make sure the @S dominates a V and has an NP to its left - if(pipelined.allChildren.exists(t => t.label.label == "@S" && t.label.hasAnnotation(Dom("V")) && t.label.siblings.nonEmpty && t.label.siblings(0) == Right("."))) { + if (pipelined.allChildren.exists(t => t.label.label == "@S" && t.label.hasAnnotation(Dom("V")) && t.label.siblings.nonEmpty && t.label.siblings(0) == Right("."))) { } else { fail(pipelined.toString + " " + pipelined.map(label => label -> ( label.label == "@S" , label.hasAnnotation(Dom("V")) , label.siblings.map(_ == Right(".")))))