fixed unit tests

apache · jkbradley · Nov 5, 2015 · Nov 6, 2015 · Nov 6, 2015 · Nov 6, 2015
commit 631d40759e88c84dfffe4805a7834c353caa341b
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -171,6 +171,14 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
     SchemaUtils.checkColumnType(schema, $(featuresCol), new VectorUDT)
     SchemaUtils.appendColumn(schema, $(topicDistributionCol), new VectorUDT)
   }
+
+  override def validateParams(): Unit = {
+    if (getDocConcentration.length != 1) {
+      require(getDocConcentration.length == getK, s"LDA docConcentration was of length" +
+        s" ${getDocConcentration.length}, but k = $getK.  docConcentration must be either" +
+        s" length 1 (scalar) or an array of length k.")
+    }
+  }
 }
 
 
@@ -195,14 +203,6 @@ class LDAModel private[ml] (
     @Since("1.6.0") @transient protected val sqlContext: SQLContext)
   extends Model[LDAModel] with LDAParams with Logging {
 
-  override def validateParams(): Unit = {
-    if (getDocConcentration.length != 1) {
-      require(getDocConcentration.length == getK, s"LDA docConcentration was of length" +
-        s" ${getDocConcentration.length}, but k = $getK.  docConcentration must be either" +
-        s" length 1 (scalar) or an array of length k.")
-    }
-  }
-
   /** Returns underlying spark.mllib model */
   @Since("1.6.0")
   protected def getModel: OldLDAModel = oldLocalModel match {
@@ -328,7 +328,7 @@ class LDAModel private[ml] (
   def describeTopics(maxTermsPerTopic: Int): DataFrame = {
     val topics = getModel.describeTopics(maxTermsPerTopic).zipWithIndex.map {
       case ((termIndices, termWeights), topic) =>
-        (topic, termIndices, termWeights)
+        (topic, termIndices.toSeq, termWeights.toSeq)
     }
     sqlContext.createDataFrame(topics).toDF("topic", "termIndices", "termWeights")
   }
@@ -463,8 +463,8 @@ class LDA @Since("1.6.0") (
   @Since("1.6.0")
   def this() = this(Identifiable.randomUID("lda"))
 
-  setDefault(k -> 10, docConcentration -> Array(-1.0), topicConcentration -> -1.0,
-    optimizer -> new OnlineLDAOptimizer)
+  setDefault(maxIter -> 20, k -> 10, docConcentration -> Array(-1.0), topicConcentration -> -1.0,
+    optimizer -> new OnlineLDAOptimizer, checkpointInterval -> 10)
 
   /**
    * The features for LDA should be a [[Vector]] representing the word counts in a document.

diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
@@ -28,14 +28,14 @@ object LDASuite {
   def generateLDAData(
       sql: SQLContext,
       rows: Int,
-      dim: Int,
       k: Int,
       vocabSize: Int): DataFrame = {
+    val avgWC = 1  // average instances of each word in a doc
     val sc = sql.sparkContext
     val rng = new java.util.Random()
     rng.setSeed(1)
     val rdd = sc.parallelize(1 to rows).map { i =>
-      Vectors.dense(Array.fill(dim)(rng.nextInt(vocabSize).toDouble))
+      Vectors.dense(Array.fill(vocabSize)(rng.nextInt(2 * avgWC).toDouble))
     }.map(v => new TestRow(v))
     sql.createDataFrame(rdd)
   }
@@ -44,16 +44,13 @@ object LDASuite {
 
 class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
 
-  val k = 5
+  val k: Int = 5
+  val vocabSize: Int = 30
   @transient var dataset: DataFrame = _
-  @transient var vocabSize: Int = _
 
   override def beforeAll(): Unit = {
     super.beforeAll()
-
-    dataset = LDASuite.generateLDAData(sqlContext, 50, 3, k, 30)
-    vocabSize = dataset.flatMap(_.getAs[Vector](0).toArray.map(_.toInt).toSet)
-      .distinct().count().toInt
+    dataset = LDASuite.generateLDAData(sqlContext, 50, k, vocabSize)
   }
 
   test("default parameters") {
@@ -62,7 +59,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(lda.getFeaturesCol === "features")
     assert(lda.getMaxIter === 20)
     assert(lda.isDefined(lda.seed))
-    assert(!lda.isDefined(lda.checkpointInterval))
+    assert(lda.getCheckpointInterval === 10)
     assert(lda.getK === 10)
     assert(lda.getDocConcentration === Array(-1.0))
     assert(lda.getTopicConcentration === -1.0)
@@ -129,9 +126,9 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
 
     // validateParams()
     lda.setDocConcentration(-1)
-    assert(lda.getDocConcentration === -1)
+    assert(lda.getDocConcentration === Array(-1.0))
     lda.validateParams()
-    lda.setDocConcentration(0.1)
+    lda.setDocConcentration(1.1)
     lda.validateParams()
     lda.setDocConcentration(Range(0, lda.getK).map(_ + 2.0).toArray)
     lda.validateParams()
@@ -178,7 +175,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     }
     transformed.select(lda.getTopicDistributionCol).collect().foreach { r =>
       val topicDistribution = r.getAs[Vector](0)
-      assert(topicDistribution.size === vocabSize)
+      assert(topicDistribution.size === k)
       assert(topicDistribution.toArray.forall(w => w >= 0.0 && w <= 1.0))
     }
 
@@ -192,14 +189,14 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     val topics = model.describeTopics(3)
     assert(topics.count() === k)
     assert(topics.select("topic").map(_.getInt(0)).collect().toSet === Range(0, k).toSet)
-    assert(topics.select("termIndices").collect().forall { case r: Row =>
-      val termIndices = r.getAs[Array[Int]](0)
-      termIndices.length === 3 && termIndices.toSet.size === 3
-    })
-    assert(topics.select("termWeights").collect().forall { case r: Row =>
-      val termWeights = r.getAs[Array[Double]](0)
-      termWeights.length === 3 && termWeights.forall(w => w >= 0.0 && w <= 1.0)
-    })
+    topics.select("termIndices").collect().foreach { case r: Row =>
+      val termIndices = r.getAs[Seq[Int]](0)
+      assert(termIndices.length === 3 && termIndices.toSet.size === 3)
+    }
+    topics.select("termWeights").collect().foreach { case r: Row =>
+      val termWeights = r.getAs[Seq[Double]](0)
+      assert(termWeights.length === 3 && termWeights.forall(w => w >= 0.0 && w <= 1.0))
+    }
   }
 
   test("fit & transform with EM LDA") {
@@ -223,6 +220,6 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext {
     val ll = model.trainingLogLikelihood
     assert(ll <= 0.0 && ll != Double.NegativeInfinity)
     val lp = model.logPrior
-    assert(lp >= 0.0 && lp != Double.PositiveInfinity)
+    assert(lp <= 0.0 && lp != Double.NegativeInfinity)
   }
 }