add more unit tests

apache · jkbradley · Jun 13, 2016 · Jun 13, 2016 · Jun 17, 2016 · Jun 18, 2016
commit 305b194dae40eaff990c18837c3f2bc8d469e60c
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/PowerIterationClustering.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.ml.clustering
 
+import org.apache.hadoop.fs.Path
+
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model}
@@ -30,9 +32,9 @@ import org.apache.spark.mllib.clustering.PowerIterationClustering.Assignment
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions.{col, udf}
-import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType}
 
-/*
+/**
  * Common params for PowerIterationClustering and PowerIterationClusteringModel
  */
 private[clustering] trait PowerIterationClusteringParams extends Params with HasMaxIter
@@ -78,7 +80,6 @@ private[clustering] trait PowerIterationClusteringParams extends Params with Has
   }
 }
 
-
 @Since("2.0.0")
 @Experimental
 class PowerIterationClusteringModel private[ml] (
@@ -126,11 +127,11 @@ class PowerIterationClusteringModel private[ml] (
       .saveInitMode($(initMode))
       .saveMaxIter($(maxIter))
     val rows: RDD[Row] = model.assignments.map {
-      case assignment: Assignment => Row(assignment.cluster)
+      case assignment: Assignment => Row(assignment.id, assignment.cluster)
     }
-    val schema = new StructType(Array(StructField("cluster", IntegerType)))
-    val predict = sparkSession.createDataFrame(rows, schema)
-    features.withColumn($(predictionCol), predict.col("cluster"))
+    val schema = new StructType(Array(StructField($(featuresCol), LongType),
+      StructField($(predictionCol), IntegerType)))
+    sparkSession.createDataFrame(rows, schema)
   }
 
   @Since("2.0.0")
@@ -178,7 +179,8 @@ object PowerIterationClusteringModel extends MLReadable[PowerIterationClustering
     override protected def saveImpl(path: String): Unit = {
       // Save metadata and Params
       DefaultParamsWriter.saveMetadata(instance, path, sc)
-      MLlibPowerIterationClusteringModel.SaveLoadV1_0.save(sc, instance.parentModel, path)
+      val dataPath = new Path(path, "data").toString
+      instance.parentModel.save(sc, dataPath)
     }
   }
 
@@ -189,10 +191,9 @@ object PowerIterationClusteringModel extends MLReadable[PowerIterationClustering
     private val className = classOf[PowerIterationClusteringModel].getName
 
     override def load(path: String): PowerIterationClusteringModel = {
-
       val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
-      val parentModel = MLlibPowerIterationClusteringModel.SaveLoadV1_0.load(sc, path)
-
+      val dataPath = new Path(path, "data").toString
+      val parentModel = MLlibPowerIterationClusteringModel.load(sc, dataPath)
       val model = new PowerIterationClusteringModel(metadata.uid, parentModel)
       DefaultParamsReader.getAndSetParams(model, metadata)
       model

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -60,7 +60,7 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode
     PowerIterationClusteringModel.SaveLoadV1_0.load(sc, path)
   }
 
-  private[spark]
+  private[clustering]
   object SaveLoadV1_0 {
 
     private val thisFormatVersion = "1.0"

diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/PowerIterationClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/PowerIterationClusteringSuite.scala
@@ -23,24 +23,21 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.util.DefaultReadWriteTest
 import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
 
 class PowerIterationClusteringSuite extends SparkFunSuite
   with MLlibTestSparkContext with DefaultReadWriteTest {
 
-  import org.apache.spark.ml.clustering.PowerIterationClustering._
+  @transient var data: Dataset[_] = _
+  final val r1 = 1.0
+  final val n1 = 10
+  final val r2 = 4.0
+  final val n2 = 40
 
-  /** Generates a circle of points. */
-  private def genCircle(r: Double, n: Int): Array[(Double, Double)] = {
-    Array.tabulate(n) { i =>
-      val theta = 2.0 * math.Pi * i / n
-      (r * math.cos(theta), r * math.sin(theta))
-    }
-  }
+  override def beforeAll(): Unit = {
+    super.beforeAll()
 
-  /** Computes Gaussian similarity. */
-  private def sim(x: (Double, Double), y: (Double, Double)): Double = {
-    val dist2 = (x._1 - y._1) * (x._1 - y._1) + (x._2 - y._2) * (x._2 - y._2)
-    math.exp(-dist2 / 2.0)
+    data = PowerIterationClusteringSuite.generatePICData(spark, r1, r2, n1, n2)
   }
 
   test("default parameters") {
@@ -78,23 +75,7 @@ class PowerIterationClusteringSuite extends SparkFunSuite
   }
 
   test("power iteration clustering") {
-    // Generate two circles following the example in the PIC paper.
-    val r1 = 1.0
-    val n1 = 10
-    val r2 = 4.0
-    val n2 = 40
     val n = n1 + n2
-    val points = genCircle(r1, n1) ++ genCircle(r2, n2)
-    val similarities = for (i <- 1 until n; j <- 0 until i) yield {
-      (i.toLong, j.toLong, sim(points(i), points(j)))
-    }
-
-    val sc = spark.sparkContext
-    val rdd = sc.parallelize(similarities)
-      .map{case (i: Long, j: Long, sim: Double) => Vectors.dense(Array(i, j, sim))}
-      .map(v => TestRow(v))
-    val data = spark.createDataFrame(rdd)
-
     val model = new PowerIterationClustering()
       .setK(2)
       .setMaxIter(40)
@@ -116,4 +97,80 @@ class PowerIterationClusteringSuite extends SparkFunSuite
     }
     assert(predictions2.toSet == Set((0 until n1).toSet, (n1 until n).toSet))
   }
+
+  test("transform") {
+    val predictionColName = "pic_prediction"
+    val model = new PowerIterationClustering()
+      .setK(2)
+      .setMaxIter(10)
+      .setPredictionCol(predictionColName)
+      .fit(data)
+
+    val transformed = model.transform(data)
+    val expectedColumns = Array("features", predictionColName)
+    expectedColumns.foreach { column =>
+      assert(transformed.columns.contains(column))
+    }
+  }
+
+  test("read/write") {
+    def checkModelData(model: PowerIterationClusteringModel,
+                       model2: PowerIterationClusteringModel): Unit = {
+      assert(model.getK === model2.getK)
+      val modelAssignments =
+        model.assignments.map(x => (x.id, x.cluster))
+      val model2Assignments =
+        model2.assignments.map(x => (x.id, x.cluster))
+      val unequalElements = modelAssignments.join(model2Assignments).filter {
+        case (id, (c1, c2)) => c1 != c2 }.count()
+      assert(unequalElements === 0L)
+    }
+    val pic = new PowerIterationClustering()
+    testEstimatorAndModelReadWrite(pic, data, PowerIterationClusteringSuite.allParamSettings,
+      checkModelData)
+  }
+}
+
+object PowerIterationClusteringSuite {
+
+  /** Generates a circle of points. */
+  private def genCircle(r: Double, n: Int): Array[(Double, Double)] = {
+    Array.tabulate(n) { i =>
+      val theta = 2.0 * math.Pi * i / n
+      (r * math.cos(theta), r * math.sin(theta))
+    }
+  }
+
+  /** Computes Gaussian similarity. */
+  private def sim(x: (Double, Double), y: (Double, Double)): Double = {
+    val dist2 = (x._1 - y._1) * (x._1 - y._1) + (x._2 - y._2) * (x._2 - y._2)
+    math.exp(-dist2 / 2.0)
+  }
+
+  def generatePICData(spark: SparkSession, r1: Double, r2: Double,
+    n1: Int, n2: Int): DataFrame = {
+    // Generate two circles following the example in the PIC paper.
+    val n = n1 + n2
+    val points = genCircle(r1, n1) ++ genCircle(r2, n2)
+    val similarities = for (i <- 1 until n; j <- 0 until i) yield {
+      (i.toLong, j.toLong, sim(points(i), points(j)))
+    }
+    val sc = spark.sparkContext
+    val rdd = sc.parallelize(similarities)
+      .map{case (i: Long, j: Long, sim: Double) => Vectors.dense(Array(i, j, sim))}
+      .map(v => TestRow(v))
+    spark.createDataFrame(rdd)
+  }
+
+  /**
+   * Mapping from all Params to valid settings which differ from the defaults.
+   * This is useful for tests which need to exercise all Params, such as save/load.
+   * This excludes input columns to simplify some tests.
+   */
+  val allParamSettings: Map[String, Any] = Map(
+    "predictionCol" -> "myPrediction",
+    "k" -> 2,
+    "maxIter" -> 10,
+    "initMode" -> "random"
+  )
 }