From 31692676e2a6df0651ff35992aa8ab3688c113b8 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 19 Oct 2016 06:46:20 -0700
Subject: [PATCH 1/3] Refactor clustering summary.

---
 .../spark/ml/clustering/BisectingKMeans.scala | 34 ++++++-------------
 .../spark/ml/clustering/GaussianMixture.scala | 33 ++++++------------
 .../apache/spark/ml/clustering/KMeans.scala   | 32 +++++++++++++----
 3 files changed, 46 insertions(+), 53 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index add8ee2a4ff8..e1a4606958f6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -297,27 +297,13 @@ object BisectingKMeans extends DefaultParamsReadable[BisectingKMeans] {
 @Since("2.1.0")
 @Experimental
 class BisectingKMeansSummary private[clustering] (
-    @Since("2.1.0") @transient val predictions: DataFrame,
-    @Since("2.1.0") val predictionCol: String,
-    @Since("2.1.0") val featuresCol: String,
-    @Since("2.1.0") val k: Int) extends Serializable {
-
-  /**
-   * Cluster centers of the transformed data.
-   */
-  @Since("2.1.0")
-  @transient lazy val cluster: DataFrame = predictions.select(predictionCol)
-
-  /**
-   * Size of (number of data points in) each cluster.
-   */
-  @Since("2.1.0")
-  lazy val clusterSizes: Array[Long] = {
-    val sizes = Array.fill[Long](k)(0)
-    cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach {
-      case Row(cluster: Int, count: Long) => sizes(cluster) = count
-    }
-    sizes
-  }
-
-}
+    predictions: DataFrame,
+    predictionCol: String,
+    featuresCol: String,
+    k: Int)
+  extends ClusteringSummary (
+    predictions,
+    predictionCol,
+    featuresCol,
+    k
+  )
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 69f060ad7711..d5862c07cf64 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -365,33 +365,20 @@ object GaussianMixture extends DefaultParamsReadable[GaussianMixture] {
 @Since("2.0.0")
 @Experimental
 class GaussianMixtureSummary private[clustering] (
-    @Since("2.0.0") @transient val predictions: DataFrame,
-    @Since("2.0.0") val predictionCol: String,
-    @Since("2.0.0") val probabilityCol: String,
-    @Since("2.0.0") val featuresCol: String,
-    @Since("2.0.0") val k: Int) extends Serializable {
-
-  /**
-   * Cluster centers of the transformed data.
-   */
-  @Since("2.0.0")
-  @transient lazy val cluster: DataFrame = predictions.select(predictionCol)
+    predictions: DataFrame,
+    predictionCol: String,
+    val probabilityCol: String,
+    featuresCol: String,
+    k: Int)
+  extends ClusteringSummary (
+    predictions,
+    predictionCol,
+    featuresCol,
+    k) {
 
   /**
    * Probability of each cluster.
    */
   @Since("2.0.0")
   @transient lazy val probability: DataFrame = predictions.select(probabilityCol)
-
-  /**
-   * Size of (number of data points in) each cluster.
-   */
-  @Since("2.0.0")
-  lazy val clusterSizes: Array[Long] = {
-    val sizes = Array.fill[Long](k)(0)
-    cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach {
-      case Row(cluster: Int, count: Long) => sizes(cluster) = count
-    }
-    sizes
-  }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index b04e82838e71..c345f28c394e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -354,21 +354,41 @@ object KMeans extends DefaultParamsReadable[KMeans] {
 @Since("2.0.0")
 @Experimental
 class KMeansSummary private[clustering] (
-    @Since("2.0.0") @transient val predictions: DataFrame,
-    @Since("2.0.0") val predictionCol: String,
-    @Since("2.0.0") val featuresCol: String,
-    @Since("2.0.0") val k: Int) extends Serializable {
+    predictions: DataFrame,
+    predictionCol: String,
+    featuresCol: String,
+    k: Int)
+  extends ClusteringSummary (
+    predictions,
+    predictionCol,
+    featuresCol,
+    k
+  )
+
+/**
+ * :: Experimental ::
+ * Summary of clustering.
+ *
+ * @param predictions  [[DataFrame]] produced by model.transform()
+ * @param predictionCol  Name for column of predicted clusters in `predictions`
+ * @param featuresCol  Name for column of features in `predictions`
+ * @param k  Number of clusters
+ */
+@Experimental
+class ClusteringSummary private[clustering] (
+    @transient val predictions: DataFrame,
+    val predictionCol: String,
+    val featuresCol: String,
+    val k: Int) extends Serializable {
 
   /**
    * Cluster centers of the transformed data.
    */
-  @Since("2.0.0")
   @transient lazy val cluster: DataFrame = predictions.select(predictionCol)
 
   /**
    * Size of (number of data points in) each cluster.
    */
-  @Since("2.0.0")
   lazy val clusterSizes: Array[Long] = {
     val sizes = Array.fill[Long](k)(0)
     cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach {

From f13f240019d38499cbf06b3cc6226b2589454f64 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Mon, 24 Oct 2016 21:45:26 -0700
Subject: [PATCH 2/3] Address comments.

---
 .../spark/ml/clustering/BisectingKMeans.scala | 16 ++++--------
 .../spark/ml/clustering/GaussianMixture.scala | 18 +++++--------
 .../apache/spark/ml/clustering/KMeans.scala   | 26 +++++++------------
 3 files changed, 22 insertions(+), 38 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index e1a4606958f6..c25c83445ed7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -289,10 +289,10 @@ object BisectingKMeans extends DefaultParamsReadable[BisectingKMeans] {
  * :: Experimental ::
  * Summary of BisectingKMeans.
  *
- * @param predictions  [[DataFrame]] produced by [[BisectingKMeansModel.transform()]]
- * @param predictionCol  Name for column of predicted clusters in `predictions`
- * @param featuresCol  Name for column of features in `predictions`
- * @param k  Number of clusters
+ * @param predictions  [[DataFrame]] produced by [[BisectingKMeansModel.transform()]].
+ * @param predictionCol  Name for column of predicted clusters in `predictions`.
+ * @param featuresCol  Name for column of features in `predictions`.
+ * @param k  Number of clusters.
  */
 @Since("2.1.0")
 @Experimental
@@ -300,10 +300,4 @@ class BisectingKMeansSummary private[clustering] (
     predictions: DataFrame,
     predictionCol: String,
     featuresCol: String,
-    k: Int)
-  extends ClusteringSummary (
-    predictions,
-    predictionCol,
-    featuresCol,
-    k
-  )
+    k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index d5862c07cf64..31cda5b9b6e7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -356,11 +356,12 @@ object GaussianMixture extends DefaultParamsReadable[GaussianMixture] {
  * :: Experimental ::
  * Summary of GaussianMixture.
  *
- * @param predictions  [[DataFrame]] produced by [[GaussianMixtureModel.transform()]]
- * @param predictionCol  Name for column of predicted clusters in `predictions`
- * @param probabilityCol  Name for column of predicted probability of each cluster in `predictions`
- * @param featuresCol  Name for column of features in `predictions`
- * @param k  Number of clusters
+ * @param predictions  [[DataFrame]] produced by [[GaussianMixtureModel.transform()]].
+ * @param predictionCol  Name for column of predicted clusters in `predictions`.
+ * @param probabilityCol  Name for column of predicted probability of each cluster
+ *                        in `predictions`.
+ * @param featuresCol  Name for column of features in `predictions`.
+ * @param k  Number of clusters.
  */
 @Since("2.0.0")
 @Experimental
@@ -369,12 +370,7 @@ class GaussianMixtureSummary private[clustering] (
     predictionCol: String,
     val probabilityCol: String,
     featuresCol: String,
-    k: Int)
-  extends ClusteringSummary (
-    predictions,
-    predictionCol,
-    featuresCol,
-    k) {
+    k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k) {
 
   /**
    * Probability of each cluster.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index c345f28c394e..52970c187e69 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -346,10 +346,10 @@ object KMeans extends DefaultParamsReadable[KMeans] {
  * :: Experimental ::
  * Summary of KMeans.
  *
- * @param predictions  [[DataFrame]] produced by [[KMeansModel.transform()]]
- * @param predictionCol  Name for column of predicted clusters in `predictions`
- * @param featuresCol  Name for column of features in `predictions`
- * @param k  Number of clusters
+ * @param predictions  [[DataFrame]] produced by [[KMeansModel.transform()]].
+ * @param predictionCol  Name for column of predicted clusters in `predictions`.
+ * @param featuresCol  Name for column of features in `predictions`.
+ * @param k  Number of clusters.
  */
 @Since("2.0.0")
 @Experimental
@@ -357,22 +357,16 @@ class KMeansSummary private[clustering] (
     predictions: DataFrame,
     predictionCol: String,
     featuresCol: String,
-    k: Int)
-  extends ClusteringSummary (
-    predictions,
-    predictionCol,
-    featuresCol,
-    k
-  )
+    k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k)
 
 /**
  * :: Experimental ::
- * Summary of clustering.
+ * Summary of clustering algorithms.
  *
- * @param predictions  [[DataFrame]] produced by model.transform()
- * @param predictionCol  Name for column of predicted clusters in `predictions`
- * @param featuresCol  Name for column of features in `predictions`
- * @param k  Number of clusters
+ * @param predictions  [[DataFrame]] produced by model.transform().
+ * @param predictionCol  Name for column of predicted clusters in `predictions`.
+ * @param featuresCol  Name for column of features in `predictions`.
+ * @param k  Number of clusters.
  */
 @Experimental
 class ClusteringSummary private[clustering] (

From 946ee7325185e512b5ea359a443712cfdcaf2cb0 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 26 Oct 2016 02:51:15 -0700
Subject: [PATCH 3/3] Move ClusteringSummary to a separate file.

---
 .../ml/clustering/ClusteringSummary.scala     | 54 +++++++++++++++++++
 .../spark/ml/clustering/GaussianMixture.scala |  2 +-
 .../apache/spark/ml/clustering/KMeans.scala   | 34 ------------
 3 files changed, 55 insertions(+), 35 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
new file mode 100644
index 000000000000..8b5f525194f2
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.clustering
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.sql.{DataFrame, Row}
+
+/**
+ * :: Experimental ::
+ * Summary of clustering algorithms.
+ *
+ * @param predictions  [[DataFrame]] produced by model.transform().
+ * @param predictionCol  Name for column of predicted clusters in `predictions`.
+ * @param featuresCol  Name for column of features in `predictions`.
+ * @param k  Number of clusters.
+ */
+@Experimental
+class ClusteringSummary private[clustering] (
+    @transient val predictions: DataFrame,
+    val predictionCol: String,
+    val featuresCol: String,
+    val k: Int) extends Serializable {
+
+  /**
+   * Cluster centers of the transformed data.
+   */
+  @transient lazy val cluster: DataFrame = predictions.select(predictionCol)
+
+  /**
+   * Size of (number of data points in) each cluster.
+   */
+  lazy val clusterSizes: Array[Long] = {
+    val sizes = Array.fill[Long](k)(0)
+    cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach {
+      case Row(cluster: Int, count: Long) => sizes(cluster) = count
+    }
+    sizes
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 31cda5b9b6e7..e3cb92f4f144 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -368,7 +368,7 @@ object GaussianMixture extends DefaultParamsReadable[GaussianMixture] {
 class GaussianMixtureSummary private[clustering] (
     predictions: DataFrame,
     predictionCol: String,
-    val probabilityCol: String,
+    @Since("2.0.0") val probabilityCol: String,
     featuresCol: String,
     k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k) {
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index 52970c187e69..fee322d0c6a7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -358,37 +358,3 @@ class KMeansSummary private[clustering] (
     predictionCol: String,
     featuresCol: String,
     k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k)
-
-/**
- * :: Experimental ::
- * Summary of clustering algorithms.
- *
- * @param predictions  [[DataFrame]] produced by model.transform().
- * @param predictionCol  Name for column of predicted clusters in `predictions`.
- * @param featuresCol  Name for column of features in `predictions`.
- * @param k  Number of clusters.
- */
-@Experimental
-class ClusteringSummary private[clustering] (
-    @transient val predictions: DataFrame,
-    val predictionCol: String,
-    val featuresCol: String,
-    val k: Int) extends Serializable {
-
-  /**
-   * Cluster centers of the transformed data.
-   */
-  @transient lazy val cluster: DataFrame = predictions.select(predictionCol)
-
-  /**
-   * Size of (number of data points in) each cluster.
-   */
-  lazy val clusterSizes: Array[Long] = {
-    val sizes = Array.fill[Long](k)(0)
-    cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach {
-      case Row(cluster: Int, count: Long) => sizes(cluster) = count
-    }
-    sizes
-  }
-
-}