From 31692676e2a6df0651ff35992aa8ab3688c113b8 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Wed, 19 Oct 2016 06:46:20 -0700 Subject: [PATCH 1/3] Refactor clustering summary. --- .../spark/ml/clustering/BisectingKMeans.scala | 34 ++++++------------- .../spark/ml/clustering/GaussianMixture.scala | 33 ++++++------------ .../apache/spark/ml/clustering/KMeans.scala | 32 +++++++++++++---- 3 files changed, 46 insertions(+), 53 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala index add8ee2a4ff8..e1a4606958f6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala @@ -297,27 +297,13 @@ object BisectingKMeans extends DefaultParamsReadable[BisectingKMeans] { @Since("2.1.0") @Experimental class BisectingKMeansSummary private[clustering] ( - @Since("2.1.0") @transient val predictions: DataFrame, - @Since("2.1.0") val predictionCol: String, - @Since("2.1.0") val featuresCol: String, - @Since("2.1.0") val k: Int) extends Serializable { - - /** - * Cluster centers of the transformed data. - */ - @Since("2.1.0") - @transient lazy val cluster: DataFrame = predictions.select(predictionCol) - - /** - * Size of (number of data points in) each cluster. - */ - @Since("2.1.0") - lazy val clusterSizes: Array[Long] = { - val sizes = Array.fill[Long](k)(0) - cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach { - case Row(cluster: Int, count: Long) => sizes(cluster) = count - } - sizes - } - -} + predictions: DataFrame, + predictionCol: String, + featuresCol: String, + k: Int) + extends ClusteringSummary ( + predictions, + predictionCol, + featuresCol, + k + ) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index 69f060ad7711..d5862c07cf64 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -365,33 +365,20 @@ object GaussianMixture extends DefaultParamsReadable[GaussianMixture] { @Since("2.0.0") @Experimental class GaussianMixtureSummary private[clustering] ( - @Since("2.0.0") @transient val predictions: DataFrame, - @Since("2.0.0") val predictionCol: String, - @Since("2.0.0") val probabilityCol: String, - @Since("2.0.0") val featuresCol: String, - @Since("2.0.0") val k: Int) extends Serializable { - - /** - * Cluster centers of the transformed data. - */ - @Since("2.0.0") - @transient lazy val cluster: DataFrame = predictions.select(predictionCol) + predictions: DataFrame, + predictionCol: String, + val probabilityCol: String, + featuresCol: String, + k: Int) + extends ClusteringSummary ( + predictions, + predictionCol, + featuresCol, + k) { /** * Probability of each cluster. */ @Since("2.0.0") @transient lazy val probability: DataFrame = predictions.select(probabilityCol) - - /** - * Size of (number of data points in) each cluster. - */ - @Since("2.0.0") - lazy val clusterSizes: Array[Long] = { - val sizes = Array.fill[Long](k)(0) - cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach { - case Row(cluster: Int, count: Long) => sizes(cluster) = count - } - sizes - } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala index b04e82838e71..c345f28c394e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala @@ -354,21 +354,41 @@ object KMeans extends DefaultParamsReadable[KMeans] { @Since("2.0.0") @Experimental class KMeansSummary private[clustering] ( - @Since("2.0.0") @transient val predictions: DataFrame, - @Since("2.0.0") val predictionCol: String, - @Since("2.0.0") val featuresCol: String, - @Since("2.0.0") val k: Int) extends Serializable { + predictions: DataFrame, + predictionCol: String, + featuresCol: String, + k: Int) + extends ClusteringSummary ( + predictions, + predictionCol, + featuresCol, + k + ) + +/** + * :: Experimental :: + * Summary of clustering. + * + * @param predictions [[DataFrame]] produced by model.transform() + * @param predictionCol Name for column of predicted clusters in `predictions` + * @param featuresCol Name for column of features in `predictions` + * @param k Number of clusters + */ +@Experimental +class ClusteringSummary private[clustering] ( + @transient val predictions: DataFrame, + val predictionCol: String, + val featuresCol: String, + val k: Int) extends Serializable { /** * Cluster centers of the transformed data. */ - @Since("2.0.0") @transient lazy val cluster: DataFrame = predictions.select(predictionCol) /** * Size of (number of data points in) each cluster. */ - @Since("2.0.0") lazy val clusterSizes: Array[Long] = { val sizes = Array.fill[Long](k)(0) cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach { From f13f240019d38499cbf06b3cc6226b2589454f64 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Mon, 24 Oct 2016 21:45:26 -0700 Subject: [PATCH 2/3] Address comments. --- .../spark/ml/clustering/BisectingKMeans.scala | 16 ++++-------- .../spark/ml/clustering/GaussianMixture.scala | 18 +++++-------- .../apache/spark/ml/clustering/KMeans.scala | 26 +++++++------------ 3 files changed, 22 insertions(+), 38 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala index e1a4606958f6..c25c83445ed7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala @@ -289,10 +289,10 @@ object BisectingKMeans extends DefaultParamsReadable[BisectingKMeans] { * :: Experimental :: * Summary of BisectingKMeans. * - * @param predictions [[DataFrame]] produced by [[BisectingKMeansModel.transform()]] - * @param predictionCol Name for column of predicted clusters in `predictions` - * @param featuresCol Name for column of features in `predictions` - * @param k Number of clusters + * @param predictions [[DataFrame]] produced by [[BisectingKMeansModel.transform()]]. + * @param predictionCol Name for column of predicted clusters in `predictions`. + * @param featuresCol Name for column of features in `predictions`. + * @param k Number of clusters. */ @Since("2.1.0") @Experimental @@ -300,10 +300,4 @@ class BisectingKMeansSummary private[clustering] ( predictions: DataFrame, predictionCol: String, featuresCol: String, - k: Int) - extends ClusteringSummary ( - predictions, - predictionCol, - featuresCol, - k - ) + k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index d5862c07cf64..31cda5b9b6e7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -356,11 +356,12 @@ object GaussianMixture extends DefaultParamsReadable[GaussianMixture] { * :: Experimental :: * Summary of GaussianMixture. * - * @param predictions [[DataFrame]] produced by [[GaussianMixtureModel.transform()]] - * @param predictionCol Name for column of predicted clusters in `predictions` - * @param probabilityCol Name for column of predicted probability of each cluster in `predictions` - * @param featuresCol Name for column of features in `predictions` - * @param k Number of clusters + * @param predictions [[DataFrame]] produced by [[GaussianMixtureModel.transform()]]. + * @param predictionCol Name for column of predicted clusters in `predictions`. + * @param probabilityCol Name for column of predicted probability of each cluster + * in `predictions`. + * @param featuresCol Name for column of features in `predictions`. + * @param k Number of clusters. */ @Since("2.0.0") @Experimental @@ -369,12 +370,7 @@ class GaussianMixtureSummary private[clustering] ( predictionCol: String, val probabilityCol: String, featuresCol: String, - k: Int) - extends ClusteringSummary ( - predictions, - predictionCol, - featuresCol, - k) { + k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k) { /** * Probability of each cluster. diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala index c345f28c394e..52970c187e69 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala @@ -346,10 +346,10 @@ object KMeans extends DefaultParamsReadable[KMeans] { * :: Experimental :: * Summary of KMeans. * - * @param predictions [[DataFrame]] produced by [[KMeansModel.transform()]] - * @param predictionCol Name for column of predicted clusters in `predictions` - * @param featuresCol Name for column of features in `predictions` - * @param k Number of clusters + * @param predictions [[DataFrame]] produced by [[KMeansModel.transform()]]. + * @param predictionCol Name for column of predicted clusters in `predictions`. + * @param featuresCol Name for column of features in `predictions`. + * @param k Number of clusters. */ @Since("2.0.0") @Experimental @@ -357,22 +357,16 @@ class KMeansSummary private[clustering] ( predictions: DataFrame, predictionCol: String, featuresCol: String, - k: Int) - extends ClusteringSummary ( - predictions, - predictionCol, - featuresCol, - k - ) + k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k) /** * :: Experimental :: - * Summary of clustering. + * Summary of clustering algorithms. * - * @param predictions [[DataFrame]] produced by model.transform() - * @param predictionCol Name for column of predicted clusters in `predictions` - * @param featuresCol Name for column of features in `predictions` - * @param k Number of clusters + * @param predictions [[DataFrame]] produced by model.transform(). + * @param predictionCol Name for column of predicted clusters in `predictions`. + * @param featuresCol Name for column of features in `predictions`. + * @param k Number of clusters. */ @Experimental class ClusteringSummary private[clustering] ( From 946ee7325185e512b5ea359a443712cfdcaf2cb0 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Wed, 26 Oct 2016 02:51:15 -0700 Subject: [PATCH 3/3] Move ClusteringSummary to a separate file. --- .../ml/clustering/ClusteringSummary.scala | 54 +++++++++++++++++++ .../spark/ml/clustering/GaussianMixture.scala | 2 +- .../apache/spark/ml/clustering/KMeans.scala | 34 ------------ 3 files changed, 55 insertions(+), 35 deletions(-) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala new file mode 100644 index 000000000000..8b5f525194f2 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.clustering + +import org.apache.spark.annotation.Experimental +import org.apache.spark.sql.{DataFrame, Row} + +/** + * :: Experimental :: + * Summary of clustering algorithms. + * + * @param predictions [[DataFrame]] produced by model.transform(). + * @param predictionCol Name for column of predicted clusters in `predictions`. + * @param featuresCol Name for column of features in `predictions`. + * @param k Number of clusters. + */ +@Experimental +class ClusteringSummary private[clustering] ( + @transient val predictions: DataFrame, + val predictionCol: String, + val featuresCol: String, + val k: Int) extends Serializable { + + /** + * Cluster centers of the transformed data. + */ + @transient lazy val cluster: DataFrame = predictions.select(predictionCol) + + /** + * Size of (number of data points in) each cluster. + */ + lazy val clusterSizes: Array[Long] = { + val sizes = Array.fill[Long](k)(0) + cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach { + case Row(cluster: Int, count: Long) => sizes(cluster) = count + } + sizes + } +} diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index 31cda5b9b6e7..e3cb92f4f144 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -368,7 +368,7 @@ object GaussianMixture extends DefaultParamsReadable[GaussianMixture] { class GaussianMixtureSummary private[clustering] ( predictions: DataFrame, predictionCol: String, - val probabilityCol: String, + @Since("2.0.0") val probabilityCol: String, featuresCol: String, k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k) { diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala index 52970c187e69..fee322d0c6a7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala @@ -358,37 +358,3 @@ class KMeansSummary private[clustering] ( predictionCol: String, featuresCol: String, k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k) - -/** - * :: Experimental :: - * Summary of clustering algorithms. - * - * @param predictions [[DataFrame]] produced by model.transform(). - * @param predictionCol Name for column of predicted clusters in `predictions`. - * @param featuresCol Name for column of features in `predictions`. - * @param k Number of clusters. - */ -@Experimental -class ClusteringSummary private[clustering] ( - @transient val predictions: DataFrame, - val predictionCol: String, - val featuresCol: String, - val k: Int) extends Serializable { - - /** - * Cluster centers of the transformed data. - */ - @transient lazy val cluster: DataFrame = predictions.select(predictionCol) - - /** - * Size of (number of data points in) each cluster. - */ - lazy val clusterSizes: Array[Long] = { - val sizes = Array.fill[Long](k)(0) - cluster.groupBy(predictionCol).count().select(predictionCol, "count").collect().foreach { - case Row(cluster: Int, count: Long) => sizes(cluster) = count - } - sizes - } - -}