Skip to content

Commit 29ba957

Browse files
committed
[SPARK-17389][ML][MLLIB] KMeans speedup with better choice of k-means|| init steps = 2
## What changes were proposed in this pull request? Reduce default k-means|| init steps to 2 from 5. See JIRA for discussion. See also #14948 ## How was this patch tested? Existing tests. Author: Sean Owen <sowen@cloudera.com> Closes #14956 from srowen/SPARK-17389.2.
1 parent 71b7d42 commit 29ba957

File tree

2 files changed

+6
-10
lines changed

2 files changed

+6
-10
lines changed

mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,10 @@ class KMeans private (
5151

5252
/**
5353
* Constructs a KMeans instance with default parameters: {k: 2, maxIterations: 20, runs: 1,
54-
* initializationMode: "k-means||", initializationSteps: 5, epsilon: 1e-4, seed: random}.
54+
* initializationMode: "k-means||", initializationSteps: 2, epsilon: 1e-4, seed: random}.
5555
*/
5656
@Since("0.8.0")
57-
def this() = this(2, 20, 1, KMeans.K_MEANS_PARALLEL, 5, 1e-4, Utils.random.nextLong())
57+
def this() = this(2, 20, 1, KMeans.K_MEANS_PARALLEL, 2, 1e-4, Utils.random.nextLong())
5858

5959
/**
6060
* Number of clusters to create (k).
@@ -134,7 +134,7 @@ class KMeans private (
134134

135135
/**
136136
* Set the number of steps for the k-means|| initialization mode. This is an advanced
137-
* setting -- the default of 5 is almost always enough. Default: 5.
137+
* setting -- the default of 2 is almost always enough. Default: 2.
138138
*/
139139
@Since("0.8.0")
140140
def setInitializationSteps(initializationSteps: Int): this.type = {

mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ class PowerIterationClusteringSuite extends SparkFunSuite with MLlibTestSparkCon
4949
val r1 = 1.0
5050
val n1 = 10
5151
val r2 = 4.0
52-
val n2 = 40
52+
val n2 = 10
5353
val n = n1 + n2
5454
val points = genCircle(r1, n1) ++ genCircle(r2, n2)
5555
val similarities = for (i <- 1 until n; j <- 0 until i) yield {
@@ -83,19 +83,15 @@ class PowerIterationClusteringSuite extends SparkFunSuite with MLlibTestSparkCon
8383
val r1 = 1.0
8484
val n1 = 10
8585
val r2 = 4.0
86-
val n2 = 40
86+
val n2 = 10
8787
val n = n1 + n2
8888
val points = genCircle(r1, n1) ++ genCircle(r2, n2)
8989
val similarities = for (i <- 1 until n; j <- 0 until i) yield {
9090
(i.toLong, j.toLong, sim(points(i), points(j)))
9191
}
9292

9393
val edges = similarities.flatMap { case (i, j, s) =>
94-
if (i != j) {
95-
Seq(Edge(i, j, s), Edge(j, i, s))
96-
} else {
97-
None
98-
}
94+
Seq(Edge(i, j, s), Edge(j, i, s))
9995
}
10096
val graph = Graph.fromEdges(sc.parallelize(edges, 2), 0.0)
10197

0 commit comments

Comments
 (0)