Skip to content

Commit 38926e3

Browse files
committed
[SPARK-3850] Trim trailing spaces for MLlib.
1 parent f7fe9e4 commit 38926e3

30 files changed

+189
-189
lines changed

mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,13 @@ private[feature] trait StandardScalerParams extends Params with HasInputCol with
3535

3636
/**
3737
* Centers the data with mean before scaling.
38-
* It will build a dense output, so this does not work on sparse input
38+
* It will build a dense output, so this does not work on sparse input
3939
* and will raise an exception.
4040
* Default: false
4141
* @group param
4242
*/
4343
val withMean: BooleanParam = new BooleanParam(this, "withMean", "Center data with mean")
44-
44+
4545
/**
4646
* Scales the data to unit standard deviation.
4747
* Default: true
@@ -68,13 +68,13 @@ class StandardScaler(override val uid: String) extends Estimator[StandardScalerM
6868

6969
/** @group setParam */
7070
def setOutputCol(value: String): this.type = set(outputCol, value)
71-
71+
7272
/** @group setParam */
7373
def setWithMean(value: Boolean): this.type = set(withMean, value)
74-
74+
7575
/** @group setParam */
7676
def setWithStd(value: Boolean): this.type = set(withStd, value)
77-
77+
7878
override def fit(dataset: DataFrame): StandardScalerModel = {
7979
transformSchema(dataset.schema, logging = true)
8080
val input = dataset.select($(inputCol)).map { case Row(v: Vector) => v }

mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,7 @@ private class LeastSquaresAggregator(
321321
}
322322
(weightsArray, -sum + labelMean / labelStd, weightsArray.length)
323323
}
324-
324+
325325
private val effectiveWeightsVector = Vectors.dense(effectiveWeightsArray)
326326

327327
private val gradientSumArray = Array.ofDim[Double](dim)

mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ private[python] class PythonMLLibAPI extends Serializable {
399399
val sigma = si.map(_.asInstanceOf[DenseMatrix])
400400
val gaussians = Array.tabulate(weight.length){
401401
i => new MultivariateGaussian(mean(i), sigma(i))
402-
}
402+
}
403403
val model = new GaussianMixtureModel(weight, gaussians)
404404
model.predictSoft(data).map(Vectors.dense)
405405
}
@@ -494,7 +494,7 @@ private[python] class PythonMLLibAPI extends Serializable {
494494
def normalizeVector(p: Double, rdd: JavaRDD[Vector]): JavaRDD[Vector] = {
495495
new Normalizer(p).transform(rdd)
496496
}
497-
497+
498498
/**
499499
* Java stub for StandardScaler.fit(). This stub returns a
500500
* handle to the Java object instead of the content of the Java object.

mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala

Lines changed: 43 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,11 @@ import org.apache.spark.util.Utils
3636
* independent Gaussian distributions with associated "mixing" weights
3737
* specifying each's contribution to the composite.
3838
*
39-
* Given a set of sample points, this class will maximize the log-likelihood
40-
* for a mixture of k Gaussians, iterating until the log-likelihood changes by
39+
* Given a set of sample points, this class will maximize the log-likelihood
40+
* for a mixture of k Gaussians, iterating until the log-likelihood changes by
4141
* less than convergenceTol, or until it has reached the max number of iterations.
4242
* While this process is generally guaranteed to converge, it is not guaranteed
43-
* to find a global optimum.
43+
* to find a global optimum.
4444
*
4545
* Note: For high-dimensional data (with many features), this algorithm may perform poorly.
4646
* This is due to high-dimensional data (a) making it difficult to cluster at all (based
@@ -53,24 +53,24 @@ import org.apache.spark.util.Utils
5353
*/
5454
@Experimental
5555
class GaussianMixture private (
56-
private var k: Int,
57-
private var convergenceTol: Double,
56+
private var k: Int,
57+
private var convergenceTol: Double,
5858
private var maxIterations: Int,
5959
private var seed: Long) extends Serializable {
60-
60+
6161
/**
6262
* Constructs a default instance. The default parameters are {k: 2, convergenceTol: 0.01,
6363
* maxIterations: 100, seed: random}.
6464
*/
6565
def this() = this(2, 0.01, 100, Utils.random.nextLong())
66-
66+
6767
// number of samples per cluster to use when initializing Gaussians
6868
private val nSamples = 5
69-
70-
// an initializing GMM can be provided rather than using the
69+
70+
// an initializing GMM can be provided rather than using the
7171
// default random starting point
7272
private var initialModel: Option[GaussianMixtureModel] = None
73-
73+
7474
/** Set the initial GMM starting point, bypassing the random initialization.
7575
* You must call setK() prior to calling this method, and the condition
7676
* (model.k == this.k) must be met; failure will result in an IllegalArgumentException
@@ -83,37 +83,37 @@ class GaussianMixture private (
8383
}
8484
this
8585
}
86-
86+
8787
/** Return the user supplied initial GMM, if supplied */
8888
def getInitialModel: Option[GaussianMixtureModel] = initialModel
89-
89+
9090
/** Set the number of Gaussians in the mixture model. Default: 2 */
9191
def setK(k: Int): this.type = {
9292
this.k = k
9393
this
9494
}
95-
95+
9696
/** Return the number of Gaussians in the mixture model */
9797
def getK: Int = k
98-
98+
9999
/** Set the maximum number of iterations to run. Default: 100 */
100100
def setMaxIterations(maxIterations: Int): this.type = {
101101
this.maxIterations = maxIterations
102102
this
103103
}
104-
104+
105105
/** Return the maximum number of iterations to run */
106106
def getMaxIterations: Int = maxIterations
107-
107+
108108
/**
109-
* Set the largest change in log-likelihood at which convergence is
109+
* Set the largest change in log-likelihood at which convergence is
110110
* considered to have occurred.
111111
*/
112112
def setConvergenceTol(convergenceTol: Double): this.type = {
113113
this.convergenceTol = convergenceTol
114114
this
115115
}
116-
116+
117117
/**
118118
* Return the largest change in log-likelihood at which convergence is
119119
* considered to have occurred.
@@ -132,41 +132,41 @@ class GaussianMixture private (
132132
/** Perform expectation maximization */
133133
def run(data: RDD[Vector]): GaussianMixtureModel = {
134134
val sc = data.sparkContext
135-
135+
136136
// we will operate on the data as breeze data
137137
val breezeData = data.map(_.toBreeze).cache()
138-
138+
139139
// Get length of the input vectors
140140
val d = breezeData.first().length
141-
141+
142142
// Determine initial weights and corresponding Gaussians.
143143
// If the user supplied an initial GMM, we use those values, otherwise
144144
// we start with uniform weights, a random mean from the data, and
145145
// diagonal covariance matrices using component variances
146-
// derived from the samples
146+
// derived from the samples
147147
val (weights, gaussians) = initialModel match {
148148
case Some(gmm) => (gmm.weights, gmm.gaussians)
149-
149+
150150
case None => {
151151
val samples = breezeData.takeSample(withReplacement = true, k * nSamples, seed)
152-
(Array.fill(k)(1.0 / k), Array.tabulate(k) { i =>
152+
(Array.fill(k)(1.0 / k), Array.tabulate(k) { i =>
153153
val slice = samples.view(i * nSamples, (i + 1) * nSamples)
154-
new MultivariateGaussian(vectorMean(slice), initCovariance(slice))
154+
new MultivariateGaussian(vectorMean(slice), initCovariance(slice))
155155
})
156156
}
157157
}
158-
159-
var llh = Double.MinValue // current log-likelihood
158+
159+
var llh = Double.MinValue // current log-likelihood
160160
var llhp = 0.0 // previous log-likelihood
161-
161+
162162
var iter = 0
163163
while (iter < maxIterations && math.abs(llh-llhp) > convergenceTol) {
164164
// create and broadcast curried cluster contribution function
165165
val compute = sc.broadcast(ExpectationSum.add(weights, gaussians)_)
166-
166+
167167
// aggregate the cluster contribution for all sample points
168168
val sums = breezeData.aggregate(ExpectationSum.zero(k, d))(compute.value, _ += _)
169-
169+
170170
// Create new distributions based on the partial assignments
171171
// (often referred to as the "M" step in literature)
172172
val sumWeights = sums.weights.sum
@@ -179,22 +179,22 @@ class GaussianMixture private (
179179
gaussians(i) = new MultivariateGaussian(mu, sums.sigmas(i) / sums.weights(i))
180180
i = i + 1
181181
}
182-
182+
183183
llhp = llh // current becomes previous
184184
llh = sums.logLikelihood // this is the freshly computed log-likelihood
185185
iter += 1
186-
}
187-
186+
}
187+
188188
new GaussianMixtureModel(weights, gaussians)
189189
}
190-
190+
191191
/** Average of dense breeze vectors */
192192
private def vectorMean(x: IndexedSeq[BV[Double]]): BDV[Double] = {
193193
val v = BDV.zeros[Double](x(0).length)
194194
x.foreach(xi => v += xi)
195-
v / x.length.toDouble
195+
v / x.length.toDouble
196196
}
197-
197+
198198
/**
199199
* Construct matrix where diagonal entries are element-wise
200200
* variance of input vectors (computes biased variance)
@@ -210,14 +210,14 @@ class GaussianMixture private (
210210
// companion class to provide zero constructor for ExpectationSum
211211
private object ExpectationSum {
212212
def zero(k: Int, d: Int): ExpectationSum = {
213-
new ExpectationSum(0.0, Array.fill(k)(0.0),
213+
new ExpectationSum(0.0, Array.fill(k)(0.0),
214214
Array.fill(k)(BDV.zeros(d)), Array.fill(k)(BreezeMatrix.zeros(d, d)))
215215
}
216-
216+
217217
// compute cluster contributions for each input point
218218
// (U, T) => U for aggregation
219219
def add(
220-
weights: Array[Double],
220+
weights: Array[Double],
221221
dists: Array[MultivariateGaussian])
222222
(sums: ExpectationSum, x: BV[Double]): ExpectationSum = {
223223
val p = weights.zip(dists).map {
@@ -235,7 +235,7 @@ private object ExpectationSum {
235235
i = i + 1
236236
}
237237
sums
238-
}
238+
}
239239
}
240240

241241
// Aggregation class for partial expectation results
@@ -244,9 +244,9 @@ private class ExpectationSum(
244244
val weights: Array[Double],
245245
val means: Array[BDV[Double]],
246246
val sigmas: Array[BreezeMatrix[Double]]) extends Serializable {
247-
247+
248248
val k = weights.length
249-
249+
250250
def +=(x: ExpectationSum): ExpectationSum = {
251251
var i = 0
252252
while (i < k) {
@@ -257,5 +257,5 @@ private class ExpectationSum(
257257
}
258258
logLikelihood += x.logLikelihood
259259
this
260-
}
260+
}
261261
}

mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -34,20 +34,20 @@ import org.apache.spark.sql.{SQLContext, Row}
3434
/**
3535
* :: Experimental ::
3636
*
37-
* Multivariate Gaussian Mixture Model (GMM) consisting of k Gaussians, where points
38-
* are drawn from each Gaussian i=1..k with probability w(i); mu(i) and sigma(i) are
39-
* the respective mean and covariance for each Gaussian distribution i=1..k.
40-
*
37+
* Multivariate Gaussian Mixture Model (GMM) consisting of k Gaussians, where points
38+
* are drawn from each Gaussian i=1..k with probability w(i); mu(i) and sigma(i) are
39+
* the respective mean and covariance for each Gaussian distribution i=1..k.
40+
*
4141
* @param weights Weights for each Gaussian distribution in the mixture, where weights(i) is
4242
* the weight for Gaussian i, and weights.sum == 1
4343
* @param gaussians Array of MultivariateGaussian where gaussians(i) represents
4444
* the Multivariate Gaussian (Normal) Distribution for Gaussian i
4545
*/
4646
@Experimental
4747
class GaussianMixtureModel(
48-
val weights: Array[Double],
48+
val weights: Array[Double],
4949
val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable{
50-
50+
5151
require(weights.length == gaussians.length, "Length of weight and Gaussian arrays must match")
5252

5353
override protected def formatVersion = "1.0"
@@ -64,20 +64,20 @@ class GaussianMixtureModel(
6464
val responsibilityMatrix = predictSoft(points)
6565
responsibilityMatrix.map(r => r.indexOf(r.max))
6666
}
67-
67+
6868
/**
6969
* Given the input vectors, return the membership value of each vector
70-
* to all mixture components.
70+
* to all mixture components.
7171
*/
7272
def predictSoft(points: RDD[Vector]): RDD[Array[Double]] = {
7373
val sc = points.sparkContext
7474
val bcDists = sc.broadcast(gaussians)
7575
val bcWeights = sc.broadcast(weights)
76-
points.map { x =>
76+
points.map { x =>
7777
computeSoftAssignments(x.toBreeze.toDenseVector, bcDists.value, bcWeights.value, k)
7878
}
7979
}
80-
80+
8181
/**
8282
* Compute the partial assignments for each vector
8383
*/
@@ -89,7 +89,7 @@ class GaussianMixtureModel(
8989
val p = weights.zip(dists).map {
9090
case (weight, dist) => MLUtils.EPSILON + weight * dist.pdf(pt)
9191
}
92-
val pSum = p.sum
92+
val pSum = p.sum
9393
for (i <- 0 until k) {
9494
p(i) /= pSum
9595
}

mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ class PowerIterationClustering private[clustering] (
121121
import org.apache.spark.mllib.clustering.PowerIterationClustering._
122122

123123
/** Constructs a PIC instance with default parameters: {k: 2, maxIterations: 100,
124-
* initMode: "random"}.
124+
* initMode: "random"}.
125125
*/
126126
def this() = this(k = 2, maxIterations = 100, initMode = "random")
127127

@@ -243,7 +243,7 @@ object PowerIterationClustering extends Logging {
243243

244244
/**
245245
* Generates random vertex properties (v0) to start power iteration.
246-
*
246+
*
247247
* @param g a graph representing the normalized affinity matrix (W)
248248
* @return a graph with edges representing W and vertices representing a random vector
249249
* with unit 1-norm
@@ -266,7 +266,7 @@ object PowerIterationClustering extends Logging {
266266
* Generates the degree vector as the vertex properties (v0) to start power iteration.
267267
* It is not exactly the node degrees but just the normalized sum similarities. Call it
268268
* as degree vector because it is used in the PIC paper.
269-
*
269+
*
270270
* @param g a graph representing the normalized affinity matrix (W)
271271
* @return a graph with edges representing W and vertices representing the degree vector
272272
*/
@@ -276,7 +276,7 @@ object PowerIterationClustering extends Logging {
276276
val v0 = g.vertices.mapValues(_ / sum)
277277
GraphImpl.fromExistingRDDs(VertexRDD(v0), g.edges)
278278
}
279-
279+
280280
/**
281281
* Runs power iteration.
282282
* @param g input graph with edges representing the normalized affinity matrix (W) and vertices

0 commit comments

Comments
 (0)