[SPARK-11207] Improve test case with many feature datasets

apache · Lewuathe · Oct 20, 2015 · Oct 20, 2015 · Oct 21, 2015 · Oct 21, 2015
commit f85bca6667dcebbfccbd50cde46b11f6855d1974
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -124,6 +124,59 @@ object LinearDataGenerator {
     y.zip(x).map(p => LabeledPoint(p._1, Vectors.dense(p._2)))
   }
 
+  /**
+   *
+   * @param intercept Data intercept
+   * @param weights  Weights to be applied.
+   * @param xMean the mean of the generated features. Lots of time, if the features are not properly
+   *              standardized, the algorithm with poor implementation will have difficulty
+   *              to converge.
+   * @param xVariance the variance of the generated features.
+   * @param nPoints Number of points in sample.
+   * @param seed Random seed
+   * @param eps Epsilon scaling factor.
+   * @return Seq of LabeledPoint includes sparse vectors..
+   */
+  @Since("1.6.0")
+  def generateLinearSparseInput(
+      intercept: Double,
+      weights: Array[Double],
+      xMean: Array[Double],
+      xVariance: Array[Double],
+      nPoints: Int,
+      seed: Int,
+      eps: Double): Seq[LabeledPoint] = {
+    val rnd = new Random(seed)
+    val x = Array.fill[Array[Double]](nPoints)(
+      Array.fill[Double](weights.length)(rnd.nextDouble()))
+
+    x.foreach { v =>
+      var i = 0
+      val len = v.length
+      while (i < len) {
+        if (rnd.nextDouble() < 0.7) {
+          v(i) = 0.0
+        } else {
+          v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
+        }
+        i += 1
+      }
+    }
+
+    val y = x.map { xi =>
+      blas.ddot(weights.length, xi, 1, weights, 1) + intercept + eps * rnd.nextGaussian()
+    }
+
+    val sparseX = x.map { (v: Array[Double]) =>
+      v.zipWithIndex.filter{
+        case (d: Double, i: Int) => d != 0.0
+      }.map {
+        case (d: Double, i: Int) => (i, d)
+      }
+    }
+    y.zip(sparseX).map(p => LabeledPoint(p._1, Vectors.sparse(weights.length, p._2)))
+  }
+
   /**
    * Generate an RDD containing sample data for Linear Regression models - including Ridge, Lasso,
    * and uregularized variants.

diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -34,7 +34,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
   private val seed: Int = 42
   @transient var dataset: DataFrame = _
   @transient var datasetWithoutIntercept: DataFrame = _
-  @transient var datasetWithBigFeature: DataFrame = _
+  @transient var datasetWithManyFeature: DataFrame = _
 
   /*
      In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
@@ -52,22 +52,27 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     super.beforeAll()
     dataset = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
-        6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2))
+        intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
+        xVariance = Array(0.7, 1.2), nPoints = 10000, seed = seed, eps = 0.1), 2))
     /*
        datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating
        training model without intercept
      */
     datasetWithoutIntercept = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
-        0.0, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2))
+        intercept = 0.0, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
+        xVariance = Array(0.7, 1.2), nPoints = 10000, seed = seed, eps = 0.1), 2))
 
     val r = new Random(seed)
+    // When feature size is larger than 4096, normal optimizer is choosed
+    // as the solver of linear regression in the case of "auto" mode.
     val featureSize = 4100
-    datasetWithBigFeature = sqlContext.createDataFrame(
-      sc.parallelize(LinearDataGenerator.generateLinearInput(
-        0.0, Seq.fill(featureSize)(r.nextDouble).toArray,
-        Seq.fill(featureSize)(r.nextDouble).toArray,
-        Seq.fill(featureSize)(r.nextDouble).toArray, 200, seed, 0.1
+    datasetWithManyFeature = sqlContext.createDataFrame(
+      sc.parallelize(LinearDataGenerator.generateLinearSparseInput(
+        intercept = 0.0, weights = Seq.fill(featureSize)(r.nextDouble).toArray,
+        xMean = Seq.fill(featureSize)(r.nextDouble).toArray,
+        xVariance = Seq.fill(featureSize)(r.nextDouble).toArray, nPoints = 200,
+        seed = seed, eps = 0.1
       ), 2))
   }
 
@@ -696,7 +701,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("linear regression model with l-bfgs with big feature datasets") {
     val trainer = new LinearRegression().setSolver("auto")
-    val model = trainer.fit(datasetWithBigFeature)
+    val model = trainer.fit(datasetWithManyFeature)
 
     // Training results for the model should be available
     assert(model.hasSummary)