apache · Lewuathe · Oct 20, 2015 · Oct 20, 2015 · Oct 21, 2015 · Oct 21, 2015
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -83,7 +83,6 @@ object LinearDataGenerator {
       nPoints, seed, eps)}
 
   /**
-   *
    * @param intercept Data intercept
    * @param weights  Weights to be applied.
    * @param xMean the mean of the generated features. Lots of time, if the features are not properly
@@ -104,24 +103,71 @@ object LinearDataGenerator {
       nPoints: Int,
       seed: Int,
       eps: Double): Seq[LabeledPoint] = {
+    generateLinearInputInternal(intercept, weights, xMean, xVariance, nPoints, seed, eps, 0.0)
+  }
 
+
+  /**
+   * @param intercept Data intercept
+   * @param weights  Weights to be applied.
+   * @param xMean the mean of the generated features. Lots of time, if the features are not properly
+   *              standardized, the algorithm with poor implementation will have difficulty
+   *              to converge.
+   * @param xVariance the variance of the generated features.
+   * @param nPoints Number of points in sample.
+   * @param seed Random seed
+   * @param eps Epsilon scaling factor.
+   * @param sparcity The ratio of zero elements. If it is 0.0, LabeledPoints with
+   *                 DenseVector is returned.
+   * @return Seq of input.
+   */
+  @Since("1.6.0")
+  def generateLinearInputInternal(
+      intercept: Double,
+      weights: Array[Double],
+      xMean: Array[Double],
+      xVariance: Array[Double],
+      nPoints: Int,
+      seed: Int,
+      eps: Double,
+      sparcity: Double): Seq[LabeledPoint] = {
+    require(sparcity <= 1.0)
     val rnd = new Random(seed)
     val x = Array.fill[Array[Double]](nPoints)(
       Array.fill[Double](weights.length)(rnd.nextDouble()))
 
     x.foreach { v =>
       var i = 0
       val len = v.length
+      val sparceRnd = new Random(seed)
       while (i < len) {
-        v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
+        if (sparceRnd.nextDouble() < sparcity) {
+          v(i) = 0.0
+        } else {
+          v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
+        }
         i += 1
       }
     }
 
     val y = x.map { xi =>
       blas.ddot(weights.length, xi, 1, weights, 1) + intercept + eps * rnd.nextGaussian()
     }
-    y.zip(x).map(p => LabeledPoint(p._1, Vectors.dense(p._2)))
+
+    val sparseX = x.map { (v: Array[Double]) =>
+      v.zipWithIndex.filter {
+        case (d: Double, i: Int) => d != 0.0
+      }.map {
+        case (d: Double, i: Int) => (i, d)
+      }
+    }
+    if (sparcity == 0.0) {
+      // Return LabeledPoints with DenseVector
+      y.zip(x).map(p => LabeledPoint(p._1, Vectors.dense(p._2)))
+    } else {
+      // Return LabeledPoints with SparseVector
+      y.zip(sparseX).map(p => LabeledPoint(p._1, Vectors.sparse(weights.length, p._2)))
+    }
   }
 
   /**

diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -34,6 +34,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
   private val seed: Int = 42
   @transient var dataset: DataFrame = _
   @transient var datasetWithoutIntercept: DataFrame = _
+  @transient var datasetWithManyFeature: DataFrame = _
 
   /*
      In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
@@ -51,14 +52,27 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     super.beforeAll()
     dataset = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
-        6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2))
+        intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
+        xVariance = Array(0.7, 1.2), nPoints = 10000, seed = seed, eps = 0.1), 2))
     /*
        datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating
        training model without intercept
      */
     datasetWithoutIntercept = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
-        0.0, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2))
+        intercept = 0.0, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
+        xVariance = Array(0.7, 1.2), nPoints = 10000, seed = seed, eps = 0.1), 2))
+
+    val r = new Random(seed)
+    // When feature size is larger than 4096, normal optimizer is choosed
+    // as the solver of linear regression in the case of "auto" mode.
+    val featureSize = 4100
+    datasetWithManyFeature = sqlContext.createDataFrame(
+      sc.parallelize(LinearDataGenerator.generateLinearInputInternal(
+        intercept = 0.0, weights = Seq.fill(featureSize)(r.nextDouble).toArray,
+        xMean = Seq.fill(featureSize)(r.nextDouble).toArray,
+        xVariance = Seq.fill(featureSize)(r.nextDouble).toArray, nPoints = 200,
+        seed = seed, eps = 0.1, sparcity = 0.7), 2))
   }
 
   test("params") {
@@ -186,19 +200,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
         .setSolver(solver).setStandardization(false)
 
-      var model1: LinearRegressionModel = null
-      var model2: LinearRegressionModel = null
-
       // Normal optimizer is not supported with only L1 regularization case.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
             trainer1.fit(dataset)
             trainer2.fit(dataset)
           }
       } else {
-        model1 = trainer1.fit(dataset)
-        model2 = trainer2.fit(dataset)
-
+        val model1 = trainer1.fit(dataset)
+        val model2 = trainer2.fit(dataset)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57))
@@ -247,18 +257,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
         .setFitIntercept(false).setStandardization(false).setSolver(solver)
 
-      var model1: LinearRegressionModel = null
-      var model2: LinearRegressionModel = null
-
       // Normal optimizer is not supported with only L1 regularization case.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
             trainer1.fit(dataset)
             trainer2.fit(dataset)
           }
       } else {
-        model1 = trainer1.fit(dataset)
-        model2 = trainer2.fit(dataset)
+        val model1 = trainer1.fit(dataset)
+        val model2 = trainer2.fit(dataset)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
@@ -408,18 +415,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
         .setStandardization(false).setSolver(solver)
 
-      var model1: LinearRegressionModel = null
-      var model2: LinearRegressionModel = null
-
       // Normal optimizer is not supported with non-zero elasticnet parameter.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
             trainer1.fit(dataset)
             trainer2.fit(dataset)
           }
       } else {
-        model1 = trainer1.fit(dataset)
-        model2 = trainer2.fit(dataset)
+        val model1 = trainer1.fit(dataset)
+        val model2 = trainer2.fit(dataset)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6))
@@ -469,18 +473,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
         .setFitIntercept(false).setStandardization(false).setSolver(solver)
 
-      var model1: LinearRegressionModel = null
-      var model2: LinearRegressionModel = null
-
       // Normal optimizer is not supported with non-zero elasticnet parameter.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
             trainer1.fit(dataset)
             trainer2.fit(dataset)
           }
       } else {
-        model1 = trainer1.fit(dataset)
-        model2 = trainer2.fit(dataset)
+        val model1 = trainer1.fit(dataset)
+        val model2 = trainer2.fit(dataset)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6,
@@ -531,7 +532,6 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainerNoPredictionCol = trainer.setPredictionCol("")
       val modelNoPredictionCol = trainerNoPredictionCol.fit(dataset)
 
-
       // Training results for the model should be available
       assert(model.hasSummary)
       assert(modelNoPredictionCol.hasSummary)
@@ -585,6 +585,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
             .objectiveHistory
             .sliding(2)
             .forall(x => x(0) >= x(1)))
+      } else {
+        // To clalify that the normal solver is used here.
+        assert(model.summary.objectiveHistory.length == 1)
+        assert(model.summary.objectiveHistory(0) == 0.0)
       }
     }
   }
@@ -693,4 +697,18 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(model4a0.weights ~== model4b.weights absTol 1E-3)
     }
   }
+
+  test("linear regression model with l-bfgs with big feature datasets") {
+    val trainer = new LinearRegression().setSolver("auto")
+    val model = trainer.fit(datasetWithManyFeature)
+
+    // Training results for the model should be available
+    assert(model.hasSummary)
+    // When LBFGS is used as optimizer, objective history can be restored.
+    assert(
+      model.summary
+        .objectiveHistory
+        .sliding(2)
+        .forall(x => x(0) >= x(1)))
+  }
 }