Added more comments

apache · dbtsai · Aug 8, 2014 · Aug 13, 2014 · Aug 13, 2014 · f19fc02418b7790103406bfb8f1c0e982abe17f2
commit f19fc02418b7790103406bfb8f1c0e982abe17f2
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -155,11 +155,11 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
     }
 
     /**
-     * Scaling to minimize the condition number:
+     * Scaling columns to unit variance as a heuristic to reduce the condition number:
      *
      * During the optimization process, the convergence (rate) depends on the condition number of
-     * the training dataset. Scaling the variables often reduces this condition number, thus
-     * improving the convergence rate dramatically. Without reducing the condition number,
+     * the training dataset. Scaling the variables often reduces this condition number
+     * heuristically, thus improving the convergence rate. Without reducing the condition number,
      * some training datasets mixing the columns with different scales may not be able to converge.
      *
      * GLMNET and LIBSVM packages perform the scaling to reduce the condition number, and return

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
@@ -229,10 +229,15 @@ class LogisticRegressionSuite extends FunSuite with LocalSparkContext with Match
     val modelB2 = lrB.run(testRDD2, initialWeights)
     val modelB3 = lrB.run(testRDD3, initialWeights)
 
-    // Test the weights
+    // For model trained with feature standardization, the weights should
+    // be the same in the scaled space. Note that the weights here are already
+    // in the original space, we transform back to scaled space to compare.
     assert(modelA1.weights(0) ~== modelA2.weights(0) * 1.0E3 absTol 0.01)
     assert(modelA1.weights(0) ~== modelA3.weights(0) * 1.0E6 absTol 0.01)
 
+    // Training data with different scales without feature standardization
+    // will not yield the same result in the scaled space due to poor
+    // convergence rate.
     assert(modelB1.weights(0) !~== modelB2.weights(0) * 1.0E3 absTol 0.1)
     assert(modelB1.weights(0) !~== modelB3.weights(0) * 1.0E6 absTol 0.1)
   }