Added functionality for tuning parellelism in the Scala implementatio…

…n of the one vs. rest algorithm.
apache · WeichenXu123 · Jun 12, 2017 · Jun 12, 2017 · Jun 13, 2017 · Jun 13, 2017
commit 81d458be99cf4f195b761eaa9bcb48ea086cdf61
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -21,6 +21,8 @@ import java.util.{List => JList}
 import java.util.UUID
 
 import scala.collection.JavaConverters._
+import scala.collection.parallel.ForkJoinTaskSupport
+import scala.concurrent.forkjoin.ForkJoinPool
 import scala.language.existentials
 
 import org.apache.hadoop.fs.Path
@@ -67,7 +69,7 @@ private[ml] trait OneVsRestParams extends PredictorParams with ClassifierTypeTra
   def getClassifier: ClassifierType = $(classifier)
 
   val parallelism = new IntParam(this, "parallelism",
-    "parallelism parameter for tuning amount of parallelism", ParamValidators.gt(1))
+    "parallelism parameter for tuning amount of parallelism", ParamValidators.gtEq(1))
 
   /** @group getParam */
   def getParallelism: Int = $(parallelism)
@@ -279,6 +281,10 @@ final class OneVsRest @Since("1.4.0") (
     @Since("1.4.0") override val uid: String)
   extends Estimator[OneVsRestModel] with OneVsRestParams with MLWritable {
 
+  setDefault(
+    parallelism -> 4
+  )
+
   @Since("1.4.0")
   def this() = this(Identifiable.randomUID("oneVsRest"))
 
@@ -337,8 +343,13 @@ final class OneVsRest @Since("1.4.0") (
       multiclassLabeled.persist(StorageLevel.MEMORY_AND_DISK)
     }
 
+    val iters = Range(0, numClasses).par
+    iters.tasksupport = new ForkJoinTaskSupport(
+      new ForkJoinPool(getParallelism)
+    )
+
     // create k columns, one for each binary classifier.
-    val models = Range(0, numClasses).par.map { index =>
+    val models = iters.map { index =>
       // generate new label metadata for the binary problem.
       val newLabelMeta = BinaryAttribute.defaultAttr.withName("label").toMetadata()
       val labelColName = "mc2b$" + index

diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -101,6 +101,40 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
     assert(expectedMetrics.confusionMatrix ~== ovaMetrics.confusionMatrix absTol 400)
   }
 
+  test("one-vs-rest: tuning parallelism produces correct output") {
+    val numClasses = 3
+    val ova = new OneVsRest()
+      .setClassifier(new LogisticRegression)
+      .setParallelism(8)
+    assert(ova.getLabelCol === "label")
+    assert(ova.getPredictionCol === "prediction")
+    val ovaModel = ova.fit(dataset)
+
+    MLTestingUtils.checkCopyAndUids(ova, ovaModel)
+
+    assert(ovaModel.models.length === numClasses)
+    val transformedDataset = ovaModel.transform(dataset)
+
+    // check for label metadata in prediction col
+    val predictionColSchema = transformedDataset.schema(ovaModel.getPredictionCol)
+    assert(MetadataUtils.getNumClasses(predictionColSchema) === Some(3))
+
+    val ovaResults = transformedDataset.select("prediction", "label").rdd.map {
+      row => (row.getDouble(0), row.getDouble(1))
+    }
+
+    val lr = new LogisticRegressionWithLBFGS().setIntercept(true).setNumClasses(numClasses)
+    lr.optimizer.setRegParam(0.1).setNumIterations(100)
+
+    val model = lr.run(rdd.map(OldLabeledPoint.fromML))
+    val results = model.predict(rdd.map(p => OldVectors.fromML(p.features))).zip(rdd.map(_.label))
+    // determine the #confusion matrix in each class.
+    // bound how much error we allow compared to multinomial logistic regression.
+    val expectedMetrics = new MulticlassMetrics(results)
+    val ovaMetrics = new MulticlassMetrics(ovaResults)
+    assert(expectedMetrics.confusionMatrix ~== ovaMetrics.confusionMatrix absTol 400)
+  }
+
   test("one-vs-rest: pass label metadata correctly during train") {
     val numClasses = 3
     val ova = new OneVsRest()

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
@@ -1517,13 +1517,13 @@ class OneVsRest(Estimator, OneVsRestParams, MLReadable, MLWritable):
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
-                 classifier=None, parallelism=8):
+                 classifier=None, parallelism=4):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  classifier=None)
         """
         super(OneVsRest, self).__init__()
-        self._setDefault(parallelism=8)
+        self._setDefault(parallelism=4)
         kwargs = self._input_kwargs
         self._set(**kwargs)