apache · WeichenXu123 · Sep 11, 2017 · Sep 12, 2017 · Sep 14, 2017 · Sep 27, 2017
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -82,7 +82,13 @@ private[shared] object SharedParamsCodeGen {
         "all instance weights as 1.0"),
       ParamDesc[String]("solver", "the solver algorithm for optimization", finalFields = false),
       ParamDesc[Int]("aggregationDepth", "suggested depth for treeAggregate (>= 2)", Some("2"),
-        isValid = "ParamValidators.gtEq(2)", isExpertParam = true))
+        isValid = "ParamValidators.gtEq(2)", isExpertParam = true),
+      ParamDesc[Boolean]("collectSubModels", "If set to false, then only the single best " +
+        "sub-model will be available after fitting. If set to true, then all sub-models will be " +
+        "available. Warning: For large models, collecting all sub-models can cause OOMs on the " +
+        "Spark driver.",
+        Some("false"), isExpertParam = true)
+    )
 
     val code = genSharedParams(params)
     val file = "src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala"

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -402,4 +402,21 @@ private[ml] trait HasAggregationDepth extends Params {
   /** @group expertGetParam */
   final def getAggregationDepth: Int = $(aggregationDepth)
 }
+
+/**
+ * Trait for shared param collectSubModels (default: false).
+ */
+private[ml] trait HasCollectSubModels extends Params {
+
+  /**
+   * Param for whether to collect a list of sub-models trained during tuning.
+   * @group expertParam
+   */
+  final val collectSubModels: BooleanParam = new BooleanParam(this, "collectSubModels", "whether to collect a list of sub-models trained during tuning")
+
+  setDefault(collectSubModels, false)
+
+  /** @group expertGetParam */
+  final def getCollectSubModels: Boolean = $(collectSubModels)
+}
 // scalastyle:on
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.tuning
 
-import java.util.{List => JList}
+import java.util.{List => JList, Locale}
 
 import scala.collection.JavaConverters._
 import scala.concurrent.Future
@@ -31,7 +31,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.evaluation.Evaluator
 import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
-import org.apache.spark.ml.param.shared.HasParallelism
+import org.apache.spark.ml.param.shared.{HasCollectSubModels, HasParallelism}
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.sql.{DataFrame, Dataset}
@@ -67,7 +67,8 @@ private[ml] trait CrossValidatorParams extends ValidatorParams {
 @Since("1.2.0")
 class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
   extends Estimator[CrossValidatorModel]
-  with CrossValidatorParams with HasParallelism with MLWritable with Logging {
+  with CrossValidatorParams with HasParallelism with HasCollectSubModels
+  with MLWritable with Logging {
 
   @Since("1.2.0")
   def this() = this(Identifiable.randomUID("cv"))
@@ -101,6 +102,10 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
   @Since("2.3.0")
   def setParallelism(value: Int): this.type = set(parallelism, value)
 
+  /** @group expertSetParam */
+  @Since("2.3.0")
+  def setCollectSubModels(value: Boolean): this.type = set(collectSubModels, value)
+
   @Since("2.0.0")
   override def fit(dataset: Dataset[_]): CrossValidatorModel = {
     val schema = dataset.schema
@@ -117,6 +122,12 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
     instr.logParams(numFolds, seed, parallelism)
     logTuningParams(instr)
 
+    val collectSubModelsParam = $(collectSubModels)
+
+    var subModels: Option[Array[Array[Model[_]]]] = if (collectSubModelsParam) {
+      Some(Array.fill($(numFolds))(Array.fill[Model[_]](epm.length)(null)))
+    } else None
+
     // Compute metrics for each model over each split
     val splits = MLUtils.kFold(dataset.toDF.rdd, $(numFolds), $(seed))
     val metrics = splits.zipWithIndex.map { case ((training, validation), splitIndex) =>
@@ -125,10 +136,14 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
       logDebug(s"Train split $splitIndex with multiple sets of parameters.")
 
       // Fit models in a Future for training in parallel
-      val modelFutures = epm.map { paramMap =>
+      val modelFutures = epm.zipWithIndex.map { case (paramMap, paramIndex) =>
         Future[Model[_]] {
-          val model = est.fit(trainingDataset, paramMap)
-          model.asInstanceOf[Model[_]]
+          val model = est.fit(trainingDataset, paramMap).asInstanceOf[Model[_]]
+
+          if (collectSubModelsParam) {
+            subModels.get(splitIndex)(paramIndex) = model
+          }
+          model
         } (executionContext)
       }
 
@@ -160,7 +175,8 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
     logInfo(s"Best cross-validation metric: $bestMetric.")
     val bestModel = est.fit(dataset, epm(bestIndex)).asInstanceOf[Model[_]]
     instr.logSuccess(bestModel)
-    copyValues(new CrossValidatorModel(uid, bestModel, metrics).setParent(this))
+    copyValues(new CrossValidatorModel(uid, bestModel, metrics)
+      .setSubModels(subModels).setParent(this))
   }
 
   @Since("1.4.0")
@@ -244,6 +260,21 @@ class CrossValidatorModel private[ml] (
     this(uid, bestModel, avgMetrics.asScala.toArray)
   }
 
+  private var _subModels: Option[Array[Array[Model[_]]]] = None
+
+  @Since("2.3.0")
+  private[tuning] def setSubModels(subModels: Option[Array[Array[Model[_]]]])
+    : CrossValidatorModel = {
+    _subModels = subModels
+    this
+  }
+
+  @Since("2.3.0")
+  def subModels: Array[Array[Model[_]]] = _subModels.get
+
+  @Since("2.3.0")
+  def hasSubModels: Boolean = _subModels.isDefined
+
   @Since("2.0.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
     transformSchema(dataset.schema, logging = true)
@@ -260,7 +291,8 @@ class CrossValidatorModel private[ml] (
     val copied = new CrossValidatorModel(
       uid,
       bestModel.copy(extra).asInstanceOf[Model[_]],
-      avgMetrics.clone())
+      avgMetrics.clone()
+    ).setSubModels(CrossValidatorModel.copySubModels(_subModels))
     copyValues(copied, extra).setParent(parent)
   }
 
@@ -271,6 +303,20 @@ class CrossValidatorModel private[ml] (
 @Since("1.6.0")
 object CrossValidatorModel extends MLReadable[CrossValidatorModel] {
 
+  private[CrossValidatorModel] def copySubModels(subModels: Option[Array[Array[Model[_]]]]) = {
+    subModels.map { subModels =>
+      val numFolds = subModels.length
+      val numParamMaps = subModels(0).length
+      val copiedSubModels = Array.fill(numFolds)(Array.fill[Model[_]](numParamMaps)(null))
+      for (i <- 0 until numFolds) {
+        for (j <- 0 until numParamMaps) {
+          copiedSubModels(i)(j) = subModels(i)(j).copy(ParamMap.empty).asInstanceOf[Model[_]]
+        }
+      }
+      copiedSubModels
+    }
+  }
+
   @Since("1.6.0")
   override def read: MLReader[CrossValidatorModel] = new CrossValidatorModelReader
 
@@ -282,12 +328,40 @@ object CrossValidatorModel extends MLReadable[CrossValidatorModel] {
 
     ValidatorParams.validateParams(instance)
 
+    protected var shouldPersistSubModels: Boolean = if (instance.hasSubModels) true else false
+
+    /**
+     * Extra options for CrossValidatorModelWriter, current support "persistSubModels".
+     * if sub models exsit, the default value for option "persistSubModels" is "true".
+     */
+    @Since("2.3.0")
+    override def option(key: String, value: String): this.type = {
+      key.toLowerCase(Locale.ROOT) match {
+        case "persistsubmodels" => shouldPersistSubModels = value.toBoolean
+        case _ => throw new IllegalArgumentException(
+          s"Illegal option ${key} for CrossValidatorModelWriter")
+      }
+      this
+    }
+
     override protected def saveImpl(path: String): Unit = {
       import org.json4s.JsonDSL._
-      val extraMetadata = "avgMetrics" -> instance.avgMetrics.toSeq
+      val extraMetadata = ("avgMetrics" -> instance.avgMetrics.toSeq) ~
+        ("shouldPersistSubModels" -> shouldPersistSubModels)
       ValidatorParams.saveImpl(path, instance, sc, Some(extraMetadata))
       val bestModelPath = new Path(path, "bestModel").toString
       instance.bestModel.asInstanceOf[MLWritable].save(bestModelPath)
+      if (shouldPersistSubModels) {
+        require(instance.hasSubModels, "Cannot get sub models to persist.")
+        val subModelsPath = new Path(path, "subModels")
+        for (splitIndex <- 0 until instance.getNumFolds) {
+          val splitPath = new Path(subModelsPath, splitIndex.toString)
+          for (paramIndex <- 0 until instance.getEstimatorParamMaps.length) {
+            val modelPath = new Path(splitPath, paramIndex.toString).toString
+            instance.subModels(splitIndex)(paramIndex).asInstanceOf[MLWritable].save(modelPath)
+          }
+        }
+      }
     }
   }
 
@@ -301,11 +375,29 @@ object CrossValidatorModel extends MLReadable[CrossValidatorModel] {
 
       val (metadata, estimator, evaluator, estimatorParamMaps) =
         ValidatorParams.loadImpl(path, sc, className)
+      val numFolds = (metadata.params \ "numFolds").extract[Int]
       val bestModelPath = new Path(path, "bestModel").toString
       val bestModel = DefaultParamsReader.loadParamsInstance[Model[_]](bestModelPath, sc)
       val avgMetrics = (metadata.metadata \ "avgMetrics").extract[Seq[Double]].toArray
+      val shouldPersistSubModels = (metadata.metadata \ "shouldPersistSubModels").extract[Boolean]
+
+      val subModels: Option[Array[Array[Model[_]]]] = if (shouldPersistSubModels) {
+        val subModelsPath = new Path(path, "subModels")
+        val _subModels = Array.fill(numFolds)(Array.fill[Model[_]](
+          estimatorParamMaps.length)(null))
+        for (splitIndex <- 0 until numFolds) {
+          val splitPath = new Path(subModelsPath, splitIndex.toString)
+          for (paramIndex <- 0 until estimatorParamMaps.length) {
+            val modelPath = new Path(splitPath, paramIndex.toString).toString
+            _subModels(splitIndex)(paramIndex) =
+              DefaultParamsReader.loadParamsInstance(modelPath, sc)
+          }
+        }
+        Some(_subModels)
+      } else None
 
       val model = new CrossValidatorModel(metadata.uid, bestModel, avgMetrics)
+        .setSubModels(subModels)
       model.set(model.estimator, estimator)
         .set(model.evaluator, evaluator)
         .set(model.estimatorParamMaps, estimatorParamMaps)