rename to modelPreservePath

YY-OnCall · YY-OnCall · commit 39c025f53b08 · 2017-06-26T12:39:25.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -92,13 +92,13 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
   def setSeed(value: Long): this.type = set(seed, value)
 
   /**
-   * If set, all the models fitted during the cross validation will be preserved
-   * under the specific directory path. By default the models will not be saved.
+   * Optional parameter. If set, all the trained models during cross validation will be
+   * saved in the specific path. By default the models will not be preserved.
    *
    * @group expertSetParam
    */
   @Since("2.3.0")
-  def setModelPath(value: String): this.type = set(modelPath, value)
+  def setModelPreservePath(value: String): this.type = set(modelPreservePath, value)
 
   @Since("2.0.0")
   override def fit(dataset: Dataset[_]): CrossValidatorModel = {
@@ -128,13 +128,15 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
         // TODO: duplicate evaluator to take extra params from input
         val metric = eval.evaluate(models(i).transform(validationDataset, epm(i)))
         logDebug(s"Got metric $metric for model trained with ${epm(i)}.")
-        if (isDefined(modelPath)) {
+        if (isDefined(modelPreservePath)) {
           models(i) match {
             case w: MLWritable =>
-              val path = new Path($(modelPath), epm(i).toSeq.map(p => p.param.name + "-" + p.value)
-                .mkString("-") + s"-split$splitIndex-${math.rint(metric * 1000) / 1000}")
-              w.save(path.toString)
+              // e.g. maxIter-5-regParam-0.001-split0-0.859
+              val fileName = epm(i).toSeq.map(p => p.param.name + "-" + p.value).sorted
+                .mkString("-") + s"-split$splitIndex-${math.rint(metric * 1000) / 1000}"
+              w.save(new Path($(modelPreservePath), fileName).toString)
             case _ =>
+              // for third-party algorithms
               logWarning(models(i).uid + " did not implement MLWritable. Serialization omitted.")
           }
         }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
@@ -88,13 +88,13 @@ class TrainValidationSplit @Since("1.5.0") (@Since("1.5.0") override val uid: St
   def setSeed(value: Long): this.type = set(seed, value)
 
   /**
-   * If set, all the models fitted during the training will be preserved
+   * Optional parameter. If set, all the models fitted during the training will be saved
    * under the specific directory path. By default the models will not be saved.
    *
    * @group expertSetParam
    */
   @Since("2.3.0")
-  def setModelPath(value: String): this.type = set(modelPath, value)
+  def setModelPreservePath(value: String): this.type = set(modelPreservePath, value)
 
   @Since("2.0.0")
   override def fit(dataset: Dataset[_]): TrainValidationSplitModel = {
@@ -124,12 +124,13 @@ class TrainValidationSplit @Since("1.5.0") (@Since("1.5.0") override val uid: St
       // TODO: duplicate evaluator to take extra params from input
       val metric = eval.evaluate(models(i).transform(validationDataset, epm(i)))
       logDebug(s"Got metric $metric for model trained with ${epm(i)}.")
-      if (isDefined(modelPath)) {
+      if (isDefined(modelPreservePath)) {
         models(i) match {
           case w: MLWritable =>
-            val path = new Path($(modelPath), epm(i).toSeq.map(p => p.param.name + "-" + p.value)
-              .mkString("-") + s"-${math.rint(metric * 1000) / 1000}")
-            w.save(path.toString)
+            // e.g. maxIter-5-regParam-0.001-0.859
+            val fileName = epm(i).toSeq.map(p => p.param.name + "-" + p.value).sorted
+              .mkString("-") + s"-${math.rint(metric * 1000) / 1000}"
+            w.save(new Path($(modelPreservePath), fileName).toString)
           case _ =>
             logWarning(models(i).uid + " did not implement MLWritable. Serialization omitted.")
         }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ValidatorParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ValidatorParams.scala
@@ -70,18 +70,18 @@ private[ml] trait ValidatorParams extends HasSeed with Params {
 
 
   /**
-   * Optional parameter. If set, all the models fitted during the cross validation will be
-   * saved in the specific path. By default the models will not be saved.
+   * Optional parameter. If set, all the models trained during the tuning grid search will be
+   * saved in the specific path. By default the models will not be preserved.
    *
    * @group expertParam
    */
-  val modelPath: Param[String] = new Param(this, "modelPath",
+  val modelPreservePath: Param[String] = new Param(this, "modelPath",
     "Optional parameter. If set, all the models fitted during the cross validation will be" +
       " saved in the path")
 
   /** @group expertGetParam */
   @Since("2.3.0")
-  def getModelPath: String = $(modelPath)
+  def getModelPreservePath: String = $(modelPreservePath)
 
   protected def transformSchemaImpl(schema: StructType): StructType = {
     require($(estimatorParamMaps).nonEmpty, s"Validator requires non-empty estimatorParamMaps")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -57,7 +57,7 @@ class CrossValidatorSuite
       .setEstimatorParamMaps(lrParamMaps)
       .setEvaluator(eval)
       .setNumFolds(3)
-    assert(!cv.isDefined(cv.modelPath))
+    assert(!cv.isDefined(cv.modelPreservePath))
     val cvModel = cv.fit(dataset)
 
     MLTestingUtils.checkCopyAndUids(cv, cvModel)
@@ -258,7 +258,7 @@ class CrossValidatorSuite
       .setEstimatorParamMaps(lrParamMaps)
       .setEvaluator(eval)
       .setNumFolds(3)
-      .setModelPath(path)
+      .setModelPreservePath(path)
     try {
       cv.fit(dataset)
       assert(tempDir.list().length === 3 * 2 * 2)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
@@ -54,7 +54,7 @@ class TrainValidationSplitSuite
       .setSeed(42L)
     val tvsModel = tvs.fit(dataset)
     val parent = tvsModel.bestModel.parent.asInstanceOf[LogisticRegression]
-    assert(!tvs.isDefined(tvs.modelPath))
+    assert(!tvs.isDefined(tvs.modelPreservePath))
     assert(tvs.getTrainRatio === 0.5)
     assert(parent.getRegParam === 0.001)
     assert(parent.getMaxIter === 10)
@@ -136,7 +136,7 @@ class TrainValidationSplitSuite
       .setEvaluator(eval)
       .setTrainRatio(0.5)
       .setSeed(42L)
-      .setModelPath(path)
+      .setModelPreservePath(path)
     try {
       tvs.fit(dataset)
       assert(tempDir.list().length === 2 * 2)