apache · MrBago · Jun 25, 2018 · Jul 5, 2018 · Jul 12, 2018 · Jul 17, 2018
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -490,7 +490,7 @@ class LogisticRegression @Since("1.2.0") (
 
   protected[spark] def train(
       dataset: Dataset[_],
-      handlePersistence: Boolean): LogisticRegressionModel = {
+      handlePersistence: Boolean): LogisticRegressionModel = Instrumentation.instrumented { instr =>
     val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol))
     val instances: RDD[Instance] =
       dataset.select(col($(labelCol)), w, col($(featuresCol))).rdd.map {
@@ -500,7 +500,7 @@ class LogisticRegression @Since("1.2.0") (
 
     if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
 
-    val instr = Instrumentation.create(this, dataset)
+    instr.logContext(this, dataset)
     instr.logParams(regParam, elasticNetParam, standardization, threshold,
       maxIter, tol, fitIntercept)
 
@@ -905,8 +905,6 @@ class LogisticRegression @Since("1.2.0") (
         objectiveHistory)
     }
     model.setSummary(Some(logRegSummary))
-    instr.logSuccess(model)
-    model
   }
 
   @Since("1.4.0")

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
@@ -91,7 +91,7 @@ private[spark] object RandomForest extends Logging {
       numTrees: Int,
       featureSubsetStrategy: String,
       seed: Long,
-      instr: Option[Instrumentation[_]],
+      instr: Option[Instrumentation],
       prune: Boolean = true, // exposed for testing only, real trees are always pruned
       parentUID: Option[String] = None): Array[DecisionTreeModel] = {
 

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/ValidatorParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/ValidatorParams.scala
@@ -80,7 +80,7 @@ private[ml] trait ValidatorParams extends HasSeed with Params {
   /**
    * Instrumentation logging for tuning params including the inner estimator and evaluator info.
    */
-  protected def logTuningParams(instrumentation: Instrumentation[_]): Unit = {
+  protected def logTuningParams(instrumentation: Instrumentation): Unit = {
     instrumentation.logNamedValue("estimator", $(estimator).getClass.getCanonicalName)
     instrumentation.logNamedValue("evaluator", $(evaluator).getClass.getCanonicalName)
     instrumentation.logNamedValue("estimatorParamMapsLength", $(estimatorParamMaps).length)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
@@ -19,7 +19,8 @@ package org.apache.spark.ml.util
 
 import java.util.UUID
 
-import scala.reflect.ClassTag
+import scala.util.{Failure, Success, Try}
+import scala.util.control.NonFatal
 
 import org.json4s._
 import org.json4s.JsonDSL._
@@ -35,32 +36,47 @@ import org.apache.spark.util.Utils
 /**
  * A small wrapper that defines a training session for an estimator, and some methods to log
  * useful information during this session.
- *
- * A new instance is expected to be created within fit().
- *
- * @param estimator the estimator that is being fit
- * @param dataset the training dataset
- * @tparam E the type of the estimator
  */
-private[spark] class Instrumentation[E <: Estimator[_]] private (
-    val estimator: E,
-    val dataset: RDD[_]) extends Logging {
+private[spark] class Instrumentation extends Logging {
 
   private val id = UUID.randomUUID()
-  private val prefix = {
-    // estimator.getClass.getSimpleName can cause Malformed class name error,
-    // call safer `Utils.getSimpleName` instead
-    val className = Utils.getSimpleName(estimator.getClass)
-    s"$className-${estimator.uid}-${dataset.hashCode()}-$id: "
+  private val shortId = id.toString.take(8)
+  private var prefix = s"$shortId:"
+
+  // TODO: update spark.ml to use new Instrumentation APIs and remove this constructor
+  var estimator: Estimator[_] = _
+  private def this(estimator: Estimator[_], dataset: RDD[_]) = {
+    this()
+    logContext(estimator, dataset)
   }
 
-  init()
+  /**
+   * Log info about the estimator and dataset being fit.
+   *
+   * @param estimator the estimator that is being fit
+   * @param dataset the training dataset
+   */
+  def logContext(estimator: Estimator[_], dataset: RDD[_]): Unit = {
+    this.estimator = estimator
+    prefix = {
+      // estimator.getClass.getSimpleName can cause Malformed class name error,
+      // call safer `Utils.getSimpleName` instead
+      val className = Utils.getSimpleName(estimator.getClass)
+      s"$shortId-$className-${estimator.uid}-${dataset.hashCode()}:"
+    }
 
-  private def init(): Unit = {
     log(s"training: numPartitions=${dataset.partitions.length}" +
       s" storageLevel=${dataset.getStorageLevel}")
   }
 
+  /**
+   * Log info about the estimator and dataset being fit.
+   *
+   * @param e the estimator that is being fit
+   * @param dataset the training dataset
+   */
+  def logContext(e: Estimator[_], dataset: Dataset[_]): Unit = logContext(e, dataset.rdd)
+
   /**
    * Logs a debug message with a prefix that uniquely identifies the training session.
    */
@@ -97,7 +113,7 @@ private[spark] class Instrumentation[E <: Estimator[_]] private (
   /**
    * Logs the value of the given parameters for the estimator being used in this session.
    */
-  def logParams(params: Param[_]*): Unit = {
+  def logParams(estimator: Estimator[_], params: Param[_]*): Unit = {
     val pairs: Seq[(String, JValue)] = for {
       p <- params
       value <- estimator.get(p)
@@ -108,6 +124,12 @@ private[spark] class Instrumentation[E <: Estimator[_]] private (
     log(compact(render(map2jvalue(pairs.toMap))))
   }
 
+  // TODO: remove this
+  def logParams(params: Param[_]*): Unit = {
+    require(estimator != null, "`logContext` must be called before `logParams`.")
+    logParams(estimator, params: _*)
+  }
+
   def logNumFeatures(num: Long): Unit = {
     logNamedValue(Instrumentation.loggerTags.numFeatures, num)
   }
@@ -148,12 +170,25 @@ private[spark] class Instrumentation[E <: Estimator[_]] private (
   }
 
 
+  // TODO: Remove this (possibly replace with logModel?)
   /**
    * Logs the successful completion of the training session.
    */
   def logSuccess(model: Model[_]): Unit = {
     log(s"training finished")
   }
+
+  def logSuccess(): Unit = {
+    log("training finished")
+  }
+
+  /**
+   * Logs an exception raised during a training session.
+   */
+  def logFailure(e: Throwable): Unit = {
+    val msg = e.getStackTrace.mkString("\n")
+    super.logInfo(msg)
+  }
 }
 
 /**
@@ -169,22 +204,33 @@ private[spark] object Instrumentation {
     val varianceOfLabels = "varianceOfLabels"
   }
 
+  // TODO: Remove these
   /**
    * Creates an instrumentation object for a training session.
    */
-  def create[E <: Estimator[_]](
-      estimator: E, dataset: Dataset[_]): Instrumentation[E] = {
-    create[E](estimator, dataset.rdd)
+  def create(estimator: Estimator[_], dataset: Dataset[_]): Instrumentation = {
+    create(estimator, dataset.rdd)
   }
 
   /**
    * Creates an instrumentation object for a training session.
    */
-  def create[E <: Estimator[_]](
-      estimator: E, dataset: RDD[_]): Instrumentation[E] = {
-    new Instrumentation[E](estimator, dataset)
+  def create(estimator: Estimator[_], dataset: RDD[_]): Instrumentation = {
+    new Instrumentation(estimator, dataset)
+  }
+  // end remove
+
+  def instrumented[T](body: (Instrumentation => T)): T = {
+    val instr = new Instrumentation()
+    Try(body(new Instrumentation())) match {
+      case Failure(NonFatal(e)) =>
+        instr.logFailure(e)
+        throw e
+      case Success(model) =>
+        instr.logSuccess()
+        model
+    }
   }
-
 }
 
 /**
@@ -193,7 +239,7 @@ private[spark] object Instrumentation {
  * will log via it, otherwise will log via common logger.
  */
 private[spark] class OptionalInstrumentation private(
-    val instrumentation: Option[Instrumentation[_ <: Estimator[_]]],
+    val instrumentation: Option[Instrumentation],
     val className: String) extends Logging {
 
   protected override def logName: String = className
@@ -225,7 +271,7 @@ private[spark] object OptionalInstrumentation {
   /**
    * Creates an `OptionalInstrumentation` object from an existing `Instrumentation` object.
    */
-  def create[E <: Estimator[_]](instr: Instrumentation[E]): OptionalInstrumentation = {
+  def create(instr: Instrumentation): OptionalInstrumentation = {
     new OptionalInstrumentation(Some(instr),
       instr.estimator.getClass.getName.stripSuffix("$"))
   }

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -235,7 +235,7 @@ class KMeans private (
 
   private[spark] def run(
       data: RDD[Vector],
-      instr: Option[Instrumentation[NewKMeans]]): KMeansModel = {
+      instr: Option[Instrumentation]): KMeansModel = {
 
     if (data.getStorageLevel == StorageLevel.NONE) {
       logWarning("The input data is not directly cached, which may hurt performance if its"
@@ -264,7 +264,7 @@ class KMeans private (
    */
   private def runAlgorithm(
       data: RDD[VectorWithNorm],
-      instr: Option[Instrumentation[NewKMeans]]): KMeansModel = {
+      instr: Option[Instrumentation]): KMeansModel = {
 
     val sc = data.sparkContext