From bef1084283f66fcd3ec5be3d14f30b206b7c45a5 Mon Sep 17 00:00:00 2001
From: Bago Amirbekian <bago@databricks.com>
Date: Thu, 29 Mar 2018 14:48:48 -0700
Subject: [PATCH 1/3] Have instrumentation use uuid & standard logger tags.

---
 .../org/apache/spark/ml/util/Instrumentation.scala  | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
index 7c46f45c5971..be56f36cd20b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.ml.util
 
+import java.util.UUID
 import java.util.concurrent.atomic.AtomicLong
 
 import org.json4s._
@@ -42,7 +43,7 @@ import org.apache.spark.sql.Dataset
 private[spark] class Instrumentation[E <: Estimator[_]] private (
     estimator: E, dataset: RDD[_]) extends Logging {
 
-  private val id = Instrumentation.counter.incrementAndGet()
+  private val id = UUID.randomUUID()
   private val prefix = {
     val className = estimator.getClass.getSimpleName
     s"$className-${estimator.uid}-${dataset.hashCode()}-$id: "
@@ -77,11 +78,11 @@ private[spark] class Instrumentation[E <: Estimator[_]] private (
   }
 
   def logNumFeatures(num: Long): Unit = {
-    log(compact(render("numFeatures" -> num)))
+    logNamedValue(Instrumentation.loggerTags.numFeatures, num)
   }
 
   def logNumClasses(num: Long): Unit = {
-    log(compact(render("numClasses" -> num)))
+    logNamedValue(Instrumentation.loggerTags.numClasses, num)
   }
 
   /**
@@ -107,7 +108,11 @@ private[spark] class Instrumentation[E <: Estimator[_]] private (
  * Some common methods for logging information about a training session.
  */
 private[spark] object Instrumentation {
-  private val counter = new AtomicLong(0)
+
+  object loggerTags {
+    val numFeatures = "numFeatures"
+    val numClasses = "numClasses"
+  }
 
   /**
    * Creates an instrumentation object for a training session.

From 42c3eca78a58ddca059b150681a0deaf462981f8 Mon Sep 17 00:00:00 2001
From: Bago Amirbekian <bago@databricks.com>
Date: Thu, 15 Mar 2018 11:14:48 -0700
Subject: [PATCH 2/3] Log data about logistic regression.

---
 .../apache/spark/ml/classification/LogisticRegression.scala | 6 ++++++
 .../scala/org/apache/spark/ml/util/Instrumentation.scala    | 1 +
 2 files changed, 7 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 3ae4db3f3f96..e10c683cbf63 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -517,6 +517,12 @@ class LogisticRegression @Since("1.2.0") (
         (new MultivariateOnlineSummarizer, new MultiClassSummarizer)
       )(seqOp, combOp, $(aggregationDepth))
     }
+    instr.logNamedValue(Instrumentation.loggerTags.numExamples, summarizer.count)
+    if (labelSummarizer.numClasses == 2) {
+      val b = labelSummarizer.histogram(0) / summarizer.count
+      instr.logNamedValue("lowestLabelWeight", labelSummarizer.histogram.min.toString)
+      instr.logNamedValue("highestLabelWeight", labelSummarizer.histogram.min.toString)
+    }
 
     val histogram = labelSummarizer.histogram
     val numInvalid = labelSummarizer.countInvalid
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
index be56f36cd20b..aae12429c823 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
@@ -112,6 +112,7 @@ private[spark] object Instrumentation {
   object loggerTags {
     val numFeatures = "numFeatures"
     val numClasses = "numClasses"
+    val numExamples = "numExamples"
   }
 
   /**

From 8a3ce3e4958420dc5e92e7c1d31d5eab1735bb9e Mon Sep 17 00:00:00 2001
From: Bago Amirbekian <bago@databricks.com>
Date: Fri, 30 Mar 2018 16:34:15 -0700
Subject: [PATCH 3/3] Add logWarning to instrumentation class.

---
 .../classification/LogisticRegression.scala   | 17 +++++++----------
 .../spark/ml/util/Instrumentation.scala       | 19 +++++++++++++++----
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index e10c683cbf63..a8401de685b8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -518,11 +518,8 @@ class LogisticRegression @Since("1.2.0") (
       )(seqOp, combOp, $(aggregationDepth))
     }
     instr.logNamedValue(Instrumentation.loggerTags.numExamples, summarizer.count)
-    if (labelSummarizer.numClasses == 2) {
-      val b = labelSummarizer.histogram(0) / summarizer.count
-      instr.logNamedValue("lowestLabelWeight", labelSummarizer.histogram.min.toString)
-      instr.logNamedValue("highestLabelWeight", labelSummarizer.histogram.min.toString)
-    }
+    instr.logNamedValue("lowestLabelWeight", labelSummarizer.histogram.min.toString)
+    instr.logNamedValue("highestLabelWeight", labelSummarizer.histogram.min.toString)
 
     val histogram = labelSummarizer.histogram
     val numInvalid = labelSummarizer.countInvalid
@@ -573,8 +570,8 @@ class LogisticRegression @Since("1.2.0") (
       val isConstantLabel = histogram.count(_ != 0.0) == 1
 
       if ($(fitIntercept) && isConstantLabel && !usingBoundConstrainedOptimization) {
-        logWarning(s"All labels are the same value and fitIntercept=true, so the coefficients " +
-          s"will be zeros. Training is not needed.")
+        instr.logWarning(s"All labels are the same value and fitIntercept=true, so the " +
+          s"coefficients will be zeros. Training is not needed.")
         val constantLabelIndex = Vectors.dense(histogram).argmax
         val coefMatrix = new SparseMatrix(numCoefficientSets, numFeatures,
           new Array[Int](numCoefficientSets + 1), Array.empty[Int], Array.empty[Double],
@@ -587,7 +584,7 @@ class LogisticRegression @Since("1.2.0") (
         (coefMatrix, interceptVec, Array.empty[Double])
       } else {
         if (!$(fitIntercept) && isConstantLabel) {
-          logWarning(s"All labels belong to a single class and fitIntercept=false. It's a " +
+          instr.logWarning(s"All labels belong to a single class and fitIntercept=false. It's a " +
             s"dangerous ground, so the algorithm may not converge.")
         }
 
@@ -596,7 +593,7 @@ class LogisticRegression @Since("1.2.0") (
 
         if (!$(fitIntercept) && (0 until numFeatures).exists { i =>
           featuresStd(i) == 0.0 && featuresMean(i) != 0.0 }) {
-          logWarning("Fitting LogisticRegressionModel without intercept on dataset with " +
+          instr.logWarning("Fitting LogisticRegressionModel without intercept on dataset with " +
             "constant nonzero column, Spark MLlib outputs zero coefficients for constant " +
             "nonzero columns. This behavior is the same as R glmnet but different from LIBSVM.")
         }
@@ -714,7 +711,7 @@ class LogisticRegression @Since("1.2.0") (
               (_initialModel.interceptVector.size == numCoefficientSets) &&
               (_initialModel.getFitIntercept == $(fitIntercept))
             if (!modelIsValid) {
-              logWarning(s"Initial coefficients will be ignored! Its dimensions " +
+              instr.logWarning(s"Initial coefficients will be ignored! Its dimensions " +
                 s"(${providedCoefs.numRows}, ${providedCoefs.numCols}) did not match the " +
                 s"expected size ($numCoefficientSets, $numFeatures)")
             }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
index aae12429c823..25ff8e0bbda7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.ml.util
 
 import java.util.UUID
-import java.util.concurrent.atomic.AtomicLong
 
 import org.json4s._
 import org.json4s.JsonDSL._
@@ -57,12 +56,24 @@ private[spark] class Instrumentation[E <: Estimator[_]] private (
   }
 
   /**
-   * Logs a message with a prefix that uniquely identifies the training session.
+   * Logs a warning message with a prefix that uniquely identifies the training session.
    */
-  def log(msg: String): Unit = {
-    logInfo(prefix + msg)
+  override def logWarning(msg: => String): Unit = {
+    super.logWarning(prefix + msg)
   }
 
+  /**
+   * Logs an info message with a prefix that uniquely identifies the training session.
+   */
+  override def logInfo(msg: => String): Unit = {
+    super.logInfo(prefix + msg)
+  }
+
+  /**
+   * Alias for logInfo, see above.
+   */
+  def log(msg: String): Unit = logInfo(msg)
+
   /**
    * Logs the value of the given parameters for the estimator being used in this session.
    */