From bef1084283f66fcd3ec5be3d14f30b206b7c45a5 Mon Sep 17 00:00:00 2001 From: Bago Amirbekian Date: Thu, 29 Mar 2018 14:48:48 -0700 Subject: [PATCH 1/3] Have instrumentation use uuid & standard logger tags. --- .../org/apache/spark/ml/util/Instrumentation.scala | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala index 7c46f45c5971..be56f36cd20b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala @@ -17,6 +17,7 @@ package org.apache.spark.ml.util +import java.util.UUID import java.util.concurrent.atomic.AtomicLong import org.json4s._ @@ -42,7 +43,7 @@ import org.apache.spark.sql.Dataset private[spark] class Instrumentation[E <: Estimator[_]] private ( estimator: E, dataset: RDD[_]) extends Logging { - private val id = Instrumentation.counter.incrementAndGet() + private val id = UUID.randomUUID() private val prefix = { val className = estimator.getClass.getSimpleName s"$className-${estimator.uid}-${dataset.hashCode()}-$id: " @@ -77,11 +78,11 @@ private[spark] class Instrumentation[E <: Estimator[_]] private ( } def logNumFeatures(num: Long): Unit = { - log(compact(render("numFeatures" -> num))) + logNamedValue(Instrumentation.loggerTags.numFeatures, num) } def logNumClasses(num: Long): Unit = { - log(compact(render("numClasses" -> num))) + logNamedValue(Instrumentation.loggerTags.numClasses, num) } /** @@ -107,7 +108,11 @@ private[spark] class Instrumentation[E <: Estimator[_]] private ( * Some common methods for logging information about a training session. */ private[spark] object Instrumentation { - private val counter = new AtomicLong(0) + + object loggerTags { + val numFeatures = "numFeatures" + val numClasses = "numClasses" + } /** * Creates an instrumentation object for a training session. From 42c3eca78a58ddca059b150681a0deaf462981f8 Mon Sep 17 00:00:00 2001 From: Bago Amirbekian Date: Thu, 15 Mar 2018 11:14:48 -0700 Subject: [PATCH 2/3] Log data about logistic regression. --- .../apache/spark/ml/classification/LogisticRegression.scala | 6 ++++++ .../scala/org/apache/spark/ml/util/Instrumentation.scala | 1 + 2 files changed, 7 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 3ae4db3f3f96..e10c683cbf63 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -517,6 +517,12 @@ class LogisticRegression @Since("1.2.0") ( (new MultivariateOnlineSummarizer, new MultiClassSummarizer) )(seqOp, combOp, $(aggregationDepth)) } + instr.logNamedValue(Instrumentation.loggerTags.numExamples, summarizer.count) + if (labelSummarizer.numClasses == 2) { + val b = labelSummarizer.histogram(0) / summarizer.count + instr.logNamedValue("lowestLabelWeight", labelSummarizer.histogram.min.toString) + instr.logNamedValue("highestLabelWeight", labelSummarizer.histogram.min.toString) + } val histogram = labelSummarizer.histogram val numInvalid = labelSummarizer.countInvalid diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala index be56f36cd20b..aae12429c823 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala @@ -112,6 +112,7 @@ private[spark] object Instrumentation { object loggerTags { val numFeatures = "numFeatures" val numClasses = "numClasses" + val numExamples = "numExamples" } /** From 8a3ce3e4958420dc5e92e7c1d31d5eab1735bb9e Mon Sep 17 00:00:00 2001 From: Bago Amirbekian Date: Fri, 30 Mar 2018 16:34:15 -0700 Subject: [PATCH 3/3] Add logWarning to instrumentation class. --- .../classification/LogisticRegression.scala | 17 +++++++---------- .../spark/ml/util/Instrumentation.scala | 19 +++++++++++++++---- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index e10c683cbf63..a8401de685b8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -518,11 +518,8 @@ class LogisticRegression @Since("1.2.0") ( )(seqOp, combOp, $(aggregationDepth)) } instr.logNamedValue(Instrumentation.loggerTags.numExamples, summarizer.count) - if (labelSummarizer.numClasses == 2) { - val b = labelSummarizer.histogram(0) / summarizer.count - instr.logNamedValue("lowestLabelWeight", labelSummarizer.histogram.min.toString) - instr.logNamedValue("highestLabelWeight", labelSummarizer.histogram.min.toString) - } + instr.logNamedValue("lowestLabelWeight", labelSummarizer.histogram.min.toString) + instr.logNamedValue("highestLabelWeight", labelSummarizer.histogram.min.toString) val histogram = labelSummarizer.histogram val numInvalid = labelSummarizer.countInvalid @@ -573,8 +570,8 @@ class LogisticRegression @Since("1.2.0") ( val isConstantLabel = histogram.count(_ != 0.0) == 1 if ($(fitIntercept) && isConstantLabel && !usingBoundConstrainedOptimization) { - logWarning(s"All labels are the same value and fitIntercept=true, so the coefficients " + - s"will be zeros. Training is not needed.") + instr.logWarning(s"All labels are the same value and fitIntercept=true, so the " + + s"coefficients will be zeros. Training is not needed.") val constantLabelIndex = Vectors.dense(histogram).argmax val coefMatrix = new SparseMatrix(numCoefficientSets, numFeatures, new Array[Int](numCoefficientSets + 1), Array.empty[Int], Array.empty[Double], @@ -587,7 +584,7 @@ class LogisticRegression @Since("1.2.0") ( (coefMatrix, interceptVec, Array.empty[Double]) } else { if (!$(fitIntercept) && isConstantLabel) { - logWarning(s"All labels belong to a single class and fitIntercept=false. It's a " + + instr.logWarning(s"All labels belong to a single class and fitIntercept=false. It's a " + s"dangerous ground, so the algorithm may not converge.") } @@ -596,7 +593,7 @@ class LogisticRegression @Since("1.2.0") ( if (!$(fitIntercept) && (0 until numFeatures).exists { i => featuresStd(i) == 0.0 && featuresMean(i) != 0.0 }) { - logWarning("Fitting LogisticRegressionModel without intercept on dataset with " + + instr.logWarning("Fitting LogisticRegressionModel without intercept on dataset with " + "constant nonzero column, Spark MLlib outputs zero coefficients for constant " + "nonzero columns. This behavior is the same as R glmnet but different from LIBSVM.") } @@ -714,7 +711,7 @@ class LogisticRegression @Since("1.2.0") ( (_initialModel.interceptVector.size == numCoefficientSets) && (_initialModel.getFitIntercept == $(fitIntercept)) if (!modelIsValid) { - logWarning(s"Initial coefficients will be ignored! Its dimensions " + + instr.logWarning(s"Initial coefficients will be ignored! Its dimensions " + s"(${providedCoefs.numRows}, ${providedCoefs.numCols}) did not match the " + s"expected size ($numCoefficientSets, $numFeatures)") } diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala index aae12429c823..25ff8e0bbda7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala @@ -18,7 +18,6 @@ package org.apache.spark.ml.util import java.util.UUID -import java.util.concurrent.atomic.AtomicLong import org.json4s._ import org.json4s.JsonDSL._ @@ -57,12 +56,24 @@ private[spark] class Instrumentation[E <: Estimator[_]] private ( } /** - * Logs a message with a prefix that uniquely identifies the training session. + * Logs a warning message with a prefix that uniquely identifies the training session. */ - def log(msg: String): Unit = { - logInfo(prefix + msg) + override def logWarning(msg: => String): Unit = { + super.logWarning(prefix + msg) } + /** + * Logs an info message with a prefix that uniquely identifies the training session. + */ + override def logInfo(msg: => String): Unit = { + super.logInfo(prefix + msg) + } + + /** + * Alias for logInfo, see above. + */ + def log(msg: String): Unit = logInfo(msg) + /** * Logs the value of the given parameters for the estimator being used in this session. */