apache · panbingkun · Apr 3, 2024 · Apr 3, 2024 · Apr 3, 2024 · Apr 3, 2024
diff --git a/common/utils/src/main/scala/org/apache/spark/internal/LogKey.scala b/common/utils/src/main/scala/org/apache/spark/internal/LogKey.scala
@@ -32,6 +32,16 @@ object LogKey extends Enumeration {
   val MIN_SIZE = Value
   val REMOTE_ADDRESS = Value
   val POD_ID = Value
+  val NUM_ITERATIONS = Value
+  val LEARNING_RATE = Value
+  val SUBSAMPLING_RATE = Value
+
+  val MAX_CATEGORIES = Value
+  val CATEGORICAL_FEATURES = Value
+
+  val RANGE_CLASSIFICATION_LABELS = Value
+  val NUM_CLASSIFICATION_LABELS = Value
+  val OPTIMIZER_CLASS_NAME = Value
 
   type LogKey = Value
 }
diff --git a/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala b/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala
@@ -49,6 +49,8 @@ case class MessageWithContext(message: String, context: java.util.HashMap[String
     resultMap.putAll(mdc.context)
     MessageWithContext(message + mdc.message, resultMap)
   }
+
+  override def toString: String = message
 }
 
 /**
@@ -117,7 +119,7 @@ trait Logging {
     }
   }
 
-  private def withLogContext(context: java.util.HashMap[String, String])(body: => Unit): Unit = {
+  protected def withLogContext(context: java.util.HashMap[String, String])(body: => Unit): Unit = {
     val threadContext = CloseableThreadContext.putAll(context)
     try {
       body

diff --git a/common/utils/src/test/scala/org/apache/spark/util/MDCSuite.scala b/common/utils/src/test/scala/org/apache/spark/util/MDCSuite.scala
@@ -32,13 +32,15 @@ class MDCSuite
     val log = log"This is a log, exitcode ${MDC(EXIT_CODE, 10086)}"
     assert(log.message === "This is a log, exitcode 10086")
     assert(log.context === Map("exit_code" -> "10086").asJava)
+    assert(log.toString === "This is a log, exitcode 10086")
   }
 
   test("custom object as MDC value") {
     val cov = CustomObjectValue("spark", 10086)
     val log = log"This is a log, exitcode ${MDC(EXIT_CODE, cov)}"
     assert(log.message === "This is a log, exitcode CustomObjectValue: spark, 10086")
     assert(log.context === Map("exit_code" -> "CustomObjectValue: spark, 10086").asJava)
+    assert(log.toString === "This is a log, exitcode CustomObjectValue: spark, 10086")
   }
 
   case class CustomObjectValue(key: String, value: Int) {

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
@@ -25,7 +25,8 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.Since
-import org.apache.spark.internal.Logging
+import org.apache.spark.internal.{Logging, MDC}
+import org.apache.spark.internal.LogKey.{NUM_CLASSIFICATION_LABELS, OPTIMIZER_CLASS_NAME, RANGE_CLASSIFICATION_LABELS}
 import org.apache.spark.ml.feature._
 import org.apache.spark.ml.linalg._
 import org.apache.spark.ml.optim.aggregator._
@@ -220,10 +221,11 @@ class LinearSVC @Since("2.2.0") (
     instr.logNumFeatures(numFeatures)
 
     if (numInvalid != 0) {
-      val msg = s"Classification labels should be in [0 to ${numClasses - 1}]. " +
-        s"Found $numInvalid invalid labels."
+      val msg = log"Classification labels should be in " +
+        log"${MDC(RANGE_CLASSIFICATION_LABELS, s"[0 to ${numClasses - 1}]")}. " +
+        log"Found ${MDC(NUM_CLASSIFICATION_LABELS, numInvalid)} invalid labels."
       instr.logError(msg)
-      throw new SparkException(msg)
+      throw new SparkException(msg.message)
     }
 
     val featuresStd = summarizer.std.toArray
@@ -249,9 +251,9 @@ class LinearSVC @Since("2.2.0") (
         regularization, optimizer)
 
     if (rawCoefficients == null) {
-      val msg = s"${optimizer.getClass.getName} failed."
+      val msg = log"${MDC(OPTIMIZER_CLASS_NAME, optimizer.getClass.getName)} failed."
       instr.logError(msg)
-      throw new SparkException(msg)
+      throw new SparkException(msg.message)
     }
 
     val coefficientArray = Array.tabulate(numFeatures) { i =>

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -27,7 +27,8 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.Since
-import org.apache.spark.internal.Logging
+import org.apache.spark.internal.{Logging, MDC}
+import org.apache.spark.internal.LogKey.{NUM_CLASSIFICATION_LABELS, OPTIMIZER_CLASS_NAME, RANGE_CLASSIFICATION_LABELS}
 import org.apache.spark.ml.feature._
 import org.apache.spark.ml.impl.Utils
 import org.apache.spark.ml.linalg._
@@ -530,10 +531,11 @@ class LogisticRegression @Since("1.2.0") (
     }
 
     if (numInvalid != 0) {
-      val msg = s"Classification labels should be in [0 to ${numClasses - 1}]. " +
-        s"Found $numInvalid invalid labels."
+      val msg = log"Classification labels should be in " +
+        log"${MDC(RANGE_CLASSIFICATION_LABELS, s"[0 to ${numClasses - 1}]")}. " +
+        log"Found ${MDC(NUM_CLASSIFICATION_LABELS, numInvalid)} invalid labels."
       instr.logError(msg)
-      throw new SparkException(msg)
+      throw new SparkException(msg.message)
     }
 
     instr.logNumClasses(numClasses)
@@ -634,9 +636,9 @@ class LogisticRegression @Since("1.2.0") (
         initialSolution.toArray, regularization, optimizer)
 
     if (allCoefficients == null) {
-      val msg = s"${optimizer.getClass.getName} failed."
+      val msg = log"${MDC(OPTIMIZER_CLASS_NAME, optimizer.getClass.getName)} failed."
       instr.logError(msg)
-      throw new SparkException(msg)
+      throw new SparkException(msg.message)
     }
 
     val allCoefMatrix = new DenseMatrix(numCoefficientSets, numFeaturesPlusIntercept,

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -25,7 +25,8 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.Since
-import org.apache.spark.internal.Logging
+import org.apache.spark.internal.{Logging, MDC}
+import org.apache.spark.internal.LogKey.OPTIMIZER_CLASS_NAME
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.feature._
 import org.apache.spark.ml.linalg._
@@ -271,9 +272,9 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S
         optimizer, initialSolution)
 
     if (rawCoefficients == null) {
-      val msg = s"${optimizer.getClass.getName} failed."
+      val msg = log"${MDC(OPTIMIZER_CLASS_NAME, optimizer.getClass.getName)} failed."
       instr.logError(msg)
-      throw new SparkException(msg)
+      throw new SparkException(msg.message)
     }
 
     val coefficientArray = Array.tabulate(numFeatures) { i =>

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -27,7 +27,8 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.Since
-import org.apache.spark.internal.Logging
+import org.apache.spark.internal.{Logging, MDC}
+import org.apache.spark.internal.LogKey.OPTIMIZER_CLASS_NAME
 import org.apache.spark.ml.{PipelineStage, PredictorParams}
 import org.apache.spark.ml.feature._
 import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
@@ -428,9 +429,9 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
         featuresMean, featuresStd, initialSolution, regularization, optimizer)
 
     if (parameters == null) {
-      val msg = s"${optimizer.getClass.getName} failed."
+      val msg = log"${MDC(OPTIMIZER_CLASS_NAME, optimizer.getClass.getName)} failed."
       instr.logError(msg)
-      throw new SparkException(msg)
+      throw new SparkException(msg.message)
     }
 
     val model = createModel(parameters, yMean, yStd, featuresMean, featuresStd)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala b/mllib/src/main/scala/org/apache/spark/ml/util/Instrumentation.scala
@@ -27,7 +27,7 @@ import org.json4s._
 import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
-import org.apache.spark.internal.Logging
+import org.apache.spark.internal.{LogEntry, Logging, MessageWithContext}
 import org.apache.spark.ml.{MLEvents, PipelineStage}
 import org.apache.spark.ml.param.{Param, Params}
 import org.apache.spark.rdd.RDD
@@ -84,20 +84,53 @@ private[spark] class Instrumentation private () extends Logging with MLEvents {
     super.logWarning(prefix + msg)
   }
 
+  /**
+   * Logs a LogEntry which message with a prefix that uniquely identifies the training session.
+   */
+  override def logWarning(entry: LogEntry): Unit = {
+    if (log.isWarnEnabled) {
+      withLogContext(entry.context) {
+        log.warn(prefix + entry.message)
+      }
+    }
+  }
+
   /**
    * Logs a error message with a prefix that uniquely identifies the training session.
    */
   override def logError(msg: => String): Unit = {
     super.logError(prefix + msg)
   }
 
+  /**
+   * Logs a LogEntry which message with a prefix that uniquely identifies the training session.
+   */
+  override def logError(entry: LogEntry): Unit = {
+    if (log.isErrorEnabled) {
+      withLogContext(entry.context) {
+        log.error(prefix + entry.message)
+      }
+    }
+  }
+
   /**
    * Logs an info message with a prefix that uniquely identifies the training session.
    */
   override def logInfo(msg: => String): Unit = {
     super.logInfo(prefix + msg)
   }
 
+  /**
+   * Logs a LogEntry which message with a prefix that uniquely identifies the training session.
+   */
+  override def logInfo(entry: LogEntry): Unit = {
+    if (log.isInfoEnabled) {
+      withLogContext(entry.context) {
+        log.info(prefix + entry.message)
+      }
+    }
+  }
+
   /**
    * Logs the value of the given parameters for the estimator being used in this session.
    */

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala
@@ -18,7 +18,8 @@
 package org.apache.spark.mllib.util
 
 import org.apache.spark.annotation.Since
-import org.apache.spark.internal.Logging
+import org.apache.spark.internal.{Logging, MDC}
+import org.apache.spark.internal.LogKey.{NUM_CLASSIFICATION_LABELS, RANGE_CLASSIFICATION_LABELS}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 
@@ -37,7 +38,8 @@ object DataValidators extends Logging {
   val binaryLabelValidator: RDD[LabeledPoint] => Boolean = { data =>
     val numInvalid = data.filter(x => x.label != 1.0 && x.label != 0.0).count()
     if (numInvalid != 0) {
-      logError("Classification labels should be 0 or 1. Found " + numInvalid + " invalid labels")
+      logError(log"Classification labels should be 0 or 1. " +
+        log"Found ${MDC(NUM_CLASSIFICATION_LABELS, numInvalid)} invalid labels")
     }
     numInvalid == 0
   }
@@ -53,8 +55,9 @@ object DataValidators extends Logging {
     val numInvalid = data.filter(x =>
       x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count()
     if (numInvalid != 0) {
-      logError("Classification labels should be in {0 to " + (k - 1) + "}. " +
-        "Found " + numInvalid + " invalid labels")
+      logError(log"Classification labels should be in " +
+        log"${MDC(RANGE_CLASSIFICATION_LABELS, s"[0 to ${k - 1}]")}. " +
+        log"Found ${MDC(NUM_CLASSIFICATION_LABELS, numInvalid)} invalid labels")
     }
     numInvalid == 0
   }

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorIndexerSuite.scala
@@ -18,7 +18,8 @@
 package org.apache.spark.ml.feature
 
 import org.apache.spark.SparkException
-import org.apache.spark.internal.Logging
+import org.apache.spark.internal.{Logging, MDC}
+import org.apache.spark.internal.LogKey.{CATEGORICAL_FEATURES, MAX_CATEGORIES}
 import org.apache.spark.ml.attribute._
 import org.apache.spark.ml.linalg.{SparseVector, Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
@@ -175,8 +176,10 @@ class VectorIndexerSuite extends MLTest with DefaultReadWriteTest with Logging {
         maxCategories: Int,
         categoricalFeatures: Set[Int]): Unit = {
       val collectedData = data.collect().map(_.getAs[Vector](0))
-      val errMsg = s"checkCategoryMaps failed for input with maxCategories=$maxCategories," +
-        s" categoricalFeatures=${categoricalFeatures.mkString(", ")}"
+
+      val errMsg = log"checkCategoryMaps failed for input with " +
+        log"maxCategories=${MDC(MAX_CATEGORIES, maxCategories)} " +
+        log"categoricalFeatures=${MDC(CATEGORICAL_FEATURES, categoricalFeatures.mkString(", "))}"
       try {
         val vectorIndexer = getIndexer.setMaxCategories(maxCategories)
         val model = vectorIndexer.fit(data)

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
@@ -18,6 +18,8 @@
 package org.apache.spark.mllib.tree
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.internal.LogKey.{LEARNING_RATE, NUM_ITERATIONS, SUBSAMPLING_RATE}
+import org.apache.spark.internal.MDC
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy}
 import org.apache.spark.mllib.tree.configuration.Algo._
@@ -51,8 +53,9 @@ class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext
             gbt, GradientBoostedTreesSuite.data.toImmutableArraySeq, 0.06)
         } catch {
           case e: java.lang.AssertionError =>
-            logError(s"FAILED for numIterations=$numIterations, learningRate=$learningRate," +
-              s" subsamplingRate=$subsamplingRate")
+            logError(log"FAILED for numIterations=${MDC(NUM_ITERATIONS, numIterations)}, " +
+              log"learningRate=${MDC(LEARNING_RATE, learningRate)}, " +
+              log"subsamplingRate=${MDC(SUBSAMPLING_RATE, subsamplingRate)}")
             throw e
         }
 
@@ -82,8 +85,9 @@ class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext
             gbt, GradientBoostedTreesSuite.data.toImmutableArraySeq, 0.85, "mae")
         } catch {
           case e: java.lang.AssertionError =>
-            logError(s"FAILED for numIterations=$numIterations, learningRate=$learningRate," +
-              s" subsamplingRate=$subsamplingRate")
+            logError(log"FAILED for numIterations=${MDC(NUM_ITERATIONS, numIterations)}, " +
+              log"learningRate=${MDC(LEARNING_RATE, learningRate)}, " +
+              log"subsamplingRate=${MDC(SUBSAMPLING_RATE, subsamplingRate)}")
             throw e
         }
 
@@ -114,8 +118,9 @@ class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext
             gbt, GradientBoostedTreesSuite.data.toImmutableArraySeq, 0.9)
         } catch {
           case e: java.lang.AssertionError =>
-            logError(s"FAILED for numIterations=$numIterations, learningRate=$learningRate," +
-              s" subsamplingRate=$subsamplingRate")
+            logError(log"FAILED for numIterations=${MDC(NUM_ITERATIONS, numIterations)}, " +
+              log"learningRate=${MDC(LEARNING_RATE, learningRate)}, " +
+              log"subsamplingRate=${MDC(SUBSAMPLING_RATE, subsamplingRate)}")
             throw e
         }