ericm-db · ericm-db · Apr 3, 2024 · Apr 3, 2024 · Apr 3, 2024 · Apr 3, 2024
diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json
@@ -467,6 +467,24 @@
     ],
     "sqlState" : "42704"
   },
+  "COLLATION_MISMATCH" : {
+    "message" : [
+      "Could not determine which collation to use for string functions and operators."
+    ],
+    "subClass" : {
+      "EXPLICIT" : {
+        "message" : [
+          "Error occurred due to the mismatch between explicit collations: <explicitTypes>. Decide on a single explicit collation and remove others."
+        ]
+      },
+      "IMPLICIT" : {
+        "message" : [
+          "Error occurred due to the mismatch between multiple implicit non-default collations. Use COLLATE function to set the collation explicitly."
+        ]
+      }
+    },
+    "sqlState" : "42P21"
+  },
   "COLLECTION_SIZE_LIMIT_EXCEEDED" : {
     "message" : [
       "Can't create array with <numberOfElements> elements which exceeding the array size limit <maxRoundedArrayLength>,"
@@ -688,11 +706,6 @@
           "To convert values from <srcType> to <targetType>, you can use the functions <functionNames> instead."
         ]
       },
-      "COLLATION_MISMATCH" : {
-        "message" : [
-          "Collations <collationNameLeft> and <collationNameRight> are not compatible. Please use the same collation for both strings."
-        ]
-      },
       "CREATE_MAP_KEY_DIFF_TYPES" : {
         "message" : [
           "The given keys of function <functionName> should all be the same type, but they are <dataType>."
@@ -1604,6 +1617,12 @@
     ],
     "sqlState" : "22003"
   },
+  "INDETERMINATE_COLLATION" : {
+    "message" : [
+      "Function called requires knowledge of the collation it should apply, but indeterminate collation was found. Use COLLATE function to set the collation explicitly."
+    ],
+    "sqlState" : "42P22"
+  },
   "INDEX_ALREADY_EXISTS" : {
     "message" : [
       "Cannot create the index <indexName> on table <tableName> because it already exists."
@@ -3560,6 +3579,12 @@
     ],
     "sqlState" : "0A000"
   },
+  "STATEFUL_PROCESSOR_CANNOT_ASSIGN_TTL_IN_NO_TTL_MODE" : {
+    "message" : [
+      "Cannot use TTL for state=<stateName> in NoTTL() mode."
+    ],
+    "sqlState" : "42802"
+  },
   "STATEFUL_PROCESSOR_CANNOT_PERFORM_OPERATION_WITH_INVALID_HANDLE_STATE" : {
     "message" : [
       "Failed to perform stateful processor operation=<operationType> with invalid handle state=<handleState>."
@@ -3578,6 +3603,12 @@
     ],
     "sqlState" : "42802"
   },
+  "STATEFUL_PROCESSOR_TTL_DURATION_MUST_BE_POSITIVE" : {
+    "message" : [
+      "TTL duration must be greater than zero for State store operation=<operationType> on state=<stateName>."
+    ],
+    "sqlState" : "42802"
+  },
   "STATE_STORE_CANNOT_CREATE_COLUMN_FAMILY_WITH_RESERVED_CHARS" : {
     "message" : [
       "Failed to create column family with unsupported starting character and name=<colFamilyName>."
@@ -3599,7 +3630,7 @@
   },
   "STATE_STORE_INCORRECT_NUM_ORDERING_COLS_FOR_RANGE_SCAN" : {
     "message" : [
-      "Incorrect number of ordering columns=<numOrderingCols> for range scan encoder. Ordering columns cannot be zero or greater than num of schema columns."
+      "Incorrect number of ordering ordinals=<numOrderingCols> for range scan encoder. The number of ordering ordinals cannot be zero or greater than number of schema columns."
     ],
     "sqlState" : "42802"
   },
@@ -4372,6 +4403,11 @@
           "Removing column families with <stateStoreProvider> is not supported."
         ]
       },
+      "STATE_STORE_TTL" : {
+        "message" : [
+          "State TTL with <stateStoreProvider> is not supported. Please use RocksDBStateStoreProvider."
+        ]
+      },
       "TABLE_OPERATION" : {
         "message" : [
           "Table <tableName> does not support <operation>. Please check the current catalog and namespace to make sure the qualified table name is expected, and also check the catalog implementation which is configured by \"spark.sql.catalog\"."
@@ -4533,7 +4569,8 @@
     "subClass" : {
       "ACCESSING_OUTER_QUERY_COLUMN_IS_NOT_ALLOWED" : {
         "message" : [
-          "Accessing outer query column is not allowed in this location<treeNode>."
+          "Accessing outer query column is not allowed in this location:",
+          "<treeNode>"
         ]
       },
       "AGGREGATE_FUNCTION_MIXED_OUTER_LOCAL_REFERENCES" : {
@@ -4543,7 +4580,8 @@
       },
       "CORRELATED_COLUMN_IS_NOT_ALLOWED_IN_PREDICATE" : {
         "message" : [
-          "Correlated column is not allowed in predicate: <treeNode>."
+          "Correlated column is not allowed in predicate:",
+          "<treeNode>"
         ]
       },
       "CORRELATED_COLUMN_NOT_FOUND" : {
@@ -4578,7 +4616,8 @@
       },
       "NON_DETERMINISTIC_LATERAL_SUBQUERIES" : {
         "message" : [
-          "Non-deterministic lateral subqueries are not supported when joining with outer relations that produce more than one row<treeNode>."
+          "Non-deterministic lateral subqueries are not supported when joining with outer relations that produce more than one row:",
+          "<treeNode>"
         ]
       },
       "UNSUPPORTED_CORRELATED_REFERENCE_DATA_TYPE" : {
@@ -4588,17 +4627,20 @@
       },
       "UNSUPPORTED_CORRELATED_SCALAR_SUBQUERY" : {
         "message" : [
-          "Correlated scalar subqueries can only be used in filters, aggregations, projections, and UPDATE/MERGE/DELETE commands<treeNode>."
+          "Correlated scalar subqueries can only be used in filters, aggregations, projections, and UPDATE/MERGE/DELETE commands:",
+          "<treeNode>"
         ]
       },
       "UNSUPPORTED_IN_EXISTS_SUBQUERY" : {
         "message" : [
-          "IN/EXISTS predicate subqueries can only be used in filters, joins, aggregations, window functions, projections, and UPDATE/MERGE/DELETE commands<treeNode>."
+          "IN/EXISTS predicate subqueries can only be used in filters, joins, aggregations, window functions, projections, and UPDATE/MERGE/DELETE commands:",
+          "<treeNode>"
         ]
       },
       "UNSUPPORTED_TABLE_ARGUMENT" : {
         "message" : [
-          "Table arguments are used in a function where they are not supported<treeNode>."
+          "Table arguments are used in a function where they are not supported:",
+          "<treeNode>"
         ]
       }
     },

diff --git a/common/utils/src/main/scala/org/apache/spark/internal/LogKey.scala b/common/utils/src/main/scala/org/apache/spark/internal/LogKey.scala
@@ -21,17 +21,100 @@ package org.apache.spark.internal
  * All structured logging keys should be defined here for standardization.
  */
 object LogKey extends Enumeration {
-  val APPLICATION_ID = Value
-  val APPLICATION_STATE = Value
+  val ACCUMULATOR_ID = Value
+  val APP_DESC = Value
+  val APP_ID = Value
+  val APP_STATE = Value
+  val BLOCK_ID = Value
+  val BLOCK_MANAGER_ID = Value
+  val BROADCAST_ID = Value
   val BUCKET = Value
+  val BYTECODE_SIZE = Value
+  val CATEGORICAL_FEATURES = Value
+  val CLASS_LOADER = Value
+  val CLASS_NAME = Value
+  val COMMAND = Value
+  val COMMAND_OUTPUT = Value
+  val COMPONENT = Value
+  val CONFIG = Value
+  val CONFIG2 = Value
   val CONTAINER_ID = Value
+  val COUNT = Value
+  val DRIVER_ID = Value
+  val END_POINT = Value
+  val ERROR = Value
+  val EVENT_LOOP = Value
+  val EVENT_QUEUE = Value
   val EXECUTOR_ID = Value
+  val EXECUTOR_STATE = Value
   val EXIT_CODE = Value
+  val FAILURES = Value
+  val GROUP_ID = Value
+  val HOST = Value
+  val JOB_ID = Value
+  val JOIN_CONDITION = Value
+  val LEARNING_RATE = Value
+  val LINE = Value
+  val LINE_NUM = Value
+  val LISTENER = Value
+  val LOG_TYPE = Value
+  val MASTER_URL = Value
+  val MAX_ATTEMPTS = Value
+  val MAX_CATEGORIES = Value
   val MAX_EXECUTOR_FAILURES = Value
   val MAX_SIZE = Value
+  val MERGE_DIR_NAME = Value
+  val METHOD_NAME = Value
   val MIN_SIZE = Value
-  val REMOTE_ADDRESS = Value
+  val NUM_ITERATIONS = Value
+  val OBJECT_ID = Value
+  val OLD_BLOCK_MANAGER_ID = Value
+  val OPTIMIZER_CLASS_NAME = Value
+  val OP_TYPE = Value
+  val PARTITION_ID = Value
+  val PATH = Value
+  val PATHS = Value
   val POD_ID = Value
+  val PORT = Value
+  val QUERY_PLAN = Value
+  val RANGE = Value
+  val RDD_ID = Value
+  val REASON = Value
+  val REDUCE_ID = Value
+  val REMOTE_ADDRESS = Value
+  val RETRY_COUNT = Value
+  val RETRY_INTERVAL = Value
+  val RPC_ADDRESS = Value
+  val RULE_BATCH_NAME = Value
+  val RULE_NAME = Value
+  val RULE_NUMBER_OF_RUNS = Value
+  val SESSION_ID = Value
+  val SHARD_ID = Value
+  val SHUFFLE_BLOCK_INFO = Value
+  val SHUFFLE_ID = Value
+  val SHUFFLE_MERGE_ID = Value
+  val SIZE = Value
+  val SLEEP_TIME = Value
+  val STAGE_ID = Value
+  val SUBMISSION_ID = Value
+  val SUBSAMPLING_RATE = Value
+  val TASK_ATTEMPT_ID = Value
+  val TASK_ID = Value
+  val TASK_NAME = Value
+  val TASK_SET_NAME = Value
+  val TASK_STATE = Value
+  val THREAD = Value
+  val THREAD_NAME = Value
+  val TID = Value
+  val TIMEOUT = Value
+  val TOTAL_EFFECTIVE_TIME = Value
+  val TOTAL_TIME = Value
+  val URI = Value
+  val USER_ID = Value
+  val USER_NAME = Value
+  val WATERMARK_CONSTRAINT = Value
+  val WORKER_URL = Value
+  val XSD_PATH = Value
 
   type LogKey = Value
 }
diff --git a/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala b/common/utils/src/main/scala/org/apache/spark/internal/Logging.scala
@@ -49,6 +49,8 @@ case class MessageWithContext(message: String, context: java.util.HashMap[String
     resultMap.putAll(mdc.context)
     MessageWithContext(message + mdc.message, resultMap)
   }
+
+  def stripMargin: MessageWithContext = copy(message = message.stripMargin)
 }
 
 /**
@@ -117,7 +119,7 @@ trait Logging {
     }
   }
 
-  private def withLogContext(context: java.util.HashMap[String, String])(body: => Unit): Unit = {
+  protected def withLogContext(context: java.util.HashMap[String, String])(body: => Unit): Unit = {
     val threadContext = CloseableThreadContext.putAll(context)
     try {
       body

diff --git a/common/utils/src/main/scala/org/apache/spark/internal/README.md b/common/utils/src/main/scala/org/apache/spark/internal/README.md
@@ -0,0 +1,14 @@
+# Guidelines for the Structured Logging Framework
+
+## LogKey
+
+LogKeys serve as identifiers for mapped diagnostic contexts (MDC) within logs. Follow these guidelines when adding new LogKeys:
+* Define all structured logging keys in `LogKey.scala`, and sort them alphabetically for ease of search.
+* Use `UPPER_SNAKE_CASE` for key names.
+* Key names should be both simple and broad, yet include specific identifiers like `STAGE_ID`, `TASK_ID`, and `JOB_ID` when needed for clarity. For instance, use `MAX_ATTEMPTS` as a general key instead of creating separate keys for each scenario such as `EXECUTOR_STATE_SYNC_MAX_ATTEMPTS` and `MAX_TASK_FAILURES`. This balances simplicity with the detail needed for effective logging.
+* Use abbreviations in names if they are widely understood, such as `APP_ID` for APPLICATION_ID, and `K8S` for KUBERNETES.
+* For time-related keys, use milliseconds as the unit of time.
+
+## Exceptions
+
+To ensure logs are compatible with Spark SQL and log analysis tools, avoid `Exception.printStackTrace()`. Use `logError`, `logWarning`, and `logInfo` methods from the `Logging` trait to log exceptions, maintaining structured and parsable logs.
diff --git a/common/utils/src/test/scala/org/apache/spark/util/LogKeySuite.scala b/common/utils/src/test/scala/org/apache/spark/util/LogKeySuite.scala
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import java.nio.charset.StandardCharsets
+import java.nio.file.{Files, Path}
+import java.util.{ArrayList => JList}
+
+import scala.jdk.CollectionConverters._
+
+import org.apache.commons.io.FileUtils
+import org.scalatest.funsuite.AnyFunSuite // scalastyle:ignore funsuite
+
+import org.apache.spark.internal.{Logging, LogKey}
+import org.apache.spark.internal.LogKey.LogKey
+
+// scalastyle:off line.size.limit
+/**
+ * To re-generate the LogKey class file, run:
+ * {{{
+ *   SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "common-utils/testOnly org.apache.spark.util.LogKeySuite"
+ * }}}
+ */
+// scalastyle:on line.size.limit
+class LogKeySuite
+    extends AnyFunSuite // scalastyle:ignore funsuite
+    with Logging {
+
+  /**
+   * Get a Path relative to the root project. It is assumed that a spark home is set.
+   */
+  protected final def getWorkspaceFilePath(first: String, more: String*): Path = {
+    if (!(sys.props.contains("spark.test.home") || sys.env.contains("SPARK_HOME"))) {
+      fail("spark.test.home or SPARK_HOME is not set.")
+    }
+    val sparkHome = sys.props.getOrElse("spark.test.home", sys.env("SPARK_HOME"))
+    java.nio.file.Paths.get(sparkHome, first +: more: _*)
+  }
+
+  private val regenerateGoldenFiles: Boolean = System.getenv("SPARK_GENERATE_GOLDEN_FILES") == "1"
+
+  private val logKeyFilePath = getWorkspaceFilePath("common", "utils", "src", "main", "scala",
+    "org", "apache", "spark", "internal", "LogKey.scala")
+
+  // regenerate the file `LogKey.scala` with its enumeration fields sorted alphabetically
+  private def regenerateLogKeyFile(
+      originalKeys: Seq[LogKey], sortedKeys: Seq[LogKey]): Unit = {
+    if (originalKeys != sortedKeys) {
+      val logKeyFile = logKeyFilePath.toFile
+      logInfo(s"Regenerating LogKey file $logKeyFile")
+      val originalContents = FileUtils.readLines(logKeyFile, StandardCharsets.UTF_8)
+      val sortedContents = new JList[String]()
+      var firstMatch = false
+      originalContents.asScala.foreach { line =>
+        if (line.trim.startsWith("val ") && line.trim.endsWith(" = Value")) {
+          if (!firstMatch) {
+            sortedKeys.foreach { logKey =>
+              sortedContents.add(s"  val ${logKey.toString} = Value")
+            }
+            firstMatch = true
+          }
+        } else {
+          sortedContents.add(line)
+        }
+      }
+      Files.delete(logKeyFile.toPath)
+      FileUtils.writeLines(logKeyFile, StandardCharsets.UTF_8.name(), sortedContents)
+    }
+  }
+
+  test("LogKey enumeration fields are correctly sorted") {
+    val originalKeys = LogKey.values.toSeq
+    val sortedKeys = originalKeys.sortBy(_.toString)
+    if (regenerateGoldenFiles) {
+      regenerateLogKeyFile(originalKeys, sortedKeys)
+    } else {
+      assert(originalKeys === sortedKeys,
+        "LogKey enumeration fields must be sorted alphabetically")
+    }
+  }
+}
diff --git a/connector/connect/bin/spark-connect-scala-client b/connector/connect/bin/spark-connect-scala-client
@@ -68,6 +68,7 @@ JVM_ARGS="-XX:+IgnoreUnrecognizedVMOptions \
   --add-opens=java.base/sun.security.action=ALL-UNNAMED \
   --add-opens=java.base/sun.util.calendar=ALL-UNNAMED \
   --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED \
-  -Djdk.reflect.useDirectMethodHandle=false "
+  -Djdk.reflect.useDirectMethodHandle=false \
+  -Dio.netty.tryReflectionSetAccessible=true"
 
 exec java $JVM_ARGS -cp "$SCCLASSPATH" org.apache.spark.sql.application.ConnectRepl "$@"