additional mdc migration

apache · asl3 · Jul 8, 2024 · Jul 8, 2024 · Jul 8, 2024 · Jul 8, 2024
commit a2de740966a6a51fee850675e94810e33a407d9a
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -366,7 +366,7 @@ object SparkEnv extends Logging {
         name: String, endpointCreator: => RpcEndpoint):
       RpcEndpointRef = {
       if (isDriver) {
-        logInfo("Registering " + name)
+        logInfo(log"Registering ${MDC(LogKeys.NAME, name)}")
         rpcEnv.setupEndpoint(name, endpointCreator)
       } else {
         RpcUtils.makeDriverRef(name, conf, rpcEnv)

diff --git a/dev/structured-logging-style.py b/dev/structured-logging-style.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import sys
+import re
+import glob
+
+from sparktestsupport import SPARK_HOME
+
+def main():
+    log_pattern = r'log(?:Info|Warning|Error)\((.*?)\)'
+    inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\+(?!.*?[ |\t].*s?")).*'
+    compiled_inner_log_pattern = re.compile(inner_log_pattern, flags=re.DOTALL)
+
+    # Regex patterns for file paths to exclude from the Structured Logging style check
+    excluded_file_patterns = [
+        '[Tt]est',
+        '/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala',
+        '/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala'
+    ]
+
+    nonmigrated_files = {}
+
+    scala_files = glob.glob(os.path.join(SPARK_HOME, '**', '*.scala'), recursive=True)
+
+    for file in scala_files:
+        skip_file = False
+        for exclude_pattern in excluded_file_patterns:
+            if re.search(exclude_pattern, file):
+                skip_file = True
+                break
+
+        if not skip_file:
+            with open(file, 'r') as f:
+                content = f.read()
+
+                log_statements = re.finditer(log_pattern, content, re.DOTALL)
+                # log_statements = [statement.group(1).strip() for statement in log_statements]
+
+                if log_statements:
+                    nonmigrated_files[file] = []
+                    for log_statement in log_statements:
+                        if compiled_inner_log_pattern.fullmatch(log_statement.group(1)):
+                            start_pos = log_statement.start()
+                            preceding_content = content[:start_pos]
+                            line_number = preceding_content.count('\n') + 1
+                            start_char = start_pos - preceding_content.rfind('\n') - 1
+                            nonmigrated_files[file].append((line_number, start_char, log_statement.group(1)))
+
+                # for log_statement in log_statements:
+                #     if compiled_inner_log_pattern.search(log_statement):
+                #         nonmigrated_files[file].append()
+
+
+                # matches = list(pattern.finditer(content))
+                # matches2 = pattern.findall(content)
+                #
+                # for m in matches2:
+                #     print(f"****** ${m}")
+                #
+                # if matches:
+                #     nonmigrated_files[file] = []
+                #     for match in matches:
+                #         start_pos = match.start()
+                #         preceding_content = content[:start_pos]
+                #         line_number = preceding_content.count('\n') + 1
+                #         start_char = start_pos - preceding_content.rfind('\n') - 1
+                #         nonmigrated_files[file].append((line_number, start_char))
+
+    if not nonmigrated_files:
+        print("Structured logging style check passed.")
+        sys.exit(0)
+    else:
+        for file_path, issues in nonmigrated_files.items():
+            if issues:
+                print(file_path)
+        # for file_path, issues in nonmigrated_files.items():
+        #     for line_number, start_char in issues:
+        #         pass
+        #         print(f"[error] {file_path}:{line_number}:{start_char}")
+        #         print("""[error]\t\tLogging message should use log"..." instead of s"..." and variables should be wrapped in `MDC`s.
+        #         Refer to Structured Logging Framework guidelines in the file `internal/Logging.scala`.""")
+
+        sys.exit(-1)
+
+if __name__ == "__main__":
+    main()
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala
@@ -276,10 +276,11 @@ private[streaming] class BlockGenerator(
       }
 
       // At this point, state is StoppedGeneratingBlock. So drain the queue of to-be-pushed blocks.
-      logInfo("Pushing out the last " + blocksForPushing.size() + " blocks")
+      logInfo(log"Pushing out the last " +
+        log"${MDC(LogKeys.NUM_BLOCK_IDS, blocksForPushing.size())} blocks")
       while (!blocksForPushing.isEmpty) {
         val block = blocksForPushing.take()
-        logDebug(s"Pushing block $block")
+        logDebug(log"Pushing block ${MDC(LogKeys.BLOCK, block)}")
         pushBlock(block)
         logInfo(log"Blocks left to push ${MDC(LogKeys.NUM_BLOCK_IDS, blocksForPushing.size())}")
       }