From 797f999ade6acd6f8e769fc1f86092f0de03aa7c Mon Sep 17 00:00:00 2001 From: Jiaheng Tang Date: Sun, 3 Nov 2024 10:39:07 -0800 Subject: [PATCH 1/4] initial commit --- .github/workflows/spark_test.yaml | 5 ++ dev/structured_logging_style.py | 113 ++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 dev/structured_logging_style.py diff --git a/.github/workflows/spark_test.yaml b/.github/workflows/spark_test.yaml index eaf4aec9bf1..881cbe36eba 100644 --- a/.github/workflows/spark_test.yaml +++ b/.github/workflows/spark_test.yaml @@ -80,6 +80,11 @@ jobs: pipenv run pip install pyarrow==8.0.0 pipenv run pip install numpy==1.20.3 if: steps.git-diff.outputs.diff + - name: Scala structured logging check + run: | + if [ -f ./dev/structured_logging_style.py ]; then + python ./dev/structured_logging_style.py + fi - name: Run Scala/Java and Python tests # when changing TEST_PARALLELISM_COUNT make sure to also change it in spark_master_test.yaml run: | diff --git a/dev/structured_logging_style.py b/dev/structured_logging_style.py new file mode 100644 index 00000000000..bbf080d52e2 --- /dev/null +++ b/dev/structured_logging_style.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Copyright (2021) The Delta Lake Project Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import sys +import re +import glob + + +def main(): + log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n" + inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\"\+(?!s?")).*|[^"]+\+\s*".*?"' + compiled_inner_log_pattern = re.compile(inner_log_pattern) + + # Regex patterns for file paths to exclude from the Structured Logging style check + excluded_file_patterns = [ + "[Tt]est", + "sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen" + "/CodeGenerator.scala", + "streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala", + "sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver" + "/SparkSQLCLIService.scala", + "core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala", + ] + + nonmigrated_files = {} + + target_directories = ["../spark", "../iceberg", "../hudi"] + scala_files = [] + for directory in target_directories: + scala_files.extend(glob.glob(os.path.join(directory, "**", "*.scala"), recursive=True)) + + for file in scala_files: + skip_file = False + for exclude_pattern in excluded_file_patterns: + if re.search(exclude_pattern, file): + skip_file = True + break + + if not skip_file and not os.path.isdir(file): + with open(file, "r") as f: + content = f.read() + + log_statements = re.finditer(log_pattern, content, re.DOTALL) + + if log_statements: + nonmigrated_files[file] = [] + for log_statement in log_statements: + log_statement_str = log_statement.group(0).strip() + # trim first ( and last ) + first_paren_index = log_statement_str.find("(") + inner_log_statement = re.sub( + r"\s+", "", log_statement_str[first_paren_index + 1 : -1] + ) + + if compiled_inner_log_pattern.fullmatch(inner_log_statement): + start_pos = log_statement.start() + preceding_content = content[:start_pos] + line_number = preceding_content.count("\n") + 1 + start_char = start_pos - preceding_content.rfind("\n") - 1 + nonmigrated_files[file].append((line_number, start_char)) + + if all(len(issues) == 0 for issues in nonmigrated_files.values()): + print("Structured logging style check passed.", file=sys.stderr) + sys.exit(0) + else: + for file_path, issues in nonmigrated_files.items(): + for line_number, start_char in issues: + print(f"[error] {file_path}:{line_number}:{start_char}", file=sys.stderr) + print( + "[error]\tPlease use the Structured Logging Framework for logging messages " + 'with variables. For example: log"...${{MDC(TASK_ID, taskId)}}..."' + "\n\tRefer to the guidelines in the file `shims/LoggingShims.scala`.", + file=sys.stderr, + ) + + sys.exit(-1) + + +if __name__ == "__main__": + main() From db6cd7ed6618c316cf192cfca40ded1c36de6b8d Mon Sep 17 00:00:00 2001 From: Jiaheng Tang Date: Mon, 4 Nov 2024 14:17:16 -0800 Subject: [PATCH 2/4] address comment --- .github/workflows/spark_test.yaml | 7 ++++--- ..._logging_style.py => spark_structured_logging_style.py} | 0 2 files changed, 4 insertions(+), 3 deletions(-) rename dev/{structured_logging_style.py => spark_structured_logging_style.py} (100%) diff --git a/.github/workflows/spark_test.yaml b/.github/workflows/spark_test.yaml index 881cbe36eba..2329a1a250f 100644 --- a/.github/workflows/spark_test.yaml +++ b/.github/workflows/spark_test.yaml @@ -80,11 +80,12 @@ jobs: pipenv run pip install pyarrow==8.0.0 pipenv run pip install numpy==1.20.3 if: steps.git-diff.outputs.diff - - name: Scala structured logging check + - name: Scala structured logging style check run: | - if [ -f ./dev/structured_logging_style.py ]; then - python ./dev/structured_logging_style.py + if [ -f ./dev/spark_structured_logging_style.py ]; then + python3 ./dev/spark_structured_logging_style.py fi + if: steps.git-diff.outputs.diff - name: Run Scala/Java and Python tests # when changing TEST_PARALLELISM_COUNT make sure to also change it in spark_master_test.yaml run: | diff --git a/dev/structured_logging_style.py b/dev/spark_structured_logging_style.py similarity index 100% rename from dev/structured_logging_style.py rename to dev/spark_structured_logging_style.py From d4600ddd9a69f1161d5cfeac10c6f01dbec0e3bb Mon Sep 17 00:00:00 2001 From: Jiaheng Tang Date: Mon, 4 Nov 2024 14:26:03 -0800 Subject: [PATCH 3/4] update --- .github/workflows/spark_master_test.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/spark_master_test.yaml b/.github/workflows/spark_master_test.yaml index 3906c31f221..8732cb6f3df 100644 --- a/.github/workflows/spark_master_test.yaml +++ b/.github/workflows/spark_master_test.yaml @@ -48,6 +48,12 @@ jobs: sudo apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev xz-utils tk-dev libffi-dev liblzma-dev python-openssl git sudo apt install libedit-dev if: steps.git-diff.outputs.diff + - name: Scala structured logging style check + run: | + if [ -f ./dev/spark_structured_logging_style.py ]; then + python3 ./dev/spark_structured_logging_style.py + fi + if: steps.git-diff.outputs.diff - name: Run Spark Master tests # when changing TEST_PARALLELISM_COUNT make sure to also change it in spark_test.yaml run: | From 2c21a8f3daedffaaa45978f311b6832e200e29b0 Mon Sep 17 00:00:00 2001 From: Jiaheng Tang Date: Mon, 4 Nov 2024 14:29:03 -0800 Subject: [PATCH 4/4] fix --- .github/workflows/spark_master_test.yaml | 6 ------ .github/workflows/spark_test.yaml | 6 +++--- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/.github/workflows/spark_master_test.yaml b/.github/workflows/spark_master_test.yaml index 8732cb6f3df..3906c31f221 100644 --- a/.github/workflows/spark_master_test.yaml +++ b/.github/workflows/spark_master_test.yaml @@ -48,12 +48,6 @@ jobs: sudo apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev xz-utils tk-dev libffi-dev liblzma-dev python-openssl git sudo apt install libedit-dev if: steps.git-diff.outputs.diff - - name: Scala structured logging style check - run: | - if [ -f ./dev/spark_structured_logging_style.py ]; then - python3 ./dev/spark_structured_logging_style.py - fi - if: steps.git-diff.outputs.diff - name: Run Spark Master tests # when changing TEST_PARALLELISM_COUNT make sure to also change it in spark_test.yaml run: | diff --git a/.github/workflows/spark_test.yaml b/.github/workflows/spark_test.yaml index 2329a1a250f..5ba7c624ef4 100644 --- a/.github/workflows/spark_test.yaml +++ b/.github/workflows/spark_test.yaml @@ -82,9 +82,9 @@ jobs: if: steps.git-diff.outputs.diff - name: Scala structured logging style check run: | - if [ -f ./dev/spark_structured_logging_style.py ]; then - python3 ./dev/spark_structured_logging_style.py - fi + if [ -f ./dev/spark_structured_logging_style.py ]; then + python3 ./dev/spark_structured_logging_style.py + fi if: steps.git-diff.outputs.diff - name: Run Scala/Java and Python tests # when changing TEST_PARALLELISM_COUNT make sure to also change it in spark_master_test.yaml