From bcde7602dcad5789b20272ea597dcec0209a8aff Mon Sep 17 00:00:00 2001
From: panbingkun <panbingkun@baidu.com>
Date: Wed, 19 Jun 2024 13:49:47 +0800
Subject: [PATCH 1/4] Address r windows

---
 .github/workflows/build_and_test.yml | 858 ++-------------------------
 1 file changed, 52 insertions(+), 806 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 881fb8cb0674..082cc801267b 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -350,822 +350,68 @@ jobs:
           # Use the infra image cache to speed up
           cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ inputs.branch }}
 
-  pyspark:
-    needs: [precondition, infra-image]
-    # always run if pyspark == 'true', even infra-image is skip (such as non-master job)
-    if: (!cancelled()) && fromJson(needs.precondition.outputs.required).pyspark == 'true'
-    name: "Build modules: ${{ matrix.modules }}"
-    runs-on: ubuntu-latest
-    timeout-minutes: 180
-    container:
-      image: ${{ needs.precondition.outputs.image_url }}
-    strategy:
-      fail-fast: false
-      matrix:
-        java:
-          - ${{ inputs.java }}
-        modules:
-          - >-
-            pyspark-sql, pyspark-resource, pyspark-testing
-          - >-
-            pyspark-core, pyspark-errors, pyspark-streaming
-          - >-
-            pyspark-mllib, pyspark-ml, pyspark-ml-connect
-          - >-
-            pyspark-connect
-          - >-
-            pyspark-pandas
-          - >-
-            pyspark-pandas-slow
-          - >-
-            pyspark-pandas-connect-part0
-          - >-
-            pyspark-pandas-connect-part1
-          - >-
-            pyspark-pandas-connect-part2
-          - >-
-            pyspark-pandas-connect-part3
-        exclude:
-          # Always run if pyspark-pandas == 'true', even infra-image is skip (such as non-master job)
-          # In practice, the build will run in individual PR, but not against the individual commit
-          # in Apache Spark repository.
-          - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas' }}
-          - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-slow' }}
-          - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part0' }}
-          - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part1' }}
-          - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part2' }}
-          - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part3' }}
-    env:
-      MODULES_TO_TEST: ${{ matrix.modules }}
-      PYTHON_TO_TEST: 'python3.11'
-      HADOOP_PROFILE: ${{ inputs.hadoop }}
-      HIVE_PROFILE: hive2.3
-      GITHUB_PREV_SHA: ${{ github.event.before }}
-      SPARK_LOCAL_IP: localhost
-      SKIP_UNIDOC: true
-      SKIP_MIMA: true
-      SKIP_PACKAGING: true
-      METASPACE_SIZE: 1g
-      BRANCH: ${{ inputs.branch }}
-    steps:
-    - name: Checkout Spark repository
-      uses: actions/checkout@v4
-      # In order to fetch changed files
-      with:
-        fetch-depth: 0
-        repository: apache/spark
-        ref: ${{ inputs.branch }}
-    - name: Add GITHUB_WORKSPACE to git trust safe.directory
-      run: |
-        git config --global --add safe.directory ${GITHUB_WORKSPACE}
-    - name: Sync the current branch with the latest in Apache Spark
-      if: github.repository != 'apache/spark'
-      run: |
-        echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
-        git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
-        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
-        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
-    # Cache local repositories. Note that GitHub Actions cache has a 10G limit.
-    - name: Cache SBT and Maven
-      uses: actions/cache@v4
-      with:
-        path: |
-          build/apache-maven-*
-          build/*.jar
-          ~/.sbt
-        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
-        restore-keys: |
-          build-
-    - name: Cache Coursier local repository
-      uses: actions/cache@v4
-      with:
-        path: ~/.cache/coursier
-        key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
-        restore-keys: |
-          pyspark-coursier-
-    - name: Free up disk space
-      shell: 'script -q -e -c "bash {0}"'
-      run: |
-        if [ -f ./dev/free_disk_space_container ]; then
-          ./dev/free_disk_space_container
-        fi
-    - name: Install Java ${{ matrix.java }}
-      uses: actions/setup-java@v4
-      with:
-        distribution: zulu
-        java-version: ${{ matrix.java }}
-    - name: List Python packages (${{ env.PYTHON_TO_TEST }})
-      env: ${{ fromJSON(inputs.envs) }}
-      shell: 'script -q -e -c "bash {0}"'
-      run: |
-        for py in $(echo $PYTHON_TO_TEST | tr "," "\n")
-        do
-          echo $py
-          $py -m pip list
-        done
-    - name: Install Conda for pip packaging test
-      if: contains(matrix.modules, 'pyspark-errors')
-      run: |
-        curl -s https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > miniconda.sh
-        bash miniconda.sh -b -p $HOME/miniconda
-        rm miniconda.sh
-    # Run the tests.
-    - name: Run tests
-      env: ${{ fromJSON(inputs.envs) }}
-      shell: 'script -q -e -c "bash {0}"'
-      run: |
-        if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then
-          export PATH=$PATH:$HOME/miniconda/bin
-          export SKIP_PACKAGING=false
-          echo "Python Packaging Tests Enabled!"
-        fi
-        if [ ! -z "$PYTHON_TO_TEST" ]; then
-          ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --python-executables "$PYTHON_TO_TEST"
-        else
-          # For branch-3.5 and below, it uses the default Python versions.
-          ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST"
-        fi
-    - name: Upload coverage to Codecov
-      if: fromJSON(inputs.envs).PYSPARK_CODECOV == 'true'
-      uses: codecov/codecov-action@v4
-      with:
-        files: ./python/coverage.xml
-        flags: unittests
-        name: PySpark
-    - name: Upload test results to report
-      env: ${{ fromJSON(inputs.envs) }}
-      if: always()
-      uses: actions/upload-artifact@v4
-      with:
-        name: test-results-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
-        path: "**/target/test-reports/*.xml"
-    - name: Upload unit tests log files
-      env: ${{ fromJSON(inputs.envs) }}
-      if: ${{ !success() }}
-      uses: actions/upload-artifact@v4
-      with:
-        name: unit-tests-log-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
-        path: "**/target/unit-tests.log"
-
-  sparkr:
-    needs: [precondition, infra-image]
-    # always run if sparkr == 'true', even infra-image is skip (such as non-master job)
-    if: (!cancelled()) && fromJson(needs.precondition.outputs.required).sparkr == 'true'
-    name: "Build modules: sparkr"
-    runs-on: ubuntu-latest
-    timeout-minutes: 180
-    container:
-      image: ${{ needs.precondition.outputs.image_url }}
-    env:
-      HADOOP_PROFILE: ${{ inputs.hadoop }}
-      HIVE_PROFILE: hive2.3
-      GITHUB_PREV_SHA: ${{ github.event.before }}
-      SPARK_LOCAL_IP: localhost
-      SKIP_UNIDOC: true
-      SKIP_MIMA: true
-      SKIP_PACKAGING: true
-    steps:
-    - name: Checkout Spark repository
-      uses: actions/checkout@v4
-      # In order to fetch changed files
-      with:
-        fetch-depth: 0
-        repository: apache/spark
-        ref: ${{ inputs.branch }}
-    - name: Add GITHUB_WORKSPACE to git trust safe.directory
-      run: |
-        git config --global --add safe.directory ${GITHUB_WORKSPACE}
-    - name: Sync the current branch with the latest in Apache Spark
-      if: github.repository != 'apache/spark'
-      run: |
-        echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
-        git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
-        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
-        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
-    # Cache local repositories. Note that GitHub Actions cache has a 10G limit.
-    - name: Cache SBT and Maven
-      uses: actions/cache@v4
-      with:
-        path: |
-          build/apache-maven-*
-          build/*.jar
-          ~/.sbt
-        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
-        restore-keys: |
-          build-
-    - name: Cache Coursier local repository
-      uses: actions/cache@v4
-      with:
-        path: ~/.cache/coursier
-        key: sparkr-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
-        restore-keys: |
-          sparkr-coursier-
-    - name: Free up disk space
-      run: |
-        if [ -f ./dev/free_disk_space_container ]; then
-          ./dev/free_disk_space_container
-        fi
-    - name: Install Java ${{ inputs.java }}
-      uses: actions/setup-java@v4
-      with:
-        distribution: zulu
-        java-version: ${{ inputs.java }}
-    - name: Run tests
-      env: ${{ fromJSON(inputs.envs) }}
-      run: |
-        # The followings are also used by `r-lib/actions/setup-r` to avoid
-        # R issues at docker environment
-        export TZ=UTC
-        export _R_CHECK_SYSTEM_CLOCK_=FALSE
-        ./dev/run-tests --parallelism 1 --modules sparkr
-    - name: Upload test results to report
-      if: always()
-      uses: actions/upload-artifact@v4
-      with:
-        name: test-results-sparkr--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
-        path: "**/target/test-reports/*.xml"
-
-  buf:
-    needs: [precondition]
-    if: (!cancelled()) && fromJson(needs.precondition.outputs.required).buf == 'true'
-    name: Protobuf breaking change detection and Python CodeGen check
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout Spark repository
-      uses: actions/checkout@v4
-      with:
-        fetch-depth: 0
-        repository: apache/spark
-        ref: ${{ inputs.branch }}
-    - name: Sync the current branch with the latest in Apache Spark
-      if: github.repository != 'apache/spark'
-      run: |
-        git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
-        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
-        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
-    - name: Install Buf
-      uses: bufbuild/buf-setup-action@v1
-      with:
-        github_token: ${{ secrets.GITHUB_TOKEN }}
-    - name: Protocol Buffers Linter
-      uses: bufbuild/buf-lint-action@v1
-      with:
-        input: core/src/main/protobuf
-    # Change 'branch-3.5' to 'branch-4.0' in master branch after cutting branch-4.0 branch.
-    - name: Breaking change detection against branch-3.5
-      uses: bufbuild/buf-breaking-action@v1
-      with:
-        input: connector/connect/common/src/main
-        against: 'https://github.com/apache/spark.git#branch=branch-3.5,subdir=connector/connect/common/src/main'
-    - name: Install Python 3.9
-      uses: actions/setup-python@v5
-      with:
-        python-version: '3.9'
-    - name: Install dependencies for Python CodeGen check
-      run: |
-        python3.9 -m pip install 'black==23.9.1' 'protobuf==4.25.1' 'mypy==1.8.0' 'mypy-protobuf==3.3.0'
-        python3.9 -m pip list
-    - name: Python CodeGen check
-      run: ./dev/connect-check-protos.py
-
-  # Static analysis
-  lint:
-    needs: [precondition, infra-image]
-    # always run if lint == 'true', even infra-image is skip (such as non-master job)
-    if: (!cancelled()) && fromJson(needs.precondition.outputs.required).lint == 'true'
-    name: Linters, licenses, and dependencies
-    runs-on: ubuntu-latest
-    timeout-minutes: 180
-    env:
-      LC_ALL: C.UTF-8
-      LANG: C.UTF-8
-      NOLINT_ON_COMPILE: false
-      PYSPARK_DRIVER_PYTHON: python3.9
-      PYSPARK_PYTHON: python3.9
-      GITHUB_PREV_SHA: ${{ github.event.before }}
-    container:
-      image: ${{ needs.precondition.outputs.image_url }}
-    steps:
-    - name: Checkout Spark repository
-      uses: actions/checkout@v4
-      with:
-        fetch-depth: 0
-        repository: apache/spark
-        ref: ${{ inputs.branch }}
-    - name: Add GITHUB_WORKSPACE to git trust safe.directory
-      run: |
-        git config --global --add safe.directory ${GITHUB_WORKSPACE}
-    - name: Sync the current branch with the latest in Apache Spark
-      if: github.repository != 'apache/spark'
-      run: |
-        echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
-        git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
-        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
-        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
-    # Cache local repositories. Note that GitHub Actions cache has a 10G limit.
-    - name: Cache SBT and Maven
-      uses: actions/cache@v4
-      with:
-        path: |
-          build/apache-maven-*
-          build/*.jar
-          ~/.sbt
-        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
-        restore-keys: |
-          build-
-    - name: Cache Coursier local repository
-      uses: actions/cache@v4
-      with:
-        path: ~/.cache/coursier
-        key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
-        restore-keys: |
-          docs-coursier-
-    - name: Cache Maven local repository
-      uses: actions/cache@v4
-      with:
-        path: ~/.m2/repository
-        key: docs-maven-${{ hashFiles('**/pom.xml') }}
-        restore-keys: |
-          docs-maven-
-    - name: Free up disk space
-      run: |
-        if [ -f ./dev/free_disk_space_container ]; then
-          ./dev/free_disk_space_container
-        fi
-    - name: Install Java ${{ inputs.java }}
-      uses: actions/setup-java@v4
-      with:
-        distribution: zulu
-        java-version: ${{ inputs.java }}
-    - name: License test
-      run: ./dev/check-license
-    - name: Dependencies test
-      run: ./dev/test-dependencies.sh
-    - name: MIMA test
-      run: ./dev/mima
-    - name: Scala linter
-      run: ./dev/lint-scala
-    - name: Java linter
-      run: ./dev/lint-java
-    - name: Spark connect jvm client mima check
-      run: ./dev/connect-jvm-client-mima-check
-    - name: Install Python linter dependencies for branch-3.4
-      if: inputs.branch == 'branch-3.4'
-      run: |
-        # SPARK-44554: Copy from https://github.com/apache/spark/blob/a05c27e85829fe742c1828507a1fd180cdc84b54/.github/workflows/build_and_test.yml#L571-L578
-        # Should delete this section after SPARK 3.4 EOL.
-        python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.920' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0'
-        python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.48.1' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0'
-    - name: Install Python linter dependencies for branch-3.5
-      if: inputs.branch == 'branch-3.5'
-      run: |
-        # SPARK-45212: Copy from https://github.com/apache/spark/blob/555c8def51e5951c7bf5165a332795e9e330ec9d/.github/workflows/build_and_test.yml#L631-L638
-        # Should delete this section after SPARK 3.5 EOL.
-        python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0'
-        python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.56.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0'
-    - name: Install Python dependencies for python linter and documentation generation
-      if: inputs.branch != 'branch-3.4' && inputs.branch != 'branch-3.5'
-      run: |
-        # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5
-        # See 'ipython_genutils' in SPARK-38517
-        # See 'docutils<0.18.0' in SPARK-39421
-        python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
-          ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \
-          'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \
-          'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
-          'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
-        python3.9 -m pip list
-    - name: Python linter
-      run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python
-    # Should delete this section after SPARK 3.5 EOL.
-    - name: Install dependencies for Python code generation check for branch-3.5
-      if: inputs.branch == 'branch-3.5'
-      run: |
-        # See more in "Installation" https://docs.buf.build/installation#tarball
-        curl -LO https://github.com/bufbuild/buf/releases/download/v1.28.1/buf-Linux-x86_64.tar.gz
-        mkdir -p $HOME/buf
-        tar -xvzf buf-Linux-x86_64.tar.gz -C $HOME/buf --strip-components 1
-        rm buf-Linux-x86_64.tar.gz
-        python3.9 -m pip install 'protobuf==4.25.1' 'mypy-protobuf==3.3.0'
-    # Should delete this section after SPARK 3.5 EOL.
-    - name: Python code generation check for branch-3.5
-      if: inputs.branch == 'branch-3.5'
-      run: if test -f ./dev/connect-check-protos.py; then PATH=$PATH:$HOME/buf/bin PYTHON_EXECUTABLE=python3.9 ./dev/connect-check-protos.py; fi
-    # Should delete this section after SPARK 3.5 EOL.
-    - name: Install JavaScript linter dependencies for branch-3.4, branch-3.5
-      if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5'
-      run: |
-        apt update
-        apt-get install -y nodejs npm
-    - name: JS linter
-      run: ./dev/lint-js
-    # Should delete this section after SPARK 3.5 EOL.
-    - name: Install R linter dependencies for branch-3.4, branch-3.5
-      if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5'
-      run: |
-        apt update
-        apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev \
-          libfontconfig1-dev libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev \
-          libtiff5-dev libjpeg-dev
-        Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
-        Rscript -e "devtools::install_version('lintr', version='2.0.1', repos='https://cloud.r-project.org')"
-    - name: Install R linter dependencies and SparkR
-      run: ./R/install-dev.sh
-    - name: R linter
-      run: ./dev/lint-r
-
-  # Documentation build
-  docs:
-    needs: [precondition, infra-image]
-    # always run if lint == 'true', even infra-image is skip (such as non-master job)
-    if: (!cancelled()) && fromJson(needs.precondition.outputs.required).docs == 'true'
-    name: Documentation generation
-    runs-on: ubuntu-latest
-    timeout-minutes: 180
-    env:
-      LC_ALL: C.UTF-8
-      LANG: C.UTF-8
-      NOLINT_ON_COMPILE: false
-      PYSPARK_DRIVER_PYTHON: python3.9
-      PYSPARK_PYTHON: python3.9
-      GITHUB_PREV_SHA: ${{ github.event.before }}
-    container:
-      image: ${{ needs.precondition.outputs.image_url }}
+  sparkr-window:
+    name: "Build Sparkr Window"
+    runs-on: windows-2019
+    timeout-minutes: 300
     steps:
-    - name: Checkout Spark repository
-      uses: actions/checkout@v4
-      with:
-        fetch-depth: 0
-        repository: apache/spark
-        ref: ${{ inputs.branch }}
-    - name: Add GITHUB_WORKSPACE to git trust safe.directory
-      run: |
-        git config --global --add safe.directory ${GITHUB_WORKSPACE}
-    - name: Sync the current branch with the latest in Apache Spark
-      if: github.repository != 'apache/spark'
-      run: |
-        echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
-        git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
-        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
-        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
-    # Cache local repositories. Note that GitHub Actions cache has a 10G limit.
-    - name: Cache SBT and Maven
-      uses: actions/cache@v4
-      with:
-        path: |
-          build/apache-maven-*
-          build/*.jar
-          ~/.sbt
-        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
-        restore-keys: |
-          build-
-    - name: Cache Coursier local repository
-      uses: actions/cache@v4
-      with:
-        path: ~/.cache/coursier
-        key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
-        restore-keys: |
-          docs-coursier-
-    - name: Cache Maven local repository
-      uses: actions/cache@v4
-      with:
-        path: ~/.m2/repository
-        key: docs-maven-${{ hashFiles('**/pom.xml') }}
-        restore-keys: |
-          docs-maven-
-    - name: Free up disk space
-      run: |
-        if [ -f ./dev/free_disk_space_container ]; then
-          ./dev/free_disk_space_container
-        fi
-    - name: Install Java ${{ inputs.java }}
-      uses: actions/setup-java@v4
-      with:
-        distribution: zulu
-        java-version: ${{ inputs.java }}
-    - name: Install Python dependencies for python linter and documentation generation
-      if: inputs.branch != 'branch-3.4' && inputs.branch != 'branch-3.5'
-      run: |
-        # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5
-        # See 'ipython_genutils' in SPARK-38517
-        # See 'docutils<0.18.0' in SPARK-39421
-        python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
-          ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \
-          'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \
-          'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
-          'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
-        python3.9 -m pip list
-    - name: Install dependencies for documentation generation for branch-3.4, branch-3.5
-      if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5'
-      run: |
-        # pandoc is required to generate PySpark APIs as well in nbsphinx.
-        apt-get update -y
-        apt-get install -y libcurl4-openssl-dev pandoc
-        apt-get install -y ruby ruby-dev
-        Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'markdown', 'e1071', 'roxygen2', 'ggplot2', 'mvtnorm', 'statmod'), repos='https://cloud.r-project.org/')"
-        Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')"
-        Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')"
-        # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5
-        python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
-        python3.9 -m pip install ipython_genutils # See SPARK-38517
-        python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8'
-        python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421
-    - name: Install dependencies for documentation generation
-      run: |
-        # Keep the version of Bundler here in sync with the following locations:
-        #   - dev/create-release/spark-rm/Dockerfile
-        #   - docs/README.md
-        gem install bundler -v 2.4.22
-        cd docs
-        bundle install
-    - name: Run documentation build
-      run: |
-        # We need this link because the jekyll build calls `python`.
-        ln -s "$(which python3.9)" "/usr/local/bin/python"
-        # Build docs first with SKIP_API to ensure they are buildable without requiring any
-        # language docs to be built beforehand.
-        cd docs; SKIP_API=1 bundle exec jekyll build; cd ..
-        if [ -f "./dev/is-changed.py" ]; then
-          # Skip PySpark and SparkR docs while keeping Scala/Java/SQL docs
-          pyspark_modules=`cd dev && python3.9 -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"`
-          if [ `./dev/is-changed.py -m $pyspark_modules` = false ]; then export SKIP_PYTHONDOC=1; fi
-          if [ `./dev/is-changed.py -m sparkr` = false ]; then export SKIP_RDOC=1; fi
-        fi
-        cd docs
-        bundle exec jekyll build
-    - name: Tar documentation
-      if: github.repository != 'apache/spark'
-      run: tar cjf site.tar.bz2 docs/_site
-    - name: Upload documentation
-      if: github.repository != 'apache/spark'
-      uses: actions/upload-artifact@v4
-      with:
-        name: site
-        path: site.tar.bz2
-        retention-days: 1
-
-  # Any TPC-DS related updates on this job need to be applied to tpcds-1g-gen job of benchmark.yml as well
-  tpcds-1g:
-    needs: precondition
-    if: fromJson(needs.precondition.outputs.required).tpcds-1g == 'true'
-    name: Run TPC-DS queries with SF=1
-    # Pin to 'Ubuntu 20.04' due to 'databricks/tpcds-kit' compilation
-    runs-on: ubuntu-20.04
-    timeout-minutes: 180
-    env:
-      SPARK_LOCAL_IP: localhost
-    steps:
-    - name: Checkout Spark repository
-      uses: actions/checkout@v4
-      with:
-        fetch-depth: 0
-        repository: apache/spark
-        ref: ${{ inputs.branch }}
-    - name: Sync the current branch with the latest in Apache Spark
-      if: github.repository != 'apache/spark'
-      run: |
-        git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
-        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
-        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
-    - name: Cache SBT and Maven
-      uses: actions/cache@v4
-      with:
-        path: |
-          build/apache-maven-*
-          build/*.jar
-          ~/.sbt
-        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
-        restore-keys: |
-          build-
-    - name: Cache Coursier local repository
-      uses: actions/cache@v4
-      with:
-        path: ~/.cache/coursier
-        key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
-        restore-keys: |
-          tpcds-coursier-
-    - name: Install Java ${{ inputs.java }}
-      uses: actions/setup-java@v4
-      with:
-        distribution: zulu
-        java-version: ${{ inputs.java }}
-    - name: Cache TPC-DS generated data
-      id: cache-tpcds-sf-1
-      uses: actions/cache@v4
-      with:
-        path: ./tpcds-sf-1
-        key: tpcds-${{ hashFiles('.github/workflows/build_and_test.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }}
-    - name: Checkout tpcds-kit repository
-      if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
-      uses: actions/checkout@v4
-      with:
-        repository: databricks/tpcds-kit
-        ref: 2a5078a782192ddb6efbcead8de9973d6ab4f069
-        path: ./tpcds-kit
-    - name: Build tpcds-kit
-      if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
-      run: cd tpcds-kit/tools && make OS=LINUX
-    - name: Generate TPC-DS (SF=1) table data
-      if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
-      run: build/sbt "sql/Test/runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite"
-    - name: Run TPC-DS queries (Sort merge join)
-      run: |
-        SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite"
-      env:
-        SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }}
-        SPARK_TPCDS_JOIN_CONF: |
-          spark.sql.autoBroadcastJoinThreshold=-1
-          spark.sql.join.preferSortMergeJoin=true
-    - name: Run TPC-DS queries (Broadcast hash join)
-      run: |
-        SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite"
-      env:
-        SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }}
-        SPARK_TPCDS_JOIN_CONF: |
-          spark.sql.autoBroadcastJoinThreshold=10485760
-    - name: Run TPC-DS queries (Shuffled hash join)
-      run: |
-        SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite"
-      env:
-        SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }}
-        SPARK_TPCDS_JOIN_CONF: |
-          spark.sql.autoBroadcastJoinThreshold=-1
-          spark.sql.join.forceApplyShuffledHashJoin=true
-    - name: Run TPC-DS queries on collated data
-      run: |
-        SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSCollationQueryTestSuite"
-    - name: Upload test results to report
-      if: always()
-      uses: actions/upload-artifact@v4
-      with:
-        name: test-results-tpcds--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
-        path: "**/target/test-reports/*.xml"
-    - name: Upload unit tests log files
-      if: ${{ !success() }}
-      uses: actions/upload-artifact@v4
-      with:
-        name: unit-tests-log-tpcds--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
-        path: "**/target/unit-tests.log"
-
-  docker-integration-tests:
-    needs: precondition
-    if: fromJson(needs.precondition.outputs.required).docker-integration-tests == 'true'
-    name: Run Docker integration tests
-    runs-on: ubuntu-latest
-    timeout-minutes: 180
-    env:
-      HADOOP_PROFILE: ${{ inputs.hadoop }}
-      HIVE_PROFILE: hive2.3
-      GITHUB_PREV_SHA: ${{ github.event.before }}
-      SPARK_LOCAL_IP: localhost
-      SKIP_UNIDOC: true
-      SKIP_MIMA: true
-      SKIP_PACKAGING: true
-    steps:
-    - name: Checkout Spark repository
-      uses: actions/checkout@v4
-      with:
-        fetch-depth: 0
-        repository: apache/spark
-        ref: ${{ inputs.branch }}
-    - name: Sync the current branch with the latest in Apache Spark
-      if: github.repository != 'apache/spark'
-      run: |
-        echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
-        git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
-        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
-        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
-    - name: Cache SBT and Maven
-      uses: actions/cache@v4
-      with:
-        path: |
-          build/apache-maven-*
-          build/*.jar
-          ~/.sbt
-        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
-        restore-keys: |
-          build-
-    - name: Cache Coursier local repository
-      uses: actions/cache@v4
-      with:
-        path: ~/.cache/coursier
-        key: docker-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
-        restore-keys: |
-          docker-integration-coursier-
-    - name: Install Java ${{ inputs.java }}
-      uses: actions/setup-java@v4
-      with:
-        distribution: zulu
-        java-version: ${{ inputs.java }}
-    - name: Run tests
-      env: ${{ fromJSON(inputs.envs) }}
-      run: |
-        ./dev/run-tests --parallelism 1 --modules docker-integration-tests --included-tags org.apache.spark.tags.DockerTest
-    - name: Upload test results to report
-      if: always()
-      uses: actions/upload-artifact@v4
-      with:
-        name: test-results-docker-integration--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
-        path: "**/target/test-reports/*.xml"
-    - name: Upload unit tests log files
-      if: ${{ !success() }}
-      uses: actions/upload-artifact@v4
-      with:
-        name: unit-tests-log-docker-integration--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
-        path: "**/target/unit-tests.log"
-
-  k8s-integration-tests:
-    needs: precondition
-    if: fromJson(needs.precondition.outputs.required).k8s-integration-tests == 'true'
-    name: Run Spark on Kubernetes Integration test
-    runs-on: ubuntu-latest
-    timeout-minutes: 180
-    steps:
-      - name: Checkout Spark repository
+      - name: Download winutils Hadoop binary
         uses: actions/checkout@v4
         with:
-          fetch-depth: 0
-          repository: apache/spark
-          ref: ${{ inputs.branch }}
-      - name: Sync the current branch with the latest in Apache Spark
-        if: github.repository != 'apache/spark'
+          repository: cdarlint/winutils
+      - name: Move Hadoop winutil into home directory
         run: |
-          echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
-          git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
-          git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
-          git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
-      - name: Cache SBT and Maven
-        uses: actions/cache@v4
-        with:
-          path: |
-            build/apache-maven-*
-            build/*.jar
-            ~/.sbt
-          key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
-          restore-keys: |
-            build-
-      - name: Cache Coursier local repository
+          Move-Item -Path hadoop-3.3.6 -Destination ~\
+      - name: Checkout Spark repository
+        uses: actions/checkout@v4
+      - name: Cache Maven local repository
         uses: actions/cache@v4
         with:
-          path: ~/.cache/coursier
-          key: k8s-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
+          path: ~/.m2/repository
+          key: build-sparkr-windows-maven-${{ hashFiles('**/pom.xml') }}
           restore-keys: |
-            k8s-integration-coursier-
-      - name: Install Java ${{ inputs.java }}
+            build-sparkr-windows-maven-
+      - name: Install Java 17
         uses: actions/setup-java@v4
         with:
           distribution: zulu
-          java-version: ${{ inputs.java }}
-      - name: start minikube
+          java-version: 17
+      - name: Install R 4.3.3
+        uses: r-lib/actions/setup-r@v2
+        with:
+          r-version: 4.3.3
+      - name: Install R dependencies
         run: |
-          # See more in "Installation" https://minikube.sigs.k8s.io/docs/start/
-          curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64
-          sudo install minikube-linux-amd64 /usr/local/bin/minikube
-          rm minikube-linux-amd64
-          # Github Action limit cpu:2, memory: 6947MB, limit to 2U6G for better resource statistic
-          minikube start --cpus 2 --memory 6144
-      - name: Print K8S pods and nodes info
+          Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow', 'xml2'), repos='https://cloud.r-project.org/')"
+          Rscript -e "pkg_list <- as.data.frame(installed.packages()[,c(1, 3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]"
+        shell: cmd
+      # SparkR build does not need Python. However, it shows warnings when the Python version is too low during
+      # the attempt to look up Python Data Sources for session initialization. The Windows 2019 runner
+      # includes Python 3.7, which Spark does not support. Therefore, we simply install the proper Python
+      # for simplicity, see SPARK-47116.
+      - name: Install Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          architecture: x64
+      - name: Build Spark
         run: |
-          kubectl get pods -A
-          kubectl describe node
-      - name: Run Spark on K8S integration test
+          rem 1. '-Djna.nosys=true' is required to avoid kernel32.dll load failure.
+          rem   See SPARK-28759.
+          rem 2. Ideally we should check the tests related to Hive in SparkR as well (SPARK-31745).
+          rem 3. setup-java installs Maven 3.8.7 but does not allow changing its version, so overwrite
+          rem   Maven version as a workaround.
+          mvn -DskipTests -Psparkr -Djna.nosys=true package -Dmaven.version=3.8.7
+        shell: cmd
+      - name: Run SparkR tests
         run: |
-          # Prepare PV test
-          PVC_TMP_DIR=$(mktemp -d)
-          export PVC_TESTS_HOST_PATH=$PVC_TMP_DIR
-          export PVC_TESTS_VM_PATH=$PVC_TMP_DIR
-          minikube mount ${PVC_TESTS_HOST_PATH}:${PVC_TESTS_VM_PATH} --gid=0 --uid=185 &
-          kubectl create clusterrolebinding serviceaccounts-cluster-admin --clusterrole=cluster-admin --group=system:serviceaccounts || true
-          kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.8.2/installer/volcano-development.yaml || true
-          if [[ "${{ inputs.branch }}" == 'branch-3.5' || "${{ inputs.branch }}" == 'branch-3.4' ]]; then
-            kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.7.0/installer/volcano-development.yaml || true
-          else
-            kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.8.2/installer/volcano-development.yaml || true
-          fi
-          eval $(minikube docker-env)
-          build/sbt -Phadoop-3 -Psparkr -Pkubernetes -Pvolcano -Pkubernetes-integration-tests -Dspark.kubernetes.test.volcanoMaxConcurrencyJobNum=1 -Dtest.exclude.tags=local "kubernetes-integration-tests/test"
-      - name: Upload Spark on K8S integration tests log files
-        if: ${{ !success() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: spark-on-kubernetes-it-log
-          path: "**/target/integration-tests.log"
-
-  ui:
-    needs: [precondition]
-    if: fromJson(needs.precondition.outputs.required).ui == 'true'
-    name: Run Spark UI tests
-    runs-on: ubuntu-latest
-    timeout-minutes: 180
-    steps:
-      - uses: actions/checkout@v4
-      - name: Use Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: 20
-          cache: 'npm'
-          cache-dependency-path: ui-test/package-lock.json
-      - run: |
-          cd ui-test
-          npm install --save-dev
-          node --experimental-vm-modules node_modules/.bin/jest
+          set HADOOP_HOME=%USERPROFILE%\hadoop-3.3.6
+          set PATH=%HADOOP_HOME%\bin;%PATH%
+          .\bin\spark-submit2.cmd --driver-java-options "-Dlog4j.configurationFile=file:///%CD:\=/%/R/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" R\pkg\tests\run-all.R
+        shell: cmd
+        env:
+          NOT_CRAN: true
+          # See SPARK-27848. Currently installing some dependent packages causes
+          # "(converted from warning) unable to identify current timezone 'C':" for an unknown reason.
+          # This environment variable works around to test SparkR against a higher version.
+          R_REMOTES_NO_ERRORS_FROM_WARNINGS: true

From 2be28bc47109892f510920e3515deaaab6c02391 Mon Sep 17 00:00:00 2001
From: panbingkun <panbingkun@baidu.com>
Date: Wed, 19 Jun 2024 14:00:42 +0800
Subject: [PATCH 2/4] update quickly

---
 .github/workflows/build_and_test.yml          | 215 ------------------
 .../scala/org/apache/spark/util/Utils.scala   |   4 +
 2 files changed, 4 insertions(+), 215 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 082cc801267b..a93eefff0aa2 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -135,221 +135,6 @@ jobs:
         IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
         echo "image_url=$IMG_URL" >> $GITHUB_OUTPUT
 
-  # Build: build Spark and run the tests for specified modules.
-  build:
-    name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }}"
-    needs: precondition
-    if: fromJson(needs.precondition.outputs.required).build == 'true'
-    runs-on: ubuntu-latest
-    timeout-minutes: 180
-    strategy:
-      fail-fast: false
-      matrix:
-        java:
-          - ${{ inputs.java }}
-        hadoop:
-          - ${{ inputs.hadoop }}
-        hive:
-          - hive2.3
-        # Note that the modules below are from sparktestsupport/modules.py.
-        modules:
-          - >-
-            core, unsafe, kvstore, avro, utils,
-            network-common, network-shuffle, repl, launcher,
-            examples, sketch, variant
-          - >-
-            api, catalyst, hive-thriftserver
-          - >-
-            mllib-local, mllib, graphx
-          - >-
-            streaming, sql-kafka-0-10, streaming-kafka-0-10, streaming-kinesis-asl,
-            kubernetes, hadoop-cloud, spark-ganglia-lgpl, protobuf, connect
-          - yarn
-        # Here, we split Hive and SQL tests into some of slow ones and the rest of them.
-        included-tags: [""]
-        excluded-tags: [""]
-        comment: [""]
-        include:
-          # Hive tests
-          - modules: hive
-            java: ${{ inputs.java }}
-            hadoop: ${{ inputs.hadoop }}
-            hive: hive2.3
-            included-tags: org.apache.spark.tags.SlowHiveTest
-            comment: "- slow tests"
-          - modules: hive
-            java: ${{ inputs.java }}
-            hadoop: ${{ inputs.hadoop }}
-            hive: hive2.3
-            excluded-tags: org.apache.spark.tags.SlowHiveTest
-            comment: "- other tests"
-          # SQL tests
-          - modules: sql
-            java: ${{ inputs.java }}
-            hadoop: ${{ inputs.hadoop }}
-            hive: hive2.3
-            included-tags: org.apache.spark.tags.ExtendedSQLTest
-            comment: "- extended tests"
-          - modules: sql
-            java: ${{ inputs.java }}
-            hadoop: ${{ inputs.hadoop }}
-            hive: hive2.3
-            included-tags: org.apache.spark.tags.SlowSQLTest
-            comment: "- slow tests"
-          - modules: sql
-            java: ${{ inputs.java }}
-            hadoop: ${{ inputs.hadoop }}
-            hive: hive2.3
-            excluded-tags: org.apache.spark.tags.ExtendedSQLTest,org.apache.spark.tags.SlowSQLTest
-            comment: "- other tests"
-        exclude:
-          # Always run if yarn == 'true', even infra-image is skip (such as non-master job)
-          # In practice, the build will run in individual PR, but not against the individual commit
-          # in Apache Spark repository.
-          - modules: ${{ fromJson(needs.precondition.outputs.required).yarn != 'true' && 'yarn' }}
-    env:
-      MODULES_TO_TEST: ${{ matrix.modules }}
-      EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
-      INCLUDED_TAGS: ${{ matrix.included-tags }}
-      HADOOP_PROFILE: ${{ matrix.hadoop }}
-      HIVE_PROFILE: ${{ matrix.hive }}
-      GITHUB_PREV_SHA: ${{ github.event.before }}
-      SPARK_LOCAL_IP: localhost
-      NOLINT_ON_COMPILE: true
-      SKIP_UNIDOC: true
-      SKIP_MIMA: true
-      SKIP_PACKAGING: true
-    steps:
-    - name: Checkout Spark repository
-      uses: actions/checkout@v4
-      # In order to fetch changed files
-      with:
-        fetch-depth: 0
-        repository: apache/spark
-        ref: ${{ inputs.branch }}
-    - name: Sync the current branch with the latest in Apache Spark
-      if: github.repository != 'apache/spark'
-      run: |
-        echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
-        git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
-        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
-        git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
-    # Cache local repositories. Note that GitHub Actions cache has a 10G limit.
-    - name: Cache SBT and Maven
-      uses: actions/cache@v4
-      with:
-        path: |
-          build/apache-maven-*
-          build/*.jar
-          ~/.sbt
-        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
-        restore-keys: |
-          build-
-    - name: Cache Coursier local repository
-      uses: actions/cache@v4
-      with:
-        path: ~/.cache/coursier
-        key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
-        restore-keys: |
-          ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-
-    - name: Free up disk space
-      run: |
-        if [ -f ./dev/free_disk_space ]; then
-          ./dev/free_disk_space
-        fi
-    - name: Install Java ${{ matrix.java }}
-      uses: actions/setup-java@v4
-      with:
-        distribution: zulu
-        java-version: ${{ matrix.java }}
-    - name: Install Python 3.9
-      uses: actions/setup-python@v5
-      # We should install one Python that is higher than 3+ for SQL and Yarn because:
-      # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
-      # - Yarn has a Python specific test too, for example, YarnClusterSuite.
-      if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect')
-      with:
-        python-version: '3.9'
-        architecture: x64
-    - name: Install Python packages (Python 3.9)
-      if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect')
-      run: |
-        python3.9 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.62.0' 'grpcio-status==1.62.0' 'protobuf==4.25.1'
-        python3.9 -m pip list
-    # Run the tests.
-    - name: Run tests
-      env: ${{ fromJSON(inputs.envs) }}
-      shell: 'script -q -e -c "bash {0}"'
-      run: |
-        # Fix for TTY related issues when launching the Ammonite REPL in tests.
-        export TERM=vt100
-        # Hive "other tests" test needs larger metaspace size based on experiment.
-        if [[ "$MODULES_TO_TEST" == "hive" ]] && [[ "$EXCLUDED_TAGS" == "org.apache.spark.tags.SlowHiveTest" ]]; then export METASPACE_SIZE=2g; fi
-        # SPARK-46283: should delete the following env replacement after SPARK 3.x EOL
-        if [[ "$MODULES_TO_TEST" == *"streaming-kinesis-asl"* ]] && [[ "${{ inputs.branch }}" =~ ^branch-3 ]]; then 
-          MODULES_TO_TEST=${MODULES_TO_TEST//streaming-kinesis-asl, /}
-        fi
-        export SERIAL_SBT_TESTS=1
-        ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
-    - name: Upload test results to report
-      if: always()
-      uses: actions/upload-artifact@v4
-      with:
-        name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
-        path: "**/target/test-reports/*.xml"
-    - name: Upload unit tests log files
-      if: ${{ !success() }}
-      uses: actions/upload-artifact@v4
-      with:
-        name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
-        path: "**/target/unit-tests.log"
-
-  infra-image:
-    name: "Base image build"
-    needs: precondition
-    if: >-
-      fromJson(needs.precondition.outputs.required).pyspark == 'true' ||
-      fromJson(needs.precondition.outputs.required).lint == 'true' ||
-      fromJson(needs.precondition.outputs.required).sparkr == 'true'
-    runs-on: ubuntu-latest
-    permissions:
-      packages: write
-    steps:
-      - name: Login to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Checkout Spark repository
-        uses: actions/checkout@v4
-        # In order to fetch changed files
-        with:
-          fetch-depth: 0
-          repository: apache/spark
-          ref: ${{ inputs.branch }}
-      - name: Sync the current branch with the latest in Apache Spark
-        if: github.repository != 'apache/spark'
-        run: |
-          echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
-          git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
-          git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
-          git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Build and push
-        id: docker_build
-        uses: docker/build-push-action@v5
-        with:
-          context: ./dev/infra/
-          push: true
-          tags: |
-            ${{ needs.precondition.outputs.image_url }}
-          # Use the infra image cache to speed up
-          cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ inputs.branch }}
-
   sparkr-window:
     name: "Build Sparkr Window"
     runs-on: windows-2019
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index a37aedfcb635..360e0f0e611b 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -575,6 +575,10 @@ private[spark] object Utils
     if (removeSourceFile) {
       Files.move(sourceFile.toPath, destFile.toPath)
     } else {
+      // scalastyle:off println
+      println(s"source path: ${sourceFile.getAbsolutePath}")
+      println(s"dest path: ${destFile.getAbsolutePath}")
+      // scalastyle:on println
       logInfo(log"Copying ${MDC(SOURCE_PATH, sourceFile.getAbsolutePath)}" +
         log" to ${MDC(DESTINATION_PATH, destFile.getAbsolutePath)}")
       copyRecursive(sourceFile, destFile)

From 61a9c4b5c314bb15e6c083a24206dcb8fed8b90e Mon Sep 17 00:00:00 2001
From: panbingkun <panbingkun@baidu.com>
Date: Fri, 21 Jun 2024 18:47:19 +0800
Subject: [PATCH 3/4] R 4.4.0

---
 .github/workflows/build_and_test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index a93eefff0aa2..a2d721022e7c 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -161,10 +161,10 @@ jobs:
         with:
           distribution: zulu
           java-version: 17
-      - name: Install R 4.3.3
+      - name: Install R 4.4.0
         uses: r-lib/actions/setup-r@v2
         with:
-          r-version: 4.3.3
+          r-version: 4.4.0
       - name: Install R dependencies
         run: |
           Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow', 'xml2'), repos='https://cloud.r-project.org/')"

From 3ec3ef2c4730b85001539afa40fdf32f172d4ae8 Mon Sep 17 00:00:00 2001
From: panbingkun <panbingkun@baidu.com>
Date: Tue, 25 Jun 2024 10:45:46 +0800
Subject: [PATCH 4/4] test windows-2022

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index a2d721022e7c..dce5ddde0c96 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -137,7 +137,7 @@ jobs:
 
   sparkr-window:
     name: "Build Sparkr Window"
-    runs-on: windows-2019
+    runs-on: windows-2022
     timeout-minutes: 300
     steps:
       - name: Download winutils Hadoop binary