From bcde7602dcad5789b20272ea597dcec0209a8aff Mon Sep 17 00:00:00 2001 From: panbingkun Date: Wed, 19 Jun 2024 13:49:47 +0800 Subject: [PATCH 1/4] Address r windows --- .github/workflows/build_and_test.yml | 858 ++------------------------- 1 file changed, 52 insertions(+), 806 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 881fb8cb0674..082cc801267b 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -350,822 +350,68 @@ jobs: # Use the infra image cache to speed up cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ inputs.branch }} - pyspark: - needs: [precondition, infra-image] - # always run if pyspark == 'true', even infra-image is skip (such as non-master job) - if: (!cancelled()) && fromJson(needs.precondition.outputs.required).pyspark == 'true' - name: "Build modules: ${{ matrix.modules }}" - runs-on: ubuntu-latest - timeout-minutes: 180 - container: - image: ${{ needs.precondition.outputs.image_url }} - strategy: - fail-fast: false - matrix: - java: - - ${{ inputs.java }} - modules: - - >- - pyspark-sql, pyspark-resource, pyspark-testing - - >- - pyspark-core, pyspark-errors, pyspark-streaming - - >- - pyspark-mllib, pyspark-ml, pyspark-ml-connect - - >- - pyspark-connect - - >- - pyspark-pandas - - >- - pyspark-pandas-slow - - >- - pyspark-pandas-connect-part0 - - >- - pyspark-pandas-connect-part1 - - >- - pyspark-pandas-connect-part2 - - >- - pyspark-pandas-connect-part3 - exclude: - # Always run if pyspark-pandas == 'true', even infra-image is skip (such as non-master job) - # In practice, the build will run in individual PR, but not against the individual commit - # in Apache Spark repository. - - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas' }} - - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-slow' }} - - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part0' }} - - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part1' }} - - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part2' }} - - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part3' }} - env: - MODULES_TO_TEST: ${{ matrix.modules }} - PYTHON_TO_TEST: 'python3.11' - HADOOP_PROFILE: ${{ inputs.hadoop }} - HIVE_PROFILE: hive2.3 - GITHUB_PREV_SHA: ${{ github.event.before }} - SPARK_LOCAL_IP: localhost - SKIP_UNIDOC: true - SKIP_MIMA: true - SKIP_PACKAGING: true - METASPACE_SIZE: 1g - BRANCH: ${{ inputs.branch }} - steps: - - name: Checkout Spark repository - uses: actions/checkout@v4 - # In order to fetch changed files - with: - fetch-depth: 0 - repository: apache/spark - ref: ${{ inputs.branch }} - - name: Add GITHUB_WORKSPACE to git trust safe.directory - run: | - git config --global --add safe.directory ${GITHUB_WORKSPACE} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - run: | - echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - # Cache local repositories. Note that GitHub Actions cache has a 10G limit. - - name: Cache SBT and Maven - uses: actions/cache@v4 - with: - path: | - build/apache-maven-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v4 - with: - path: ~/.cache/coursier - key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - pyspark-coursier- - - name: Free up disk space - shell: 'script -q -e -c "bash {0}"' - run: | - if [ -f ./dev/free_disk_space_container ]; then - ./dev/free_disk_space_container - fi - - name: Install Java ${{ matrix.java }} - uses: actions/setup-java@v4 - with: - distribution: zulu - java-version: ${{ matrix.java }} - - name: List Python packages (${{ env.PYTHON_TO_TEST }}) - env: ${{ fromJSON(inputs.envs) }} - shell: 'script -q -e -c "bash {0}"' - run: | - for py in $(echo $PYTHON_TO_TEST | tr "," "\n") - do - echo $py - $py -m pip list - done - - name: Install Conda for pip packaging test - if: contains(matrix.modules, 'pyspark-errors') - run: | - curl -s https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > miniconda.sh - bash miniconda.sh -b -p $HOME/miniconda - rm miniconda.sh - # Run the tests. - - name: Run tests - env: ${{ fromJSON(inputs.envs) }} - shell: 'script -q -e -c "bash {0}"' - run: | - if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then - export PATH=$PATH:$HOME/miniconda/bin - export SKIP_PACKAGING=false - echo "Python Packaging Tests Enabled!" - fi - if [ ! -z "$PYTHON_TO_TEST" ]; then - ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --python-executables "$PYTHON_TO_TEST" - else - # For branch-3.5 and below, it uses the default Python versions. - ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" - fi - - name: Upload coverage to Codecov - if: fromJSON(inputs.envs).PYSPARK_CODECOV == 'true' - uses: codecov/codecov-action@v4 - with: - files: ./python/coverage.xml - flags: unittests - name: PySpark - - name: Upload test results to report - env: ${{ fromJSON(inputs.envs) }} - if: always() - uses: actions/upload-artifact@v4 - with: - name: test-results-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }} - path: "**/target/test-reports/*.xml" - - name: Upload unit tests log files - env: ${{ fromJSON(inputs.envs) }} - if: ${{ !success() }} - uses: actions/upload-artifact@v4 - with: - name: unit-tests-log-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }} - path: "**/target/unit-tests.log" - - sparkr: - needs: [precondition, infra-image] - # always run if sparkr == 'true', even infra-image is skip (such as non-master job) - if: (!cancelled()) && fromJson(needs.precondition.outputs.required).sparkr == 'true' - name: "Build modules: sparkr" - runs-on: ubuntu-latest - timeout-minutes: 180 - container: - image: ${{ needs.precondition.outputs.image_url }} - env: - HADOOP_PROFILE: ${{ inputs.hadoop }} - HIVE_PROFILE: hive2.3 - GITHUB_PREV_SHA: ${{ github.event.before }} - SPARK_LOCAL_IP: localhost - SKIP_UNIDOC: true - SKIP_MIMA: true - SKIP_PACKAGING: true - steps: - - name: Checkout Spark repository - uses: actions/checkout@v4 - # In order to fetch changed files - with: - fetch-depth: 0 - repository: apache/spark - ref: ${{ inputs.branch }} - - name: Add GITHUB_WORKSPACE to git trust safe.directory - run: | - git config --global --add safe.directory ${GITHUB_WORKSPACE} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - run: | - echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - # Cache local repositories. Note that GitHub Actions cache has a 10G limit. - - name: Cache SBT and Maven - uses: actions/cache@v4 - with: - path: | - build/apache-maven-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v4 - with: - path: ~/.cache/coursier - key: sparkr-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - sparkr-coursier- - - name: Free up disk space - run: | - if [ -f ./dev/free_disk_space_container ]; then - ./dev/free_disk_space_container - fi - - name: Install Java ${{ inputs.java }} - uses: actions/setup-java@v4 - with: - distribution: zulu - java-version: ${{ inputs.java }} - - name: Run tests - env: ${{ fromJSON(inputs.envs) }} - run: | - # The followings are also used by `r-lib/actions/setup-r` to avoid - # R issues at docker environment - export TZ=UTC - export _R_CHECK_SYSTEM_CLOCK_=FALSE - ./dev/run-tests --parallelism 1 --modules sparkr - - name: Upload test results to report - if: always() - uses: actions/upload-artifact@v4 - with: - name: test-results-sparkr--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 - path: "**/target/test-reports/*.xml" - - buf: - needs: [precondition] - if: (!cancelled()) && fromJson(needs.precondition.outputs.required).buf == 'true' - name: Protobuf breaking change detection and Python CodeGen check - runs-on: ubuntu-latest - steps: - - name: Checkout Spark repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - repository: apache/spark - ref: ${{ inputs.branch }} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - run: | - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - - name: Install Buf - uses: bufbuild/buf-setup-action@v1 - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - - name: Protocol Buffers Linter - uses: bufbuild/buf-lint-action@v1 - with: - input: core/src/main/protobuf - # Change 'branch-3.5' to 'branch-4.0' in master branch after cutting branch-4.0 branch. - - name: Breaking change detection against branch-3.5 - uses: bufbuild/buf-breaking-action@v1 - with: - input: connector/connect/common/src/main - against: 'https://github.com/apache/spark.git#branch=branch-3.5,subdir=connector/connect/common/src/main' - - name: Install Python 3.9 - uses: actions/setup-python@v5 - with: - python-version: '3.9' - - name: Install dependencies for Python CodeGen check - run: | - python3.9 -m pip install 'black==23.9.1' 'protobuf==4.25.1' 'mypy==1.8.0' 'mypy-protobuf==3.3.0' - python3.9 -m pip list - - name: Python CodeGen check - run: ./dev/connect-check-protos.py - - # Static analysis - lint: - needs: [precondition, infra-image] - # always run if lint == 'true', even infra-image is skip (such as non-master job) - if: (!cancelled()) && fromJson(needs.precondition.outputs.required).lint == 'true' - name: Linters, licenses, and dependencies - runs-on: ubuntu-latest - timeout-minutes: 180 - env: - LC_ALL: C.UTF-8 - LANG: C.UTF-8 - NOLINT_ON_COMPILE: false - PYSPARK_DRIVER_PYTHON: python3.9 - PYSPARK_PYTHON: python3.9 - GITHUB_PREV_SHA: ${{ github.event.before }} - container: - image: ${{ needs.precondition.outputs.image_url }} - steps: - - name: Checkout Spark repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - repository: apache/spark - ref: ${{ inputs.branch }} - - name: Add GITHUB_WORKSPACE to git trust safe.directory - run: | - git config --global --add safe.directory ${GITHUB_WORKSPACE} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - run: | - echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - # Cache local repositories. Note that GitHub Actions cache has a 10G limit. - - name: Cache SBT and Maven - uses: actions/cache@v4 - with: - path: | - build/apache-maven-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v4 - with: - path: ~/.cache/coursier - key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - docs-coursier- - - name: Cache Maven local repository - uses: actions/cache@v4 - with: - path: ~/.m2/repository - key: docs-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - docs-maven- - - name: Free up disk space - run: | - if [ -f ./dev/free_disk_space_container ]; then - ./dev/free_disk_space_container - fi - - name: Install Java ${{ inputs.java }} - uses: actions/setup-java@v4 - with: - distribution: zulu - java-version: ${{ inputs.java }} - - name: License test - run: ./dev/check-license - - name: Dependencies test - run: ./dev/test-dependencies.sh - - name: MIMA test - run: ./dev/mima - - name: Scala linter - run: ./dev/lint-scala - - name: Java linter - run: ./dev/lint-java - - name: Spark connect jvm client mima check - run: ./dev/connect-jvm-client-mima-check - - name: Install Python linter dependencies for branch-3.4 - if: inputs.branch == 'branch-3.4' - run: | - # SPARK-44554: Copy from https://github.com/apache/spark/blob/a05c27e85829fe742c1828507a1fd180cdc84b54/.github/workflows/build_and_test.yml#L571-L578 - # Should delete this section after SPARK 3.4 EOL. - python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.920' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0' - python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.48.1' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' - - name: Install Python linter dependencies for branch-3.5 - if: inputs.branch == 'branch-3.5' - run: | - # SPARK-45212: Copy from https://github.com/apache/spark/blob/555c8def51e5951c7bf5165a332795e9e330ec9d/.github/workflows/build_and_test.yml#L631-L638 - # Should delete this section after SPARK 3.5 EOL. - python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0' - python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.56.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' - - name: Install Python dependencies for python linter and documentation generation - if: inputs.branch != 'branch-3.4' && inputs.branch != 'branch-3.5' - run: | - # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5 - # See 'ipython_genutils' in SPARK-38517 - # See 'docutils<0.18.0' in SPARK-39421 - python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \ - ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \ - 'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \ - 'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \ - 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' - python3.9 -m pip list - - name: Python linter - run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python - # Should delete this section after SPARK 3.5 EOL. - - name: Install dependencies for Python code generation check for branch-3.5 - if: inputs.branch == 'branch-3.5' - run: | - # See more in "Installation" https://docs.buf.build/installation#tarball - curl -LO https://github.com/bufbuild/buf/releases/download/v1.28.1/buf-Linux-x86_64.tar.gz - mkdir -p $HOME/buf - tar -xvzf buf-Linux-x86_64.tar.gz -C $HOME/buf --strip-components 1 - rm buf-Linux-x86_64.tar.gz - python3.9 -m pip install 'protobuf==4.25.1' 'mypy-protobuf==3.3.0' - # Should delete this section after SPARK 3.5 EOL. - - name: Python code generation check for branch-3.5 - if: inputs.branch == 'branch-3.5' - run: if test -f ./dev/connect-check-protos.py; then PATH=$PATH:$HOME/buf/bin PYTHON_EXECUTABLE=python3.9 ./dev/connect-check-protos.py; fi - # Should delete this section after SPARK 3.5 EOL. - - name: Install JavaScript linter dependencies for branch-3.4, branch-3.5 - if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5' - run: | - apt update - apt-get install -y nodejs npm - - name: JS linter - run: ./dev/lint-js - # Should delete this section after SPARK 3.5 EOL. - - name: Install R linter dependencies for branch-3.4, branch-3.5 - if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5' - run: | - apt update - apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev \ - libfontconfig1-dev libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev \ - libtiff5-dev libjpeg-dev - Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')" - Rscript -e "devtools::install_version('lintr', version='2.0.1', repos='https://cloud.r-project.org')" - - name: Install R linter dependencies and SparkR - run: ./R/install-dev.sh - - name: R linter - run: ./dev/lint-r - - # Documentation build - docs: - needs: [precondition, infra-image] - # always run if lint == 'true', even infra-image is skip (such as non-master job) - if: (!cancelled()) && fromJson(needs.precondition.outputs.required).docs == 'true' - name: Documentation generation - runs-on: ubuntu-latest - timeout-minutes: 180 - env: - LC_ALL: C.UTF-8 - LANG: C.UTF-8 - NOLINT_ON_COMPILE: false - PYSPARK_DRIVER_PYTHON: python3.9 - PYSPARK_PYTHON: python3.9 - GITHUB_PREV_SHA: ${{ github.event.before }} - container: - image: ${{ needs.precondition.outputs.image_url }} + sparkr-window: + name: "Build Sparkr Window" + runs-on: windows-2019 + timeout-minutes: 300 steps: - - name: Checkout Spark repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - repository: apache/spark - ref: ${{ inputs.branch }} - - name: Add GITHUB_WORKSPACE to git trust safe.directory - run: | - git config --global --add safe.directory ${GITHUB_WORKSPACE} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - run: | - echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - # Cache local repositories. Note that GitHub Actions cache has a 10G limit. - - name: Cache SBT and Maven - uses: actions/cache@v4 - with: - path: | - build/apache-maven-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v4 - with: - path: ~/.cache/coursier - key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - docs-coursier- - - name: Cache Maven local repository - uses: actions/cache@v4 - with: - path: ~/.m2/repository - key: docs-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - docs-maven- - - name: Free up disk space - run: | - if [ -f ./dev/free_disk_space_container ]; then - ./dev/free_disk_space_container - fi - - name: Install Java ${{ inputs.java }} - uses: actions/setup-java@v4 - with: - distribution: zulu - java-version: ${{ inputs.java }} - - name: Install Python dependencies for python linter and documentation generation - if: inputs.branch != 'branch-3.4' && inputs.branch != 'branch-3.5' - run: | - # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5 - # See 'ipython_genutils' in SPARK-38517 - # See 'docutils<0.18.0' in SPARK-39421 - python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \ - ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \ - 'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \ - 'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \ - 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' - python3.9 -m pip list - - name: Install dependencies for documentation generation for branch-3.4, branch-3.5 - if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5' - run: | - # pandoc is required to generate PySpark APIs as well in nbsphinx. - apt-get update -y - apt-get install -y libcurl4-openssl-dev pandoc - apt-get install -y ruby ruby-dev - Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'markdown', 'e1071', 'roxygen2', 'ggplot2', 'mvtnorm', 'statmod'), repos='https://cloud.r-project.org/')" - Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" - Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" - # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5 - python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' - python3.9 -m pip install ipython_genutils # See SPARK-38517 - python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' - python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421 - - name: Install dependencies for documentation generation - run: | - # Keep the version of Bundler here in sync with the following locations: - # - dev/create-release/spark-rm/Dockerfile - # - docs/README.md - gem install bundler -v 2.4.22 - cd docs - bundle install - - name: Run documentation build - run: | - # We need this link because the jekyll build calls `python`. - ln -s "$(which python3.9)" "/usr/local/bin/python" - # Build docs first with SKIP_API to ensure they are buildable without requiring any - # language docs to be built beforehand. - cd docs; SKIP_API=1 bundle exec jekyll build; cd .. - if [ -f "./dev/is-changed.py" ]; then - # Skip PySpark and SparkR docs while keeping Scala/Java/SQL docs - pyspark_modules=`cd dev && python3.9 -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"` - if [ `./dev/is-changed.py -m $pyspark_modules` = false ]; then export SKIP_PYTHONDOC=1; fi - if [ `./dev/is-changed.py -m sparkr` = false ]; then export SKIP_RDOC=1; fi - fi - cd docs - bundle exec jekyll build - - name: Tar documentation - if: github.repository != 'apache/spark' - run: tar cjf site.tar.bz2 docs/_site - - name: Upload documentation - if: github.repository != 'apache/spark' - uses: actions/upload-artifact@v4 - with: - name: site - path: site.tar.bz2 - retention-days: 1 - - # Any TPC-DS related updates on this job need to be applied to tpcds-1g-gen job of benchmark.yml as well - tpcds-1g: - needs: precondition - if: fromJson(needs.precondition.outputs.required).tpcds-1g == 'true' - name: Run TPC-DS queries with SF=1 - # Pin to 'Ubuntu 20.04' due to 'databricks/tpcds-kit' compilation - runs-on: ubuntu-20.04 - timeout-minutes: 180 - env: - SPARK_LOCAL_IP: localhost - steps: - - name: Checkout Spark repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - repository: apache/spark - ref: ${{ inputs.branch }} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - run: | - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - - name: Cache SBT and Maven - uses: actions/cache@v4 - with: - path: | - build/apache-maven-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v4 - with: - path: ~/.cache/coursier - key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - tpcds-coursier- - - name: Install Java ${{ inputs.java }} - uses: actions/setup-java@v4 - with: - distribution: zulu - java-version: ${{ inputs.java }} - - name: Cache TPC-DS generated data - id: cache-tpcds-sf-1 - uses: actions/cache@v4 - with: - path: ./tpcds-sf-1 - key: tpcds-${{ hashFiles('.github/workflows/build_and_test.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }} - - name: Checkout tpcds-kit repository - if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' - uses: actions/checkout@v4 - with: - repository: databricks/tpcds-kit - ref: 2a5078a782192ddb6efbcead8de9973d6ab4f069 - path: ./tpcds-kit - - name: Build tpcds-kit - if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' - run: cd tpcds-kit/tools && make OS=LINUX - - name: Generate TPC-DS (SF=1) table data - if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' - run: build/sbt "sql/Test/runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite" - - name: Run TPC-DS queries (Sort merge join) - run: | - SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" - env: - SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }} - SPARK_TPCDS_JOIN_CONF: | - spark.sql.autoBroadcastJoinThreshold=-1 - spark.sql.join.preferSortMergeJoin=true - - name: Run TPC-DS queries (Broadcast hash join) - run: | - SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" - env: - SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }} - SPARK_TPCDS_JOIN_CONF: | - spark.sql.autoBroadcastJoinThreshold=10485760 - - name: Run TPC-DS queries (Shuffled hash join) - run: | - SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" - env: - SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }} - SPARK_TPCDS_JOIN_CONF: | - spark.sql.autoBroadcastJoinThreshold=-1 - spark.sql.join.forceApplyShuffledHashJoin=true - - name: Run TPC-DS queries on collated data - run: | - SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSCollationQueryTestSuite" - - name: Upload test results to report - if: always() - uses: actions/upload-artifact@v4 - with: - name: test-results-tpcds--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 - path: "**/target/test-reports/*.xml" - - name: Upload unit tests log files - if: ${{ !success() }} - uses: actions/upload-artifact@v4 - with: - name: unit-tests-log-tpcds--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 - path: "**/target/unit-tests.log" - - docker-integration-tests: - needs: precondition - if: fromJson(needs.precondition.outputs.required).docker-integration-tests == 'true' - name: Run Docker integration tests - runs-on: ubuntu-latest - timeout-minutes: 180 - env: - HADOOP_PROFILE: ${{ inputs.hadoop }} - HIVE_PROFILE: hive2.3 - GITHUB_PREV_SHA: ${{ github.event.before }} - SPARK_LOCAL_IP: localhost - SKIP_UNIDOC: true - SKIP_MIMA: true - SKIP_PACKAGING: true - steps: - - name: Checkout Spark repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - repository: apache/spark - ref: ${{ inputs.branch }} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - run: | - echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - - name: Cache SBT and Maven - uses: actions/cache@v4 - with: - path: | - build/apache-maven-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v4 - with: - path: ~/.cache/coursier - key: docker-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - docker-integration-coursier- - - name: Install Java ${{ inputs.java }} - uses: actions/setup-java@v4 - with: - distribution: zulu - java-version: ${{ inputs.java }} - - name: Run tests - env: ${{ fromJSON(inputs.envs) }} - run: | - ./dev/run-tests --parallelism 1 --modules docker-integration-tests --included-tags org.apache.spark.tags.DockerTest - - name: Upload test results to report - if: always() - uses: actions/upload-artifact@v4 - with: - name: test-results-docker-integration--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 - path: "**/target/test-reports/*.xml" - - name: Upload unit tests log files - if: ${{ !success() }} - uses: actions/upload-artifact@v4 - with: - name: unit-tests-log-docker-integration--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 - path: "**/target/unit-tests.log" - - k8s-integration-tests: - needs: precondition - if: fromJson(needs.precondition.outputs.required).k8s-integration-tests == 'true' - name: Run Spark on Kubernetes Integration test - runs-on: ubuntu-latest - timeout-minutes: 180 - steps: - - name: Checkout Spark repository + - name: Download winutils Hadoop binary uses: actions/checkout@v4 with: - fetch-depth: 0 - repository: apache/spark - ref: ${{ inputs.branch }} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' + repository: cdarlint/winutils + - name: Move Hadoop winutil into home directory run: | - echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - - name: Cache SBT and Maven - uses: actions/cache@v4 - with: - path: | - build/apache-maven-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository + Move-Item -Path hadoop-3.3.6 -Destination ~\ + - name: Checkout Spark repository + uses: actions/checkout@v4 + - name: Cache Maven local repository uses: actions/cache@v4 with: - path: ~/.cache/coursier - key: k8s-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + path: ~/.m2/repository + key: build-sparkr-windows-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | - k8s-integration-coursier- - - name: Install Java ${{ inputs.java }} + build-sparkr-windows-maven- + - name: Install Java 17 uses: actions/setup-java@v4 with: distribution: zulu - java-version: ${{ inputs.java }} - - name: start minikube + java-version: 17 + - name: Install R 4.3.3 + uses: r-lib/actions/setup-r@v2 + with: + r-version: 4.3.3 + - name: Install R dependencies run: | - # See more in "Installation" https://minikube.sigs.k8s.io/docs/start/ - curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64 - sudo install minikube-linux-amd64 /usr/local/bin/minikube - rm minikube-linux-amd64 - # Github Action limit cpu:2, memory: 6947MB, limit to 2U6G for better resource statistic - minikube start --cpus 2 --memory 6144 - - name: Print K8S pods and nodes info + Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow', 'xml2'), repos='https://cloud.r-project.org/')" + Rscript -e "pkg_list <- as.data.frame(installed.packages()[,c(1, 3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]" + shell: cmd + # SparkR build does not need Python. However, it shows warnings when the Python version is too low during + # the attempt to look up Python Data Sources for session initialization. The Windows 2019 runner + # includes Python 3.7, which Spark does not support. Therefore, we simply install the proper Python + # for simplicity, see SPARK-47116. + - name: Install Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: '3.11' + architecture: x64 + - name: Build Spark run: | - kubectl get pods -A - kubectl describe node - - name: Run Spark on K8S integration test + rem 1. '-Djna.nosys=true' is required to avoid kernel32.dll load failure. + rem See SPARK-28759. + rem 2. Ideally we should check the tests related to Hive in SparkR as well (SPARK-31745). + rem 3. setup-java installs Maven 3.8.7 but does not allow changing its version, so overwrite + rem Maven version as a workaround. + mvn -DskipTests -Psparkr -Djna.nosys=true package -Dmaven.version=3.8.7 + shell: cmd + - name: Run SparkR tests run: | - # Prepare PV test - PVC_TMP_DIR=$(mktemp -d) - export PVC_TESTS_HOST_PATH=$PVC_TMP_DIR - export PVC_TESTS_VM_PATH=$PVC_TMP_DIR - minikube mount ${PVC_TESTS_HOST_PATH}:${PVC_TESTS_VM_PATH} --gid=0 --uid=185 & - kubectl create clusterrolebinding serviceaccounts-cluster-admin --clusterrole=cluster-admin --group=system:serviceaccounts || true - kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.8.2/installer/volcano-development.yaml || true - if [[ "${{ inputs.branch }}" == 'branch-3.5' || "${{ inputs.branch }}" == 'branch-3.4' ]]; then - kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.7.0/installer/volcano-development.yaml || true - else - kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.8.2/installer/volcano-development.yaml || true - fi - eval $(minikube docker-env) - build/sbt -Phadoop-3 -Psparkr -Pkubernetes -Pvolcano -Pkubernetes-integration-tests -Dspark.kubernetes.test.volcanoMaxConcurrencyJobNum=1 -Dtest.exclude.tags=local "kubernetes-integration-tests/test" - - name: Upload Spark on K8S integration tests log files - if: ${{ !success() }} - uses: actions/upload-artifact@v4 - with: - name: spark-on-kubernetes-it-log - path: "**/target/integration-tests.log" - - ui: - needs: [precondition] - if: fromJson(needs.precondition.outputs.required).ui == 'true' - name: Run Spark UI tests - runs-on: ubuntu-latest - timeout-minutes: 180 - steps: - - uses: actions/checkout@v4 - - name: Use Node.js - uses: actions/setup-node@v4 - with: - node-version: 20 - cache: 'npm' - cache-dependency-path: ui-test/package-lock.json - - run: | - cd ui-test - npm install --save-dev - node --experimental-vm-modules node_modules/.bin/jest + set HADOOP_HOME=%USERPROFILE%\hadoop-3.3.6 + set PATH=%HADOOP_HOME%\bin;%PATH% + .\bin\spark-submit2.cmd --driver-java-options "-Dlog4j.configurationFile=file:///%CD:\=/%/R/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" R\pkg\tests\run-all.R + shell: cmd + env: + NOT_CRAN: true + # See SPARK-27848. Currently installing some dependent packages causes + # "(converted from warning) unable to identify current timezone 'C':" for an unknown reason. + # This environment variable works around to test SparkR against a higher version. + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true From 2be28bc47109892f510920e3515deaaab6c02391 Mon Sep 17 00:00:00 2001 From: panbingkun Date: Wed, 19 Jun 2024 14:00:42 +0800 Subject: [PATCH 2/4] update quickly --- .github/workflows/build_and_test.yml | 215 ------------------ .../scala/org/apache/spark/util/Utils.scala | 4 + 2 files changed, 4 insertions(+), 215 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 082cc801267b..a93eefff0aa2 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -135,221 +135,6 @@ jobs: IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" echo "image_url=$IMG_URL" >> $GITHUB_OUTPUT - # Build: build Spark and run the tests for specified modules. - build: - name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }}" - needs: precondition - if: fromJson(needs.precondition.outputs.required).build == 'true' - runs-on: ubuntu-latest - timeout-minutes: 180 - strategy: - fail-fast: false - matrix: - java: - - ${{ inputs.java }} - hadoop: - - ${{ inputs.hadoop }} - hive: - - hive2.3 - # Note that the modules below are from sparktestsupport/modules.py. - modules: - - >- - core, unsafe, kvstore, avro, utils, - network-common, network-shuffle, repl, launcher, - examples, sketch, variant - - >- - api, catalyst, hive-thriftserver - - >- - mllib-local, mllib, graphx - - >- - streaming, sql-kafka-0-10, streaming-kafka-0-10, streaming-kinesis-asl, - kubernetes, hadoop-cloud, spark-ganglia-lgpl, protobuf, connect - - yarn - # Here, we split Hive and SQL tests into some of slow ones and the rest of them. - included-tags: [""] - excluded-tags: [""] - comment: [""] - include: - # Hive tests - - modules: hive - java: ${{ inputs.java }} - hadoop: ${{ inputs.hadoop }} - hive: hive2.3 - included-tags: org.apache.spark.tags.SlowHiveTest - comment: "- slow tests" - - modules: hive - java: ${{ inputs.java }} - hadoop: ${{ inputs.hadoop }} - hive: hive2.3 - excluded-tags: org.apache.spark.tags.SlowHiveTest - comment: "- other tests" - # SQL tests - - modules: sql - java: ${{ inputs.java }} - hadoop: ${{ inputs.hadoop }} - hive: hive2.3 - included-tags: org.apache.spark.tags.ExtendedSQLTest - comment: "- extended tests" - - modules: sql - java: ${{ inputs.java }} - hadoop: ${{ inputs.hadoop }} - hive: hive2.3 - included-tags: org.apache.spark.tags.SlowSQLTest - comment: "- slow tests" - - modules: sql - java: ${{ inputs.java }} - hadoop: ${{ inputs.hadoop }} - hive: hive2.3 - excluded-tags: org.apache.spark.tags.ExtendedSQLTest,org.apache.spark.tags.SlowSQLTest - comment: "- other tests" - exclude: - # Always run if yarn == 'true', even infra-image is skip (such as non-master job) - # In practice, the build will run in individual PR, but not against the individual commit - # in Apache Spark repository. - - modules: ${{ fromJson(needs.precondition.outputs.required).yarn != 'true' && 'yarn' }} - env: - MODULES_TO_TEST: ${{ matrix.modules }} - EXCLUDED_TAGS: ${{ matrix.excluded-tags }} - INCLUDED_TAGS: ${{ matrix.included-tags }} - HADOOP_PROFILE: ${{ matrix.hadoop }} - HIVE_PROFILE: ${{ matrix.hive }} - GITHUB_PREV_SHA: ${{ github.event.before }} - SPARK_LOCAL_IP: localhost - NOLINT_ON_COMPILE: true - SKIP_UNIDOC: true - SKIP_MIMA: true - SKIP_PACKAGING: true - steps: - - name: Checkout Spark repository - uses: actions/checkout@v4 - # In order to fetch changed files - with: - fetch-depth: 0 - repository: apache/spark - ref: ${{ inputs.branch }} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - run: | - echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - # Cache local repositories. Note that GitHub Actions cache has a 10G limit. - - name: Cache SBT and Maven - uses: actions/cache@v4 - with: - path: | - build/apache-maven-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v4 - with: - path: ~/.cache/coursier - key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-coursier- - - name: Free up disk space - run: | - if [ -f ./dev/free_disk_space ]; then - ./dev/free_disk_space - fi - - name: Install Java ${{ matrix.java }} - uses: actions/setup-java@v4 - with: - distribution: zulu - java-version: ${{ matrix.java }} - - name: Install Python 3.9 - uses: actions/setup-python@v5 - # We should install one Python that is higher than 3+ for SQL and Yarn because: - # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils. - # - Yarn has a Python specific test too, for example, YarnClusterSuite. - if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect') - with: - python-version: '3.9' - architecture: x64 - - name: Install Python packages (Python 3.9) - if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect') - run: | - python3.9 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.62.0' 'grpcio-status==1.62.0' 'protobuf==4.25.1' - python3.9 -m pip list - # Run the tests. - - name: Run tests - env: ${{ fromJSON(inputs.envs) }} - shell: 'script -q -e -c "bash {0}"' - run: | - # Fix for TTY related issues when launching the Ammonite REPL in tests. - export TERM=vt100 - # Hive "other tests" test needs larger metaspace size based on experiment. - if [[ "$MODULES_TO_TEST" == "hive" ]] && [[ "$EXCLUDED_TAGS" == "org.apache.spark.tags.SlowHiveTest" ]]; then export METASPACE_SIZE=2g; fi - # SPARK-46283: should delete the following env replacement after SPARK 3.x EOL - if [[ "$MODULES_TO_TEST" == *"streaming-kinesis-asl"* ]] && [[ "${{ inputs.branch }}" =~ ^branch-3 ]]; then - MODULES_TO_TEST=${MODULES_TO_TEST//streaming-kinesis-asl, /} - fi - export SERIAL_SBT_TESTS=1 - ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" - - name: Upload test results to report - if: always() - uses: actions/upload-artifact@v4 - with: - name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} - path: "**/target/test-reports/*.xml" - - name: Upload unit tests log files - if: ${{ !success() }} - uses: actions/upload-artifact@v4 - with: - name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} - path: "**/target/unit-tests.log" - - infra-image: - name: "Base image build" - needs: precondition - if: >- - fromJson(needs.precondition.outputs.required).pyspark == 'true' || - fromJson(needs.precondition.outputs.required).lint == 'true' || - fromJson(needs.precondition.outputs.required).sparkr == 'true' - runs-on: ubuntu-latest - permissions: - packages: write - steps: - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout Spark repository - uses: actions/checkout@v4 - # In order to fetch changed files - with: - fetch-depth: 0 - repository: apache/spark - ref: ${{ inputs.branch }} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - run: | - echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - name: Build and push - id: docker_build - uses: docker/build-push-action@v5 - with: - context: ./dev/infra/ - push: true - tags: | - ${{ needs.precondition.outputs.image_url }} - # Use the infra image cache to speed up - cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ inputs.branch }} - sparkr-window: name: "Build Sparkr Window" runs-on: windows-2019 diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index a37aedfcb635..360e0f0e611b 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -575,6 +575,10 @@ private[spark] object Utils if (removeSourceFile) { Files.move(sourceFile.toPath, destFile.toPath) } else { + // scalastyle:off println + println(s"source path: ${sourceFile.getAbsolutePath}") + println(s"dest path: ${destFile.getAbsolutePath}") + // scalastyle:on println logInfo(log"Copying ${MDC(SOURCE_PATH, sourceFile.getAbsolutePath)}" + log" to ${MDC(DESTINATION_PATH, destFile.getAbsolutePath)}") copyRecursive(sourceFile, destFile) From 61a9c4b5c314bb15e6c083a24206dcb8fed8b90e Mon Sep 17 00:00:00 2001 From: panbingkun Date: Fri, 21 Jun 2024 18:47:19 +0800 Subject: [PATCH 3/4] R 4.4.0 --- .github/workflows/build_and_test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index a93eefff0aa2..a2d721022e7c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -161,10 +161,10 @@ jobs: with: distribution: zulu java-version: 17 - - name: Install R 4.3.3 + - name: Install R 4.4.0 uses: r-lib/actions/setup-r@v2 with: - r-version: 4.3.3 + r-version: 4.4.0 - name: Install R dependencies run: | Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival', 'arrow', 'xml2'), repos='https://cloud.r-project.org/')" From 3ec3ef2c4730b85001539afa40fdf32f172d4ae8 Mon Sep 17 00:00:00 2001 From: panbingkun Date: Tue, 25 Jun 2024 10:45:46 +0800 Subject: [PATCH 4/4] test windows-2022 --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index a2d721022e7c..dce5ddde0c96 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -137,7 +137,7 @@ jobs: sparkr-window: name: "Build Sparkr Window" - runs-on: windows-2019 + runs-on: windows-2022 timeout-minutes: 300 steps: - name: Download winutils Hadoop binary