diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml deleted file mode 100644 index fe01b9203637..000000000000 --- a/.github/workflows/master.yml +++ /dev/null @@ -1,239 +0,0 @@ -name: master - -on: - push: - branches: - - master - pull_request: - branches: - - master - -jobs: - # TODO(SPARK-32248): Recover JDK 11 builds - # Build: build Spark and run the tests for specified modules. - build: - name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})" - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - java: - - 1.8 - hadoop: - - hadoop3.2 - hive: - - hive2.3 - # TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now. - # Kinesis tests depends on external Amazon kinesis service. - # Note that the modules below are from sparktestsupport/modules.py. - modules: - - |- - core, unsafe, kvstore, avro, - network-common, network-shuffle, repl, launcher, - examples, sketch, graphx - - |- - catalyst, hive-thriftserver - - |- - streaming, sql-kafka-0-10, streaming-kafka-0-10, - mllib-local, mllib, - yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl - - |- - pyspark-sql, pyspark-mllib, pyspark-resource - - |- - pyspark-core, pyspark-streaming, pyspark-ml - - |- - sparkr - # Here, we split Hive and SQL tests into some of slow ones and the rest of them. - included-tags: [""] - excluded-tags: [""] - comment: [""] - include: - # Hive tests - - modules: hive - java: 1.8 - hadoop: hadoop3.2 - hive: hive2.3 - included-tags: org.apache.spark.tags.SlowHiveTest - comment: "- slow tests" - - modules: hive - java: 1.8 - hadoop: hadoop3.2 - hive: hive2.3 - excluded-tags: org.apache.spark.tags.SlowHiveTest - comment: "- other tests" - # SQL tests - - modules: sql - java: 1.8 - hadoop: hadoop3.2 - hive: hive2.3 - included-tags: org.apache.spark.tags.ExtendedSQLTest - comment: "- slow tests" - - modules: sql - java: 1.8 - hadoop: hadoop3.2 - hive: hive2.3 - excluded-tags: org.apache.spark.tags.ExtendedSQLTest - comment: "- other tests" - env: - MODULES_TO_TEST: ${{ matrix.modules }} - EXCLUDED_TAGS: ${{ matrix.excluded-tags }} - INCLUDED_TAGS: ${{ matrix.included-tags }} - HADOOP_PROFILE: ${{ matrix.hadoop }} - HIVE_PROFILE: ${{ matrix.hive }} - # GitHub Actions' default miniconda to use in pip packaging test. - CONDA_PREFIX: /usr/share/miniconda - GITHUB_PREV_SHA: ${{ github.event.before }} - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - # In order to fetch changed files - with: - fetch-depth: 0 - # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - - name: Cache Scala, SBT, Maven and Zinc - uses: actions/cache@v1 - with: - path: build - key: build-${{ hashFiles('**/pom.xml') }} - restore-keys: | - build- - - name: Cache Maven local repository - uses: actions/cache@v2 - with: - path: ~/.m2/repository - key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-maven- - - name: Cache Ivy local repository - uses: actions/cache@v2 - with: - path: ~/.ivy2/cache - key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml') }}-${{ hashFiles('**/plugins.sbt') }} - restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-ivy- - - name: Install JDK ${{ matrix.java }} - uses: actions/setup-java@v1 - with: - java-version: ${{ matrix.java }} - # PySpark - - name: Install PyPy3 - # Note that order of Python installations here matters because default python3 is - # overridden by pypy3. - uses: actions/setup-python@v2 - if: contains(matrix.modules, 'pyspark') - with: - python-version: pypy3 - architecture: x64 - - name: Install Python 3.6 - uses: actions/setup-python@v2 - if: contains(matrix.modules, 'pyspark') - with: - python-version: 3.6 - architecture: x64 - - name: Install Python 3.8 - uses: actions/setup-python@v2 - # We should install one Python that is higher then 3+ for SQL and Yarn because: - # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils. - # - Yarn has a Python specific test too, for example, YarnClusterSuite. - if: contains(matrix.modules, 'yarn') || contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) - with: - python-version: 3.8 - architecture: x64 - - name: Install Python packages (Python 3.6 and PyPy3) - if: contains(matrix.modules, 'pyspark') - # PyArrow is not supported in PyPy yet, see ARROW-2651. - # TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason. - run: | - python3.6 -m pip install numpy pyarrow pandas scipy - python3.6 -m pip list - pypy3 -m pip install numpy pandas - pypy3 -m pip list - - name: Install Python packages (Python 3.8) - if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) - run: | - python3.8 -m pip install numpy pyarrow pandas scipy - python3.8 -m pip list - # SparkR - - name: Install R 3.6 - uses: r-lib/actions/setup-r@v1 - if: contains(matrix.modules, 'sparkr') - with: - r-version: 3.6 - - name: Install R packages - if: contains(matrix.modules, 'sparkr') - run: | - sudo apt-get install -y libcurl4-openssl-dev - sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')" - # Show installed packages in R. - sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]' - # Run the tests. - - name: "Run tests: ${{ matrix.modules }}" - run: | - # Hive tests become flaky when running in parallel as it's too intensive. - if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi - mkdir -p ~/.m2 - ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" - rm -rf ~/.m2/repository/org/apache/spark - - # Static analysis, and documentation build - lint: - name: Linters, licenses, dependencies and documentation generation - runs-on: ubuntu-latest - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - - name: Cache Maven local repository - uses: actions/cache@v2 - with: - path: ~/.m2/repository - key: docs-maven-repo-${{ hashFiles('**/pom.xml') }} - restore-keys: | - docs-maven- - - name: Install JDK 1.8 - uses: actions/setup-java@v1 - with: - java-version: 1.8 - - name: Install Python 3.6 - uses: actions/setup-python@v2 - with: - python-version: 3.6 - architecture: x64 - - name: Install Python linter dependencies - run: | - pip3 install flake8 sphinx numpy - - name: Install R 3.6 - uses: r-lib/actions/setup-r@v1 - with: - r-version: 3.6 - - name: Install R linter dependencies and SparkR - run: | - sudo apt-get install -y libcurl4-openssl-dev - sudo Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')" - sudo Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')" - ./R/install-dev.sh - - name: Install Ruby 2.7 for documentation generation - uses: actions/setup-ruby@v1 - with: - ruby-version: 2.7 - - name: Install dependencies for documentation generation - run: | - sudo apt-get install -y libcurl4-openssl-dev pandoc - pip install sphinx mkdocs numpy - gem install jekyll jekyll-redirect-from rouge - sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')" - - name: Scala linter - run: ./dev/lint-scala - - name: Java linter - run: ./dev/lint-java - - name: Python linter - run: ./dev/lint-python - - name: R linter - run: ./dev/lint-r - - name: License test - run: ./dev/check-license - - name: Dependencies test - run: ./dev/test-dependencies.sh - - name: Run documentation build - run: | - cd docs - jekyll build diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index c5c08bd7a063..2047f0d75ca1 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -23,7 +23,7 @@ Suggests: testthat, e1071, survival, - arrow (>= 0.15.1) + arrow (>= 1.0.0) Collate: 'schema.R' 'generics.R' diff --git a/appveyor.yml b/appveyor.yml index 1fd91daae901..cb843c19748d 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -57,6 +57,10 @@ environment: # "(converted from warning) unable to identify current timezone 'C':" for an unknown reason. # This environment variable works around to test SparkR against a higher version. R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + # AppVeyor does not have python3 yet which is used by default. + PYSPARK_PYTHON: python + # TODO(SPARK-32453): Remove SPARK_SCALA_VERSION environment and let load-spark-env scripts detect it. + SPARK_SCALA_VERSION: 2.12 test_script: - cmd: .\bin\spark-submit2.cmd --driver-java-options "-Dlog4j.configuration=file:///%CD:\=/%/R/log4j.properties" --conf spark.hadoop.fs.defaultFS="file:///" R\pkg\tests\run-all.R diff --git a/bin/load-spark-env.cmd b/bin/load-spark-env.cmd index ebbde66153e1..6cb8bf7fc7a2 100644 --- a/bin/load-spark-env.cmd +++ b/bin/load-spark-env.cmd @@ -22,7 +22,7 @@ rem spark-env.cmd is loaded from SPARK_CONF_DIR if set, or within the current di rem conf\ subdirectory. set SPARK_ENV_CMD=spark-env.cmd -if [%SPARK_ENV_LOADED%] == [] ( +if not defined SPARK_ENV_LOADED ( set SPARK_ENV_LOADED=1 if [%SPARK_CONF_DIR%] == [] ( @@ -37,18 +37,19 @@ if [%SPARK_ENV_LOADED%] == [] ( rem Setting SPARK_SCALA_VERSION if not already set. -if [%SPARK_SCALA_VERSION%] == [] ( - set SCALA_VERSION_1=2.13 - set SCALA_VERSION_2=2.12 +set SCALA_VERSION_1=2.13 +set SCALA_VERSION_2=2.12 - set ASSEMBLY_DIR1=%SPARK_HOME%\assembly\target\scala-%SCALA_VERSION_1% - set ASSEMBLY_DIR2=%SPARK_HOME%\assembly\target\scala-%SCALA_VERSION_2% - set ENV_VARIABLE_DOC=https://spark.apache.org/docs/latest/configuration.html#environment-variables +set ASSEMBLY_DIR1=%SPARK_HOME%\assembly\target\scala-%SCALA_VERSION_1% +set ASSEMBLY_DIR2=%SPARK_HOME%\assembly\target\scala-%SCALA_VERSION_2% +set ENV_VARIABLE_DOC=https://spark.apache.org/docs/latest/configuration.html#environment-variables + +if not defined SPARK_SCALA_VERSION ( if exist %ASSEMBLY_DIR2% if exist %ASSEMBLY_DIR1% ( - echo "Presence of build for multiple Scala versions detected (%ASSEMBLY_DIR1% and %ASSEMBLY_DIR2%)." - echo "Remove one of them or, set SPARK_SCALA_VERSION=%SCALA_VERSION_1% in %SPARK_ENV_CMD%." - echo "Visit %ENV_VARIABLE_DOC% for more details about setting environment variables in spark-env.cmd." - echo "Either clean one of them or, set SPARK_SCALA_VERSION in spark-env.cmd." + echo Presence of build for multiple Scala versions detected ^(%ASSEMBLY_DIR1% and %ASSEMBLY_DIR2%^). + echo Remove one of them or, set SPARK_SCALA_VERSION=%SCALA_VERSION_1% in spark-env.cmd. + echo Visit %ENV_VARIABLE_DOC% for more details about setting environment variables in spark-env.cmd. + echo Either clean one of them or, set SPARK_SCALA_VERSION in spark-env.cmd. exit 1 ) if exist %ASSEMBLY_DIR1% ( diff --git a/docs/sparkr.md b/docs/sparkr.md index d86fa86c8985..fa1bb1b85181 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -674,7 +674,7 @@ Rscript -e 'install.packages("arrow", repos="https://cloud.r-project.org/")' Please refer [the official documentation of Apache Arrow](https://arrow.apache.org/docs/r/) for more detials. Note that you must ensure that Arrow R package is installed and available on all cluster nodes. -The current supported minimum version is 0.15.1; however, this might change between the minor releases since Arrow optimization in SparkR is experimental. +The current supported minimum version is 1.0.0; however, this might change between the minor releases since Arrow optimization in SparkR is experimental. ## Enabling for Conversion to/from R DataFrame, `dapply` and `gapply`