HyukjinKwon
diff --git a/‎.github/workflows/branch-2.4.yml‎
Lines changed: 0 additions & 104 deletions b/‎.github/workflows/branch-2.4.yml‎
Lines changed: 0 additions & 104 deletions
diff --git a/‎.github/workflows/master.yml‎
Lines changed: 221 additions & 0 deletions b/‎.github/workflows/master.yml‎
Lines changed: 221 additions & 0 deletions
diff --git a/‎common/tags/src/test/java/org/apache/spark/tags/SlowHiveTest.java‎
Lines changed: 30 additions & 0 deletions b/‎common/tags/src/test/java/org/apache/spark/tags/SlowHiveTest.java‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎dev/run-pip-tests‎
Lines changed: 8 additions & 3 deletions b/‎dev/run-pip-tests‎
Lines changed: 8 additions & 3 deletions
@@ -0,0 +1,221 @@
+name: master
+
+on:
+  pull_request:
+    branches:
+    - branch-2.4
+
+jobs:
+  # TODO(SPARK-32248): Recover JDK 11 builds
+  # Build: build Spark and run the tests for specified modules.
+  build:
+    name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }})"
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        java:
+          - 1.8
+        hadoop:
+          - hadoop2.6
+        # TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now.
+        # Kinesis tests depends on external Amazon kinesis service.
+        # Note that the modules below are from sparktestsupport/modules.py.
+        modules:
+          - |-
+            core, unsafe, kvstore, avro,
+            network_common, network_shuffle, repl, launcher
+            examples, sketch, graphx
+          - |-
+            catalyst, hive-thriftserver
+          - |-
+            streaming, sql-kafka-0-10, streaming-kafka-0-10,
+            mllib-local, mllib,
+            yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl,
+            streaming-flume, streaming-flume-sink, streaming-kafka-0-8
+          - |-
+            pyspark-sql, pyspark-mllib
+          - |-
+            pyspark-core, pyspark-streaming, pyspark-ml
+          - |-
+            sparkr
+        # Here, we split Hive and SQL tests into some of slow ones and the rest of them.
+        included-tags: [""]
+        excluded-tags: [""]
+        comment: [""]
+        include:
+          # Hive tests
+          - modules: hive
+            java: 1.8
+            hadoop: hadoop2.6
+            included-tags: org.apache.spark.tags.SlowHiveTest
+            comment: "- slow tests"
+          - modules: hive
+            java: 1.8
+            hadoop: hadoop2.6
+            excluded-tags: org.apache.spark.tags.SlowHiveTest
+            comment: "- other tests"
+          # SQL tests
+          - modules: sql
+            java: 1.8
+            hadoop: hadoop2.6
+            included-tags: org.apache.spark.tags.ExtendedSQLTest
+            comment: "- slow tests"
+          - modules: sql
+            java: 1.8
+            hadoop: hadoop2.6
+            excluded-tags: org.apache.spark.tags.ExtendedSQLTest
+            comment: "- other tests"
+    env:
+      TEST_ONLY_MODULES: ${{ matrix.modules }}
+      TEST_ONLY_EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
+      TEST_ONLY_INCLUDED_TAGS: ${{ matrix.included-tags }}
+      HADOOP_PROFILE: ${{ matrix.hadoop }}
+      # GitHub Actions' default miniconda to use in pip packaging test.
+      CONDA_PREFIX: /usr/share/miniconda
+    steps:
+    - name: Checkout Spark repository
+      uses: actions/checkout@v2
+    # Cache local repositories. Note that GitHub Actions cache has a 2G limit.
+    - name: Cache Scala, SBT, Maven and Zinc
+      uses: actions/cache@v1
+      with:
+        path: build
+        key: build-${{ hashFiles('**/pom.xml') }}
+        restore-keys: |
+          build-
+    - name: Cache Maven local repository
+      uses: actions/cache@v2
+      with:
+        path: ~/.m2/repository
+        key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-${{ hashFiles('**/pom.xml') }}
+        restore-keys: |
+          ${{ matrix.java }}-${{ matrix.hadoop }}-maven-
+    - name: Cache Ivy local repository
+      uses: actions/cache@v2
+      with:
+        path: ~/.ivy2/cache
+        key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml') }}-${{ hashFiles('**/plugins.sbt') }}
+        restore-keys: |
+          ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-
+    - name: Install JDK ${{ matrix.java }}
+      uses: actions/setup-java@v1
+      with:
+        java-version: ${{ matrix.java }}
+    # PySpark
+    - name: Install PyPy3
+      # SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
+      # Note that order of Python installations here matters because default python3 is
+      # overridden by pypy3.
+      uses: actions/setup-python@v2
+      if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
+      with:
+        python-version: pypy3
+        architecture: x64
+    - name: Install Python 3.6
+      uses: actions/setup-python@v2
+      if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
+      with:
+        python-version: 3.6
+        architecture: x64
+    - name: Install Python 2.7
+      uses: actions/setup-python@v2
+      if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
+      with:
+        python-version: 2.7
+        architecture: x64
+    - name: Install Python packages
+      if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
+      # PyArrow is not supported in PyPy yet, see ARROW-2651.
+      # TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason.
+      run: |
+        python3 -m pip install numpy pyarrow pandas scipy
+        python3 -m pip list
+        python2 -m pip install numpy pyarrow pandas scipy
+        python2 -m pip list
+        pypy3 -m pip install numpy pandas
+        pypy3 -m pip list
+    # SparkR
+    - name: Install R 3.6
+      uses: r-lib/actions/setup-r@v1
+      if: contains(matrix.modules, 'sparkr')
+      with:
+        r-version: 3.6
+    - name: Install R packages
+      if: contains(matrix.modules, 'sparkr')
+      run: |
+        sudo apt-get install -y libcurl4-openssl-dev
+        sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')"
+        # Show installed packages in R.
+        sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]'
+    # Run the tests.
+    - name: "Run tests: ${{ matrix.modules }}"
+      run: |
+        # Hive tests become flaky when running in parallel as it's too intensive.
+        if [[ "$TEST_ONLY_MODULES" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi
+        mkdir -p ~/.m2
+        ./dev/run-tests --parallelism 2
+        rm -rf ~/.m2/repository/org/apache/spark
+
+  # Static analysis, and documentation build
+  lint:
+    name: Linters, licenses, dependencies and documentation generation
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout Spark repository
+      uses: actions/checkout@v2
+    - name: Cache Maven local repository
+      uses: actions/cache@v2
+      with:
+        path: ~/.m2/repository
+        key: docs-maven-repo-${{ hashFiles('**/pom.xml') }}
+        restore-keys: |
+          docs-maven-
+    - name: Install JDK 1.8
+      uses: actions/setup-java@v1
+      with:
+        java-version: 1.8
+    - name: Install Python 3.6
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.6
+        architecture: x64
+    - name: Install Python linter dependencies
+      run: |
+        pip3 install flake8 sphinx numpy
+    - name: Install R 3.6
+      uses: r-lib/actions/setup-r@v1
+      with:
+        r-version: 3.6
+    - name: Install R linter dependencies and SparkR
+      run: |
+        sudo apt-get install -y libcurl4-openssl-dev
+        sudo Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
+        sudo Rscript -e "devtools::install_github('jimhester/[email protected]')"
+        ./R/install-dev.sh
+    - name: Install Ruby 2.7 for documentation generation
+      uses: actions/setup-ruby@v1
+      with:
+        ruby-version: 2.7
+    - name: Install dependencies for documentation generation
+      run: |
+        sudo apt-get install -y libcurl4-openssl-dev pandoc
+        pip install sphinx mkdocs numpy
+        gem install jekyll jekyll-redirect-from rouge
+        sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
+    - name: Scala linter
+      run: ./dev/lint-scala
+    - name: Java linter
+      run: ./dev/lint-java
+    - name: Python linter
+      run: ./dev/lint-python
+    - name: R linter
+      run: ./dev/lint-r
+    - name: License test
+      run: ./dev/check-license
+    - name: Dependencies test
+      run: ./dev/test-dependencies.sh
+    - name: Run documentation build
+      run: |
+        cd docs
+        jekyll build
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.tags;
+
+import org.scalatest.TagAnnotation;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+@TagAnnotation
+@Retention(RetentionPolicy.RUNTIME)
+@Target({ElementType.METHOD, ElementType.TYPE})
+public @interface SlowHiveTest { }
@@ -68,7 +68,7 @@ fi
 PYSPARK_VERSION=$(python3 -c "exec(open('python/pyspark/version.py').read());print(__version__)")
 PYSPARK_DIST="$FWDIR/python/dist/pyspark-$PYSPARK_VERSION.tar.gz"
 # The pip install options we use for all the pip commands
-PIP_OPTIONS="--upgrade --no-cache-dir --force-reinstall "
+PIP_OPTIONS="--user --upgrade --no-cache-dir --force-reinstall "
 # Test both regular user and edit/dev install modes.
 PIP_COMMANDS=("pip install $PIP_OPTIONS $PYSPARK_DIST"
 	      "pip install $PIP_OPTIONS -e python/")
@@ -81,8 +81,12 @@ for python in "${PYTHON_EXECS[@]}"; do
     VIRTUALENV_PATH="$VIRTUALENV_BASE"/$python
     rm -rf "$VIRTUALENV_PATH"
     if [ -n "$USE_CONDA" ]; then
+      if [ -f "$CONDA_PREFIX/etc/profile.d/conda.sh" ]; then
+        # See also https://github.com/conda/conda/issues/7980
+        source "$CONDA_PREFIX/etc/profile.d/conda.sh"
+      fi
       conda create -y -p "$VIRTUALENV_PATH" python=$python numpy pandas pip setuptools
-      source activate "$VIRTUALENV_PATH"
+      conda activate "$VIRTUALENV_PATH" || (echo "Falling back to 'source activate'" && source activate "$VIRTUALENV_PATH")
     else
       mkdir -p "$VIRTUALENV_PATH"
       virtualenv --python=$python "$VIRTUALENV_PATH"
@@ -115,6 +119,7 @@ for python in "${PYTHON_EXECS[@]}"; do
     cd /
 
     echo "Run basic sanity check on pip installed version with spark-submit"
+    export PATH="$(python3 -m site --user-base)/bin:$PATH"
     spark-submit "$FWDIR"/dev/pip-sanity-check.py
     echo "Run basic sanity check with import based"
     python "$FWDIR"/dev/pip-sanity-check.py
@@ -125,7 +130,7 @@ for python in "${PYTHON_EXECS[@]}"; do
 
     # conda / virtualenv environments need to be deactivated differently
     if [ -n "$USE_CONDA" ]; then
-      source deactivate
+      conda deactivate || (echo "Falling back to 'source deactivate'" && source deactivate)
     else
       deactivate
     fi