diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 7bb5481a561b..2fc556e6a43b 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -29,7 +29,7 @@ jobs: modules: - |- core, unsafe, kvstore, avro, - network_common, network_shuffle, repl, launcher + network-common, network-shuffle, repl, launcher, examples, sketch, graphx - |- catalyst, hive-thriftserver @@ -75,156 +75,32 @@ jobs: excluded-tags: org.apache.spark.tags.ExtendedSQLTest comment: "- other tests" env: - TEST_ONLY_MODULES: ${{ matrix.modules }} - TEST_ONLY_EXCLUDED_TAGS: ${{ matrix.excluded-tags }} - TEST_ONLY_INCLUDED_TAGS: ${{ matrix.included-tags }} + MODULES_TO_TEST: ${{ matrix.modules }} + EXCLUDED_TAGS: ${{ matrix.excluded-tags }} + INCLUDED_TAGS: ${{ matrix.included-tags }} HADOOP_PROFILE: ${{ matrix.hadoop }} HIVE_PROFILE: ${{ matrix.hive }} # GitHub Actions' default miniconda to use in pip packaging test. CONDA_PREFIX: /usr/share/miniconda + GITHUB_PREV_SHA: ${{ github.event.before }} steps: - name: Checkout Spark repository uses: actions/checkout@v2 - # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - - name: Cache Scala, SBT, Maven and Zinc - uses: actions/cache@v1 + # In order to fetch changed files with: - path: build - key: build-${{ hashFiles('**/pom.xml') }} - restore-keys: | - build- - - name: Cache Maven local repository - uses: actions/cache@v2 - with: - path: ~/.m2/repository - key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-maven- - - name: Cache Ivy local repository - uses: actions/cache@v2 - with: - path: ~/.ivy2/cache - key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml') }}-${{ hashFiles('**/plugins.sbt') }} - restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-ivy- - - name: Install JDK ${{ matrix.java }} - uses: actions/setup-java@v1 - with: - java-version: ${{ matrix.java }} + fetch-depth: 0 # PySpark - - name: Install PyPy3 - # SQL component also has Python related tests, for example, IntegratedUDFTestUtils. - # Note that order of Python installations here matters because default python3 is - # overridden by pypy3. - uses: actions/setup-python@v2 - if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) - with: - python-version: pypy3 - architecture: x64 - - name: Install Python 2.7 - uses: actions/setup-python@v2 - if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) - with: - python-version: 2.7 - architecture: x64 - name: Install Python 3.6 uses: actions/setup-python@v2 if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) with: python-version: 3.6 architecture: x64 - - name: Install Python packages - if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) - # PyArrow is not supported in PyPy yet, see ARROW-2651. - # TODO(SPARK-32247): scipy installation with PyPy fails for an unknown reason. - run: | - python3 -m pip install numpy pyarrow pandas scipy - python3 -m pip list - python2 -m pip install numpy pyarrow pandas scipy - python2 -m pip list - pypy3 -m pip install numpy pandas - pypy3 -m pip list - # SparkR - - name: Install R 3.6 - uses: r-lib/actions/setup-r@v1 - if: contains(matrix.modules, 'sparkr') - with: - r-version: 3.6 - - name: Install R packages - if: contains(matrix.modules, 'sparkr') - run: | - sudo apt-get install -y libcurl4-openssl-dev - sudo Rscript -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'devtools', 'e1071', 'survival', 'arrow', 'roxygen2'), repos='https://cloud.r-project.org/')" - # Show installed packages in R. - sudo Rscript -e 'pkg_list <- as.data.frame(installed.packages()[, c(1,3:4)]); pkg_list[is.na(pkg_list$Priority), 1:2, drop = FALSE]' - # Run the tests. - name: "Run tests: ${{ matrix.modules }}" run: | # Hive tests become flaky when running in parallel as it's too intensive. - if [[ "$TEST_ONLY_MODULES" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi + git diff + if [[ "$MODULES_TO_TEST" == "hive" ]]; then export SERIAL_SBT_TESTS=1; fi mkdir -p ~/.m2 - ./dev/run-tests --parallelism 2 + ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" rm -rf ~/.m2/repository/org/apache/spark - - # Static analysis, and documentation build - lint: - name: Linters, licenses, dependencies and documentation generation - runs-on: ubuntu-latest - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - - name: Cache Maven local repository - uses: actions/cache@v2 - with: - path: ~/.m2/repository - key: docs-maven-repo-${{ hashFiles('**/pom.xml') }} - restore-keys: | - docs-maven- - - name: Install JDK 1.8 - uses: actions/setup-java@v1 - with: - java-version: 1.8 - - name: Install Python 3.6 - uses: actions/setup-python@v2 - with: - python-version: 3.6 - architecture: x64 - - name: Install Python linter dependencies - run: | - pip3 install flake8 sphinx numpy - - name: Install R 3.6 - uses: r-lib/actions/setup-r@v1 - with: - r-version: 3.6 - - name: Install R linter dependencies and SparkR - run: | - sudo apt-get install -y libcurl4-openssl-dev - sudo Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')" - sudo Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')" - ./R/install-dev.sh - - name: Install Ruby 2.7 for documentation generation - uses: actions/setup-ruby@v1 - with: - ruby-version: 2.7 - - name: Install dependencies for documentation generation - run: | - sudo apt-get install -y libcurl4-openssl-dev pandoc - pip install sphinx mkdocs numpy - gem install jekyll jekyll-redirect-from rouge - sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')" - - name: Scala linter - run: ./dev/lint-scala - - name: Java linter - run: ./dev/lint-java - - name: Python linter - run: ./dev/lint-python - - name: R linter - run: ./dev/lint-r - - name: License test - run: ./dev/check-license - - name: Dependencies test - run: ./dev/test-dependencies.sh - - name: Run documentation build - run: | - cd docs - jekyll build diff --git a/R/README.md b/R/README.md index 31174c73526f..bd59b3daad1d 100644 --- a/R/README.md +++ b/R/README.md @@ -1,4 +1,4 @@ -# R on Spark +# R on Spark. SparkR is an R package that provides a light-weight frontend to use Spark from R. diff --git a/dev/run-tests.py b/dev/run-tests.py index 03cc3230a65f..c2ed518a4762 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -101,12 +101,14 @@ def setup_test_environ(environ): os.environ[k] = v -def determine_modules_to_test(changed_modules): +def determine_modules_to_test(changed_modules, deduplicated=True): """ Given a set of modules that have changed, compute the transitive closure of those modules' dependent modules in order to determine the set of modules that should be tested. Returns a topologically-sorted list of modules (ties are broken by sorting on module names). + If ``deduplicated`` is disabled, the modules are returned without tacking the deduplication + by dependencies into account. >>> [x.name for x in determine_modules_to_test([modules.root])] ['root'] @@ -122,11 +124,29 @@ def determine_modules_to_test(changed_modules): ... # doctest: +NORMALIZE_WHITESPACE ['sql', 'avro', 'hive', 'mllib', 'sql-kafka-0-10', 'examples', 'hive-thriftserver', 'pyspark-sql', 'repl', 'sparkr', 'pyspark-mllib', 'pyspark-ml'] + >>> sorted([x.name for x in determine_modules_to_test( + ... [modules.sparkr, modules.pyspark_sql], deduplicated=False)]) + ... # doctest: +NORMALIZE_WHITESPACE + ['avro', 'examples', 'hive', 'hive-thriftserver', 'mllib', 'pyspark-ml', + 'pyspark-mllib', 'pyspark-sql', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10'] + >>> sorted([x.name for x in determine_modules_to_test( + ... [modules.sql, modules.core], deduplicated=False)]) + ... # doctest: +NORMALIZE_WHITESPACE + ['avro', 'catalyst', 'core', 'examples', 'graphx', 'hive', 'hive-thriftserver', + 'mllib', 'mllib-local', 'pyspark-core', 'pyspark-ml', 'pyspark-mllib', + 'pyspark-resource', 'pyspark-sql', 'pyspark-streaming', 'repl', 'root', + 'sparkr', 'sql', 'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10', + 'streaming-kinesis-asl'] """ modules_to_test = set() for module in changed_modules: - modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules)) + modules_to_test = modules_to_test.union( + determine_modules_to_test(module.dependent_modules, deduplicated)) modules_to_test = modules_to_test.union(set(changed_modules)) + + if not deduplicated: + return modules_to_test + # If we need to run all of the tests, then we should short-circuit and return 'root' if modules.root in modules_to_test: return [modules.root] @@ -539,6 +559,24 @@ def parse_opts(): "-p", "--parallelism", type=int, default=8, help="The number of suites to test in parallel (default %(default)d)" ) + parser.add_argument( + "-m", "--modules", type=str, + default=None, + help="A comma-separated list of modules to test " + "(default: %s)" % ",".join(sorted([m.name for m in modules.all_modules])) + ) + parser.add_argument( + "-e", "--excluded-tags", type=str, + default=None, + help="A comma-separated list of tags to exclude in the tests, " + "e.g., org.apache.spark.tags.ExtendedHiveTest " + ) + parser.add_argument( + "-i", "--included-tags", type=str, + default=None, + help="A comma-separated list of tags to include in the tests, " + "e.g., org.apache.spark.tags.ExtendedHiveTest " + ) args, unknown = parser.parse_known_args() if unknown: @@ -589,43 +627,79 @@ def main(): # /home/jenkins/anaconda2/envs/py36/bin os.environ["PATH"] = "/home/anaconda/envs/py36/bin:" + os.environ.get("PATH") else: - # else we're running locally and can use local settings + # else we're running locally or Github Actions. build_tool = "sbt" hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7") hive_version = os.environ.get("HIVE_PROFILE", "hive2.3") - test_env = "local" + if "GITHUB_ACTIONS" in os.environ: + test_env = "github_actions" + else: + test_env = "local" print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version, "and Hive profile", hive_version, "under environment", test_env) extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version) changed_modules = None + test_modules = None changed_files = None - should_only_test_modules = "TEST_ONLY_MODULES" in os.environ + should_only_test_modules = opts.modules is not None included_tags = [] + excluded_tags = [] if should_only_test_modules: - str_test_modules = [m.strip() for m in os.environ.get("TEST_ONLY_MODULES").split(",")] + str_test_modules = [m.strip() for m in opts.modules.split(",")] test_modules = [m for m in modules.all_modules if m.name in str_test_modules] - # Directly uses test_modules as changed modules to apply tags and environments - # as if all specified test modules are changed. + + # If we're running the tests in Github Actions, attempt to detect and test + # only the affected modules. + if test_env == "github_actions": + if os.environ["GITHUB_BASE_REF"] != "": + # Pull requests + changed_files = identify_changed_files_from_git_commits( + os.environ["GITHUB_SHA"], target_branch=os.environ["GITHUB_BASE_REF"]) + else: + # Build for each commit. + changed_files = identify_changed_files_from_git_commits( + os.environ["GITHUB_SHA"], target_ref=os.environ["GITHUB_PREV_SHA"]) + print("changed_files : %s" % changed_files) + + modules_to_test = determine_modules_to_test( + determine_modules_for_files(changed_files), deduplicated=False) + print("modules_to_test : %s" % modules_to_test) + + if modules.root not in modules_to_test: + # If root module does not exist, only test the intersected modules. + # If root module is found, just run the modules as specified initially. + test_modules = list(set(modules_to_test).intersection(test_modules)) + print("test_modules : %s" % test_modules) + changed_modules = test_modules - str_excluded_tags = os.environ.get("TEST_ONLY_EXCLUDED_TAGS", None) - str_included_tags = os.environ.get("TEST_ONLY_INCLUDED_TAGS", None) - excluded_tags = [] - if str_excluded_tags: - excluded_tags = [t.strip() for t in str_excluded_tags.split(",")] - included_tags = [] - if str_included_tags: - included_tags = [t.strip() for t in str_included_tags.split(",")] + if len(changed_modules) == 0: + print("[info] There are no modules to test, exiting without testing.") + return + + # If we're running the tests in AMPLab Jenkins, calculate the diff from the targeted branch, and + # detect modules to test. elif test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): target_branch = os.environ["ghprbTargetBranch"] changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch) changed_modules = determine_modules_for_files(changed_files) + test_modules = determine_modules_to_test(changed_modules) excluded_tags = determine_tags_to_exclude(changed_modules) + # If there is no changed module found, tests all. if not changed_modules: changed_modules = [modules.root] - excluded_tags = [] + if not test_modules: + test_modules = determine_modules_to_test(changed_modules) + + str_excluded_tags = opts.excluded_tags + str_included_tags = opts.included_tags + if str_excluded_tags: + excluded_tags.extend([t.strip() for t in str_excluded_tags.split(",")]) + if str_included_tags: + included_tags.extend([t.strip() for t in str_included_tags.split(",")]) + print("[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)) @@ -640,8 +714,6 @@ def main(): should_run_java_style_checks = False if not should_only_test_modules: - test_modules = determine_modules_to_test(changed_modules) - # license checks run_apache_rat_checks() @@ -672,40 +744,43 @@ def main(): # if "DOCS" in changed_modules and test_env == "amplab_jenkins": # build_spark_documentation() - if any(m.should_run_build_tests for m in test_modules) and test_env != "amplab_jenkins": - run_build_tests() - - # spark build - build_apache_spark(build_tool, extra_profiles) - - # backwards compatibility checks - if build_tool == "sbt": - # Note: compatibility tests only supported in sbt for now - detect_binary_inop_with_mima(extra_profiles) - # Since we did not build assembly/package before running dev/mima, we need to - # do it here because the tests still rely on it; see SPARK-13294 for details. - build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks) - - # run the test suites - run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags) - - modules_with_python_tests = [m for m in test_modules if m.python_test_goals] - if modules_with_python_tests: - # We only run PySpark tests with coverage report in one specific job with - # Spark master with SBT in Jenkins. - is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ - run_python_tests( - modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job) - run_python_packaging_tests() - if any(m.should_run_r_tests for m in test_modules): - run_sparkr_tests() + print(changed_modules) + print(test_modules) + print([m for m in test_modules if m.python_test_goals]) + print([m.should_run_r_tests for m in test_modules]) + print(excluded_tags) + print(included_tags) + + # if any(m.should_run_build_tests for m in test_modules) and test_env != "amplab_jenkins": + # run_build_tests() + # + # # spark build + # build_apache_spark(build_tool, extra_profiles) + # + # # backwards compatibility checks + # if build_tool == "sbt": + # # Note: compatibility tests only supported in sbt for now + # detect_binary_inop_with_mima(extra_profiles) + # # Since we did not build assembly/package before running dev/mima, we need to + # # do it here because the tests still rely on it; see SPARK-13294 for details. + # build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks) + # + # # run the test suites + # run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags) + # + # modules_with_python_tests = [m for m in test_modules if m.python_test_goals] + # if modules_with_python_tests: + # # We only run PySpark tests with coverage report in one specific job with + # # Spark master with SBT in Jenkins. + # is_sbt_master_job = "SPARK_MASTER_SBT_HADOOP_2_7" in os.environ + # run_python_tests( + # modules_with_python_tests, opts.parallelism, with_coverage=is_sbt_master_job) + # run_python_packaging_tests() + # if any(m.should_run_r_tests for m in test_modules): + # run_sparkr_tests() def _test(): - if "TEST_ONLY_MODULES" in os.environ: - # TODO(SPARK-32252): Enable doctests back in Github Actions. - return - import doctest failure_count = doctest.testmod()[0] if failure_count: @@ -713,5 +788,5 @@ def _test(): if __name__ == "__main__": - _test() + # _test() main() diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index 5f4a8a2d2db1..bb192b487d4c 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -16,7 +16,7 @@ # """ -Worker that receives input from Piped RDD. +Worker that receives input from Piped RDD """ from __future__ import print_function from __future__ import absolute_import diff --git a/sql/core/pom.xml b/sql/core/pom.xml index c2ed4c079d3c..d42de271ce95 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -76,7 +76,7 @@ org.apache.spark diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index da542c67d9c5..5898174657c0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -61,10 +61,10 @@ private[sql] object Column { /** * A [[Column]] where an [[Encoder]] has been given for the expected input and return type. - * To create a [[TypedColumn]], use the `as` function on a [[Column]]. + * To create a [[TypedColumn]], use the `as` function on a [[Column]] * * @tparam T The input type expected for this expression. Can be `Any` if the expression is type - * checked by the analyzer instead of the compiler (i.e. `expr("sum(...)")`). + * checked by the analyzer instead of the compiler (i.e. `expr("sum(...)")`) * @tparam U The output type of this column. * * @since 1.6.0