From 1bdeb87de057187655abfaddf95491580629d7aa Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Mon, 22 Jun 2015 15:40:05 -0700 Subject: [PATCH 01/27] Move module definitions to separate file. --- dev/run-tests.py | 299 ++----------------------------- dev/sparktestsupport/__init__.py | 16 ++ dev/sparktestsupport/modules.py | 292 ++++++++++++++++++++++++++++++ 3 files changed, 321 insertions(+), 286 deletions(-) create mode 100644 dev/sparktestsupport/__init__.py create mode 100644 dev/sparktestsupport/modules.py diff --git a/dev/run-tests.py b/dev/run-tests.py index de1b4537eda5..c0652625fd2a 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -25,289 +25,16 @@ import subprocess from collections import namedtuple +import sparktestsupport.modules as modules + SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") USER_HOME = os.environ.get("HOME") # ------------------------------------------------------------------------------------------------- -# Test module definitions and functions for traversing module dependency graph +# Functions for traversing module dependency graph # ------------------------------------------------------------------------------------------------- - -all_modules = [] - - -class Module(object): - """ - A module is the basic abstraction in our test runner script. Each module consists of a set of - source files, a set of test commands, and a set of dependencies on other modules. We use modules - to define a dependency graph that lets determine which tests to run based on which files have - changed. - """ - - def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), - sbt_test_goals=(), should_run_python_tests=False, should_run_r_tests=False): - """ - Define a new module. - - :param name: A short module name, for display in logging and error messages. - :param dependencies: A set of dependencies for this module. This should only include direct - dependencies; transitive dependencies are resolved automatically. - :param source_file_regexes: a set of regexes that match source files belonging to this - module. These regexes are applied by attempting to match at the beginning of the - filename strings. - :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in - order to build and test this module (e.g. '-PprofileName'). - :param sbt_test_goals: A set of SBT test goals for testing this module. - :param should_run_python_tests: If true, changes in this module will trigger Python tests. - For now, this has the effect of causing _all_ Python tests to be run, although in the - future this should be changed to run only a subset of the Python tests that depend - on this module. - :param should_run_r_tests: If true, changes in this module will trigger all R tests. - """ - self.name = name - self.dependencies = dependencies - self.source_file_prefixes = source_file_regexes - self.sbt_test_goals = sbt_test_goals - self.build_profile_flags = build_profile_flags - self.should_run_python_tests = should_run_python_tests - self.should_run_r_tests = should_run_r_tests - - self.dependent_modules = set() - for dep in dependencies: - dep.dependent_modules.add(self) - all_modules.append(self) - - def contains_file(self, filename): - return any(re.match(p, filename) for p in self.source_file_prefixes) - - -sql = Module( - name="sql", - dependencies=[], - source_file_regexes=[ - "sql/(?!hive-thriftserver)", - "bin/spark-sql", - ], - build_profile_flags=[ - "-Phive", - ], - sbt_test_goals=[ - "catalyst/test", - "sql/test", - "hive/test", - ] -) - - -hive_thriftserver = Module( - name="hive-thriftserver", - dependencies=[sql], - source_file_regexes=[ - "sql/hive-thriftserver", - "sbin/start-thriftserver.sh", - ], - build_profile_flags=[ - "-Phive-thriftserver", - ], - sbt_test_goals=[ - "hive-thriftserver/test", - ] -) - - -graphx = Module( - name="graphx", - dependencies=[], - source_file_regexes=[ - "graphx/", - ], - sbt_test_goals=[ - "graphx/test" - ] -) - - -streaming = Module( - name="streaming", - dependencies=[], - source_file_regexes=[ - "streaming", - ], - sbt_test_goals=[ - "streaming/test", - ] -) - - -streaming_kinesis_asl = Module( - name="kinesis-asl", - dependencies=[streaming], - source_file_regexes=[ - "extras/kinesis-asl/", - ], - build_profile_flags=[ - "-Pkinesis-asl", - ], - sbt_test_goals=[ - "kinesis-asl/test", - ] -) - - -streaming_zeromq = Module( - name="streaming-zeromq", - dependencies=[streaming], - source_file_regexes=[ - "external/zeromq", - ], - sbt_test_goals=[ - "streaming-zeromq/test", - ] -) - - -streaming_twitter = Module( - name="streaming-twitter", - dependencies=[streaming], - source_file_regexes=[ - "external/twitter", - ], - sbt_test_goals=[ - "streaming-twitter/test", - ] -) - - -streaming_mqtt = Module( - name="streaming-mqtt", - dependencies=[streaming], - source_file_regexes=[ - "external/mqtt", - ], - sbt_test_goals=[ - "streaming-mqtt/test", - ] -) - - -streaming_kafka = Module( - name="streaming-kafka", - dependencies=[streaming], - source_file_regexes=[ - "external/kafka", - "external/kafka-assembly", - ], - sbt_test_goals=[ - "streaming-kafka/test", - ] -) - - -streaming_flume_sink = Module( - name="streaming-flume-sink", - dependencies=[streaming], - source_file_regexes=[ - "external/flume-sink", - ], - sbt_test_goals=[ - "streaming-flume-sink/test", - ] -) - - -streaming_flume = Module( - name="streaming_flume", - dependencies=[streaming], - source_file_regexes=[ - "external/flume", - ], - sbt_test_goals=[ - "streaming-flume/test", - ] -) - - -mllib = Module( - name="mllib", - dependencies=[streaming, sql], - source_file_regexes=[ - "data/mllib/", - "mllib/", - ], - sbt_test_goals=[ - "mllib/test", - ] -) - - -examples = Module( - name="examples", - dependencies=[graphx, mllib, streaming, sql], - source_file_regexes=[ - "examples/", - ], - sbt_test_goals=[ - "examples/test", - ] -) - - -pyspark = Module( - name="pyspark", - dependencies=[mllib, streaming, streaming_kafka, sql], - source_file_regexes=[ - "python/" - ], - should_run_python_tests=True -) - - -sparkr = Module( - name="sparkr", - dependencies=[sql, mllib], - source_file_regexes=[ - "R/", - ], - should_run_r_tests=True -) - - -docs = Module( - name="docs", - dependencies=[], - source_file_regexes=[ - "docs/", - ] -) - - -ec2 = Module( - name="ec2", - dependencies=[], - source_file_regexes=[ - "ec2/", - ] -) - - -# The root module is a dummy module which is used to run all of the tests. -# No other modules should directly depend on this module. -root = Module( - name="root", - dependencies=[], - source_file_regexes=[], - # In order to run all of the tests, enable every test profile: - build_profile_flags= - list(set(itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))), - sbt_test_goals=[ - "test", - ], - should_run_python_tests=True, - should_run_r_tests=True -) - - def determine_modules_for_files(filenames): """ Given a list of filenames, return the set of modules that contain those files. @@ -322,12 +49,12 @@ def determine_modules_for_files(filenames): changed_modules = set() for filename in filenames: matched_at_least_one_module = False - for module in all_modules: + for module in modules.all_modules: if module.contains_file(filename): changed_modules.add(module) matched_at_least_one_module = True if not matched_at_least_one_module: - changed_modules.add(root) + changed_modules.add(modules.root) return changed_modules @@ -362,18 +89,18 @@ def determine_modules_to_test(changed_modules): Given a set of modules that have changed, compute the transitive closure of those modules' dependent modules in order to determine the set of modules that should be tested. - >>> sorted(x.name for x in determine_modules_to_test([root])) + >>> sorted(x.name for x in determine_modules_to_test([modules.root])) ['root'] - >>> sorted(x.name for x in determine_modules_to_test([graphx])) + >>> sorted(x.name for x in determine_modules_to_test([modules.graphx])) ['examples', 'graphx'] - >>> sorted(x.name for x in determine_modules_to_test([sql])) + >>> sorted(x.name for x in determine_modules_to_test([modules.sql])) ['examples', 'hive-thriftserver', 'mllib', 'pyspark', 'sparkr', 'sql'] """ # If we're going to have to run all of the tests, then we can just short-circuit # and return 'root'. No module depends on root, so if it appears then it will be # in changed_modules. - if root in changed_modules: - return [root] + if modules.root in changed_modules: + return [modules.root] modules_to_test = set() for module in changed_modules: modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules)) @@ -596,7 +323,7 @@ def get_hadoop_profiles(hadoop_version): def build_spark_maven(hadoop_version): # Enable all of the profiles for the build: - build_profiles = get_hadoop_profiles(hadoop_version) + root.build_profile_flags + build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags mvn_goals = ["clean", "package", "-DskipTests"] profiles_and_goals = build_profiles + mvn_goals @@ -608,7 +335,7 @@ def build_spark_maven(hadoop_version): def build_spark_sbt(hadoop_version): # Enable all of the profiles for the build: - build_profiles = get_hadoop_profiles(hadoop_version) + root.build_profile_flags + build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags sbt_goals = ["package", "assembly/assembly", "streaming-kafka-assembly/assembly"] @@ -746,7 +473,7 @@ def main(): changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch) changed_modules = determine_modules_for_files(changed_files) if not changed_modules: - changed_modules = [root] + changed_modules = [modules.root] print "[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules) test_modules = determine_modules_to_test(changed_modules) diff --git a/dev/sparktestsupport/__init__.py b/dev/sparktestsupport/__init__.py new file mode 100644 index 000000000000..ecb1860df848 --- /dev/null +++ b/dev/sparktestsupport/__init__.py @@ -0,0 +1,16 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# \ No newline at end of file diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py new file mode 100644 index 000000000000..70710b3de782 --- /dev/null +++ b/dev/sparktestsupport/modules.py @@ -0,0 +1,292 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import itertools +import re + +all_modules = [] + + +class Module(object): + """ + A module is the basic abstraction in our test runner script. Each module consists of a set of + source files, a set of test commands, and a set of dependencies on other modules. We use modules + to define a dependency graph that lets determine which tests to run based on which files have + changed. + """ + + def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), + sbt_test_goals=(), should_run_python_tests=False, should_run_r_tests=False): + """ + Define a new module. + + :param name: A short module name, for display in logging and error messages. + :param dependencies: A set of dependencies for this module. This should only include direct + dependencies; transitive dependencies are resolved automatically. + :param source_file_regexes: a set of regexes that match source files belonging to this + module. These regexes are applied by attempting to match at the beginning of the + filename strings. + :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in + order to build and test this module (e.g. '-PprofileName'). + :param sbt_test_goals: A set of SBT test goals for testing this module. + :param should_run_python_tests: If true, changes in this module will trigger Python tests. + For now, this has the effect of causing _all_ Python tests to be run, although in the + future this should be changed to run only a subset of the Python tests that depend + on this module. + :param should_run_r_tests: If true, changes in this module will trigger all R tests. + """ + self.name = name + self.dependencies = dependencies + self.source_file_prefixes = source_file_regexes + self.sbt_test_goals = sbt_test_goals + self.build_profile_flags = build_profile_flags + self.should_run_python_tests = should_run_python_tests + self.should_run_r_tests = should_run_r_tests + + self.dependent_modules = set() + for dep in dependencies: + dep.dependent_modules.add(self) + all_modules.append(self) + + def contains_file(self, filename): + return any(re.match(p, filename) for p in self.source_file_prefixes) + + +sql = Module( + name="sql", + dependencies=[], + source_file_regexes=[ + "sql/(?!hive-thriftserver)", + "bin/spark-sql", + ], + build_profile_flags=[ + "-Phive", + ], + sbt_test_goals=[ + "catalyst/test", + "sql/test", + "hive/test", + ] +) + + +hive_thriftserver = Module( + name="hive-thriftserver", + dependencies=[sql], + source_file_regexes=[ + "sql/hive-thriftserver", + "sbin/start-thriftserver.sh", + ], + build_profile_flags=[ + "-Phive-thriftserver", + ], + sbt_test_goals=[ + "hive-thriftserver/test", + ] +) + + +graphx = Module( + name="graphx", + dependencies=[], + source_file_regexes=[ + "graphx/", + ], + sbt_test_goals=[ + "graphx/test" + ] +) + + +streaming = Module( + name="streaming", + dependencies=[], + source_file_regexes=[ + "streaming", + ], + sbt_test_goals=[ + "streaming/test", + ] +) + + +streaming_kinesis_asl = Module( + name="kinesis-asl", + dependencies=[streaming], + source_file_regexes=[ + "extras/kinesis-asl/", + ], + build_profile_flags=[ + "-Pkinesis-asl", + ], + sbt_test_goals=[ + "kinesis-asl/test", + ] +) + + +streaming_zeromq = Module( + name="streaming-zeromq", + dependencies=[streaming], + source_file_regexes=[ + "external/zeromq", + ], + sbt_test_goals=[ + "streaming-zeromq/test", + ] +) + + +streaming_twitter = Module( + name="streaming-twitter", + dependencies=[streaming], + source_file_regexes=[ + "external/twitter", + ], + sbt_test_goals=[ + "streaming-twitter/test", + ] +) + + +streaming_mqtt = Module( + name="streaming-mqtt", + dependencies=[streaming], + source_file_regexes=[ + "external/mqtt", + ], + sbt_test_goals=[ + "streaming-mqtt/test", + ] +) + + +streaming_kafka = Module( + name="streaming-kafka", + dependencies=[streaming], + source_file_regexes=[ + "external/kafka", + "external/kafka-assembly", + ], + sbt_test_goals=[ + "streaming-kafka/test", + ] +) + + +streaming_flume_sink = Module( + name="streaming-flume-sink", + dependencies=[streaming], + source_file_regexes=[ + "external/flume-sink", + ], + sbt_test_goals=[ + "streaming-flume-sink/test", + ] +) + + +streaming_flume = Module( + name="streaming_flume", + dependencies=[streaming], + source_file_regexes=[ + "external/flume", + ], + sbt_test_goals=[ + "streaming-flume/test", + ] +) + + +mllib = Module( + name="mllib", + dependencies=[streaming, sql], + source_file_regexes=[ + "data/mllib/", + "mllib/", + ], + sbt_test_goals=[ + "mllib/test", + ] +) + + +examples = Module( + name="examples", + dependencies=[graphx, mllib, streaming, sql], + source_file_regexes=[ + "examples/", + ], + sbt_test_goals=[ + "examples/test", + ] +) + + +pyspark = Module( + name="pyspark", + dependencies=[mllib, streaming, streaming_kafka, sql], + source_file_regexes=[ + "python/" + ], + should_run_python_tests=True +) + + +sparkr = Module( + name="sparkr", + dependencies=[sql, mllib], + source_file_regexes=[ + "R/", + ], + should_run_r_tests=True +) + + +docs = Module( + name="docs", + dependencies=[], + source_file_regexes=[ + "docs/", + ] +) + + +ec2 = Module( + name="ec2", + dependencies=[], + source_file_regexes=[ + "ec2/", + ] +) + + +# The root module is a dummy module which is used to run all of the tests. +# No other modules should directly depend on this module. +root = Module( + name="root", + dependencies=[], + source_file_regexes=[], + # In order to run all of the tests, enable every test profile: + build_profile_flags= + list(set(itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))), + sbt_test_goals=[ + "test", + ], + should_run_python_tests=True, + should_run_r_tests=True +) From 311c6a99b313aa277cfc7d44be544a33c6550289 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Mon, 22 Jun 2015 15:46:32 -0700 Subject: [PATCH 02/27] Move shell utility functions to own module. --- dev/run-tests.py | 56 +-------------------- dev/sparktestsupport/shellutils.py | 81 ++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 55 deletions(-) create mode 100644 dev/sparktestsupport/shellutils.py diff --git a/dev/run-tests.py b/dev/run-tests.py index c0652625fd2a..6b067c325868 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -21,10 +21,10 @@ import os import re import sys -import shutil import subprocess from collections import namedtuple +from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which import sparktestsupport.modules as modules SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") @@ -125,60 +125,6 @@ def get_error_codes(err_code_file): ERROR_CODES = get_error_codes(os.path.join(SPARK_HOME, "dev/run-tests-codes.sh")) -def exit_from_command_with_retcode(cmd, retcode): - print "[error] running", ' '.join(cmd), "; received return code", retcode - sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) - - -def rm_r(path): - """Given an arbitrary path properly remove it with the correct python - construct if it exists - - from: http://stackoverflow.com/a/9559881""" - - if os.path.isdir(path): - shutil.rmtree(path) - elif os.path.exists(path): - os.remove(path) - - -def run_cmd(cmd): - """Given a command as a list of arguments will attempt to execute the - command from the determined SPARK_HOME directory and, on failure, print - an error message""" - - if not isinstance(cmd, list): - cmd = cmd.split() - try: - subprocess.check_call(cmd) - except subprocess.CalledProcessError as e: - exit_from_command_with_retcode(e.cmd, e.returncode) - - -def is_exe(path): - """Check if a given path is an executable file - - from: http://stackoverflow.com/a/377028""" - - return os.path.isfile(path) and os.access(path, os.X_OK) - - -def which(program): - """Find and return the given program by its absolute path or 'None' - - from: http://stackoverflow.com/a/377028""" - - fpath = os.path.split(program)[0] - - if fpath: - if is_exe(program): - return program - else: - for path in os.environ.get("PATH").split(os.pathsep): - path = path.strip('"') - exe_file = os.path.join(path, program) - if is_exe(exe_file): - return exe_file - return None - - def determine_java_executable(): """Will return the path of the java executable that will be used by Spark's tests or `None`""" diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py new file mode 100644 index 000000000000..81ba6cba64e5 --- /dev/null +++ b/dev/sparktestsupport/shellutils.py @@ -0,0 +1,81 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import shutil +import subprocess +import sys + + +def exit_from_command_with_retcode(cmd, retcode): + print "[error] running", ' '.join(cmd), "; received return code", retcode + sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) + + +def rm_r(path): + """ + Given an arbitrary path, properly remove it with the correct Python construct if it exists. + From: http://stackoverflow.com/a/9559881 + """ + + if os.path.isdir(path): + shutil.rmtree(path) + elif os.path.exists(path): + os.remove(path) + + +def run_cmd(cmd): + """ + Given a command as a list of arguments will attempt to execute the command + and, on failure, print an error message + """ + + if not isinstance(cmd, list): + cmd = cmd.split() + try: + subprocess.check_call(cmd) + except subprocess.CalledProcessError as e: + exit_from_command_with_retcode(e.cmd, e.returncode) + + +def is_exe(path): + """ + Check if a given path is an executable file. + From: http://stackoverflow.com/a/377028 + """ + + return os.path.isfile(path) and os.access(path, os.X_OK) + + +def which(program): + """ + Find and return the given program by its absolute path or 'None' if the program cannot be found. + From: http://stackoverflow.com/a/377028 + """ + + fpath = os.path.split(program)[0] + + if fpath: + if is_exe(program): + return program + else: + for path in os.environ.get("PATH").split(os.pathsep): + path = path.strip('"') + exe_file = os.path.join(path, program) + if is_exe(exe_file): + return exe_file + return None From 32660fc94545f48e5a39785061261697fe4f447f Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Mon, 22 Jun 2015 22:58:06 -0700 Subject: [PATCH 03/27] Initial cut at Python test runner refactoring --- dev/run-tests.py | 5 +- dev/sparktestsupport/__init__.py | 7 +- dev/sparktestsupport/modules.py | 106 +++++++++++++++++++++++++---- dev/sparktestsupport/shellutils.py | 2 +- python/run-tests | 87 ++--------------------- python/run-tests.py | 86 +++++++++++++++++++++++ 6 files changed, 193 insertions(+), 100 deletions(-) create mode 100644 python/run-tests.py diff --git a/dev/run-tests.py b/dev/run-tests.py index 6b067c325868..2de57fc0d1dc 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -24,13 +24,10 @@ import subprocess from collections import namedtuple +from sparktestsupport import SPARK_HOME, USER_HOME from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which import sparktestsupport.modules as modules -SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") -USER_HOME = os.environ.get("HOME") - - # ------------------------------------------------------------------------------------------------- # Functions for traversing module dependency graph # ------------------------------------------------------------------------------------------------- diff --git a/dev/sparktestsupport/__init__.py b/dev/sparktestsupport/__init__.py index ecb1860df848..9afb1da74760 100644 --- a/dev/sparktestsupport/__init__.py +++ b/dev/sparktestsupport/__init__.py @@ -13,4 +13,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# \ No newline at end of file +# + +import os + +SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../../") +USER_HOME = os.environ.get("HOME") \ No newline at end of file diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 70710b3de782..48bbdd231364 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -30,7 +30,7 @@ class Module(object): """ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), - sbt_test_goals=(), should_run_python_tests=False, should_run_r_tests=False): + sbt_test_goals=(), python_test_goals=(), should_run_r_tests=False): """ Define a new module. @@ -43,10 +43,7 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags= :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in order to build and test this module (e.g. '-PprofileName'). :param sbt_test_goals: A set of SBT test goals for testing this module. - :param should_run_python_tests: If true, changes in this module will trigger Python tests. - For now, this has the effect of causing _all_ Python tests to be run, although in the - future this should be changed to run only a subset of the Python tests that depend - on this module. + :param python_test_goals: A set of Python test goals for testing this module. :param should_run_r_tests: If true, changes in this module will trigger all R tests. """ self.name = name @@ -54,7 +51,7 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags= self.source_file_prefixes = source_file_regexes self.sbt_test_goals = sbt_test_goals self.build_profile_flags = build_profile_flags - self.should_run_python_tests = should_run_python_tests + self.python_test_goals = python_test_goals self.should_run_r_tests = should_run_r_tests self.dependent_modules = set() @@ -237,16 +234,101 @@ def contains_file(self, filename): ) -pyspark = Module( - name="pyspark", - dependencies=[mllib, streaming, streaming_kafka, sql], +pyspark_core = Module( + name="pyspark-core", + dependencies=[mllib, streaming, streaming_kafka], source_file_regexes=[ - "python/" + "python/(?!pyspark/(ml|mllib|sql|streaming))" ], - should_run_python_tests=True + python_test_goals=[ + "pyspark.rdd", + "pyspark.context", + "pyspark.conf", + "pyspark.broadcast", + "pyspark.accumulators", + "pyspark.serializers", + "pyspark.profiler", + "pyspark.shuffle", + "pyspark.tests", + ] +) + + +pyspark_sql = Module( + name="pyspark-sql", + dependencies=[pyspark_core, sql], + source_file_regexes=[ + "python/sql" + ], + python_test_goals=[ + "pyspark.sql.types", + "pyspark.sql.context", + "pyspark.sql.column", + "pyspark.sql.dataframe", + "pyspark.sql.group", + "pyspark.sql.functions", + "pyspark.sql.readwriter", + "pyspark.sql.window", + "pyspark.sql.tests", + ] +) + + +pyspark_streaming = Module( + name="pyspark-streaming", + dependencies=[pyspark_core, streaming, streaming_kafka], + source_file_regexes=[ + "python/streaming" + ], + python_test_goals=[ + "pyspark.streaming.util", + "pyspark.streaming.tests", + ] +) + + +pyspark_mllib = Module( + name="pyspark-mllib", + dependencies=[pyspark_core, pyspark_streaming, pyspark_sql, mllib], + source_file_regexes=[ + "python/mllib" + ], + python_test_goals=[ + "pyspark.mllib.classification", + "pyspark.mllib.clustering", + "pyspark.mllib.evaluation", + "pyspark.mllib.feature", + "pyspark.mllib.fpm", + "pyspark.mllib.linalg", + "pyspark.mllib.random", + "pyspark.mllib.recommendation", + "pyspark.mllib.regression", + "pyspark.mllib.stat._statistics", + "pyspark.mllib.stat.KernelDensity", + "pyspark.mllib.tree", + "pyspark.mllib.util", + "pyspark.mllib.tests", + ] ) +pyspark_ml = Module( + name="pyspark-sql", + dependencies=[pyspark_core, pyspark_mllib], + source_file_regexes=[ + "python/sql" + ], + python_test_goals=[ + "pyspark.ml.feature", + "pyspark.ml.classification", + "pyspark.ml.recommendation", + "pyspark.ml.regression", + "pyspark.ml.tuning", + "pyspark.ml.tests", + "pyspark.ml.evaluation", + ] +) + sparkr = Module( name="sparkr", dependencies=[sql, mllib], @@ -287,6 +369,6 @@ def contains_file(self, filename): sbt_test_goals=[ "test", ], - should_run_python_tests=True, + python_test_goals=list(itertools.chain.from_iterable(m.python_test_goals for m in all_modules)), should_run_r_tests=True ) diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py index 81ba6cba64e5..5176e4e15c06 100644 --- a/dev/sparktestsupport/shellutils.py +++ b/dev/sparktestsupport/shellutils.py @@ -41,7 +41,7 @@ def rm_r(path): def run_cmd(cmd): """ Given a command as a list of arguments will attempt to execute the command - and, on failure, print an error message + and, on failure, print an error message and exit. """ if not isinstance(cmd, list): diff --git a/python/run-tests b/python/run-tests index 4468fdb3f267..78da70ae53bc 100755 --- a/python/run-tests +++ b/python/run-tests @@ -18,13 +18,13 @@ # -# Figure out where the Spark framework is installed -FWDIR="$(cd "`dirname "$0"`"; cd ../; pwd)" +FWDIR="$(cd "`dirname $0`"/..; pwd)" +cd "$FWDIR" -. "$FWDIR"/bin/load-spark-env.sh +PYTHONPATH="$FWDIR/dev/" exec python -u ./python/run-tests.py + +exit -# CD into the python directory to find things on the right path -cd "$FWDIR/python" FAILED=0 LOG_FILE=unit-tests.log @@ -32,83 +32,6 @@ START=$(date +"%s") rm -f $LOG_FILE -# Remove the metastore and warehouse directory created by the HiveContext tests in Spark SQL -rm -rf metastore warehouse - -function run_test() { - echo -en "Running test: $1 ... " | tee -a $LOG_FILE - start=$(date +"%s") - SPARK_TESTING=1 time "$FWDIR"/bin/pyspark $1 > $LOG_FILE 2>&1 - - FAILED=$((PIPESTATUS[0]||$FAILED)) - - # Fail and exit on the first test failure. - if [[ $FAILED != 0 ]]; then - cat $LOG_FILE | grep -v "^[0-9][0-9]*" # filter all lines starting with a number. - echo -en "\033[31m" # Red - echo "Had test failures; see logs." - echo -en "\033[0m" # No color - exit -1 - else - now=$(date +"%s") - echo "ok ($(($now - $start))s)" - fi -} - -function run_core_tests() { - echo "Run core tests ..." - run_test "pyspark.rdd" - run_test "pyspark.context" - run_test "pyspark.conf" - run_test "pyspark.broadcast" - run_test "pyspark.accumulators" - run_test "pyspark.serializers" - run_test "pyspark.profiler" - run_test "pyspark.shuffle" - run_test "pyspark.tests" -} - -function run_sql_tests() { - echo "Run sql tests ..." - run_test "pyspark.sql.types" - run_test "pyspark.sql.context" - run_test "pyspark.sql.column" - run_test "pyspark.sql.dataframe" - run_test "pyspark.sql.group" - run_test "pyspark.sql.functions" - run_test "pyspark.sql.readwriter" - run_test "pyspark.sql.window" - run_test "pyspark.sql.tests" -} - -function run_mllib_tests() { - echo "Run mllib tests ..." - run_test "pyspark.mllib.classification" - run_test "pyspark.mllib.clustering" - run_test "pyspark.mllib.evaluation" - run_test "pyspark.mllib.feature" - run_test "pyspark.mllib.fpm" - run_test "pyspark.mllib.linalg" - run_test "pyspark.mllib.random" - run_test "pyspark.mllib.recommendation" - run_test "pyspark.mllib.regression" - run_test "pyspark.mllib.stat._statistics" - run_test "pyspark.mllib.stat.KernelDensity" - run_test "pyspark.mllib.tree" - run_test "pyspark.mllib.util" - run_test "pyspark.mllib.tests" -} - -function run_ml_tests() { - echo "Run ml tests ..." - run_test "pyspark.ml.feature" - run_test "pyspark.ml.classification" - run_test "pyspark.ml.recommendation" - run_test "pyspark.ml.regression" - run_test "pyspark.ml.tuning" - run_test "pyspark.ml.tests" - run_test "pyspark.ml.evaluation" -} function run_streaming_tests() { echo "Run streaming tests ..." diff --git a/python/run-tests.py b/python/run-tests.py new file mode 100644 index 000000000000..7a053e1c168e --- /dev/null +++ b/python/run-tests.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python2 + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function +import os +import re +import subprocess +import time + +from sparktestsupport import SPARK_HOME +from sparktestsupport.shellutils import which +from sparktestsupport.modules import all_modules + + +def print_red(text): + print('\033[31m' + text + '\033[0m') + + +LOG_FILE = os.path.join(SPARK_HOME, "python/unit-tests.log") + + +def run_individual_python_test(test_name, pyspark_python=None): + env = {'SPARK_TESTING': '1'} + if pyspark_python: + env["PYSPARK_PYTHON"] = pyspark_python + print(" Running test: %s ..." % test_name, end='') + start_time = time.time() + with open(LOG_FILE, 'a') as log_file: + retcode = subprocess.call( + [os.path.join(SPARK_HOME, "bin/pyspark"), test_name], + stderr=log_file, stdout=log_file, env=env) + duration = time.time() - start_time + # Exit on the first failure. + if retcode != 0: + with open(LOG_FILE, 'r') as log_file: + for line in log_file: + if not re.match('[0-9]+', line): + print(line, end='') + print_red("\nHad test failures in %s; see logs." % test_name) + exit(-1) + else: + print("ok (%fs)" % (duration / 10000.0)) + + +def main(): + # TODO: do we need to remove the metastore and warehouse created by the SQL tests? Ask Ahir. + if os.path.exists(LOG_FILE): + os.remove(LOG_FILE) + python_execs = [x for x in ["python2.6", "python3.4", "pypy"] if which(x)] + if "python2.6" not in python_execs: + print("WARNING: Not testing against `python2.6` because it could not be found; falling" + " back to `python` instead") + python_execs.insert(0, "python") + print("Will test against the following Python executables: %s" % python_execs) + start_time = time.time() + for python_exec in python_execs: + print("Testing with `%s`: " % python_exec, end='') + subprocess.call([python_exec, "--version"]) + + python_modules = [m for m in all_modules if m.python_test_goals] + for module in python_modules: + print("Running %s tests ..." % module.name) + for test_goal in module.python_test_goals: + run_individual_python_test(test_goal) + total_duration = time.time() - start_time + print("Tests passed in %f seconds" % (total_duration / 1000.0)) + + +if __name__ == "__main__": + main() From dcc9c09719cafc172bf3d58b7e81c73ae4535eaa Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Mon, 22 Jun 2015 23:16:48 -0700 Subject: [PATCH 04/27] Fix time division --- python/run-tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/run-tests.py b/python/run-tests.py index 7a053e1c168e..b68721883649 100644 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -55,7 +55,7 @@ def run_individual_python_test(test_name, pyspark_python=None): print_red("\nHad test failures in %s; see logs." % test_name) exit(-1) else: - print("ok (%fs)" % (duration / 10000.0)) + print("ok (%is)" % duration) def main(): @@ -79,7 +79,7 @@ def main(): for test_goal in module.python_test_goals: run_individual_python_test(test_goal) total_duration = time.time() - start_time - print("Tests passed in %f seconds" % (total_duration / 1000.0)) + print("Tests passed in %i seconds" % total_duration) if __name__ == "__main__": From 4c9713653a05e66a279885c50c99b288ba7089fa Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Tue, 23 Jun 2015 00:28:18 -0700 Subject: [PATCH 05/27] PYTHONPATH fixes --- python/run-tests | 2 +- python/run-tests.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) mode change 100644 => 100755 python/run-tests.py diff --git a/python/run-tests b/python/run-tests index 78da70ae53bc..afee53179261 100755 --- a/python/run-tests +++ b/python/run-tests @@ -21,7 +21,7 @@ FWDIR="$(cd "`dirname $0`"/..; pwd)" cd "$FWDIR" -PYTHONPATH="$FWDIR/dev/" exec python -u ./python/run-tests.py +exec python -u ./python/run-tests.py exit diff --git a/python/run-tests.py b/python/run-tests.py old mode 100644 new mode 100755 index b68721883649..4e232bfe4be7 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -21,11 +21,17 @@ import os import re import subprocess +import sys import time -from sparktestsupport import SPARK_HOME -from sparktestsupport.shellutils import which -from sparktestsupport.modules import all_modules + +# Append `SPARK_HOME/dev` to the Python path so that we can import the sparktestsupport module +sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), "../dev/")) + + +from sparktestsupport import SPARK_HOME # noqa (suppress pep8 warnings) +from sparktestsupport.shellutils import which # noqa +from sparktestsupport.modules import all_modules # noqa def print_red(text): From 04015b9dd5d2a2f47c1ca48408bca8949709ab01 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Tue, 23 Jun 2015 18:06:48 -0700 Subject: [PATCH 06/27] First attempt at getting PySpark Kafka test to work in new runner script --- python/pyspark/streaming/tests.py | 31 ++++++++++++ python/run-tests | 79 ------------------------------- python/run-tests.py | 1 + 3 files changed, 32 insertions(+), 79 deletions(-) diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py index 57049beea4db..05dd45246f56 100644 --- a/python/pyspark/streaming/tests.py +++ b/python/pyspark/streaming/tests.py @@ -15,6 +15,7 @@ # limitations under the License. # +import glob import os import sys from itertools import chain @@ -575,6 +576,36 @@ def check_output(n): class KafkaStreamTests(PySparkStreamingTestCase): timeout = 20 # seconds duration = 1 + old_pyspark_submit_args = None + + @classmethod + def setUpClass(cls): + cls.old_pyspark_submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS") + SPARK_HOME = os.environ["SPARK_HOME"] + kafka_assembly_dir = os.path.join(SPARK_HOME, "external/kafka-assembly") + jars = glob.glob( + os.path.join(kafka_assembly_dir, "target/scala-*/spark-streaming-kafka-assembly-*.jar")) + if not jars: + raise Exception( + ("Failed to find Spark Streaming kafka assembly jar in %s. " % kafka_assembly_dir) + + "You need to build Spark with " + "'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or " + "'build/mvn package' before running this test") + elif len(jars) > 1: + raise Exception(("Found multiple Spark Streaming Kafka assembly JARs in %s; please " + "remove all but one") % kafka_assembly_dir) + else: + os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars %s pyspark-shell" % jars[0] + print(os.environ["PYSPARK_SUBMIT_ARGS"]) + super(KafkaStreamTests, cls).setUpClass() + + @classmethod + def tearDownClass(cls): + if cls.old_pyspark_submit_args is None: + del os.environ["PYSPARK_SUBMIT_ARGS"] + else: + os.environ["PYSPARK_SUBMIT_ARGS"] = cls.old_pyspark_submit_args + super(KafkaStreamTests, cls).tearDownClass() def setUp(self): super(KafkaStreamTests, self).setUp() diff --git a/python/run-tests b/python/run-tests index afee53179261..c27cbb4fedb1 100755 --- a/python/run-tests +++ b/python/run-tests @@ -24,82 +24,3 @@ cd "$FWDIR" exec python -u ./python/run-tests.py exit - - -FAILED=0 -LOG_FILE=unit-tests.log -START=$(date +"%s") - -rm -f $LOG_FILE - - -function run_streaming_tests() { - echo "Run streaming tests ..." - - KAFKA_ASSEMBLY_DIR="$FWDIR"/external/kafka-assembly - JAR_PATH="${KAFKA_ASSEMBLY_DIR}/target/scala-${SPARK_SCALA_VERSION}" - for f in "${JAR_PATH}"/spark-streaming-kafka-assembly-*.jar; do - if [[ ! -e "$f" ]]; then - echo "Failed to find Spark Streaming Kafka assembly jar in $KAFKA_ASSEMBLY_DIR" 1>&2 - echo "You need to build Spark with " \ - "'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or" \ - "'build/mvn package' before running this program" 1>&2 - exit 1 - fi - KAFKA_ASSEMBLY_JAR="$f" - done - - export PYSPARK_SUBMIT_ARGS="--jars ${KAFKA_ASSEMBLY_JAR} pyspark-shell" - run_test "pyspark.streaming.util" - run_test "pyspark.streaming.tests" -} - -echo "Running PySpark tests. Output is in python/$LOG_FILE." - -export PYSPARK_PYTHON="python" - -# Try to test with Python 2.6, since that's the minimum version that we support: -if [ $(which python2.6) ]; then - export PYSPARK_PYTHON="python2.6" -fi - -echo "Testing with Python version:" -$PYSPARK_PYTHON --version - -run_core_tests -run_sql_tests -run_mllib_tests -run_ml_tests -run_streaming_tests - -# Try to test with Python 3 -if [ $(which python3.4) ]; then - export PYSPARK_PYTHON="python3.4" - echo "Testing with Python3.4 version:" - $PYSPARK_PYTHON --version - - run_core_tests - run_sql_tests - run_mllib_tests - run_ml_tests - run_streaming_tests -fi - -# Try to test with PyPy -if [ $(which pypy) ]; then - export PYSPARK_PYTHON="pypy" - echo "Testing with PyPy version:" - $PYSPARK_PYTHON --version - - run_core_tests - run_sql_tests - run_streaming_tests -fi - -if [[ $FAILED == 0 ]]; then - now=$(date +"%s") - echo -e "\033[32mTests passed \033[0min $(($now - $START)) seconds" -fi - -# TODO: in the long-run, it would be nice to use a test runner like `nose`. -# The doctest fixtures are the current barrier to doing this. diff --git a/python/run-tests.py b/python/run-tests.py index 4e232bfe4be7..fa43a3a8aee0 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -66,6 +66,7 @@ def run_individual_python_test(test_name, pyspark_python=None): def main(): # TODO: do we need to remove the metastore and warehouse created by the SQL tests? Ask Ahir. + print("Running PySpark tests. Output is in python/%s" % LOG_FILE) if os.path.exists(LOG_FILE): os.remove(LOG_FILE) python_execs = [x for x in ["python2.6", "python3.4", "pypy"] if which(x)] From aec0b8f8cdfe7017066a29b77b446c923abe3ab6 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Tue, 23 Jun 2015 18:12:01 -0700 Subject: [PATCH 07/27] Actually get the Kafka stuff to run properly --- python/pyspark/streaming/tests.py | 45 +++++++++++-------------------- 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py index 05dd45246f56..91ce681fbe16 100644 --- a/python/pyspark/streaming/tests.py +++ b/python/pyspark/streaming/tests.py @@ -576,36 +576,6 @@ def check_output(n): class KafkaStreamTests(PySparkStreamingTestCase): timeout = 20 # seconds duration = 1 - old_pyspark_submit_args = None - - @classmethod - def setUpClass(cls): - cls.old_pyspark_submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS") - SPARK_HOME = os.environ["SPARK_HOME"] - kafka_assembly_dir = os.path.join(SPARK_HOME, "external/kafka-assembly") - jars = glob.glob( - os.path.join(kafka_assembly_dir, "target/scala-*/spark-streaming-kafka-assembly-*.jar")) - if not jars: - raise Exception( - ("Failed to find Spark Streaming kafka assembly jar in %s. " % kafka_assembly_dir) + - "You need to build Spark with " - "'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or " - "'build/mvn package' before running this test") - elif len(jars) > 1: - raise Exception(("Found multiple Spark Streaming Kafka assembly JARs in %s; please " - "remove all but one") % kafka_assembly_dir) - else: - os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars %s pyspark-shell" % jars[0] - print(os.environ["PYSPARK_SUBMIT_ARGS"]) - super(KafkaStreamTests, cls).setUpClass() - - @classmethod - def tearDownClass(cls): - if cls.old_pyspark_submit_args is None: - del os.environ["PYSPARK_SUBMIT_ARGS"] - else: - os.environ["PYSPARK_SUBMIT_ARGS"] = cls.old_pyspark_submit_args - super(KafkaStreamTests, cls).tearDownClass() def setUp(self): super(KafkaStreamTests, self).setUp() @@ -708,4 +678,19 @@ def test_kafka_rdd_with_leaders(self): self._validateRddResult(sendData, rdd) if __name__ == "__main__": + SPARK_HOME = os.environ["SPARK_HOME"] + kafka_assembly_dir = os.path.join(SPARK_HOME, "external/kafka-assembly") + jars = glob.glob( + os.path.join(kafka_assembly_dir, "target/scala-*/spark-streaming-kafka-assembly-*.jar")) + if not jars: + raise Exception( + ("Failed to find Spark Streaming kafka assembly jar in %s. " % kafka_assembly_dir) + + "You need to build Spark with " + "'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or " + "'build/mvn package' before running this test") + elif len(jars) > 1: + raise Exception(("Found multiple Spark Streaming Kafka assembly JARs in %s; please " + "remove all but one") % kafka_assembly_dir) + else: + os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars %s pyspark-shell" % jars[0] unittest.main() From def2d8a951416514f975c9a19f9142c7f4086bfc Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Tue, 23 Jun 2015 18:38:05 -0700 Subject: [PATCH 08/27] Two minor fixes --- python/run-tests | 2 -- python/run-tests.py | 1 - 2 files changed, 3 deletions(-) diff --git a/python/run-tests b/python/run-tests index c27cbb4fedb1..c7961cd5616c 100755 --- a/python/run-tests +++ b/python/run-tests @@ -22,5 +22,3 @@ FWDIR="$(cd "`dirname $0`"/..; pwd)" cd "$FWDIR" exec python -u ./python/run-tests.py - -exit diff --git a/python/run-tests.py b/python/run-tests.py index fa43a3a8aee0..82cdfbb544ab 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -65,7 +65,6 @@ def run_individual_python_test(test_name, pyspark_python=None): def main(): - # TODO: do we need to remove the metastore and warehouse created by the SQL tests? Ask Ahir. print("Running PySpark tests. Output is in python/%s" % LOG_FILE) if os.path.exists(LOG_FILE): os.remove(LOG_FILE) From d6a77d30fa33651ac70aa36ad2bb9c5ce85d01d9 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Tue, 23 Jun 2015 22:26:43 -0700 Subject: [PATCH 09/27] Fix the tests of dev/run-tests --- dev/run-tests.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 2de57fc0d1dc..a6d959c53f5a 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -32,6 +32,7 @@ # Functions for traversing module dependency graph # ------------------------------------------------------------------------------------------------- + def determine_modules_for_files(filenames): """ Given a list of filenames, return the set of modules that contain those files. @@ -39,7 +40,7 @@ def determine_modules_for_files(filenames): file to belong to the 'root' module. >>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/test/foo"])) - ['pyspark', 'sql'] + ['pyspark-core', 'sql'] >>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])] ['root'] """ @@ -90,8 +91,11 @@ def determine_modules_to_test(changed_modules): ['root'] >>> sorted(x.name for x in determine_modules_to_test([modules.graphx])) ['examples', 'graphx'] - >>> sorted(x.name for x in determine_modules_to_test([modules.sql])) - ['examples', 'hive-thriftserver', 'mllib', 'pyspark', 'sparkr', 'sql'] + >>> x = sorted(x.name for x in determine_modules_to_test([modules.sql])) + >>> x # doctest: +NORMALIZE_WHITESPACE + ['examples', 'hive-thriftserver', 'mllib', 'pyspark-core', \ + 'pyspark-mllib', 'pyspark-sql', 'pyspark-sql', 'pyspark-streaming', \ + 'sparkr', 'sql'] """ # If we're going to have to run all of the tests, then we can just short-circuit # and return 'root'. No module depends on root, so if it appears then it will be From caeb040b28c7151a5a3f0898b5b418fa063ff4c6 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 24 Jun 2015 09:53:23 -0700 Subject: [PATCH 10/27] Fixes to PySpark test module definitions --- dev/run-tests.py | 5 ++--- dev/sparktestsupport/modules.py | 10 +++++----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index a6d959c53f5a..56096d39387d 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -93,9 +93,8 @@ def determine_modules_to_test(changed_modules): ['examples', 'graphx'] >>> x = sorted(x.name for x in determine_modules_to_test([modules.sql])) >>> x # doctest: +NORMALIZE_WHITESPACE - ['examples', 'hive-thriftserver', 'mllib', 'pyspark-core', \ - 'pyspark-mllib', 'pyspark-sql', 'pyspark-sql', 'pyspark-streaming', \ - 'sparkr', 'sql'] + ['examples', 'hive-thriftserver', 'mllib', 'pyspark-core', 'pyspark-ml', \ + 'pyspark-mllib', 'pyspark-sql', 'pyspark-streaming', 'sparkr', 'sql'] """ # If we're going to have to run all of the tests, then we can just short-circuit # and return 'root'. No module depends on root, so if it appears then it will be diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 48bbdd231364..c13bf425d71a 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -258,7 +258,7 @@ def contains_file(self, filename): name="pyspark-sql", dependencies=[pyspark_core, sql], source_file_regexes=[ - "python/sql" + "python/pyspark/sql" ], python_test_goals=[ "pyspark.sql.types", @@ -278,7 +278,7 @@ def contains_file(self, filename): name="pyspark-streaming", dependencies=[pyspark_core, streaming, streaming_kafka], source_file_regexes=[ - "python/streaming" + "python/pyspark/streaming" ], python_test_goals=[ "pyspark.streaming.util", @@ -291,7 +291,7 @@ def contains_file(self, filename): name="pyspark-mllib", dependencies=[pyspark_core, pyspark_streaming, pyspark_sql, mllib], source_file_regexes=[ - "python/mllib" + "python/pyspark/mllib" ], python_test_goals=[ "pyspark.mllib.classification", @@ -313,10 +313,10 @@ def contains_file(self, filename): pyspark_ml = Module( - name="pyspark-sql", + name="pyspark-ml", dependencies=[pyspark_core, pyspark_mllib], source_file_regexes=[ - "python/sql" + "python/pyspark/ml/" ], python_test_goals=[ "pyspark.ml.feature", From b2ab027852c596da6dcf18df2109386653beb97a Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 24 Jun 2015 13:28:13 -0700 Subject: [PATCH 11/27] Add command-line options for running individual suites in python/run-tests --- python/run-tests | 2 +- python/run-tests.py | 50 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/python/run-tests b/python/run-tests index c7961cd5616c..24949657ed7a 100755 --- a/python/run-tests +++ b/python/run-tests @@ -21,4 +21,4 @@ FWDIR="$(cd "`dirname $0`"/..; pwd)" cd "$FWDIR" -exec python -u ./python/run-tests.py +exec python -u ./python/run-tests.py "$@" diff --git a/python/run-tests.py b/python/run-tests.py index 82cdfbb544ab..3c9a579303e8 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -18,6 +18,7 @@ # from __future__ import print_function +from optparse import OptionParser import os import re import subprocess @@ -34,6 +35,9 @@ from sparktestsupport.modules import all_modules # noqa +python_modules = dict((m.name, m) for m in all_modules if m.python_test_goals) + + def print_red(text): print('\033[31m' + text + '\033[0m') @@ -64,23 +68,57 @@ def run_individual_python_test(test_name, pyspark_python=None): print("ok (%is)" % duration) -def main(): - print("Running PySpark tests. Output is in python/%s" % LOG_FILE) - if os.path.exists(LOG_FILE): - os.remove(LOG_FILE) +def get_default_python_executables(): python_execs = [x for x in ["python2.6", "python3.4", "pypy"] if which(x)] if "python2.6" not in python_execs: print("WARNING: Not testing against `python2.6` because it could not be found; falling" " back to `python` instead") python_execs.insert(0, "python") + return python_execs + + +def parse_opts(): + parser = OptionParser( + prog="run-tests" + ) + parser.add_option( + "--python-executables", type="string", default=','.join(get_default_python_executables()), + help="A comma-separated list of Python executables to test against (default: %default)" + ) + parser.add_option( + "--modules", type="string", + default=",".join(sorted(set(python_modules.keys()) - set(['root']))), + help="A comma-separated list of Python modules to test (default: %default)" + ) + + (opts, args) = parser.parse_args() + if args: + parser.error("Unsupported arguments: %s" % ' '.join(args)) + return opts + + +def main(): + opts = parse_opts() + print("Running PySpark tests. Output is in python/%s" % LOG_FILE) + if os.path.exists(LOG_FILE): + os.remove(LOG_FILE) + python_execs = opts.python_executables.split(',') + modules_to_test = [] + for module_name in opts.modules.split(','): + if module_name in python_modules: + modules_to_test.append(python_modules[module_name]) + else: + print("Error: unrecognized module %s" % module_name) + sys.exit(-1) print("Will test against the following Python executables: %s" % python_execs) + print("Will test the following Python modules: %s" % [x.name for x in modules_to_test]) + start_time = time.time() for python_exec in python_execs: print("Testing with `%s`: " % python_exec, end='') subprocess.call([python_exec, "--version"]) - python_modules = [m for m in all_modules if m.python_test_goals] - for module in python_modules: + for module in modules_to_test: print("Running %s tests ..." % module.name) for test_goal in module.python_test_goals: run_individual_python_test(test_goal) From 2efd594a6fb7c323c1a3d94e0398d6f5c6b66c58 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 24 Jun 2015 13:32:37 -0700 Subject: [PATCH 12/27] Update dev/run-tests to use new Python test runner flags --- dev/run-tests.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 56096d39387d..c287600f7c33 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -352,10 +352,12 @@ def run_scala_tests(build_tool, hadoop_version, test_modules): run_scala_tests_sbt(test_modules, test_profiles) -def run_python_tests(): +def run_python_tests(modules): set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS") - run_cmd([os.path.join(SPARK_HOME, "python", "run-tests")]) + run_cmd([ + os.path.join(SPARK_HOME, "python", "run-tests"), + "--modules=%s" % ','.join(m.name for m in modules)]) def run_sparkr_tests(): @@ -447,8 +449,9 @@ def main(): # run the test suites run_scala_tests(build_tool, hadoop_version, test_modules) - if any(m.should_run_python_tests for m in test_modules): - run_python_tests() + modules_with_python_tests = [m for m in test_modules if m.python_test_goals] + if modules_with_python_tests: + run_python_tests(modules_with_python_tests) if any(m.should_run_r_tests for m in test_modules): run_sparkr_tests() From fff4d099f2e59b5ecccaa9107cb0b3becde65dac Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 24 Jun 2015 13:34:34 -0700 Subject: [PATCH 13/27] Add dev/sparktestsupport to pep8 checks --- dev/lint-python | 2 +- dev/sparktestsupport/__init__.py | 2 +- dev/sparktestsupport/modules.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dev/lint-python b/dev/lint-python index f50d149dc4d4..21f785031b5d 100755 --- a/dev/lint-python +++ b/dev/lint-python @@ -19,7 +19,7 @@ SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")" -PATHS_TO_CHECK="./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/" +PATHS_TO_CHECK="./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/ ./dev/sparktestsupport" PYTHON_LINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/python-lint-report.txt" cd "$SPARK_ROOT_DIR" diff --git a/dev/sparktestsupport/__init__.py b/dev/sparktestsupport/__init__.py index 9afb1da74760..a75dbdcd77a0 100644 --- a/dev/sparktestsupport/__init__.py +++ b/dev/sparktestsupport/__init__.py @@ -18,4 +18,4 @@ import os SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../../") -USER_HOME = os.environ.get("HOME") \ No newline at end of file +USER_HOME = os.environ.get("HOME") diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index c13bf425d71a..d5cf01bf3e87 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -364,8 +364,8 @@ def contains_file(self, filename): dependencies=[], source_file_regexes=[], # In order to run all of the tests, enable every test profile: - build_profile_flags= - list(set(itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))), + build_profile_flags=list(set( + itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))), sbt_test_goals=[ "test", ], From f542ac5a28902b4021641c61ff508b2ce0e1c145 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 24 Jun 2015 16:10:28 -0700 Subject: [PATCH 14/27] Fix lint check for Python 3 --- dev/lint-python | 1 + dev/sparktestsupport/shellutils.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/dev/lint-python b/dev/lint-python index 21f785031b5d..cd32d734ebb3 100755 --- a/dev/lint-python +++ b/dev/lint-python @@ -20,6 +20,7 @@ SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")" PATHS_TO_CHECK="./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/ ./dev/sparktestsupport" +PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py" PYTHON_LINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/python-lint-report.txt" cd "$SPARK_ROOT_DIR" diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py index 5176e4e15c06..ad9b0cc89e4a 100644 --- a/dev/sparktestsupport/shellutils.py +++ b/dev/sparktestsupport/shellutils.py @@ -22,7 +22,7 @@ def exit_from_command_with_retcode(cmd, retcode): - print "[error] running", ' '.join(cmd), "; received return code", retcode + print("[error] running", ' '.join(cmd), "; received return code", retcode) sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) From 8f3244c1127c7f4d30cf0018d904e88bc375e562 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 24 Jun 2015 22:58:27 -0700 Subject: [PATCH 15/27] Use universal_newlines to fix dev/run-tests doctest failures on Python 3. --- dev/run-tests.py | 60 ++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index c287600f7c33..2949d1e5e1f2 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -17,6 +17,7 @@ # limitations under the License. # +from __future__ import print_function import itertools import os import re @@ -77,7 +78,8 @@ def identify_changed_files_from_git_commits(patch_sha, target_branch=None, targe run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)]) else: diff_target = target_ref - raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target]) + raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target], + universal_newlines=True) # Remove any empty strings return [f for f in raw_output.split('\n') if f] @@ -149,7 +151,8 @@ def determine_java_version(java_exe): with accessors '.major', '.minor', '.patch', '.update'""" raw_output = subprocess.check_output([java_exe, "-version"], - stderr=subprocess.STDOUT) + stderr=subprocess.STDOUT, + universal_newlines=True) raw_version_str = raw_output.split('\n')[0] # eg 'java version "1.8.0_25"' version_str = raw_version_str.split()[-1].strip('"') # eg '1.8.0_25' version, update = version_str.split('_') # eg ['1.8.0', '25'] @@ -172,10 +175,10 @@ def set_title_and_block(title, err_block): os.environ["CURRENT_BLOCK"] = ERROR_CODES[err_block] line_str = '=' * 72 - print - print line_str - print title - print line_str + print() + print(line_str) + print(title) + print(line_str) def run_apache_rat_checks(): @@ -202,8 +205,8 @@ def build_spark_documentation(): jekyll_bin = which("jekyll") if not jekyll_bin: - print "[error] Cannot find a version of `jekyll` on the system; please", - print "install one and retry to build documentation." + print("[error] Cannot find a version of `jekyll` on the system; please" + " install one and retry to build documentation.") sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) else: run_cmd([jekyll_bin, "build"]) @@ -239,7 +242,7 @@ def exec_sbt(sbt_args=()): echo_proc.wait() for line in iter(sbt_proc.stdout.readline, ''): if not sbt_output_filter.match(line): - print line, + print(line, end='') retcode = sbt_proc.wait() if retcode > 0: @@ -262,8 +265,8 @@ def get_hadoop_profiles(hadoop_version): if hadoop_version in sbt_maven_hadoop_profiles: return sbt_maven_hadoop_profiles[hadoop_version] else: - print "[error] Could not find", hadoop_version, "in the list. Valid options", - print "are", sbt_maven_hadoop_profiles.keys() + print("[error] Could not find", hadoop_version, "in the list. Valid options" + " are", sbt_maven_hadoop_profiles.keys()) sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) @@ -273,8 +276,8 @@ def build_spark_maven(hadoop_version): mvn_goals = ["clean", "package", "-DskipTests"] profiles_and_goals = build_profiles + mvn_goals - print "[info] Building Spark (w/Hive 0.13.1) using Maven with these arguments:", - print " ".join(profiles_and_goals) + print("[info] Building Spark (w/Hive 0.13.1) using Maven with these arguments: " + " ".join(profiles_and_goals)) exec_maven(profiles_and_goals) @@ -287,8 +290,8 @@ def build_spark_sbt(hadoop_version): "streaming-kafka-assembly/assembly"] profiles_and_goals = build_profiles + sbt_goals - print "[info] Building Spark (w/Hive 0.13.1) using SBT with these arguments:", - print " ".join(profiles_and_goals) + print("[info] Building Spark (w/Hive 0.13.1) using SBT with these arguments: " + " ".join(profiles_and_goals)) exec_sbt(profiles_and_goals) @@ -316,8 +319,8 @@ def run_scala_tests_maven(test_profiles): mvn_test_goals = ["test", "--fail-at-end"] profiles_and_goals = test_profiles + mvn_test_goals - print "[info] Running Spark tests using Maven with these arguments:", - print " ".join(profiles_and_goals) + print("[info] Running Spark tests using Maven with these arguments: " + " ".join(profiles_and_goals)) exec_maven(profiles_and_goals) @@ -331,8 +334,8 @@ def run_scala_tests_sbt(test_modules, test_profiles): profiles_and_goals = test_profiles + list(sbt_test_goals) - print "[info] Running Spark tests using SBT with these arguments:", - print " ".join(profiles_and_goals) + print("[info] Running Spark tests using SBT with these arguments: " + " ".join(profiles_and_goals)) exec_sbt(profiles_and_goals) @@ -367,14 +370,14 @@ def run_sparkr_tests(): run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")]) run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")]) else: - print "Ignoring SparkR tests as R was not found in PATH" + print("Ignoring SparkR tests as R was not found in PATH") def main(): # Ensure the user home directory (HOME) is valid and is an absolute directory if not USER_HOME or not os.path.isabs(USER_HOME): - print "[error] Cannot determine your home directory as an absolute path;", - print "ensure the $HOME environment variable is set properly." + print("[error] Cannot determine your home directory as an absolute path;" + " ensure the $HOME environment variable is set properly.") sys.exit(1) os.chdir(SPARK_HOME) @@ -388,14 +391,14 @@ def main(): java_exe = determine_java_executable() if not java_exe: - print "[error] Cannot find a version of `java` on the system; please", - print "install one and retry." + print("[error] Cannot find a version of `java` on the system; please" + " install one and retry.") sys.exit(2) java_version = determine_java_version(java_exe) if java_version.minor < 8: - print "[warn] Java 8 tests will not run because JDK version is < 1.8." + print("[warn] Java 8 tests will not run because JDK version is < 1.8.") if os.environ.get("AMPLAB_JENKINS"): # if we're on the Amplab Jenkins build servers setup variables @@ -411,8 +414,8 @@ def main(): hadoop_version = "hadoop2.3" test_env = "local" - print "[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version, - print "under environment", test_env + print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version, + "under environment", test_env) changed_modules = None changed_files = None @@ -422,7 +425,8 @@ def main(): changed_modules = determine_modules_for_files(changed_files) if not changed_modules: changed_modules = [modules.root] - print "[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules) + print("[info] Found the following changed modules:", + ", ".join(x.name for x in changed_modules)) test_modules = determine_modules_to_test(changed_modules) From 4f8902cc3f819a16354584d09ad7054072a4b1ea Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 24 Jun 2015 22:59:18 -0700 Subject: [PATCH 16/27] Python lint fixes. --- dev/run-tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 2949d1e5e1f2..ec021056af2c 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -79,7 +79,7 @@ def identify_changed_files_from_git_commits(patch_sha, target_branch=None, targe else: diff_target = target_ref raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target], - universal_newlines=True) + universal_newlines=True) # Remove any empty strings return [f for f in raw_output.split('\n') if f] @@ -377,7 +377,7 @@ def main(): # Ensure the user home directory (HOME) is valid and is an absolute directory if not USER_HOME or not os.path.isabs(USER_HOME): print("[error] Cannot determine your home directory as an absolute path;" - " ensure the $HOME environment variable is set properly.") + " ensure the $HOME environment variable is set properly.") sys.exit(1) os.chdir(SPARK_HOME) From 9c80469099790fbb2aa3957c2054ba44e061a76d Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Thu, 25 Jun 2015 16:59:44 -0700 Subject: [PATCH 17/27] Fix passing of PYSPARK_PYTHON --- python/run-tests.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/python/run-tests.py b/python/run-tests.py index 3c9a579303e8..76ba76f9c269 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -45,10 +45,8 @@ def print_red(text): LOG_FILE = os.path.join(SPARK_HOME, "python/unit-tests.log") -def run_individual_python_test(test_name, pyspark_python=None): - env = {'SPARK_TESTING': '1'} - if pyspark_python: - env["PYSPARK_PYTHON"] = pyspark_python +def run_individual_python_test(test_name, pyspark_python): + env = {'SPARK_TESTING': '1', 'PYSPARK_PYTHON': pyspark_python} print(" Running test: %s ..." % test_name, end='') start_time = time.time() with open(LOG_FILE, 'a') as log_file: @@ -121,7 +119,7 @@ def main(): for module in modules_to_test: print("Running %s tests ..." % module.name) for test_goal in module.python_test_goals: - run_individual_python_test(test_goal) + run_individual_python_test(test_goal, python_exec) total_duration = time.time() - start_time print("Tests passed in %i seconds" % total_duration) From f53db5575123ab3720cce0de8a671bc5a009d359 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Thu, 25 Jun 2015 19:25:17 -0700 Subject: [PATCH 18/27] Remove python2 flag, since the test runner script also works fine under Python 3 --- python/run-tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/run-tests.py b/python/run-tests.py index 76ba76f9c269..d2584758c5c1 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env # # Licensed to the Apache Software Foundation (ASF) under one or more From 3b852ae617ed3c6d0fd19527a280099facf6a300 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Fri, 26 Jun 2015 01:08:54 -0700 Subject: [PATCH 19/27] Fall back to PYSPARK_PYTHON when sys.executable is None (fixes a test) --- python/pyspark/tests.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index 78265423682b..17256dfc9574 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -1421,7 +1421,8 @@ def do_termination_test(self, terminator): # start daemon daemon_path = os.path.join(os.path.dirname(__file__), "daemon.py") - daemon = Popen([sys.executable, daemon_path], stdin=PIPE, stdout=PIPE) + python_exec = sys.executable or os.environ.get("PYSPARK_PYTHON") + daemon = Popen([python_exec, daemon_path], stdin=PIPE, stdout=PIPE) # read the port number port = read_int(daemon.stdout) From 568a3fdfc505908384c8402787d4ee4d98a94665 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Fri, 26 Jun 2015 01:11:50 -0700 Subject: [PATCH 20/27] Fix hashbang --- python/run-tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/run-tests.py b/python/run-tests.py index d2584758c5c1..b24c1b74de5d 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -1,4 +1,4 @@ -#!/usr/bin/env +#!/usr/bin/env python # # Licensed to the Apache Software Foundation (ASF) under one or more From c364ccfe5c8cee31c7898ac16e79de72ad0d07fe Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Fri, 26 Jun 2015 01:29:14 -0700 Subject: [PATCH 21/27] Use which() to convert PYSPARK_PYTHON to an absolute path before shelling out to run tests --- dev/sparktestsupport/__init__.py | 2 +- python/run-tests.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/sparktestsupport/__init__.py b/dev/sparktestsupport/__init__.py index a75dbdcd77a0..12696d98fb98 100644 --- a/dev/sparktestsupport/__init__.py +++ b/dev/sparktestsupport/__init__.py @@ -17,5 +17,5 @@ import os -SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../../") +SPARK_HOME = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "../../")) USER_HOME = os.environ.get("HOME") diff --git a/python/run-tests.py b/python/run-tests.py index b24c1b74de5d..d508eda208fe 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -46,7 +46,7 @@ def print_red(text): def run_individual_python_test(test_name, pyspark_python): - env = {'SPARK_TESTING': '1', 'PYSPARK_PYTHON': pyspark_python} + env = {'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python) } print(" Running test: %s ..." % test_name, end='') start_time = time.time() with open(LOG_FILE, 'a') as log_file: From 27a389fff86c5ca1985ae7fa8f6d0af1d51d3c1a Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Fri, 26 Jun 2015 09:47:33 -0700 Subject: [PATCH 22/27] Skip MLLib tests for PyPy --- dev/sparktestsupport/modules.py | 13 ++++++++++++- python/run-tests.py | 11 +++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index d5cf01bf3e87..efe3a897e9c1 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -30,7 +30,8 @@ class Module(object): """ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), - sbt_test_goals=(), python_test_goals=(), should_run_r_tests=False): + sbt_test_goals=(), python_test_goals=(), blacklisted_python_implementations=(), + should_run_r_tests=False): """ Define a new module. @@ -44,6 +45,9 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags= order to build and test this module (e.g. '-PprofileName'). :param sbt_test_goals: A set of SBT test goals for testing this module. :param python_test_goals: A set of Python test goals for testing this module. + :param blacklisted_python_implementations: A set of Python implementations that are not + supported by this module's Python components. The values in this set should match + strings returned by Python's `platform.python_implementation()`. :param should_run_r_tests: If true, changes in this module will trigger all R tests. """ self.name = name @@ -52,6 +56,7 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags= self.sbt_test_goals = sbt_test_goals self.build_profile_flags = build_profile_flags self.python_test_goals = python_test_goals + self.blacklisted_python_implementations = blacklisted_python_implementations self.should_run_r_tests = should_run_r_tests self.dependent_modules = set() @@ -308,6 +313,9 @@ def contains_file(self, filename): "pyspark.mllib.tree", "pyspark.mllib.util", "pyspark.mllib.tests", + ], + blacklisted_python_implementations=[ + "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there ] ) @@ -326,6 +334,9 @@ def contains_file(self, filename): "pyspark.ml.tuning", "pyspark.ml.tests", "pyspark.ml.evaluation", + ], + blacklisted_python_implementations=[ + "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there ] ) diff --git a/python/run-tests.py b/python/run-tests.py index d508eda208fe..3eb45e47685a 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -46,7 +46,7 @@ def print_red(text): def run_individual_python_test(test_name, pyspark_python): - env = {'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python) } + env = {'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python)} print(" Running test: %s ..." % test_name, end='') start_time = time.time() with open(LOG_FILE, 'a') as log_file: @@ -113,13 +113,16 @@ def main(): start_time = time.time() for python_exec in python_execs: + python_implementation = subprocess.check_output( + [python_exec, "-c", "import platform; print platform.python_implementation()"]).strip() print("Testing with `%s`: " % python_exec, end='') subprocess.call([python_exec, "--version"]) for module in modules_to_test: - print("Running %s tests ..." % module.name) - for test_goal in module.python_test_goals: - run_individual_python_test(test_goal, python_exec) + if python_implementation not in module.blacklisted_python_implementations: + print("Running %s tests ..." % module.name) + for test_goal in module.python_test_goals: + run_individual_python_test(test_goal, python_exec) total_duration = time.time() - start_time print("Tests passed in %i seconds" % total_duration) From 37aff00fa6fd8ec09d115f9ae7c41dedaa573d67 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Fri, 26 Jun 2015 10:02:03 -0700 Subject: [PATCH 23/27] Python 3 fix --- python/run-tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/run-tests.py b/python/run-tests.py index 3eb45e47685a..297ba22e9961 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -114,7 +114,7 @@ def main(): start_time = time.time() for python_exec in python_execs: python_implementation = subprocess.check_output( - [python_exec, "-c", "import platform; print platform.python_implementation()"]).strip() + [python_exec, "-c", "import platform; print(platform.python_implementation())"]).strip() print("Testing with `%s`: " % python_exec, end='') subprocess.call([python_exec, "--version"]) From 8f65ed088d1b3df22e7d364de8c23f0b034c171f Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Fri, 26 Jun 2015 12:02:04 -0700 Subject: [PATCH 24/27] Fix handling of module in python/run-tests --- dev/run-tests.py | 9 +++++---- python/run-tests.py | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 87218e3eea70..edf669e73591 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -360,12 +360,13 @@ def run_scala_tests(build_tool, hadoop_version, test_modules): run_scala_tests_sbt(test_modules, test_profiles) -def run_python_tests(modules): +def run_python_tests(test_modules): set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS") - run_cmd([ - os.path.join(SPARK_HOME, "python", "run-tests"), - "--modules=%s" % ','.join(m.name for m in modules)]) + command = [os.path.join(SPARK_HOME, "python", "run-tests")] + if test_modules != [modules.root]: + command.append("--modules=%s" % ','.join(m.name for m in modules)) + run_cmd(command) def run_sparkr_tests(): diff --git a/python/run-tests.py b/python/run-tests.py index 297ba22e9961..6d188019f871 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -35,7 +35,7 @@ from sparktestsupport.modules import all_modules # noqa -python_modules = dict((m.name, m) for m in all_modules if m.python_test_goals) +python_modules = dict((m.name, m) for m in all_modules if m.python_test_goals if m.name != 'root') def print_red(text): @@ -85,7 +85,7 @@ def parse_opts(): ) parser.add_option( "--modules", type="string", - default=",".join(sorted(set(python_modules.keys()) - set(['root']))), + default=",".join(sorted(python_modules.keys())), help="A comma-separated list of Python modules to test (default: %default)" ) From 34c98d2bac769e1e57f5f4a5a13e94145402efd2 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Fri, 26 Jun 2015 21:32:25 -0700 Subject: [PATCH 25/27] Fix universal_newlines for Python 3 --- python/run-tests.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/run-tests.py b/python/run-tests.py index 6d188019f871..7d485b500ee3 100755 --- a/python/run-tests.py +++ b/python/run-tests.py @@ -114,7 +114,8 @@ def main(): start_time = time.time() for python_exec in python_execs: python_implementation = subprocess.check_output( - [python_exec, "-c", "import platform; print(platform.python_implementation())"]).strip() + [python_exec, "-c", "import platform; print(platform.python_implementation())"], + universal_newlines=True).strip() print("Testing with `%s`: " % python_exec, end='') subprocess.call([python_exec, "--version"]) From 8233d61608892f8ff2da906b206f423f6a67d5b9 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Sat, 27 Jun 2015 15:39:10 -0700 Subject: [PATCH 26/27] Add python/run-tests.py to Python lint checks --- dev/lint-python | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/lint-python b/dev/lint-python index cd32d734ebb3..0c3586462cb3 100755 --- a/dev/lint-python +++ b/dev/lint-python @@ -20,7 +20,7 @@ SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")" PATHS_TO_CHECK="./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/ ./dev/sparktestsupport" -PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py" +PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py" PYTHON_LINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/python-lint-report.txt" cd "$SPARK_ROOT_DIR" From f578d6da50cd09a3ad13dbe6dcc6b22870f227e5 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Sat, 27 Jun 2015 15:39:47 -0700 Subject: [PATCH 27/27] Fix print for Python 2.x --- dev/run-tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index edf669e73591..c51b0d3010a0 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -180,7 +180,7 @@ def set_title_and_block(title, err_block): os.environ["CURRENT_BLOCK"] = ERROR_CODES[err_block] line_str = '=' * 72 - print() + print('') print(line_str) print(title) print(line_str)