From 1bdeb87de057187655abfaddf95491580629d7aa Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 22 Jun 2015 15:40:05 -0700
Subject: [PATCH 01/27] Move module definitions to separate file.

---
 dev/run-tests.py                 | 299 ++-----------------------------
 dev/sparktestsupport/__init__.py |  16 ++
 dev/sparktestsupport/modules.py  | 292 ++++++++++++++++++++++++++++++
 3 files changed, 321 insertions(+), 286 deletions(-)
 create mode 100644 dev/sparktestsupport/__init__.py
 create mode 100644 dev/sparktestsupport/modules.py

diff --git a/dev/run-tests.py b/dev/run-tests.py
index de1b4537eda5..c0652625fd2a 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -25,289 +25,16 @@
 import subprocess
 from collections import namedtuple
 
+import sparktestsupport.modules as modules
+
 SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
 USER_HOME = os.environ.get("HOME")
 
 
 # -------------------------------------------------------------------------------------------------
-# Test module definitions and functions for traversing module dependency graph
+# Functions for traversing module dependency graph
 # -------------------------------------------------------------------------------------------------
 
-
-all_modules = []
-
-
-class Module(object):
-    """
-    A module is the basic abstraction in our test runner script. Each module consists of a set of
-    source files, a set of test commands, and a set of dependencies on other modules. We use modules
-    to define a dependency graph that lets determine which tests to run based on which files have
-    changed.
-    """
-
-    def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(),
-                 sbt_test_goals=(), should_run_python_tests=False, should_run_r_tests=False):
-        """
-        Define a new module.
-
-        :param name: A short module name, for display in logging and error messages.
-        :param dependencies: A set of dependencies for this module. This should only include direct
-            dependencies; transitive dependencies are resolved automatically.
-        :param source_file_regexes: a set of regexes that match source files belonging to this
-            module. These regexes are applied by attempting to match at the beginning of the
-            filename strings.
-        :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in
-            order to build and test this module (e.g. '-PprofileName').
-        :param sbt_test_goals: A set of SBT test goals for testing this module.
-        :param should_run_python_tests: If true, changes in this module will trigger Python tests.
-            For now, this has the effect of causing _all_ Python tests to be run, although in the
-            future this should be changed to run only a subset of the Python tests that depend
-            on this module.
-        :param should_run_r_tests: If true, changes in this module will trigger all R tests.
-        """
-        self.name = name
-        self.dependencies = dependencies
-        self.source_file_prefixes = source_file_regexes
-        self.sbt_test_goals = sbt_test_goals
-        self.build_profile_flags = build_profile_flags
-        self.should_run_python_tests = should_run_python_tests
-        self.should_run_r_tests = should_run_r_tests
-
-        self.dependent_modules = set()
-        for dep in dependencies:
-            dep.dependent_modules.add(self)
-        all_modules.append(self)
-
-    def contains_file(self, filename):
-        return any(re.match(p, filename) for p in self.source_file_prefixes)
-
-
-sql = Module(
-    name="sql",
-    dependencies=[],
-    source_file_regexes=[
-        "sql/(?!hive-thriftserver)",
-        "bin/spark-sql",
-    ],
-    build_profile_flags=[
-        "-Phive",
-    ],
-    sbt_test_goals=[
-        "catalyst/test",
-        "sql/test",
-        "hive/test",
-    ]
-)
-
-
-hive_thriftserver = Module(
-    name="hive-thriftserver",
-    dependencies=[sql],
-    source_file_regexes=[
-        "sql/hive-thriftserver",
-        "sbin/start-thriftserver.sh",
-    ],
-    build_profile_flags=[
-        "-Phive-thriftserver",
-    ],
-    sbt_test_goals=[
-        "hive-thriftserver/test",
-    ]
-)
-
-
-graphx = Module(
-    name="graphx",
-    dependencies=[],
-    source_file_regexes=[
-        "graphx/",
-    ],
-    sbt_test_goals=[
-        "graphx/test"
-    ]
-)
-
-
-streaming = Module(
-    name="streaming",
-    dependencies=[],
-    source_file_regexes=[
-        "streaming",
-    ],
-    sbt_test_goals=[
-        "streaming/test",
-    ]
-)
-
-
-streaming_kinesis_asl = Module(
-    name="kinesis-asl",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "extras/kinesis-asl/",
-    ],
-    build_profile_flags=[
-        "-Pkinesis-asl",
-    ],
-    sbt_test_goals=[
-        "kinesis-asl/test",
-    ]
-)
-
-
-streaming_zeromq = Module(
-    name="streaming-zeromq",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "external/zeromq",
-    ],
-    sbt_test_goals=[
-        "streaming-zeromq/test",
-    ]
-)
-
-
-streaming_twitter = Module(
-    name="streaming-twitter",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "external/twitter",
-    ],
-    sbt_test_goals=[
-        "streaming-twitter/test",
-    ]
-)
-
-
-streaming_mqtt = Module(
-    name="streaming-mqtt",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "external/mqtt",
-    ],
-    sbt_test_goals=[
-        "streaming-mqtt/test",
-    ]
-)
-
-
-streaming_kafka = Module(
-    name="streaming-kafka",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "external/kafka",
-        "external/kafka-assembly",
-    ],
-    sbt_test_goals=[
-        "streaming-kafka/test",
-    ]
-)
-
-
-streaming_flume_sink = Module(
-    name="streaming-flume-sink",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "external/flume-sink",
-    ],
-    sbt_test_goals=[
-        "streaming-flume-sink/test",
-    ]
-)
-
-
-streaming_flume = Module(
-    name="streaming_flume",
-    dependencies=[streaming],
-    source_file_regexes=[
-        "external/flume",
-    ],
-    sbt_test_goals=[
-        "streaming-flume/test",
-    ]
-)
-
-
-mllib = Module(
-    name="mllib",
-    dependencies=[streaming, sql],
-    source_file_regexes=[
-        "data/mllib/",
-        "mllib/",
-    ],
-    sbt_test_goals=[
-        "mllib/test",
-    ]
-)
-
-
-examples = Module(
-    name="examples",
-    dependencies=[graphx, mllib, streaming, sql],
-    source_file_regexes=[
-        "examples/",
-    ],
-    sbt_test_goals=[
-        "examples/test",
-    ]
-)
-
-
-pyspark = Module(
-    name="pyspark",
-    dependencies=[mllib, streaming, streaming_kafka, sql],
-    source_file_regexes=[
-        "python/"
-    ],
-    should_run_python_tests=True
-)
-
-
-sparkr = Module(
-    name="sparkr",
-    dependencies=[sql, mllib],
-    source_file_regexes=[
-        "R/",
-    ],
-    should_run_r_tests=True
-)
-
-
-docs = Module(
-    name="docs",
-    dependencies=[],
-    source_file_regexes=[
-        "docs/",
-    ]
-)
-
-
-ec2 = Module(
-    name="ec2",
-    dependencies=[],
-    source_file_regexes=[
-        "ec2/",
-    ]
-)
-
-
-# The root module is a dummy module which is used to run all of the tests.
-# No other modules should directly depend on this module.
-root = Module(
-    name="root",
-    dependencies=[],
-    source_file_regexes=[],
-    # In order to run all of the tests, enable every test profile:
-    build_profile_flags=
-        list(set(itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))),
-    sbt_test_goals=[
-        "test",
-    ],
-    should_run_python_tests=True,
-    should_run_r_tests=True
-)
-
-
 def determine_modules_for_files(filenames):
     """
     Given a list of filenames, return the set of modules that contain those files.
@@ -322,12 +49,12 @@ def determine_modules_for_files(filenames):
     changed_modules = set()
     for filename in filenames:
         matched_at_least_one_module = False
-        for module in all_modules:
+        for module in modules.all_modules:
             if module.contains_file(filename):
                 changed_modules.add(module)
                 matched_at_least_one_module = True
         if not matched_at_least_one_module:
-            changed_modules.add(root)
+            changed_modules.add(modules.root)
     return changed_modules
 
 
@@ -362,18 +89,18 @@ def determine_modules_to_test(changed_modules):
     Given a set of modules that have changed, compute the transitive closure of those modules'
     dependent modules in order to determine the set of modules that should be tested.
 
-    >>> sorted(x.name for x in determine_modules_to_test([root]))
+    >>> sorted(x.name for x in determine_modules_to_test([modules.root]))
     ['root']
-    >>> sorted(x.name for x in determine_modules_to_test([graphx]))
+    >>> sorted(x.name for x in determine_modules_to_test([modules.graphx]))
     ['examples', 'graphx']
-    >>> sorted(x.name for x in determine_modules_to_test([sql]))
+    >>> sorted(x.name for x in determine_modules_to_test([modules.sql]))
     ['examples', 'hive-thriftserver', 'mllib', 'pyspark', 'sparkr', 'sql']
     """
     # If we're going to have to run all of the tests, then we can just short-circuit
     # and return 'root'. No module depends on root, so if it appears then it will be
     # in changed_modules.
-    if root in changed_modules:
-        return [root]
+    if modules.root in changed_modules:
+        return [modules.root]
     modules_to_test = set()
     for module in changed_modules:
         modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules))
@@ -596,7 +323,7 @@ def get_hadoop_profiles(hadoop_version):
 
 def build_spark_maven(hadoop_version):
     # Enable all of the profiles for the build:
-    build_profiles = get_hadoop_profiles(hadoop_version) + root.build_profile_flags
+    build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
     mvn_goals = ["clean", "package", "-DskipTests"]
     profiles_and_goals = build_profiles + mvn_goals
 
@@ -608,7 +335,7 @@ def build_spark_maven(hadoop_version):
 
 def build_spark_sbt(hadoop_version):
     # Enable all of the profiles for the build:
-    build_profiles = get_hadoop_profiles(hadoop_version) + root.build_profile_flags
+    build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
     sbt_goals = ["package",
                  "assembly/assembly",
                  "streaming-kafka-assembly/assembly"]
@@ -746,7 +473,7 @@ def main():
         changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
         changed_modules = determine_modules_for_files(changed_files)
     if not changed_modules:
-        changed_modules = [root]
+        changed_modules = [modules.root]
     print "[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)
 
     test_modules = determine_modules_to_test(changed_modules)
diff --git a/dev/sparktestsupport/__init__.py b/dev/sparktestsupport/__init__.py
new file mode 100644
index 000000000000..ecb1860df848
--- /dev/null
+++ b/dev/sparktestsupport/__init__.py
@@ -0,0 +1,16 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
\ No newline at end of file
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
new file mode 100644
index 000000000000..70710b3de782
--- /dev/null
+++ b/dev/sparktestsupport/modules.py
@@ -0,0 +1,292 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import itertools
+import re
+
+all_modules = []
+
+
+class Module(object):
+    """
+    A module is the basic abstraction in our test runner script. Each module consists of a set of
+    source files, a set of test commands, and a set of dependencies on other modules. We use modules
+    to define a dependency graph that lets determine which tests to run based on which files have
+    changed.
+    """
+
+    def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(),
+                 sbt_test_goals=(), should_run_python_tests=False, should_run_r_tests=False):
+        """
+        Define a new module.
+
+        :param name: A short module name, for display in logging and error messages.
+        :param dependencies: A set of dependencies for this module. This should only include direct
+            dependencies; transitive dependencies are resolved automatically.
+        :param source_file_regexes: a set of regexes that match source files belonging to this
+            module. These regexes are applied by attempting to match at the beginning of the
+            filename strings.
+        :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in
+            order to build and test this module (e.g. '-PprofileName').
+        :param sbt_test_goals: A set of SBT test goals for testing this module.
+        :param should_run_python_tests: If true, changes in this module will trigger Python tests.
+            For now, this has the effect of causing _all_ Python tests to be run, although in the
+            future this should be changed to run only a subset of the Python tests that depend
+            on this module.
+        :param should_run_r_tests: If true, changes in this module will trigger all R tests.
+        """
+        self.name = name
+        self.dependencies = dependencies
+        self.source_file_prefixes = source_file_regexes
+        self.sbt_test_goals = sbt_test_goals
+        self.build_profile_flags = build_profile_flags
+        self.should_run_python_tests = should_run_python_tests
+        self.should_run_r_tests = should_run_r_tests
+
+        self.dependent_modules = set()
+        for dep in dependencies:
+            dep.dependent_modules.add(self)
+        all_modules.append(self)
+
+    def contains_file(self, filename):
+        return any(re.match(p, filename) for p in self.source_file_prefixes)
+
+
+sql = Module(
+    name="sql",
+    dependencies=[],
+    source_file_regexes=[
+        "sql/(?!hive-thriftserver)",
+        "bin/spark-sql",
+    ],
+    build_profile_flags=[
+        "-Phive",
+    ],
+    sbt_test_goals=[
+        "catalyst/test",
+        "sql/test",
+        "hive/test",
+    ]
+)
+
+
+hive_thriftserver = Module(
+    name="hive-thriftserver",
+    dependencies=[sql],
+    source_file_regexes=[
+        "sql/hive-thriftserver",
+        "sbin/start-thriftserver.sh",
+    ],
+    build_profile_flags=[
+        "-Phive-thriftserver",
+    ],
+    sbt_test_goals=[
+        "hive-thriftserver/test",
+    ]
+)
+
+
+graphx = Module(
+    name="graphx",
+    dependencies=[],
+    source_file_regexes=[
+        "graphx/",
+    ],
+    sbt_test_goals=[
+        "graphx/test"
+    ]
+)
+
+
+streaming = Module(
+    name="streaming",
+    dependencies=[],
+    source_file_regexes=[
+        "streaming",
+    ],
+    sbt_test_goals=[
+        "streaming/test",
+    ]
+)
+
+
+streaming_kinesis_asl = Module(
+    name="kinesis-asl",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "extras/kinesis-asl/",
+    ],
+    build_profile_flags=[
+        "-Pkinesis-asl",
+    ],
+    sbt_test_goals=[
+        "kinesis-asl/test",
+    ]
+)
+
+
+streaming_zeromq = Module(
+    name="streaming-zeromq",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "external/zeromq",
+    ],
+    sbt_test_goals=[
+        "streaming-zeromq/test",
+    ]
+)
+
+
+streaming_twitter = Module(
+    name="streaming-twitter",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "external/twitter",
+    ],
+    sbt_test_goals=[
+        "streaming-twitter/test",
+    ]
+)
+
+
+streaming_mqtt = Module(
+    name="streaming-mqtt",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "external/mqtt",
+    ],
+    sbt_test_goals=[
+        "streaming-mqtt/test",
+    ]
+)
+
+
+streaming_kafka = Module(
+    name="streaming-kafka",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "external/kafka",
+        "external/kafka-assembly",
+    ],
+    sbt_test_goals=[
+        "streaming-kafka/test",
+    ]
+)
+
+
+streaming_flume_sink = Module(
+    name="streaming-flume-sink",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "external/flume-sink",
+    ],
+    sbt_test_goals=[
+        "streaming-flume-sink/test",
+    ]
+)
+
+
+streaming_flume = Module(
+    name="streaming_flume",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "external/flume",
+    ],
+    sbt_test_goals=[
+        "streaming-flume/test",
+    ]
+)
+
+
+mllib = Module(
+    name="mllib",
+    dependencies=[streaming, sql],
+    source_file_regexes=[
+        "data/mllib/",
+        "mllib/",
+    ],
+    sbt_test_goals=[
+        "mllib/test",
+    ]
+)
+
+
+examples = Module(
+    name="examples",
+    dependencies=[graphx, mllib, streaming, sql],
+    source_file_regexes=[
+        "examples/",
+    ],
+    sbt_test_goals=[
+        "examples/test",
+    ]
+)
+
+
+pyspark = Module(
+    name="pyspark",
+    dependencies=[mllib, streaming, streaming_kafka, sql],
+    source_file_regexes=[
+        "python/"
+    ],
+    should_run_python_tests=True
+)
+
+
+sparkr = Module(
+    name="sparkr",
+    dependencies=[sql, mllib],
+    source_file_regexes=[
+        "R/",
+    ],
+    should_run_r_tests=True
+)
+
+
+docs = Module(
+    name="docs",
+    dependencies=[],
+    source_file_regexes=[
+        "docs/",
+    ]
+)
+
+
+ec2 = Module(
+    name="ec2",
+    dependencies=[],
+    source_file_regexes=[
+        "ec2/",
+    ]
+)
+
+
+# The root module is a dummy module which is used to run all of the tests.
+# No other modules should directly depend on this module.
+root = Module(
+    name="root",
+    dependencies=[],
+    source_file_regexes=[],
+    # In order to run all of the tests, enable every test profile:
+    build_profile_flags=
+        list(set(itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))),
+    sbt_test_goals=[
+        "test",
+    ],
+    should_run_python_tests=True,
+    should_run_r_tests=True
+)

From 311c6a99b313aa277cfc7d44be544a33c6550289 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 22 Jun 2015 15:46:32 -0700
Subject: [PATCH 02/27] Move shell utility functions to own module.

---
 dev/run-tests.py                   | 56 +--------------------
 dev/sparktestsupport/shellutils.py | 81 ++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+), 55 deletions(-)
 create mode 100644 dev/sparktestsupport/shellutils.py

diff --git a/dev/run-tests.py b/dev/run-tests.py
index c0652625fd2a..6b067c325868 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -21,10 +21,10 @@
 import os
 import re
 import sys
-import shutil
 import subprocess
 from collections import namedtuple
 
+from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
 import sparktestsupport.modules as modules
 
 SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
@@ -125,60 +125,6 @@ def get_error_codes(err_code_file):
 ERROR_CODES = get_error_codes(os.path.join(SPARK_HOME, "dev/run-tests-codes.sh"))
 
 
-def exit_from_command_with_retcode(cmd, retcode):
-    print "[error] running", ' '.join(cmd), "; received return code", retcode
-    sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
-
-
-def rm_r(path):
-    """Given an arbitrary path properly remove it with the correct python
-    construct if it exists
-    - from: http://stackoverflow.com/a/9559881"""
-
-    if os.path.isdir(path):
-        shutil.rmtree(path)
-    elif os.path.exists(path):
-        os.remove(path)
-
-
-def run_cmd(cmd):
-    """Given a command as a list of arguments will attempt to execute the
-    command from the determined SPARK_HOME directory and, on failure, print
-    an error message"""
-
-    if not isinstance(cmd, list):
-        cmd = cmd.split()
-    try:
-        subprocess.check_call(cmd)
-    except subprocess.CalledProcessError as e:
-        exit_from_command_with_retcode(e.cmd, e.returncode)
-
-
-def is_exe(path):
-    """Check if a given path is an executable file
-    - from: http://stackoverflow.com/a/377028"""
-
-    return os.path.isfile(path) and os.access(path, os.X_OK)
-
-
-def which(program):
-    """Find and return the given program by its absolute path or 'None'
-    - from: http://stackoverflow.com/a/377028"""
-
-    fpath = os.path.split(program)[0]
-
-    if fpath:
-        if is_exe(program):
-            return program
-    else:
-        for path in os.environ.get("PATH").split(os.pathsep):
-            path = path.strip('"')
-            exe_file = os.path.join(path, program)
-            if is_exe(exe_file):
-                return exe_file
-    return None
-
-
 def determine_java_executable():
     """Will return the path of the java executable that will be used by Spark's
     tests or `None`"""
diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py
new file mode 100644
index 000000000000..81ba6cba64e5
--- /dev/null
+++ b/dev/sparktestsupport/shellutils.py
@@ -0,0 +1,81 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import shutil
+import subprocess
+import sys
+
+
+def exit_from_command_with_retcode(cmd, retcode):
+    print "[error] running", ' '.join(cmd), "; received return code", retcode
+    sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
+
+
+def rm_r(path):
+    """
+    Given an arbitrary path, properly remove it with the correct Python construct if it exists.
+    From: http://stackoverflow.com/a/9559881
+    """
+
+    if os.path.isdir(path):
+        shutil.rmtree(path)
+    elif os.path.exists(path):
+        os.remove(path)
+
+
+def run_cmd(cmd):
+    """
+    Given a command as a list of arguments will attempt to execute the command
+    and, on failure, print an error message
+    """
+
+    if not isinstance(cmd, list):
+        cmd = cmd.split()
+    try:
+        subprocess.check_call(cmd)
+    except subprocess.CalledProcessError as e:
+        exit_from_command_with_retcode(e.cmd, e.returncode)
+
+
+def is_exe(path):
+    """
+    Check if a given path is an executable file.
+    From: http://stackoverflow.com/a/377028
+    """
+
+    return os.path.isfile(path) and os.access(path, os.X_OK)
+
+
+def which(program):
+    """
+    Find and return the given program by its absolute path or 'None' if the program cannot be found.
+    From: http://stackoverflow.com/a/377028
+    """
+
+    fpath = os.path.split(program)[0]
+
+    if fpath:
+        if is_exe(program):
+            return program
+    else:
+        for path in os.environ.get("PATH").split(os.pathsep):
+            path = path.strip('"')
+            exe_file = os.path.join(path, program)
+            if is_exe(exe_file):
+                return exe_file
+    return None

From 32660fc94545f48e5a39785061261697fe4f447f Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 22 Jun 2015 22:58:06 -0700
Subject: [PATCH 03/27] Initial cut at Python test runner refactoring

---
 dev/run-tests.py                   |   5 +-
 dev/sparktestsupport/__init__.py   |   7 +-
 dev/sparktestsupport/modules.py    | 106 +++++++++++++++++++++++++----
 dev/sparktestsupport/shellutils.py |   2 +-
 python/run-tests                   |  87 ++---------------------
 python/run-tests.py                |  86 +++++++++++++++++++++++
 6 files changed, 193 insertions(+), 100 deletions(-)
 create mode 100644 python/run-tests.py

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 6b067c325868..2de57fc0d1dc 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -24,13 +24,10 @@
 import subprocess
 from collections import namedtuple
 
+from sparktestsupport import SPARK_HOME, USER_HOME
 from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
 import sparktestsupport.modules as modules
 
-SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
-USER_HOME = os.environ.get("HOME")
-
-
 # -------------------------------------------------------------------------------------------------
 # Functions for traversing module dependency graph
 # -------------------------------------------------------------------------------------------------
diff --git a/dev/sparktestsupport/__init__.py b/dev/sparktestsupport/__init__.py
index ecb1860df848..9afb1da74760 100644
--- a/dev/sparktestsupport/__init__.py
+++ b/dev/sparktestsupport/__init__.py
@@ -13,4 +13,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
\ No newline at end of file
+#
+
+import os
+
+SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../../")
+USER_HOME = os.environ.get("HOME")
\ No newline at end of file
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 70710b3de782..48bbdd231364 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -30,7 +30,7 @@ class Module(object):
     """
 
     def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(),
-                 sbt_test_goals=(), should_run_python_tests=False, should_run_r_tests=False):
+                 sbt_test_goals=(), python_test_goals=(), should_run_r_tests=False):
         """
         Define a new module.
 
@@ -43,10 +43,7 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
         :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in
             order to build and test this module (e.g. '-PprofileName').
         :param sbt_test_goals: A set of SBT test goals for testing this module.
-        :param should_run_python_tests: If true, changes in this module will trigger Python tests.
-            For now, this has the effect of causing _all_ Python tests to be run, although in the
-            future this should be changed to run only a subset of the Python tests that depend
-            on this module.
+        :param python_test_goals: A set of Python test goals for testing this module.
         :param should_run_r_tests: If true, changes in this module will trigger all R tests.
         """
         self.name = name
@@ -54,7 +51,7 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
         self.source_file_prefixes = source_file_regexes
         self.sbt_test_goals = sbt_test_goals
         self.build_profile_flags = build_profile_flags
-        self.should_run_python_tests = should_run_python_tests
+        self.python_test_goals = python_test_goals
         self.should_run_r_tests = should_run_r_tests
 
         self.dependent_modules = set()
@@ -237,16 +234,101 @@ def contains_file(self, filename):
 )
 
 
-pyspark = Module(
-    name="pyspark",
-    dependencies=[mllib, streaming, streaming_kafka, sql],
+pyspark_core = Module(
+    name="pyspark-core",
+    dependencies=[mllib, streaming, streaming_kafka],
     source_file_regexes=[
-        "python/"
+        "python/(?!pyspark/(ml|mllib|sql|streaming))"
     ],
-    should_run_python_tests=True
+    python_test_goals=[
+        "pyspark.rdd",
+        "pyspark.context",
+        "pyspark.conf",
+        "pyspark.broadcast",
+        "pyspark.accumulators",
+        "pyspark.serializers",
+        "pyspark.profiler",
+        "pyspark.shuffle",
+        "pyspark.tests",
+    ]
+)
+
+
+pyspark_sql = Module(
+    name="pyspark-sql",
+    dependencies=[pyspark_core, sql],
+    source_file_regexes=[
+        "python/sql"
+    ],
+    python_test_goals=[
+        "pyspark.sql.types",
+        "pyspark.sql.context",
+        "pyspark.sql.column",
+        "pyspark.sql.dataframe",
+        "pyspark.sql.group",
+        "pyspark.sql.functions",
+        "pyspark.sql.readwriter",
+        "pyspark.sql.window",
+        "pyspark.sql.tests",
+    ]
+)
+
+
+pyspark_streaming = Module(
+    name="pyspark-streaming",
+    dependencies=[pyspark_core, streaming, streaming_kafka],
+    source_file_regexes=[
+        "python/streaming"
+    ],
+    python_test_goals=[
+        "pyspark.streaming.util",
+        "pyspark.streaming.tests",
+    ]
+)
+
+
+pyspark_mllib = Module(
+    name="pyspark-mllib",
+    dependencies=[pyspark_core, pyspark_streaming, pyspark_sql, mllib],
+    source_file_regexes=[
+        "python/mllib"
+    ],
+    python_test_goals=[
+        "pyspark.mllib.classification",
+        "pyspark.mllib.clustering",
+        "pyspark.mllib.evaluation",
+        "pyspark.mllib.feature",
+        "pyspark.mllib.fpm",
+        "pyspark.mllib.linalg",
+        "pyspark.mllib.random",
+        "pyspark.mllib.recommendation",
+        "pyspark.mllib.regression",
+        "pyspark.mllib.stat._statistics",
+        "pyspark.mllib.stat.KernelDensity",
+        "pyspark.mllib.tree",
+        "pyspark.mllib.util",
+        "pyspark.mllib.tests",
+    ]
 )
 
 
+pyspark_ml = Module(
+    name="pyspark-sql",
+    dependencies=[pyspark_core, pyspark_mllib],
+    source_file_regexes=[
+        "python/sql"
+    ],
+    python_test_goals=[
+        "pyspark.ml.feature",
+        "pyspark.ml.classification",
+        "pyspark.ml.recommendation",
+        "pyspark.ml.regression",
+        "pyspark.ml.tuning",
+        "pyspark.ml.tests",
+        "pyspark.ml.evaluation",
+    ]
+)
+
 sparkr = Module(
     name="sparkr",
     dependencies=[sql, mllib],
@@ -287,6 +369,6 @@ def contains_file(self, filename):
     sbt_test_goals=[
         "test",
     ],
-    should_run_python_tests=True,
+    python_test_goals=list(itertools.chain.from_iterable(m.python_test_goals for m in all_modules)),
     should_run_r_tests=True
 )
diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py
index 81ba6cba64e5..5176e4e15c06 100644
--- a/dev/sparktestsupport/shellutils.py
+++ b/dev/sparktestsupport/shellutils.py
@@ -41,7 +41,7 @@ def rm_r(path):
 def run_cmd(cmd):
     """
     Given a command as a list of arguments will attempt to execute the command
-    and, on failure, print an error message
+    and, on failure, print an error message and exit.
     """
 
     if not isinstance(cmd, list):
diff --git a/python/run-tests b/python/run-tests
index 4468fdb3f267..78da70ae53bc 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -18,13 +18,13 @@
 #
 
 
-# Figure out where the Spark framework is installed
-FWDIR="$(cd "`dirname "$0"`"; cd ../; pwd)"
+FWDIR="$(cd "`dirname $0`"/..; pwd)"
+cd "$FWDIR"
 
-. "$FWDIR"/bin/load-spark-env.sh
+PYTHONPATH="$FWDIR/dev/" exec python -u ./python/run-tests.py
+
+exit
 
-# CD into the python directory to find things on the right path
-cd "$FWDIR/python"
 
 FAILED=0
 LOG_FILE=unit-tests.log
@@ -32,83 +32,6 @@ START=$(date +"%s")
 
 rm -f $LOG_FILE
 
-# Remove the metastore and warehouse directory created by the HiveContext tests in Spark SQL
-rm -rf metastore warehouse
-
-function run_test() {
-    echo -en "Running test: $1 ... " | tee -a $LOG_FILE
-    start=$(date +"%s")
-    SPARK_TESTING=1 time "$FWDIR"/bin/pyspark $1 > $LOG_FILE 2>&1
-
-    FAILED=$((PIPESTATUS[0]||$FAILED))
-
-    # Fail and exit on the first test failure.
-    if [[ $FAILED != 0 ]]; then
-        cat $LOG_FILE | grep -v "^[0-9][0-9]*" # filter all lines starting with a number.
-        echo -en "\033[31m"  # Red
-        echo "Had test failures; see logs."
-        echo -en "\033[0m"  # No color
-        exit -1
-    else
-        now=$(date +"%s")
-        echo "ok ($(($now - $start))s)"
-    fi
-}
-
-function run_core_tests() {
-    echo "Run core tests ..."
-    run_test "pyspark.rdd"
-    run_test "pyspark.context"
-    run_test "pyspark.conf"
-    run_test "pyspark.broadcast"
-    run_test "pyspark.accumulators"
-    run_test "pyspark.serializers"
-    run_test "pyspark.profiler"
-    run_test "pyspark.shuffle"
-    run_test "pyspark.tests"
-}
-
-function run_sql_tests() {
-    echo "Run sql tests ..."
-    run_test "pyspark.sql.types"
-    run_test "pyspark.sql.context"
-    run_test "pyspark.sql.column"
-    run_test "pyspark.sql.dataframe"
-    run_test "pyspark.sql.group"
-    run_test "pyspark.sql.functions"
-    run_test "pyspark.sql.readwriter"
-    run_test "pyspark.sql.window"
-    run_test "pyspark.sql.tests"
-}
-
-function run_mllib_tests() {
-    echo "Run mllib tests ..."
-    run_test "pyspark.mllib.classification"
-    run_test "pyspark.mllib.clustering"
-    run_test "pyspark.mllib.evaluation"
-    run_test "pyspark.mllib.feature"
-    run_test "pyspark.mllib.fpm"
-    run_test "pyspark.mllib.linalg"
-    run_test "pyspark.mllib.random"
-    run_test "pyspark.mllib.recommendation"
-    run_test "pyspark.mllib.regression"
-    run_test "pyspark.mllib.stat._statistics"
-    run_test "pyspark.mllib.stat.KernelDensity"
-    run_test "pyspark.mllib.tree"
-    run_test "pyspark.mllib.util"
-    run_test "pyspark.mllib.tests"
-}
-
-function run_ml_tests() {
-    echo "Run ml tests ..."
-    run_test "pyspark.ml.feature"
-    run_test "pyspark.ml.classification"
-    run_test "pyspark.ml.recommendation"
-    run_test "pyspark.ml.regression"
-    run_test "pyspark.ml.tuning"
-    run_test "pyspark.ml.tests"
-    run_test "pyspark.ml.evaluation"
-}
 
 function run_streaming_tests() {
     echo "Run streaming tests ..."
diff --git a/python/run-tests.py b/python/run-tests.py
new file mode 100644
index 000000000000..7a053e1c168e
--- /dev/null
+++ b/python/run-tests.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python2
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+import os
+import re
+import subprocess
+import time
+
+from sparktestsupport import SPARK_HOME
+from sparktestsupport.shellutils import which
+from sparktestsupport.modules import all_modules
+
+
+def print_red(text):
+    print('\033[31m' + text + '\033[0m')
+
+
+LOG_FILE = os.path.join(SPARK_HOME, "python/unit-tests.log")
+
+
+def run_individual_python_test(test_name, pyspark_python=None):
+    env = {'SPARK_TESTING': '1'}
+    if pyspark_python:
+        env["PYSPARK_PYTHON"] = pyspark_python
+    print("    Running test: %s ..." % test_name, end='')
+    start_time = time.time()
+    with open(LOG_FILE, 'a') as log_file:
+        retcode = subprocess.call(
+            [os.path.join(SPARK_HOME, "bin/pyspark"), test_name],
+            stderr=log_file, stdout=log_file, env=env)
+    duration = time.time() - start_time
+    # Exit on the first failure.
+    if retcode != 0:
+        with open(LOG_FILE, 'r') as log_file:
+            for line in log_file:
+                if not re.match('[0-9]+', line):
+                    print(line, end='')
+        print_red("\nHad test failures in %s; see logs." % test_name)
+        exit(-1)
+    else:
+        print("ok (%fs)" % (duration / 10000.0))
+
+
+def main():
+    # TODO: do we need to remove the metastore and warehouse created by the SQL tests? Ask Ahir.
+    if os.path.exists(LOG_FILE):
+        os.remove(LOG_FILE)
+    python_execs = [x for x in ["python2.6", "python3.4", "pypy"] if which(x)]
+    if "python2.6" not in python_execs:
+        print("WARNING: Not testing against `python2.6` because it could not be found; falling"
+              " back to `python` instead")
+        python_execs.insert(0, "python")
+    print("Will test against the following Python executables: %s" % python_execs)
+    start_time = time.time()
+    for python_exec in python_execs:
+        print("Testing with `%s`: " % python_exec, end='')
+        subprocess.call([python_exec, "--version"])
+
+        python_modules = [m for m in all_modules if m.python_test_goals]
+        for module in python_modules:
+            print("Running %s tests ..." % module.name)
+            for test_goal in module.python_test_goals:
+                run_individual_python_test(test_goal)
+    total_duration = time.time() - start_time
+    print("Tests passed in %f seconds" % (total_duration / 1000.0))
+
+
+if __name__ == "__main__":
+    main()

From dcc9c09719cafc172bf3d58b7e81c73ae4535eaa Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 22 Jun 2015 23:16:48 -0700
Subject: [PATCH 04/27] Fix time division

---
 python/run-tests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index 7a053e1c168e..b68721883649 100644
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -55,7 +55,7 @@ def run_individual_python_test(test_name, pyspark_python=None):
         print_red("\nHad test failures in %s; see logs." % test_name)
         exit(-1)
     else:
-        print("ok (%fs)" % (duration / 10000.0))
+        print("ok (%is)" % duration)
 
 
 def main():
@@ -79,7 +79,7 @@ def main():
             for test_goal in module.python_test_goals:
                 run_individual_python_test(test_goal)
     total_duration = time.time() - start_time
-    print("Tests passed in %f seconds" % (total_duration / 1000.0))
+    print("Tests passed in %i seconds" % total_duration)
 
 
 if __name__ == "__main__":

From 4c9713653a05e66a279885c50c99b288ba7089fa Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 23 Jun 2015 00:28:18 -0700
Subject: [PATCH 05/27] PYTHONPATH fixes

---
 python/run-tests    |  2 +-
 python/run-tests.py | 12 +++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)
 mode change 100644 => 100755 python/run-tests.py

diff --git a/python/run-tests b/python/run-tests
index 78da70ae53bc..afee53179261 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -21,7 +21,7 @@
 FWDIR="$(cd "`dirname $0`"/..; pwd)"
 cd "$FWDIR"
 
-PYTHONPATH="$FWDIR/dev/" exec python -u ./python/run-tests.py
+exec python -u ./python/run-tests.py
 
 exit
 
diff --git a/python/run-tests.py b/python/run-tests.py
old mode 100644
new mode 100755
index b68721883649..4e232bfe4be7
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -21,11 +21,17 @@
 import os
 import re
 import subprocess
+import sys
 import time
 
-from sparktestsupport import SPARK_HOME
-from sparktestsupport.shellutils import which
-from sparktestsupport.modules import all_modules
+
+# Append `SPARK_HOME/dev` to the Python path so that we can import the sparktestsupport module
+sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), "../dev/"))
+
+
+from sparktestsupport import SPARK_HOME  # noqa (suppress pep8 warnings)
+from sparktestsupport.shellutils import which  # noqa
+from sparktestsupport.modules import all_modules  # noqa
 
 
 def print_red(text):

From 04015b9dd5d2a2f47c1ca48408bca8949709ab01 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 23 Jun 2015 18:06:48 -0700
Subject: [PATCH 06/27] First attempt at getting PySpark Kafka test to work in
 new runner script

---
 python/pyspark/streaming/tests.py | 31 ++++++++++++
 python/run-tests                  | 79 -------------------------------
 python/run-tests.py               |  1 +
 3 files changed, 32 insertions(+), 79 deletions(-)

diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 57049beea4db..05dd45246f56 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 
+import glob
 import os
 import sys
 from itertools import chain
@@ -575,6 +576,36 @@ def check_output(n):
 class KafkaStreamTests(PySparkStreamingTestCase):
     timeout = 20  # seconds
     duration = 1
+    old_pyspark_submit_args = None
+
+    @classmethod
+    def setUpClass(cls):
+        cls.old_pyspark_submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS")
+        SPARK_HOME = os.environ["SPARK_HOME"]
+        kafka_assembly_dir = os.path.join(SPARK_HOME, "external/kafka-assembly")
+        jars = glob.glob(
+            os.path.join(kafka_assembly_dir, "target/scala-*/spark-streaming-kafka-assembly-*.jar"))
+        if not jars:
+            raise Exception(
+                ("Failed to find Spark Streaming kafka assembly jar in %s. " % kafka_assembly_dir) +
+                "You need to build Spark with "
+                "'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or "
+                "'build/mvn package' before running this test")
+        elif len(jars) > 1:
+            raise Exception(("Found multiple Spark Streaming Kafka assembly JARs in %s; please "
+                            "remove all but one") % kafka_assembly_dir)
+        else:
+            os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars %s pyspark-shell" % jars[0]
+            print(os.environ["PYSPARK_SUBMIT_ARGS"])
+            super(KafkaStreamTests, cls).setUpClass()
+
+    @classmethod
+    def tearDownClass(cls):
+        if cls.old_pyspark_submit_args is None:
+            del os.environ["PYSPARK_SUBMIT_ARGS"]
+        else:
+            os.environ["PYSPARK_SUBMIT_ARGS"] = cls.old_pyspark_submit_args
+        super(KafkaStreamTests, cls).tearDownClass()
 
     def setUp(self):
         super(KafkaStreamTests, self).setUp()
diff --git a/python/run-tests b/python/run-tests
index afee53179261..c27cbb4fedb1 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -24,82 +24,3 @@ cd "$FWDIR"
 exec python -u ./python/run-tests.py
 
 exit
-
-
-FAILED=0
-LOG_FILE=unit-tests.log
-START=$(date +"%s")
-
-rm -f $LOG_FILE
-
-
-function run_streaming_tests() {
-    echo "Run streaming tests ..."
-
-    KAFKA_ASSEMBLY_DIR="$FWDIR"/external/kafka-assembly
-    JAR_PATH="${KAFKA_ASSEMBLY_DIR}/target/scala-${SPARK_SCALA_VERSION}"
-    for f in "${JAR_PATH}"/spark-streaming-kafka-assembly-*.jar; do
-      if [[ ! -e "$f" ]]; then
-        echo "Failed to find Spark Streaming Kafka assembly jar in $KAFKA_ASSEMBLY_DIR" 1>&2
-        echo "You need to build Spark with " \
-             "'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or" \
-             "'build/mvn package' before running this program" 1>&2
-        exit 1
-      fi
-      KAFKA_ASSEMBLY_JAR="$f"
-    done
-
-    export PYSPARK_SUBMIT_ARGS="--jars ${KAFKA_ASSEMBLY_JAR} pyspark-shell"
-    run_test "pyspark.streaming.util"
-    run_test "pyspark.streaming.tests"
-}
-
-echo "Running PySpark tests. Output is in python/$LOG_FILE."
-
-export PYSPARK_PYTHON="python"
-
-# Try to test with Python 2.6, since that's the minimum version that we support:
-if [ $(which python2.6) ]; then
-    export PYSPARK_PYTHON="python2.6"
-fi
-
-echo "Testing with Python version:"
-$PYSPARK_PYTHON --version
-
-run_core_tests
-run_sql_tests
-run_mllib_tests
-run_ml_tests
-run_streaming_tests
-
-# Try to test with Python 3
-if [ $(which python3.4) ]; then
-    export PYSPARK_PYTHON="python3.4"
-    echo "Testing with Python3.4 version:"
-    $PYSPARK_PYTHON --version
-
-    run_core_tests
-    run_sql_tests
-    run_mllib_tests
-    run_ml_tests
-    run_streaming_tests
-fi
-
-# Try to test with PyPy
-if [ $(which pypy) ]; then
-    export PYSPARK_PYTHON="pypy"
-    echo "Testing with PyPy version:"
-    $PYSPARK_PYTHON --version
-
-    run_core_tests
-    run_sql_tests
-    run_streaming_tests
-fi
-
-if [[ $FAILED == 0 ]]; then
-    now=$(date +"%s")
-    echo -e "\033[32mTests passed \033[0min $(($now - $START)) seconds"
-fi
-
-# TODO: in the long-run, it would be nice to use a test runner like `nose`.
-# The doctest fixtures are the current barrier to doing this.
diff --git a/python/run-tests.py b/python/run-tests.py
index 4e232bfe4be7..fa43a3a8aee0 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -66,6 +66,7 @@ def run_individual_python_test(test_name, pyspark_python=None):
 
 def main():
     # TODO: do we need to remove the metastore and warehouse created by the SQL tests? Ask Ahir.
+    print("Running PySpark tests. Output is in python/%s" % LOG_FILE)
     if os.path.exists(LOG_FILE):
         os.remove(LOG_FILE)
     python_execs = [x for x in ["python2.6", "python3.4", "pypy"] if which(x)]

From aec0b8f8cdfe7017066a29b77b446c923abe3ab6 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 23 Jun 2015 18:12:01 -0700
Subject: [PATCH 07/27] Actually get the Kafka stuff to run properly

---
 python/pyspark/streaming/tests.py | 45 +++++++++++--------------------
 1 file changed, 15 insertions(+), 30 deletions(-)

diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index 05dd45246f56..91ce681fbe16 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -576,36 +576,6 @@ def check_output(n):
 class KafkaStreamTests(PySparkStreamingTestCase):
     timeout = 20  # seconds
     duration = 1
-    old_pyspark_submit_args = None
-
-    @classmethod
-    def setUpClass(cls):
-        cls.old_pyspark_submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS")
-        SPARK_HOME = os.environ["SPARK_HOME"]
-        kafka_assembly_dir = os.path.join(SPARK_HOME, "external/kafka-assembly")
-        jars = glob.glob(
-            os.path.join(kafka_assembly_dir, "target/scala-*/spark-streaming-kafka-assembly-*.jar"))
-        if not jars:
-            raise Exception(
-                ("Failed to find Spark Streaming kafka assembly jar in %s. " % kafka_assembly_dir) +
-                "You need to build Spark with "
-                "'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or "
-                "'build/mvn package' before running this test")
-        elif len(jars) > 1:
-            raise Exception(("Found multiple Spark Streaming Kafka assembly JARs in %s; please "
-                            "remove all but one") % kafka_assembly_dir)
-        else:
-            os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars %s pyspark-shell" % jars[0]
-            print(os.environ["PYSPARK_SUBMIT_ARGS"])
-            super(KafkaStreamTests, cls).setUpClass()
-
-    @classmethod
-    def tearDownClass(cls):
-        if cls.old_pyspark_submit_args is None:
-            del os.environ["PYSPARK_SUBMIT_ARGS"]
-        else:
-            os.environ["PYSPARK_SUBMIT_ARGS"] = cls.old_pyspark_submit_args
-        super(KafkaStreamTests, cls).tearDownClass()
 
     def setUp(self):
         super(KafkaStreamTests, self).setUp()
@@ -708,4 +678,19 @@ def test_kafka_rdd_with_leaders(self):
         self._validateRddResult(sendData, rdd)
 
 if __name__ == "__main__":
+    SPARK_HOME = os.environ["SPARK_HOME"]
+    kafka_assembly_dir = os.path.join(SPARK_HOME, "external/kafka-assembly")
+    jars = glob.glob(
+        os.path.join(kafka_assembly_dir, "target/scala-*/spark-streaming-kafka-assembly-*.jar"))
+    if not jars:
+        raise Exception(
+            ("Failed to find Spark Streaming kafka assembly jar in %s. " % kafka_assembly_dir) +
+            "You need to build Spark with "
+            "'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or "
+            "'build/mvn package' before running this test")
+    elif len(jars) > 1:
+        raise Exception(("Found multiple Spark Streaming Kafka assembly JARs in %s; please "
+                         "remove all but one") % kafka_assembly_dir)
+    else:
+        os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars %s pyspark-shell" % jars[0]
     unittest.main()

From def2d8a951416514f975c9a19f9142c7f4086bfc Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 23 Jun 2015 18:38:05 -0700
Subject: [PATCH 08/27] Two minor fixes

---
 python/run-tests    | 2 --
 python/run-tests.py | 1 -
 2 files changed, 3 deletions(-)

diff --git a/python/run-tests b/python/run-tests
index c27cbb4fedb1..c7961cd5616c 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -22,5 +22,3 @@ FWDIR="$(cd "`dirname $0`"/..; pwd)"
 cd "$FWDIR"
 
 exec python -u ./python/run-tests.py
-
-exit
diff --git a/python/run-tests.py b/python/run-tests.py
index fa43a3a8aee0..82cdfbb544ab 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -65,7 +65,6 @@ def run_individual_python_test(test_name, pyspark_python=None):
 
 
 def main():
-    # TODO: do we need to remove the metastore and warehouse created by the SQL tests? Ask Ahir.
     print("Running PySpark tests. Output is in python/%s" % LOG_FILE)
     if os.path.exists(LOG_FILE):
         os.remove(LOG_FILE)

From d6a77d30fa33651ac70aa36ad2bb9c5ce85d01d9 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 23 Jun 2015 22:26:43 -0700
Subject: [PATCH 09/27] Fix the tests of dev/run-tests

---
 dev/run-tests.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 2de57fc0d1dc..a6d959c53f5a 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -32,6 +32,7 @@
 # Functions for traversing module dependency graph
 # -------------------------------------------------------------------------------------------------
 
+
 def determine_modules_for_files(filenames):
     """
     Given a list of filenames, return the set of modules that contain those files.
@@ -39,7 +40,7 @@ def determine_modules_for_files(filenames):
     file to belong to the 'root' module.
 
     >>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/test/foo"]))
-    ['pyspark', 'sql']
+    ['pyspark-core', 'sql']
     >>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])]
     ['root']
     """
@@ -90,8 +91,11 @@ def determine_modules_to_test(changed_modules):
     ['root']
     >>> sorted(x.name for x in determine_modules_to_test([modules.graphx]))
     ['examples', 'graphx']
-    >>> sorted(x.name for x in determine_modules_to_test([modules.sql]))
-    ['examples', 'hive-thriftserver', 'mllib', 'pyspark', 'sparkr', 'sql']
+    >>> x = sorted(x.name for x in determine_modules_to_test([modules.sql]))
+    >>> x # doctest: +NORMALIZE_WHITESPACE
+    ['examples', 'hive-thriftserver', 'mllib', 'pyspark-core', \
+     'pyspark-mllib', 'pyspark-sql', 'pyspark-sql', 'pyspark-streaming', \
+     'sparkr', 'sql']
     """
     # If we're going to have to run all of the tests, then we can just short-circuit
     # and return 'root'. No module depends on root, so if it appears then it will be

From caeb040b28c7151a5a3f0898b5b418fa063ff4c6 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 24 Jun 2015 09:53:23 -0700
Subject: [PATCH 10/27] Fixes to PySpark test module definitions

---
 dev/run-tests.py                |  5 ++---
 dev/sparktestsupport/modules.py | 10 +++++-----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index a6d959c53f5a..56096d39387d 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -93,9 +93,8 @@ def determine_modules_to_test(changed_modules):
     ['examples', 'graphx']
     >>> x = sorted(x.name for x in determine_modules_to_test([modules.sql]))
     >>> x # doctest: +NORMALIZE_WHITESPACE
-    ['examples', 'hive-thriftserver', 'mllib', 'pyspark-core', \
-     'pyspark-mllib', 'pyspark-sql', 'pyspark-sql', 'pyspark-streaming', \
-     'sparkr', 'sql']
+    ['examples', 'hive-thriftserver', 'mllib', 'pyspark-core', 'pyspark-ml', \
+     'pyspark-mllib', 'pyspark-sql', 'pyspark-streaming', 'sparkr', 'sql']
     """
     # If we're going to have to run all of the tests, then we can just short-circuit
     # and return 'root'. No module depends on root, so if it appears then it will be
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 48bbdd231364..c13bf425d71a 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -258,7 +258,7 @@ def contains_file(self, filename):
     name="pyspark-sql",
     dependencies=[pyspark_core, sql],
     source_file_regexes=[
-        "python/sql"
+        "python/pyspark/sql"
     ],
     python_test_goals=[
         "pyspark.sql.types",
@@ -278,7 +278,7 @@ def contains_file(self, filename):
     name="pyspark-streaming",
     dependencies=[pyspark_core, streaming, streaming_kafka],
     source_file_regexes=[
-        "python/streaming"
+        "python/pyspark/streaming"
     ],
     python_test_goals=[
         "pyspark.streaming.util",
@@ -291,7 +291,7 @@ def contains_file(self, filename):
     name="pyspark-mllib",
     dependencies=[pyspark_core, pyspark_streaming, pyspark_sql, mllib],
     source_file_regexes=[
-        "python/mllib"
+        "python/pyspark/mllib"
     ],
     python_test_goals=[
         "pyspark.mllib.classification",
@@ -313,10 +313,10 @@ def contains_file(self, filename):
 
 
 pyspark_ml = Module(
-    name="pyspark-sql",
+    name="pyspark-ml",
     dependencies=[pyspark_core, pyspark_mllib],
     source_file_regexes=[
-        "python/sql"
+        "python/pyspark/ml/"
     ],
     python_test_goals=[
         "pyspark.ml.feature",

From b2ab027852c596da6dcf18df2109386653beb97a Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 24 Jun 2015 13:28:13 -0700
Subject: [PATCH 11/27] Add command-line options for running individual suites
 in python/run-tests

---
 python/run-tests    |  2 +-
 python/run-tests.py | 50 +++++++++++++++++++++++++++++++++++++++------
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/python/run-tests b/python/run-tests
index c7961cd5616c..24949657ed7a 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -21,4 +21,4 @@
 FWDIR="$(cd "`dirname $0`"/..; pwd)"
 cd "$FWDIR"
 
-exec python -u ./python/run-tests.py
+exec python -u ./python/run-tests.py "$@"
diff --git a/python/run-tests.py b/python/run-tests.py
index 82cdfbb544ab..3c9a579303e8 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -18,6 +18,7 @@
 #
 
 from __future__ import print_function
+from optparse import OptionParser
 import os
 import re
 import subprocess
@@ -34,6 +35,9 @@
 from sparktestsupport.modules import all_modules  # noqa
 
 
+python_modules = dict((m.name, m) for m in all_modules if m.python_test_goals)
+
+
 def print_red(text):
     print('\033[31m' + text + '\033[0m')
 
@@ -64,23 +68,57 @@ def run_individual_python_test(test_name, pyspark_python=None):
         print("ok (%is)" % duration)
 
 
-def main():
-    print("Running PySpark tests. Output is in python/%s" % LOG_FILE)
-    if os.path.exists(LOG_FILE):
-        os.remove(LOG_FILE)
+def get_default_python_executables():
     python_execs = [x for x in ["python2.6", "python3.4", "pypy"] if which(x)]
     if "python2.6" not in python_execs:
         print("WARNING: Not testing against `python2.6` because it could not be found; falling"
               " back to `python` instead")
         python_execs.insert(0, "python")
+    return python_execs
+
+
+def parse_opts():
+    parser = OptionParser(
+        prog="run-tests"
+    )
+    parser.add_option(
+        "--python-executables", type="string", default=','.join(get_default_python_executables()),
+        help="A comma-separated list of Python executables to test against (default: %default)"
+    )
+    parser.add_option(
+        "--modules", type="string",
+        default=",".join(sorted(set(python_modules.keys()) - set(['root']))),
+        help="A comma-separated list of Python modules to test (default: %default)"
+    )
+
+    (opts, args) = parser.parse_args()
+    if args:
+        parser.error("Unsupported arguments: %s" % ' '.join(args))
+    return opts
+
+
+def main():
+    opts = parse_opts()
+    print("Running PySpark tests. Output is in python/%s" % LOG_FILE)
+    if os.path.exists(LOG_FILE):
+        os.remove(LOG_FILE)
+    python_execs = opts.python_executables.split(',')
+    modules_to_test = []
+    for module_name in opts.modules.split(','):
+        if module_name in python_modules:
+            modules_to_test.append(python_modules[module_name])
+        else:
+            print("Error: unrecognized module %s" % module_name)
+            sys.exit(-1)
     print("Will test against the following Python executables: %s" % python_execs)
+    print("Will test the following Python modules: %s" % [x.name for x in modules_to_test])
+
     start_time = time.time()
     for python_exec in python_execs:
         print("Testing with `%s`: " % python_exec, end='')
         subprocess.call([python_exec, "--version"])
 
-        python_modules = [m for m in all_modules if m.python_test_goals]
-        for module in python_modules:
+        for module in modules_to_test:
             print("Running %s tests ..." % module.name)
             for test_goal in module.python_test_goals:
                 run_individual_python_test(test_goal)

From 2efd594a6fb7c323c1a3d94e0398d6f5c6b66c58 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 24 Jun 2015 13:32:37 -0700
Subject: [PATCH 12/27] Update dev/run-tests to use new Python test runner
 flags

---
 dev/run-tests.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 56096d39387d..c287600f7c33 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -352,10 +352,12 @@ def run_scala_tests(build_tool, hadoop_version, test_modules):
         run_scala_tests_sbt(test_modules, test_profiles)
 
 
-def run_python_tests():
+def run_python_tests(modules):
     set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")
 
-    run_cmd([os.path.join(SPARK_HOME, "python", "run-tests")])
+    run_cmd([
+        os.path.join(SPARK_HOME, "python", "run-tests"),
+        "--modules=%s" % ','.join(m.name for m in modules)])
 
 
 def run_sparkr_tests():
@@ -447,8 +449,9 @@ def main():
     # run the test suites
     run_scala_tests(build_tool, hadoop_version, test_modules)
 
-    if any(m.should_run_python_tests for m in test_modules):
-        run_python_tests()
+    modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
+    if modules_with_python_tests:
+        run_python_tests(modules_with_python_tests)
     if any(m.should_run_r_tests for m in test_modules):
         run_sparkr_tests()
 

From fff4d099f2e59b5ecccaa9107cb0b3becde65dac Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 24 Jun 2015 13:34:34 -0700
Subject: [PATCH 13/27] Add dev/sparktestsupport to pep8 checks

---
 dev/lint-python                  | 2 +-
 dev/sparktestsupport/__init__.py | 2 +-
 dev/sparktestsupport/modules.py  | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/dev/lint-python b/dev/lint-python
index f50d149dc4d4..21f785031b5d 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -19,7 +19,7 @@
 
 SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")"
-PATHS_TO_CHECK="./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/"
+PATHS_TO_CHECK="./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/ ./dev/sparktestsupport"
 PYTHON_LINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/python-lint-report.txt"
 
 cd "$SPARK_ROOT_DIR"
diff --git a/dev/sparktestsupport/__init__.py b/dev/sparktestsupport/__init__.py
index 9afb1da74760..a75dbdcd77a0 100644
--- a/dev/sparktestsupport/__init__.py
+++ b/dev/sparktestsupport/__init__.py
@@ -18,4 +18,4 @@
 import os
 
 SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../../")
-USER_HOME = os.environ.get("HOME")
\ No newline at end of file
+USER_HOME = os.environ.get("HOME")
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index c13bf425d71a..d5cf01bf3e87 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -364,8 +364,8 @@ def contains_file(self, filename):
     dependencies=[],
     source_file_regexes=[],
     # In order to run all of the tests, enable every test profile:
-    build_profile_flags=
-        list(set(itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))),
+    build_profile_flags=list(set(
+        itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))),
     sbt_test_goals=[
         "test",
     ],

From f542ac5a28902b4021641c61ff508b2ce0e1c145 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 24 Jun 2015 16:10:28 -0700
Subject: [PATCH 14/27] Fix lint check for Python 3

---
 dev/lint-python                    | 1 +
 dev/sparktestsupport/shellutils.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/dev/lint-python b/dev/lint-python
index 21f785031b5d..cd32d734ebb3 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -20,6 +20,7 @@
 SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")"
 PATHS_TO_CHECK="./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/ ./dev/sparktestsupport"
+PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py"
 PYTHON_LINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/python-lint-report.txt"
 
 cd "$SPARK_ROOT_DIR"
diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py
index 5176e4e15c06..ad9b0cc89e4a 100644
--- a/dev/sparktestsupport/shellutils.py
+++ b/dev/sparktestsupport/shellutils.py
@@ -22,7 +22,7 @@
 
 
 def exit_from_command_with_retcode(cmd, retcode):
-    print "[error] running", ' '.join(cmd), "; received return code", retcode
+    print("[error] running", ' '.join(cmd), "; received return code", retcode)
     sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
 
 

From 8f3244c1127c7f4d30cf0018d904e88bc375e562 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 24 Jun 2015 22:58:27 -0700
Subject: [PATCH 15/27] Use universal_newlines to fix dev/run-tests doctest
 failures on Python 3.

---
 dev/run-tests.py | 60 ++++++++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 28 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index c287600f7c33..2949d1e5e1f2 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -17,6 +17,7 @@
 # limitations under the License.
 #
 
+from __future__ import print_function
 import itertools
 import os
 import re
@@ -77,7 +78,8 @@ def identify_changed_files_from_git_commits(patch_sha, target_branch=None, targe
         run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
     else:
         diff_target = target_ref
-    raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target])
+    raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target],
+        universal_newlines=True)
     # Remove any empty strings
     return [f for f in raw_output.split('\n') if f]
 
@@ -149,7 +151,8 @@ def determine_java_version(java_exe):
     with accessors '.major', '.minor', '.patch', '.update'"""
 
     raw_output = subprocess.check_output([java_exe, "-version"],
-                                         stderr=subprocess.STDOUT)
+                                         stderr=subprocess.STDOUT,
+                                         universal_newlines=True)
     raw_version_str = raw_output.split('\n')[0]  # eg 'java version "1.8.0_25"'
     version_str = raw_version_str.split()[-1].strip('"')  # eg '1.8.0_25'
     version, update = version_str.split('_')  # eg ['1.8.0', '25']
@@ -172,10 +175,10 @@ def set_title_and_block(title, err_block):
     os.environ["CURRENT_BLOCK"] = ERROR_CODES[err_block]
     line_str = '=' * 72
 
-    print
-    print line_str
-    print title
-    print line_str
+    print()
+    print(line_str)
+    print(title)
+    print(line_str)
 
 
 def run_apache_rat_checks():
@@ -202,8 +205,8 @@ def build_spark_documentation():
     jekyll_bin = which("jekyll")
 
     if not jekyll_bin:
-        print "[error] Cannot find a version of `jekyll` on the system; please",
-        print "install one and retry to build documentation."
+        print("[error] Cannot find a version of `jekyll` on the system; please"
+              " install one and retry to build documentation.")
         sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
     else:
         run_cmd([jekyll_bin, "build"])
@@ -239,7 +242,7 @@ def exec_sbt(sbt_args=()):
     echo_proc.wait()
     for line in iter(sbt_proc.stdout.readline, ''):
         if not sbt_output_filter.match(line):
-            print line,
+            print(line, end='')
     retcode = sbt_proc.wait()
 
     if retcode > 0:
@@ -262,8 +265,8 @@ def get_hadoop_profiles(hadoop_version):
     if hadoop_version in sbt_maven_hadoop_profiles:
         return sbt_maven_hadoop_profiles[hadoop_version]
     else:
-        print "[error] Could not find", hadoop_version, "in the list. Valid options",
-        print "are", sbt_maven_hadoop_profiles.keys()
+        print("[error] Could not find", hadoop_version, "in the list. Valid options"
+              " are", sbt_maven_hadoop_profiles.keys())
         sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
 
 
@@ -273,8 +276,8 @@ def build_spark_maven(hadoop_version):
     mvn_goals = ["clean", "package", "-DskipTests"]
     profiles_and_goals = build_profiles + mvn_goals
 
-    print "[info] Building Spark (w/Hive 0.13.1) using Maven with these arguments:",
-    print " ".join(profiles_and_goals)
+    print("[info] Building Spark (w/Hive 0.13.1) using Maven with these arguments: "
+          " ".join(profiles_and_goals))
 
     exec_maven(profiles_and_goals)
 
@@ -287,8 +290,8 @@ def build_spark_sbt(hadoop_version):
                  "streaming-kafka-assembly/assembly"]
     profiles_and_goals = build_profiles + sbt_goals
 
-    print "[info] Building Spark (w/Hive 0.13.1) using SBT with these arguments:",
-    print " ".join(profiles_and_goals)
+    print("[info] Building Spark (w/Hive 0.13.1) using SBT with these arguments: "
+          " ".join(profiles_and_goals))
 
     exec_sbt(profiles_and_goals)
 
@@ -316,8 +319,8 @@ def run_scala_tests_maven(test_profiles):
     mvn_test_goals = ["test", "--fail-at-end"]
     profiles_and_goals = test_profiles + mvn_test_goals
 
-    print "[info] Running Spark tests using Maven with these arguments:",
-    print " ".join(profiles_and_goals)
+    print("[info] Running Spark tests using Maven with these arguments: "
+          " ".join(profiles_and_goals))
 
     exec_maven(profiles_and_goals)
 
@@ -331,8 +334,8 @@ def run_scala_tests_sbt(test_modules, test_profiles):
 
     profiles_and_goals = test_profiles + list(sbt_test_goals)
 
-    print "[info] Running Spark tests using SBT with these arguments:",
-    print " ".join(profiles_and_goals)
+    print("[info] Running Spark tests using SBT with these arguments: "
+          " ".join(profiles_and_goals))
 
     exec_sbt(profiles_and_goals)
 
@@ -367,14 +370,14 @@ def run_sparkr_tests():
         run_cmd([os.path.join(SPARK_HOME, "R", "install-dev.sh")])
         run_cmd([os.path.join(SPARK_HOME, "R", "run-tests.sh")])
     else:
-        print "Ignoring SparkR tests as R was not found in PATH"
+        print("Ignoring SparkR tests as R was not found in PATH")
 
 
 def main():
     # Ensure the user home directory (HOME) is valid and is an absolute directory
     if not USER_HOME or not os.path.isabs(USER_HOME):
-        print "[error] Cannot determine your home directory as an absolute path;",
-        print "ensure the $HOME environment variable is set properly."
+        print("[error] Cannot determine your home directory as an absolute path;"
+             " ensure the $HOME environment variable is set properly.")
         sys.exit(1)
 
     os.chdir(SPARK_HOME)
@@ -388,14 +391,14 @@ def main():
     java_exe = determine_java_executable()
 
     if not java_exe:
-        print "[error] Cannot find a version of `java` on the system; please",
-        print "install one and retry."
+        print("[error] Cannot find a version of `java` on the system; please"
+              " install one and retry.")
         sys.exit(2)
 
     java_version = determine_java_version(java_exe)
 
     if java_version.minor < 8:
-        print "[warn] Java 8 tests will not run because JDK version is < 1.8."
+        print("[warn] Java 8 tests will not run because JDK version is < 1.8.")
 
     if os.environ.get("AMPLAB_JENKINS"):
         # if we're on the Amplab Jenkins build servers setup variables
@@ -411,8 +414,8 @@ def main():
         hadoop_version = "hadoop2.3"
         test_env = "local"
 
-    print "[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
-    print "under environment", test_env
+    print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
+          "under environment", test_env)
 
     changed_modules = None
     changed_files = None
@@ -422,7 +425,8 @@ def main():
         changed_modules = determine_modules_for_files(changed_files)
     if not changed_modules:
         changed_modules = [modules.root]
-    print "[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)
+    print("[info] Found the following changed modules:",
+          ", ".join(x.name for x in changed_modules))
 
     test_modules = determine_modules_to_test(changed_modules)
 

From 4f8902cc3f819a16354584d09ad7054072a4b1ea Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 24 Jun 2015 22:59:18 -0700
Subject: [PATCH 16/27] Python lint fixes.

---
 dev/run-tests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 2949d1e5e1f2..ec021056af2c 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -79,7 +79,7 @@ def identify_changed_files_from_git_commits(patch_sha, target_branch=None, targe
     else:
         diff_target = target_ref
     raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target],
-        universal_newlines=True)
+                                         universal_newlines=True)
     # Remove any empty strings
     return [f for f in raw_output.split('\n') if f]
 
@@ -377,7 +377,7 @@ def main():
     # Ensure the user home directory (HOME) is valid and is an absolute directory
     if not USER_HOME or not os.path.isabs(USER_HOME):
         print("[error] Cannot determine your home directory as an absolute path;"
-             " ensure the $HOME environment variable is set properly.")
+              " ensure the $HOME environment variable is set properly.")
         sys.exit(1)
 
     os.chdir(SPARK_HOME)

From 9c80469099790fbb2aa3957c2054ba44e061a76d Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 25 Jun 2015 16:59:44 -0700
Subject: [PATCH 17/27] Fix passing of PYSPARK_PYTHON

---
 python/run-tests.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index 3c9a579303e8..76ba76f9c269 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -45,10 +45,8 @@ def print_red(text):
 LOG_FILE = os.path.join(SPARK_HOME, "python/unit-tests.log")
 
 
-def run_individual_python_test(test_name, pyspark_python=None):
-    env = {'SPARK_TESTING': '1'}
-    if pyspark_python:
-        env["PYSPARK_PYTHON"] = pyspark_python
+def run_individual_python_test(test_name, pyspark_python):
+    env = {'SPARK_TESTING': '1', 'PYSPARK_PYTHON': pyspark_python}
     print("    Running test: %s ..." % test_name, end='')
     start_time = time.time()
     with open(LOG_FILE, 'a') as log_file:
@@ -121,7 +119,7 @@ def main():
         for module in modules_to_test:
             print("Running %s tests ..." % module.name)
             for test_goal in module.python_test_goals:
-                run_individual_python_test(test_goal)
+                run_individual_python_test(test_goal, python_exec)
     total_duration = time.time() - start_time
     print("Tests passed in %i seconds" % total_duration)
 

From f53db5575123ab3720cce0de8a671bc5a009d359 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 25 Jun 2015 19:25:17 -0700
Subject: [PATCH 18/27] Remove python2 flag, since the test runner script also
 works fine under Python 3

---
 python/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index 76ba76f9c269..d2584758c5c1 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env
 
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more

From 3b852ae617ed3c6d0fd19527a280099facf6a300 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 26 Jun 2015 01:08:54 -0700
Subject: [PATCH 19/27] Fall back to PYSPARK_PYTHON when sys.executable is None
 (fixes a test)

---
 python/pyspark/tests.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 78265423682b..17256dfc9574 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -1421,7 +1421,8 @@ def do_termination_test(self, terminator):
 
         # start daemon
         daemon_path = os.path.join(os.path.dirname(__file__), "daemon.py")
-        daemon = Popen([sys.executable, daemon_path], stdin=PIPE, stdout=PIPE)
+        python_exec = sys.executable or os.environ.get("PYSPARK_PYTHON")
+        daemon = Popen([python_exec, daemon_path], stdin=PIPE, stdout=PIPE)
 
         # read the port number
         port = read_int(daemon.stdout)

From 568a3fdfc505908384c8402787d4ee4d98a94665 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 26 Jun 2015 01:11:50 -0700
Subject: [PATCH 20/27] Fix hashbang

---
 python/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index d2584758c5c1..b24c1b74de5d 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env
+#!/usr/bin/env python
 
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more

From c364ccfe5c8cee31c7898ac16e79de72ad0d07fe Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 26 Jun 2015 01:29:14 -0700
Subject: [PATCH 21/27] Use which() to convert PYSPARK_PYTHON to an absolute
 path before shelling out to run tests

---
 dev/sparktestsupport/__init__.py | 2 +-
 python/run-tests.py              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/sparktestsupport/__init__.py b/dev/sparktestsupport/__init__.py
index a75dbdcd77a0..12696d98fb98 100644
--- a/dev/sparktestsupport/__init__.py
+++ b/dev/sparktestsupport/__init__.py
@@ -17,5 +17,5 @@
 
 import os
 
-SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../../")
+SPARK_HOME = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "../../"))
 USER_HOME = os.environ.get("HOME")
diff --git a/python/run-tests.py b/python/run-tests.py
index b24c1b74de5d..d508eda208fe 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -46,7 +46,7 @@ def print_red(text):
 
 
 def run_individual_python_test(test_name, pyspark_python):
-    env = {'SPARK_TESTING': '1', 'PYSPARK_PYTHON': pyspark_python}
+    env = {'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python) }
     print("    Running test: %s ..." % test_name, end='')
     start_time = time.time()
     with open(LOG_FILE, 'a') as log_file:

From 27a389fff86c5ca1985ae7fa8f6d0af1d51d3c1a Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 26 Jun 2015 09:47:33 -0700
Subject: [PATCH 22/27] Skip MLLib tests for PyPy

---
 dev/sparktestsupport/modules.py | 13 ++++++++++++-
 python/run-tests.py             | 11 +++++++----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index d5cf01bf3e87..efe3a897e9c1 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -30,7 +30,8 @@ class Module(object):
     """
 
     def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(),
-                 sbt_test_goals=(), python_test_goals=(), should_run_r_tests=False):
+                 sbt_test_goals=(), python_test_goals=(), blacklisted_python_implementations=(),
+                 should_run_r_tests=False):
         """
         Define a new module.
 
@@ -44,6 +45,9 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
             order to build and test this module (e.g. '-PprofileName').
         :param sbt_test_goals: A set of SBT test goals for testing this module.
         :param python_test_goals: A set of Python test goals for testing this module.
+        :param blacklisted_python_implementations: A set of Python implementations that are not
+            supported by this module's Python components. The values in this set should match
+            strings returned by Python's `platform.python_implementation()`.
         :param should_run_r_tests: If true, changes in this module will trigger all R tests.
         """
         self.name = name
@@ -52,6 +56,7 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
         self.sbt_test_goals = sbt_test_goals
         self.build_profile_flags = build_profile_flags
         self.python_test_goals = python_test_goals
+        self.blacklisted_python_implementations = blacklisted_python_implementations
         self.should_run_r_tests = should_run_r_tests
 
         self.dependent_modules = set()
@@ -308,6 +313,9 @@ def contains_file(self, filename):
         "pyspark.mllib.tree",
         "pyspark.mllib.util",
         "pyspark.mllib.tests",
+    ],
+    blacklisted_python_implementations=[
+        "PyPy"  # Skip these tests under PyPy since they require numpy and it isn't available there
     ]
 )
 
@@ -326,6 +334,9 @@ def contains_file(self, filename):
         "pyspark.ml.tuning",
         "pyspark.ml.tests",
         "pyspark.ml.evaluation",
+    ],
+    blacklisted_python_implementations=[
+        "PyPy"  # Skip these tests under PyPy since they require numpy and it isn't available there
     ]
 )
 
diff --git a/python/run-tests.py b/python/run-tests.py
index d508eda208fe..3eb45e47685a 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -46,7 +46,7 @@ def print_red(text):
 
 
 def run_individual_python_test(test_name, pyspark_python):
-    env = {'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python) }
+    env = {'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python)}
     print("    Running test: %s ..." % test_name, end='')
     start_time = time.time()
     with open(LOG_FILE, 'a') as log_file:
@@ -113,13 +113,16 @@ def main():
 
     start_time = time.time()
     for python_exec in python_execs:
+        python_implementation = subprocess.check_output(
+            [python_exec, "-c", "import platform; print platform.python_implementation()"]).strip()
         print("Testing with `%s`: " % python_exec, end='')
         subprocess.call([python_exec, "--version"])
 
         for module in modules_to_test:
-            print("Running %s tests ..." % module.name)
-            for test_goal in module.python_test_goals:
-                run_individual_python_test(test_goal, python_exec)
+            if python_implementation not in module.blacklisted_python_implementations:
+                print("Running %s tests ..." % module.name)
+                for test_goal in module.python_test_goals:
+                    run_individual_python_test(test_goal, python_exec)
     total_duration = time.time() - start_time
     print("Tests passed in %i seconds" % total_duration)
 

From 37aff00fa6fd8ec09d115f9ae7c41dedaa573d67 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 26 Jun 2015 10:02:03 -0700
Subject: [PATCH 23/27] Python 3 fix

---
 python/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index 3eb45e47685a..297ba22e9961 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -114,7 +114,7 @@ def main():
     start_time = time.time()
     for python_exec in python_execs:
         python_implementation = subprocess.check_output(
-            [python_exec, "-c", "import platform; print platform.python_implementation()"]).strip()
+            [python_exec, "-c", "import platform; print(platform.python_implementation())"]).strip()
         print("Testing with `%s`: " % python_exec, end='')
         subprocess.call([python_exec, "--version"])
 

From 8f65ed088d1b3df22e7d364de8c23f0b034c171f Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 26 Jun 2015 12:02:04 -0700
Subject: [PATCH 24/27] Fix handling of  module in python/run-tests

---
 dev/run-tests.py    | 9 +++++----
 python/run-tests.py | 4 ++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 87218e3eea70..edf669e73591 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -360,12 +360,13 @@ def run_scala_tests(build_tool, hadoop_version, test_modules):
         run_scala_tests_sbt(test_modules, test_profiles)
 
 
-def run_python_tests(modules):
+def run_python_tests(test_modules):
     set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS")
 
-    run_cmd([
-        os.path.join(SPARK_HOME, "python", "run-tests"),
-        "--modules=%s" % ','.join(m.name for m in modules)])
+    command = [os.path.join(SPARK_HOME, "python", "run-tests")]
+    if test_modules != [modules.root]:
+        command.append("--modules=%s" % ','.join(m.name for m in modules))
+    run_cmd(command)
 
 
 def run_sparkr_tests():
diff --git a/python/run-tests.py b/python/run-tests.py
index 297ba22e9961..6d188019f871 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -35,7 +35,7 @@
 from sparktestsupport.modules import all_modules  # noqa
 
 
-python_modules = dict((m.name, m) for m in all_modules if m.python_test_goals)
+python_modules = dict((m.name, m) for m in all_modules if m.python_test_goals if m.name != 'root')
 
 
 def print_red(text):
@@ -85,7 +85,7 @@ def parse_opts():
     )
     parser.add_option(
         "--modules", type="string",
-        default=",".join(sorted(set(python_modules.keys()) - set(['root']))),
+        default=",".join(sorted(python_modules.keys())),
         help="A comma-separated list of Python modules to test (default: %default)"
     )
 

From 34c98d2bac769e1e57f5f4a5a13e94145402efd2 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 26 Jun 2015 21:32:25 -0700
Subject: [PATCH 25/27] Fix universal_newlines for Python 3

---
 python/run-tests.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index 6d188019f871..7d485b500ee3 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -114,7 +114,8 @@ def main():
     start_time = time.time()
     for python_exec in python_execs:
         python_implementation = subprocess.check_output(
-            [python_exec, "-c", "import platform; print(platform.python_implementation())"]).strip()
+            [python_exec, "-c", "import platform; print(platform.python_implementation())"],
+            universal_newlines=True).strip()
         print("Testing with `%s`: " % python_exec, end='')
         subprocess.call([python_exec, "--version"])
 

From 8233d61608892f8ff2da906b206f423f6a67d5b9 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sat, 27 Jun 2015 15:39:10 -0700
Subject: [PATCH 26/27] Add python/run-tests.py to Python lint checks

---
 dev/lint-python | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/lint-python b/dev/lint-python
index cd32d734ebb3..0c3586462cb3 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -20,7 +20,7 @@
 SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")"
 PATHS_TO_CHECK="./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/ ./dev/sparktestsupport"
-PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py"
+PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py"
 PYTHON_LINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/python-lint-report.txt"
 
 cd "$SPARK_ROOT_DIR"

From f578d6da50cd09a3ad13dbe6dcc6b22870f227e5 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sat, 27 Jun 2015 15:39:47 -0700
Subject: [PATCH 27/27] Fix print for Python 2.x

---
 dev/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index edf669e73591..c51b0d3010a0 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -180,7 +180,7 @@ def set_title_and_block(title, err_block):
     os.environ["CURRENT_BLOCK"] = ERROR_CODES[err_block]
     line_str = '=' * 72
 
-    print()
+    print('')
     print(line_str)
     print(title)
     print(line_str)