Remove assembly in tests.

apache · JoshRosen · Mar 14, 2016 · Mar 14, 2016 · Mar 14, 2016 · Mar 14, 2016
commit 2c101932a7b94ff8b4aa14b09bea6728da4a4bdd
diff --git a/dev/run-tests.py b/dev/run-tests.py
@@ -323,7 +323,7 @@ def get_hadoop_profiles(hadoop_version):
 def build_spark_maven(hadoop_version):
     # Enable all of the profiles for the build:
     build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
-    mvn_goals = ["clean", "package", "-DskipTests"]
+    mvn_goals = ["clean", "package", "-DskipTests", "-pl", "!assembly"]
     profiles_and_goals = build_profiles + mvn_goals
 
     print("[info] Building Spark (w/Hive 1.2.1) using Maven with these arguments: ",
@@ -349,16 +349,6 @@ def build_spark_sbt(hadoop_version):
     exec_sbt(profiles_and_goals)
 
 
-def build_spark_assembly_sbt(hadoop_version):
-    # Enable all of the profiles for the build:
-    build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
-    sbt_goals = ["assembly/assembly"]
-    profiles_and_goals = build_profiles + sbt_goals
-    print("[info] Building Spark assembly (w/Hive 1.2.1) using SBT with these arguments: ",
-          " ".join(profiles_and_goals))
-    exec_sbt(profiles_and_goals)
-
-
 def build_apache_spark(build_tool, hadoop_version):
     """Will build Spark against Hive v1.2.1 given the passed in build tool (either `sbt` or
     `maven`). Defaults to using `sbt`."""
@@ -574,9 +564,6 @@ def main():
     if build_tool == "sbt":
         # Note: compatibility tests only supported in sbt for now
         detect_binary_inop_with_mima()
-        # Since we did not build assembly/assembly before running dev/mima, we need to
-        # do it here because the tests still rely on it; see SPARK-13294 for details.
-        build_spark_assembly_sbt(hadoop_version)
 
     # run the test suites
     run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags)

diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
@@ -144,10 +144,38 @@ List<String> buildClassPath(String appClassPath) throws IOException {
     boolean isTesting = "1".equals(getenv("SPARK_TESTING"));
     if (prependClasses || isTesting) {
       String scala = getScalaVersion();
-      List<String> projects = Arrays.asList("core", "repl", "mllib", "graphx",
-        "streaming", "tools", "sql/catalyst", "sql/core", "sql/hive", "sql/hive-thriftserver",
-        "yarn", "launcher",
-        "common/network-common", "common/network-shuffle", "common/network-yarn");
+      // All projects except assemblies:
+      List<String> projects = Arrays.asList(
+        "common/network-common",
+        "common/network-shuffle",
+        "common/network-yarn",
+        "common/sketch",
+        "common/tags",
+        "common/unsafe",
+        "core",
+        "examples",
+        "external/akka",
+        "external/docker-integration-tests",
+        "external/flume",
+        "external/flume-sink",
+        "external/kafka",
+        "external/kinesis-asl",
+        "external/mqtt",
+        "external/spark-ganglia-lgpl",
+        "external/twitter",
+        "external/zeromq",
+        "graphx",
+        "launcher",
+        "mllib",
+        "repl",
+        "sql/catalyst",
+        "sql/core",
+        "sql/hive",
+        "sql/hive-thriftserver",
+        "streaming",
+        "tools",
+        "yarn"
+      );
       if (prependClasses) {
         if (!isTesting) {
           System.err.println(

diff --git a/python/run-tests.py b/python/run-tests.py
@@ -54,10 +54,27 @@ def print_red(text):
 LOGGER = logging.getLogger()
 
 
-def run_individual_python_test(test_name, pyspark_python):
+def get_spark_dist_classpath():
+    original_working_dir = os.getcwd()
+    os.chdir(SPARK_HOME)
+    cp = subprocess_check_output(
+        ["./build/sbt", "export assembly/managedClasspath"], universal_newlines=True)
+    cp = cp.strip().split("\n")[-1]
+    os.chdir(original_working_dir)
+    return cp
+
+
+def run_individual_python_test(test_name, pyspark_python, spark_dist_classpath):
     env = dict(os.environ)
-    env.update({'SPARK_TESTING': '1', 'PYSPARK_PYTHON': which(pyspark_python),
-                'PYSPARK_DRIVER_PYTHON': which(pyspark_python)})
+    env.update({
+        # Setting SPARK_DIST_CLASSPATH is a simple way to make sure that any child processes
+        # launched by the tests have access to the correct test-time classpath.
+        'SPARK_DIST_CLASSPATH': spark_dist_classpath,
+        'SPARK_TESTING': '1',
+        'SPARK_PREPEND_CLASSES': '1',
+        'PYSPARK_PYTHON': which(pyspark_python),
+        'PYSPARK_DRIVER_PYTHON': which(pyspark_python),
+    })
     LOGGER.debug("Starting test(%s): %s", pyspark_python, test_name)
     start_time = time.time()
     try:
@@ -175,14 +192,16 @@ def main():
                         priority = 100
                     task_queue.put((priority, (python_exec, test_goal)))
 
+    spark_dist_classpath = get_spark_dist_classpath()
+
     def process_queue(task_queue):
         while True:
             try:
                 (priority, (python_exec, test_goal)) = task_queue.get_nowait()
             except Queue.Empty:
                 break
             try:
-                run_individual_python_test(test_goal, python_exec)
+                run_individual_python_test(test_goal, python_exec, spark_dist_classpath)
             finally:
                 task_queue.task_done()