First attempt at getting PySpark Kafka test to work in new runner script

apache · JoshRosen · Jun 22, 2015 · Jun 22, 2015 · Jun 23, 2015 · Jun 23, 2015
commit 04015b9dd5d2a2f47c1ca48408bca8949709ab01
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 
+import glob
 import os
 import sys
 from itertools import chain
@@ -575,6 +576,36 @@ def check_output(n):
 class KafkaStreamTests(PySparkStreamingTestCase):
     timeout = 20  # seconds
     duration = 1
+    old_pyspark_submit_args = None
+
+    @classmethod
+    def setUpClass(cls):
+        cls.old_pyspark_submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS")
+        SPARK_HOME = os.environ["SPARK_HOME"]
+        kafka_assembly_dir = os.path.join(SPARK_HOME, "external/kafka-assembly")
+        jars = glob.glob(
+            os.path.join(kafka_assembly_dir, "target/scala-*/spark-streaming-kafka-assembly-*.jar"))
+        if not jars:
+            raise Exception(
+                ("Failed to find Spark Streaming kafka assembly jar in %s. " % kafka_assembly_dir) +
+                "You need to build Spark with "
+                "'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or "
+                "'build/mvn package' before running this test")
+        elif len(jars) > 1:
+            raise Exception(("Found multiple Spark Streaming Kafka assembly JARs in %s; please "
+                            "remove all but one") % kafka_assembly_dir)
+        else:
+            os.environ["PYSPARK_SUBMIT_ARGS"] = "--jars %s pyspark-shell" % jars[0]
+            print(os.environ["PYSPARK_SUBMIT_ARGS"])
+            super(KafkaStreamTests, cls).setUpClass()
+
+    @classmethod
+    def tearDownClass(cls):
+        if cls.old_pyspark_submit_args is None:
+            del os.environ["PYSPARK_SUBMIT_ARGS"]
+        else:
+            os.environ["PYSPARK_SUBMIT_ARGS"] = cls.old_pyspark_submit_args
+        super(KafkaStreamTests, cls).tearDownClass()
 
     def setUp(self):
         super(KafkaStreamTests, self).setUp()

diff --git a/python/run-tests b/python/run-tests
@@ -24,82 +24,3 @@ cd "$FWDIR"
 exec python -u ./python/run-tests.py
 
 exit
-
-
-FAILED=0
-LOG_FILE=unit-tests.log
-START=$(date +"%s")
-
-rm -f $LOG_FILE
-
-
-function run_streaming_tests() {
-    echo "Run streaming tests ..."
-
-    KAFKA_ASSEMBLY_DIR="$FWDIR"/external/kafka-assembly
-    JAR_PATH="${KAFKA_ASSEMBLY_DIR}/target/scala-${SPARK_SCALA_VERSION}"
-    for f in "${JAR_PATH}"/spark-streaming-kafka-assembly-*.jar; do
-      if [[ ! -e "$f" ]]; then
-        echo "Failed to find Spark Streaming Kafka assembly jar in $KAFKA_ASSEMBLY_DIR" 1>&2
-        echo "You need to build Spark with " \
-             "'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or" \
-             "'build/mvn package' before running this program" 1>&2
-        exit 1
-      fi
-      KAFKA_ASSEMBLY_JAR="$f"
-    done
-
-    export PYSPARK_SUBMIT_ARGS="--jars ${KAFKA_ASSEMBLY_JAR} pyspark-shell"
-    run_test "pyspark.streaming.util"
-    run_test "pyspark.streaming.tests"
-}
-
-echo "Running PySpark tests. Output is in python/$LOG_FILE."
-
-export PYSPARK_PYTHON="python"
-
-# Try to test with Python 2.6, since that's the minimum version that we support:
-if [ $(which python2.6) ]; then
-    export PYSPARK_PYTHON="python2.6"
-fi
-
-echo "Testing with Python version:"
-$PYSPARK_PYTHON --version
-
-run_core_tests
-run_sql_tests
-run_mllib_tests
-run_ml_tests
-run_streaming_tests
-
-# Try to test with Python 3
-if [ $(which python3.4) ]; then
-    export PYSPARK_PYTHON="python3.4"
-    echo "Testing with Python3.4 version:"
-    $PYSPARK_PYTHON --version
-
-    run_core_tests
-    run_sql_tests
-    run_mllib_tests
-    run_ml_tests
-    run_streaming_tests
-fi
-
-# Try to test with PyPy
-if [ $(which pypy) ]; then
-    export PYSPARK_PYTHON="pypy"
-    echo "Testing with PyPy version:"
-    $PYSPARK_PYTHON --version
-
-    run_core_tests
-    run_sql_tests
-    run_streaming_tests
-fi
-
-if [[ $FAILED == 0 ]]; then
-    now=$(date +"%s")
-    echo -e "\033[32mTests passed \033[0min $(($now - $START)) seconds"
-fi
-
-# TODO: in the long-run, it would be nice to use a test runner like `nose`.
-# The doctest fixtures are the current barrier to doing this.
diff --git a/python/run-tests.py b/python/run-tests.py
@@ -66,6 +66,7 @@ def run_individual_python_test(test_name, pyspark_python=None):
 
 def main():
     # TODO: do we need to remove the metastore and warehouse created by the SQL tests? Ask Ahir.
+    print("Running PySpark tests. Output is in python/%s" % LOG_FILE)
     if os.path.exists(LOG_FILE):
         os.remove(LOG_FILE)
     python_execs = [x for x in ["python2.6", "python3.4", "pypy"] if which(x)]