Address comments

xuanyuanking · xuanyuanking · commit 4868e82256c0 · 2019-01-09T11:13:47.000+08:00
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
@@ -493,9 +493,13 @@ def getStart(split):
                 return start0 + int((split * size / numSlices)) * step
 
             def f(split, iterator):
-                # it's an empty iterator here but we need this line for triggering the logic of
-                # checking END_OF_DATA_SECTION during load iterator in runtime, thus make sure
-                # worker reuse takes effect. See more details in SPARK-26549.
+                # it's an empty iterator here but we need this line for triggering the
+                # logic of signal handling in FramedSerializer.load_stream, for instance,
+                # SpecialLengths.END_OF_DATA_SECTION in _read_with_length. Since
+                # FramedSerializer.load_stream produces a generator, the control should
+                # at least be in that function once. Here we do it by explicitly converting
+                # the empty iterator to a list, thus make sure worker reuse takes effect.
+                # See more details in SPARK-26549.
                 assert len(list(iterator)) == 0
                 return xrange(getStart(split), getStart(split + 1), step)
 
diff --git a/python/pyspark/tests/test_worker.py b/python/pyspark/tests/test_worker.py
@@ -22,7 +22,7 @@
 
 from py4j.protocol import Py4JJavaError
 
-from pyspark.testing.utils import ReusedPySparkTestCase, QuietTest
+from pyspark.testing.utils import ReusedPySparkTestCase, PySparkTestCase, QuietTest
 
 if sys.version_info[0] >= 3:
     xrange = range
@@ -144,14 +144,15 @@ def test_with_different_versions_of_python(self):
         finally:
             self.sc.pythonVer = version
 
+
+class WorkerReuseTest(PySparkTestCase):
+
     def test_reuse_worker_of_parallelize_xrange(self):
-        def get_worker_pid(input_rdd):
-            return input_rdd.map(lambda x: os.getpid()).collect()
-        rdd = self.sc.parallelize(xrange(20), 20)
-        worker_pids = get_worker_pid(rdd)
-        pids = get_worker_pid(rdd)
-        for pid in pids:
-            self.assertTrue(pid in worker_pids)
+        rdd = self.sc.parallelize(xrange(20), 8)
+        previous_pids = rdd.map(lambda x: os.getpid()).collect()
+        current_pids = rdd.map(lambda x: os.getpid()).collect()
+        for pid in current_pids:
+            self.assertTrue(pid in previous_pids)
 
 
 if __name__ == "__main__":