Simplify approach just for parallelize xrange

apache · xuanyuanking · Jan 5, 2019 · Jan 7, 2019 · Jan 9, 2019 · Jan 7, 2019
commit ab451e5b4e152450e3fda7ef677deef52bf359a1
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
@@ -493,6 +493,10 @@ def getStart(split):
                 return start0 + int((split * size / numSlices)) * step
 
             def f(split, iterator):
+                # it's an empty iterator here but we need this line for triggering the logic of
+                # checking END_OF_DATA_SECTION during load iterator in runtime, thus make sure
+                # worker reuse takes effect. See more details in SPARK-26549.
+                assert len(list(iterator)) == 0
                 return xrange(getStart(split), getStart(split + 1), step)
 
             return self.parallelize([], numSlices).mapPartitionsWithIndex(f)

diff --git a/python/pyspark/tests/test_worker.py b/python/pyspark/tests/test_worker.py
@@ -144,10 +144,10 @@ def test_with_different_versions_of_python(self):
         finally:
             self.sc.pythonVer = version
 
-    def test_reuse_worker(self):
+    def test_reuse_worker_of_parallelize_xrange(self):
         def get_worker_pid(input_rdd):
             return input_rdd.map(lambda x: os.getpid()).collect()
-        rdd = self.sc.parallelize(range(20), 20)
+        rdd = self.sc.parallelize(xrange(20), 20)
         worker_pids = get_worker_pid(rdd)
         pids = get_worker_pid(rdd)
         for pid in pids:

diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -446,12 +446,7 @@ def process():
         pickleSer._write_with_length((aid, accum._value), outfile)
 
     # check end of stream
-    res = read_int(infile)
-    if sys.version >= '3' and res == SpecialLengths.END_OF_DATA_SECTION:
-        # skip the END_OF_DATA_SECTION for Python3, otherwise the worker reuse will take
-        # no effect, see SPARK-26549 for more details.
-        res = read_int(infile)
-    if res == SpecialLengths.END_OF_STREAM:
+    if read_int(infile) == SpecialLengths.END_OF_STREAM:
         write_int(SpecialLengths.END_OF_STREAM, outfile)
     else:
         # write a different value to tell JVM to not reuse this worker