apache · advancedxy · Sep 19, 2019 · Sep 19, 2019 · Sep 19, 2019 · Sep 19, 2019
diff --git a/python/pyspark/testing/utils.py b/python/pyspark/testing/utils.py
@@ -18,6 +18,7 @@
 import os
 import struct
 import sys
+import threading
 import unittest
 
 from pyspark import SparkContext, SparkConf

diff --git a/python/pyspark/tests/test_worker.py b/python/pyspark/tests/test_worker.py
@@ -1,3 +1,4 @@
+# -*- encoding: utf-8 -*-
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -150,6 +151,20 @@ def test_with_different_versions_of_python(self):
         finally:
             self.sc.pythonVer = version
 
+    def test_python_exception_non_hanging(self):
+        # SPARK-21045: exceptions with no ascii encoding shall not hanging PySpark.
+        try:
+            def f():
+                raise Exception("exception with 中 and \xd6\xd0")
+
+            self.sc.parallelize([1]).map(lambda x: f()).count()
+        except Py4JJavaError as e:
+            if sys.version_info.major < 3:
+                # we have to use unicode here to avoid UnicodeDecodeError
+                self.assertRegexpMatches(unicode(e).encode("utf-8"), "exception with 中")
+            else:
+                self.assertRegexpMatches(str(e), "exception with 中")
+
 
 class WorkerReuseTest(PySparkTestCase):
 

diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -44,7 +44,7 @@
 from pyspark.util import _get_argspec, fail_on_stopiteration
 from pyspark import shuffle
 
-if sys.version >= '3':
+if sys.version_info.major >= 3:
     basestring = str
 else:
     from itertools import imap as map  # use iterator map by default
@@ -598,8 +598,18 @@ def process():
             process()
     except Exception:
         try:
+            exc_info = traceback.format_exc()
+            if sys.version_info.major < 3:
+                if isinstance(exc_info, unicode):
+                    exc_info = exc_info.encode("utf-8")
+                else:
+                    # exc_info may contains other encoding bytes, replace the invalid byte and
+                    # convert it back to utf-8 again
+                    exc_info = exc_info.decode("utf-8", "replace").encode("utf-8")
+            else:
+                exc_info = exc_info.encode("utf-8")
             write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile)
-            write_with_length(traceback.format_exc().encode("utf-8"), outfile)
+            write_with_length(exc_info, outfile)
         except IOError:
             # JVM close the socket
             pass