Explicitly specify supported types in toPandas

apache · HyukjinKwon · Feb 16, 2018 · Feb 16, 2018 · Feb 16, 2018 · c79c6df7284b9717fe4e4c26090dcb51bf7712da
commit c79c6df7284b9717fe4e4c26090dcb51bf7712da
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -1988,10 +1988,11 @@ def toPandas(self):
         if self.sql_ctx.getConf("spark.sql.execution.arrow.enabled", "false").lower() == "true":
             try:
                 from pyspark.sql.types import _check_dataframe_convert_date, \
-                    _check_dataframe_localize_timestamps
+                    _check_dataframe_localize_timestamps, to_arrow_schema
                 from pyspark.sql.utils import require_minimum_pyarrow_version
                 import pyarrow
                 require_minimum_pyarrow_version()
+                to_arrow_schema(self.schema)
                 tables = self._collectAsArrow()
                 if tables:
                     table = pyarrow.concat_tables(tables)
@@ -2000,10 +2001,12 @@ def toPandas(self):
                     return _check_dataframe_localize_timestamps(pdf, timezone)
                 else:
                     return pd.DataFrame.from_records([], columns=self.columns)
-            except ImportError as e:
-                msg = "note: pyarrow must be installed and available on calling Python process " \
-                      "if using spark.sql.execution.arrow.enabled=true"
-                raise ImportError("%s\n%s" % (_exception_message(e), msg))
+            except Exception as e:
+                msg = (
+                    "Note: toPandas attempted Arrow optimization because "
+                    "'spark.sql.execution.arrow.enabled' is set to true. Please set it to false "
+                    "to disable this.")
+                raise RuntimeError("%s\n%s" % (_exception_message(e), msg))
         else:
             pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
 

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -3497,7 +3497,14 @@ def test_unsupported_datatype(self):
         schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)])
         df = self.spark.createDataFrame([(None,)], schema=schema)
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(Exception, 'Unsupported data type'):
+            with self.assertRaisesRegexp(Exception, 'Unsupported type'):
+                df.toPandas()
+
+        df = self.spark.createDataFrame([(None,)], schema="a binary")
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(
+                    Exception,
+                    'Unsupported type.*\nNote: toPandas attempted Arrow optimization because'):
                 df.toPandas()
 
     def test_null_conversion(self):