Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Explicitly specify supported types in toPandas
  • Loading branch information
HyukjinKwon committed Feb 16, 2018
commit c79c6df7284b9717fe4e4c26090dcb51bf7712da
13 changes: 8 additions & 5 deletions python/pyspark/sql/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1988,10 +1988,11 @@ def toPandas(self):
if self.sql_ctx.getConf("spark.sql.execution.arrow.enabled", "false").lower() == "true":
try:
from pyspark.sql.types import _check_dataframe_convert_date, \
_check_dataframe_localize_timestamps
_check_dataframe_localize_timestamps, to_arrow_schema
from pyspark.sql.utils import require_minimum_pyarrow_version
import pyarrow
require_minimum_pyarrow_version()
to_arrow_schema(self.schema)
tables = self._collectAsArrow()
if tables:
table = pyarrow.concat_tables(tables)
Expand All @@ -2000,10 +2001,12 @@ def toPandas(self):
return _check_dataframe_localize_timestamps(pdf, timezone)
else:
return pd.DataFrame.from_records([], columns=self.columns)
except ImportError as e:
msg = "note: pyarrow must be installed and available on calling Python process " \
"if using spark.sql.execution.arrow.enabled=true"
raise ImportError("%s\n%s" % (_exception_message(e), msg))
except Exception as e:
msg = (
"Note: toPandas attempted Arrow optimization because "
"'spark.sql.execution.arrow.enabled' is set to true. Please set it to false "
"to disable this.")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, this says why it's trying arrow and how to turn it off, but doesn't say why I have to turn it off? perhaps say something like pyarrow is not found (if it is the cause if we know)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, that should be part of the original message. For example, I don't have PyArrow in pypy in my local. it shows the error like:

RuntimeError: PyArrow >= 0.8.0 must be installed; however, it was not found.
Note: toPandas attempted Arrow optimization because 'spark.sql.execution.arrow.enabled' is set to true. Please set it to false to disable this.

raise RuntimeError("%s\n%s" % (_exception_message(e), msg))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should the same type of error be raised instead of RuntimeError?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, please open a PR if you have a better idea.

else:
pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)

Expand Down
9 changes: 8 additions & 1 deletion python/pyspark/sql/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -3497,7 +3497,14 @@ def test_unsupported_datatype(self):
schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)])
df = self.spark.createDataFrame([(None,)], schema=schema)
with QuietTest(self.sc):
with self.assertRaisesRegexp(Exception, 'Unsupported data type'):
with self.assertRaisesRegexp(Exception, 'Unsupported type'):
df.toPandas()

df = self.spark.createDataFrame([(None,)], schema="a binary")
with QuietTest(self.sc):
with self.assertRaisesRegexp(
Exception,
'Unsupported type.*\nNote: toPandas attempted Arrow optimization because'):
df.toPandas()

def test_null_conversion(self):
Expand Down