Infer 'mixed' types as strings when using Arrow serialization (#702)

rshkv · rshkv · commit 0526805082ca · 2020-08-20T00:40:52.000+02:00
* Add failing test

* More generously infer columns as strings if Arrow thought they are binary

* Add failing test for true binary values

* Infer 'mixed' columns by checking the first value

* Update FORK.md
diff --git a/FORK.md b/FORK.md
@@ -24,9 +24,10 @@
 # Added
 
 * Gradle plugin to easily create custom docker images for use with k8s
-* Filter rLibDir by exists so that daemon.R references the correct file [460](https://github.com/palantir/spark/pull/460)
+* Filter rLibDir by exists so that daemon.R references the correct file [(#460)](https://github.com/palantir/spark/pull/460)
 * Implementation of the shuffle I/O plugins from SPARK-25299 that asynchronously backs up shuffle files to remote storage
-* Add pre-installed conda configuration and use to find rlib directory [700](https://github.com/palantir/spark/pull/700)
+* Add pre-installed conda configuration and use to find rlib directory [(#700)](https://github.com/palantir/spark/pull/700)
+* Supports Arrow-serialization of Python 2 strings [(#678)](https://github.com/palantir/spark/pull/678)
 
 # Reverted
 * [SPARK-25908](https://issues.apache.org/jira/browse/SPARK-25908) - Removal of `monotonicall_increasing_id`, `toDegree`, `toRadians`, `approxCountDistinct`, `unionAll`
diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
@@ -357,6 +357,20 @@ def test_createDataFrame_with_str_struct_col(self):
             df, df_arrow = self._createDataFrame_toggle(pdf)
             self.assertEqual(df.schema, df_arrow.schema)
 
+    def test_createDataFrame_with_str_binary_mixed(self):
+        import pandas as pd
+        pdf = pd.DataFrame({"a": [u"unicode-value", "binary-under-python-2"]})
+
+        df, df_arrow = self._createDataFrame_toggle(pdf)
+        self.assertEqual(df.schema, df_arrow.schema)
+
+    def test_createDataFrame_with_real_binary(self):
+        import pandas as pd
+        pdf = pd.DataFrame({"a": [bytearray(b"a"), bytearray(b"c")]})
+
+        df, df_arrow = self._createDataFrame_toggle(pdf)
+        self.assertEqual(df.schema, df_arrow.schema)
+
     def test_createDataFrame_fallback_enabled(self):
         with QuietTest(self.sc):
             with self.sql_conf({"spark.sql.execution.arrow.fallback.enabled": True}):
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
@@ -1694,10 +1694,20 @@ def from_arrow_schema(arrow_schema):
 def _infer_binary_columns_as_arrow_string(schema, pandas_df):
     import pandas as pd
     import pyarrow as pa
+    import six
 
     for field_index, field in enumerate(schema):
-        if field.type == pa.binary() and \
-                pd.api.types.infer_dtype(pandas_df.iloc[:, field_index], skipna=True) == "string":
+        if not field.type == pa.binary():
+            continue
+
+        inferred_dtype = pd.api.types.infer_dtype(pandas_df.iloc[:, field_index], skipna=True)
+        if inferred_dtype == 'string':
+            is_string_column = True
+        elif inferred_dtype == 'mixed' and len(pandas_df.index) > 0:
+            first_value = pandas_df.iloc[0, field_index]
+            is_string_column = isinstance(first_value, six.string_types)
+
+        if is_string_column:
             field_as_string = pa.field(field.name, pa.string())
             schema = schema.set(field_index, field_as_string)