fix; test

apache · xinrong-meng · Nov 14, 2024 · Nov 15, 2024 · Nov 15, 2024 · Nov 15, 2024
commit d524d1f2bef2710ab7d32f5959d202a6aa9efaef
diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py
@@ -655,13 +655,15 @@ def createDataFrame(
                 _table = pa.Table.from_arrays(
                     [pa.array(data[::, i]) for i in range(0, data.shape[1])], _cols
                 )
-            _table.cast()
 
             # The _table should already have the proper column names.
             _cols = None
 
-            if verifySchema is _NoValue:
-                verifySchema = True
+            if verifySchema is not _NoValue:
+                warnings.warn(
+                    "'verifySchema' is ignored. It is not supported"
+                    " with np.ndarray input on Spark Connect."
+                )
 
         else:
             _data = list(data)
@@ -702,7 +704,7 @@ def createDataFrame(
             # Spark Connect will try its best to build the Arrow table with the
             # inferred schema in the client side, and then rename the columns and
             # cast the datatypes in the server side.
-            _table = LocalDataToArrowConversion.convert(_data, _schema, verifySchema)
+            _table = LocalDataToArrowConversion.convert(_data, _schema, cast(bool, verifySchema))
 
         # TODO: Beside the validation on number of columns, we should also check
         # whether the Arrow Schema is compatible with the user provided Schema.

diff --git a/python/pyspark/sql/tests/connect/test_parity_arrow.py b/python/pyspark/sql/tests/connect/test_parity_arrow.py
@@ -137,7 +137,6 @@ def test_toPandas_udt(self):
     def test_create_dataframe_namedtuples(self):
         self.check_create_dataframe_namedtuples(True)
 
-    @unittest.skip("Spark Connect does not support verifySchema.")
     def test_createDataFrame_verifySchema(self):
         super().test_createDataFrame_verifySchema()