verifySchema on Connect

apache · xinrong-meng · Nov 14, 2024 · Nov 15, 2024 · Nov 15, 2024 · Nov 15, 2024
commit a211003a50f0cfa88500e4915411b5d8fe831ecb
diff --git a/python/pyspark/sql/connect/conversion.py b/python/pyspark/sql/connect/conversion.py
@@ -322,7 +322,7 @@ def convert_other(value: Any) -> Any:
             return lambda value: value
 
     @staticmethod
-    def convert(data: Sequence[Any], schema: StructType) -> "pa.Table":
+    def convert(data: Sequence[Any], schema: StructType, verifySchema: bool = False) -> "pa.Table":
         assert isinstance(data, list) and len(data) > 0
 
         assert schema is not None and isinstance(schema, StructType)
@@ -372,8 +372,8 @@ def convert(data: Sequence[Any], schema: StructType) -> "pa.Table":
                 ]
             )
         )
-
-        return pa.Table.from_arrays(pylist, schema=pa_schema)
+        table = pa.Table.from_arrays(pylist, schema=pa_schema)
+        return table.cast(pa_schema, safe=verifySchema)
 
 
 class ArrowTableToRowsConversion:

diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py
@@ -50,6 +50,7 @@
 )
 import urllib
 
+from pyspark._globals import _NoValue, _NoValueType
 from pyspark.sql.connect.dataframe import DataFrame
 from pyspark.sql.dataframe import DataFrame as ParentDataFrame
 from pyspark.sql.connect.logging import logger
@@ -449,7 +450,7 @@ def createDataFrame(
         data: Union["pd.DataFrame", "np.ndarray", "pa.Table", Iterable[Any]],
         schema: Optional[Union[AtomicType, StructType, str, List[str], Tuple[str, ...]]] = None,
         samplingRatio: Optional[float] = None,
-        verifySchema: Optional[bool] = None,
+        verifySchema: Union[_NoValueType, bool] = _NoValue,
     ) -> "ParentDataFrame":
         assert data is not None
         if isinstance(data, DataFrame):
@@ -461,9 +462,6 @@ def createDataFrame(
         if samplingRatio is not None:
             warnings.warn("'samplingRatio' is ignored. It is not supported with Spark Connect.")
 
-        if verifySchema is not None:
-            warnings.warn("'verifySchema' is ignored. It is not supported with Spark Connect.")
-
         _schema: Optional[Union[AtomicType, StructType]] = None
         _cols: Optional[List[str]] = None
         _num_cols: Optional[int] = None
@@ -576,7 +574,10 @@ def createDataFrame(
                 "spark.sql.session.timeZone", "spark.sql.execution.pandas.convertToArrowArraySafely"
             )
 
-            ser = ArrowStreamPandasSerializer(cast(str, timezone), safecheck == "true")
+            if verifySchema is _NoValue:
+                verifySchema = safecheck == "true"
+
+            ser = ArrowStreamPandasSerializer(cast(str, timezone), verifySchema)
 
             _table = pa.Table.from_batches(
                 [
@@ -596,6 +597,9 @@ def createDataFrame(
                 ).cast(arrow_schema)
 
         elif isinstance(data, pa.Table):
+            if verifySchema is _NoValue:
+                verifySchema = False
+
             prefer_timestamp_ntz = is_timestamp_ntz_preferred()
 
             (timezone,) = self._client.get_configs("spark.sql.session.timeZone")
@@ -613,7 +617,10 @@ def createDataFrame(
 
             _table = (
                 _check_arrow_table_timestamps_localize(data, schema, True, timezone)
-                .cast(to_arrow_schema(schema, error_on_duplicated_field_names_in_struct=True))
+                .cast(
+                    to_arrow_schema(schema, error_on_duplicated_field_names_in_struct=True),
+                    safe=verifySchema,
+                )
                 .rename_columns(schema.names)
             )
 
@@ -648,10 +655,14 @@ def createDataFrame(
                 _table = pa.Table.from_arrays(
                     [pa.array(data[::, i]) for i in range(0, data.shape[1])], _cols
                 )
+            _table.cast()
 
             # The _table should already have the proper column names.
             _cols = None
 
+            if verifySchema is _NoValue:
+                verifySchema = True
+
         else:
             _data = list(data)
 
@@ -683,12 +694,15 @@ def createDataFrame(
                         errorClass="CANNOT_DETERMINE_TYPE", messageParameters={}
                     )
 
+            if verifySchema is _NoValue:
+                verifySchema = True
+
             from pyspark.sql.connect.conversion import LocalDataToArrowConversion
 
             # Spark Connect will try its best to build the Arrow table with the
             # inferred schema in the client side, and then rename the columns and
             # cast the datatypes in the server side.
-            _table = LocalDataToArrowConversion.convert(_data, _schema)
+            _table = LocalDataToArrowConversion.convert(_data, _schema, verifySchema)
 
         # TODO: Beside the validation on number of columns, we should also check
         # whether the Arrow Schema is compatible with the user provided Schema.