fix format

apache · bogao007 · Jan 22, 2024 · Feb 6, 2024 · Feb 6, 2024 · Jun 20, 2024
commit df9ea9e228e4d94c6c83ae16bf7c14b38acfaa5e
diff --git a/python/pyspark/sql/pandas/group_ops.py b/python/pyspark/sql/pandas/group_ops.py
@@ -23,7 +23,10 @@
 from pyspark.sql.column import Column
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.streaming.state import GroupStateTimeout
-from pyspark.sql.streaming.stateful_processor_api_client import StatefulProcessorApiClient, StatefulProcessorHandleState
+from pyspark.sql.streaming.stateful_processor_api_client import (
+    StatefulProcessorApiClient,
+    StatefulProcessorHandleState,
+)
 from pyspark.sql.streaming.stateful_processor import StatefulProcessor, StatefulProcessorHandle
 from pyspark.sql.types import StructType, _parse_datatype_string
 
@@ -35,7 +38,7 @@
         PandasCogroupedMapFunction,
         ArrowGroupedMapFunction,
         ArrowCogroupedMapFunction,
-        DataFrameLike as PandasDataFrameLike
+        DataFrameLike as PandasDataFrameLike,
     )
     from pyspark.sql.group import GroupedData
 
@@ -361,13 +364,12 @@ def applyInPandasWithState(
         )
         return DataFrame(jdf, self.session)
 
-
     def transformWithStateInPandas(
         self,
         statefulProcessor: StatefulProcessor,
         outputStructType: Union[StructType, str],
         outputMode: str,
-        timeMode: str
+        timeMode: str,
     ) -> DataFrame:
         """
         Invokes methods defined in the stateful processor used in arbitrary state API v2.
@@ -453,26 +455,30 @@ def transformWithStateInPandas(
 
         This API is experimental.
         """
-        
+
         from pyspark.sql import GroupedData
         from pyspark.sql.functions import pandas_udf
+
         assert isinstance(self, GroupedData)
 
-        def transformWithStateUDF(statefulProcessorApiClient: StatefulProcessorApiClient, key: Any,
-                                  inputRows: Iterator["PandasDataFrameLike"]
-                                  ) -> Iterator["PandasDataFrameLike"]:
+        def transformWithStateUDF(
+            statefulProcessorApiClient: StatefulProcessorApiClient,
+            key: Any,
+            inputRows: Iterator["PandasDataFrameLike"],
+        ) -> Iterator["PandasDataFrameLike"]:
             handle = StatefulProcessorHandle(statefulProcessorApiClient)
 
             if statefulProcessorApiClient.handle_state == StatefulProcessorHandleState.CREATED:
                 statefulProcessor.init(handle)
                 statefulProcessorApiClient.set_handle_state(
-                    StatefulProcessorHandleState.INITIALIZED)
+                    StatefulProcessorHandleState.INITIALIZED
+                )
 
             statefulProcessorApiClient.set_implicit_key(key)
             result = statefulProcessor.handleInputRows(key, inputRows)
 
             return result
-        
+
         if isinstance(outputStructType, str):
             outputStructType = cast(StructType, _parse_datatype_string(outputStructType))
 
@@ -483,7 +489,7 @@ def transformWithStateUDF(statefulProcessorApiClient: StatefulProcessorApiClient
         )
         df = self._df
         udf_column = udf(*[df[col] for col in df.columns])
-        
+
         jdf = self._jgd.transformWithStateInPandas(
             udf_column._jc.expr(),
             self.session._jsparkSession.parseDataType(outputStructType.json()),
@@ -492,7 +498,6 @@ def transformWithStateUDF(statefulProcessorApiClient: StatefulProcessorApiClient
         )
         return DataFrame(jdf, self.session)
 
-
     def applyInArrow(
         self, func: "ArrowGroupedMapFunction", schema: Union[StructType, str]
     ) -> "DataFrame":

diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
@@ -26,7 +26,14 @@
 from typing import Any
 from pyspark.errors import PySparkRuntimeError, PySparkTypeError, PySparkValueError
 from pyspark.loose_version import LooseVersion
-from pyspark.serializers import Serializer, read_int, write_int, UTF8Deserializer, CPickleSerializer, write_with_length
+from pyspark.serializers import (
+    Serializer,
+    read_int,
+    write_int,
+    UTF8Deserializer,
+    CPickleSerializer,
+    write_with_length,
+)
 from pyspark.sql.pandas.types import (
     from_arrow_type,
     to_arrow_type,
@@ -1139,16 +1146,10 @@ class TransformWithStateInPandasSerializer(ArrowStreamPandasUDFSerializer):
         Limit of the number of records that can be written to a single ArrowRecordBatch in memory.
     """
 
-    def __init__(
-            self,
-            timezone,
-            safecheck,
-            assign_cols_by_name,
-            arrow_max_records_per_batch):
-        super(
-            TransformWithStateInPandasSerializer,
-            self
-        ).__init__(timezone, safecheck, assign_cols_by_name)
+    def __init__(self, timezone, safecheck, assign_cols_by_name, arrow_max_records_per_batch):
+        super(TransformWithStateInPandasSerializer, self).__init__(
+            timezone, safecheck, assign_cols_by_name
+        )
         self.arrow_max_records_per_batch = arrow_max_records_per_batch
         self.key_offsets = None
 
@@ -1170,7 +1171,9 @@ def load_stream(self, stream):
 
         def generate_data_batches(batches):
             for batch in batches:
-                data_pandas = [self.arrow_to_pandas(c) for c in pa.Table.from_batches([batch]).itercolumns()]
+                data_pandas = [
+                    self.arrow_to_pandas(c) for c in pa.Table.from_batches([batch]).itercolumns()
+                ]
                 key_series = [data_pandas[o] for o in self.key_offsets]
                 batch_key = tuple(s[0] for s in key_series)
                 yield (batch_key, data_pandas)
@@ -1181,11 +1184,10 @@ def generate_data_batches(batches):
         for k, g in groupby(data_batches, key=lambda x: x[0]):
             yield (k, g)
 
-
     def dump_stream(self, iterator, stream):
         """
         Read through an iterator of (iterator of pandas DataFrame, state), serialize them to Arrow
         RecordBatches, and write batches to stream.
         """
-        result = [(b, t) for x in iterator for y, t in x for b in y]    
+        result = [(b, t) for x in iterator for y, t in x for b in y]
         super().dump_stream(result, stream)
diff --git a/python/pyspark/sql/streaming/StateMessage_pb2.py b/python/pyspark/sql/streaming/StateMessage_pb2.py
@@ -24,48 +24,49 @@
 from google.protobuf import descriptor_pool as _descriptor_pool
 from google.protobuf import symbol_database as _symbol_database
 from google.protobuf.internal import builder as _builder
+
 # @@protoc_insertion_point(imports)
 
 _sym_db = _symbol_database.Default()
 
 
-
-
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x12StateMessage.proto\x12.org.apache.spark.sql.execution.streaming.state\"\xe9\x02\n\x0cStateRequest\x12\x0f\n\x07version\x18\x01 \x01(\x05\x12\x66\n\x15statefulProcessorCall\x18\x02 \x01(\x0b\x32\x45.org.apache.spark.sql.execution.streaming.state.StatefulProcessorCallH\x00\x12\x64\n\x14stateVariableRequest\x18\x03 \x01(\x0b\x32\x44.org.apache.spark.sql.execution.streaming.state.StateVariableRequestH\x00\x12p\n\x1aimplicitGroupingKeyRequest\x18\x04 \x01(\x0b\x32J.org.apache.spark.sql.execution.streaming.state.ImplicitGroupingKeyRequestH\x00\x42\x08\n\x06method\"9\n\rStateResponse\x12\x12\n\nstatusCode\x18\x01 \x01(\x05\x12\x14\n\x0c\x65rrorMessage\x18\x02 \x01(\t\"\x89\x03\n\x15StatefulProcessorCall\x12X\n\x0esetHandleState\x18\x01 \x01(\x0b\x32>.org.apache.spark.sql.execution.streaming.state.SetHandleStateH\x00\x12Y\n\rgetValueState\x18\x02 \x01(\x0b\x32@.org.apache.spark.sql.execution.streaming.state.StateCallCommandH\x00\x12X\n\x0cgetListState\x18\x03 \x01(\x0b\x32@.org.apache.spark.sql.execution.streaming.state.StateCallCommandH\x00\x12W\n\x0bgetMapState\x18\x04 \x01(\x0b\x32@.org.apache.spark.sql.execution.streaming.state.StateCallCommandH\x00\x42\x08\n\x06method\"z\n\x14StateVariableRequest\x12X\n\x0evalueStateCall\x18\x01 \x01(\x0b\x32>.org.apache.spark.sql.execution.streaming.state.ValueStateCallH\x00\x42\x08\n\x06method\"\xe0\x01\n\x1aImplicitGroupingKeyRequest\x12X\n\x0esetImplicitKey\x18\x01 \x01(\x0b\x32>.org.apache.spark.sql.execution.streaming.state.SetImplicitKeyH\x00\x12^\n\x11removeImplicitKey\x18\x02 \x01(\x0b\x32\x41.org.apache.spark.sql.execution.streaming.state.RemoveImplicitKeyH\x00\x42\x08\n\x06method\"5\n\x10StateCallCommand\x12\x11\n\tstateName\x18\x01 \x01(\t\x12\x0e\n\x06schema\x18\x02 \x01(\t\"\xe1\x02\n\x0eValueStateCall\x12\x11\n\tstateName\x18\x01 \x01(\t\x12H\n\x06\x65xists\x18\x02 \x01(\x0b\x32\x36.org.apache.spark.sql.execution.streaming.state.ExistsH\x00\x12\x42\n\x03get\x18\x03 \x01(\x0b\x32\x33.org.apache.spark.sql.execution.streaming.state.GetH\x00\x12\\\n\x10valueStateUpdate\x18\x04 \x01(\x0b\x32@.org.apache.spark.sql.execution.streaming.state.ValueStateUpdateH\x00\x12\x46\n\x05\x63lear\x18\x05 \x01(\x0b\x32\x35.org.apache.spark.sql.execution.streaming.state.ClearH\x00\x42\x08\n\x06method\"\x1d\n\x0eSetImplicitKey\x12\x0b\n\x03key\x18\x01 \x01(\x0c\"\x13\n\x11RemoveImplicitKey\"\x08\n\x06\x45xists\"\x05\n\x03Get\"1\n\x10ValueStateUpdate\x12\x0e\n\x06schema\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x0c\"\x07\n\x05\x43lear\"\\\n\x0eSetHandleState\x12J\n\x05state\x18\x01 \x01(\x0e\x32;.org.apache.spark.sql.execution.streaming.state.HandleState*K\n\x0bHandleState\x12\x0b\n\x07\x43REATED\x10\x00\x12\x0f\n\x0bINITIALIZED\x10\x01\x12\x12\n\x0e\x44\x41TA_PROCESSED\x10\x02\x12\n\n\x06\x43LOSED\x10\x03\x62\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
+    b'\n\x12StateMessage.proto\x12.org.apache.spark.sql.execution.streaming.state"\xe9\x02\n\x0cStateRequest\x12\x0f\n\x07version\x18\x01 \x01(\x05\x12\x66\n\x15statefulProcessorCall\x18\x02 \x01(\x0b\x32\x45.org.apache.spark.sql.execution.streaming.state.StatefulProcessorCallH\x00\x12\x64\n\x14stateVariableRequest\x18\x03 \x01(\x0b\x32\x44.org.apache.spark.sql.execution.streaming.state.StateVariableRequestH\x00\x12p\n\x1aimplicitGroupingKeyRequest\x18\x04 \x01(\x0b\x32J.org.apache.spark.sql.execution.streaming.state.ImplicitGroupingKeyRequestH\x00\x42\x08\n\x06method"9\n\rStateResponse\x12\x12\n\nstatusCode\x18\x01 \x01(\x05\x12\x14\n\x0c\x65rrorMessage\x18\x02 \x01(\t"\x89\x03\n\x15StatefulProcessorCall\x12X\n\x0esetHandleState\x18\x01 \x01(\x0b\x32>.org.apache.spark.sql.execution.streaming.state.SetHandleStateH\x00\x12Y\n\rgetValueState\x18\x02 \x01(\x0b\x32@.org.apache.spark.sql.execution.streaming.state.StateCallCommandH\x00\x12X\n\x0cgetListState\x18\x03 \x01(\x0b\x32@.org.apache.spark.sql.execution.streaming.state.StateCallCommandH\x00\x12W\n\x0bgetMapState\x18\x04 \x01(\x0b\x32@.org.apache.spark.sql.execution.streaming.state.StateCallCommandH\x00\x42\x08\n\x06method"z\n\x14StateVariableRequest\x12X\n\x0evalueStateCall\x18\x01 \x01(\x0b\x32>.org.apache.spark.sql.execution.streaming.state.ValueStateCallH\x00\x42\x08\n\x06method"\xe0\x01\n\x1aImplicitGroupingKeyRequest\x12X\n\x0esetImplicitKey\x18\x01 \x01(\x0b\x32>.org.apache.spark.sql.execution.streaming.state.SetImplicitKeyH\x00\x12^\n\x11removeImplicitKey\x18\x02 \x01(\x0b\x32\x41.org.apache.spark.sql.execution.streaming.state.RemoveImplicitKeyH\x00\x42\x08\n\x06method"5\n\x10StateCallCommand\x12\x11\n\tstateName\x18\x01 \x01(\t\x12\x0e\n\x06schema\x18\x02 \x01(\t"\xe1\x02\n\x0eValueStateCall\x12\x11\n\tstateName\x18\x01 \x01(\t\x12H\n\x06\x65xists\x18\x02 \x01(\x0b\x32\x36.org.apache.spark.sql.execution.streaming.state.ExistsH\x00\x12\x42\n\x03get\x18\x03 \x01(\x0b\x32\x33.org.apache.spark.sql.execution.streaming.state.GetH\x00\x12\\\n\x10valueStateUpdate\x18\x04 \x01(\x0b\x32@.org.apache.spark.sql.execution.streaming.state.ValueStateUpdateH\x00\x12\x46\n\x05\x63lear\x18\x05 \x01(\x0b\x32\x35.org.apache.spark.sql.execution.streaming.state.ClearH\x00\x42\x08\n\x06method"\x1d\n\x0eSetImplicitKey\x12\x0b\n\x03key\x18\x01 \x01(\x0c"\x13\n\x11RemoveImplicitKey"\x08\n\x06\x45xists"\x05\n\x03Get"1\n\x10ValueStateUpdate\x12\x0e\n\x06schema\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x0c"\x07\n\x05\x43lear"\\\n\x0eSetHandleState\x12J\n\x05state\x18\x01 \x01(\x0e\x32;.org.apache.spark.sql.execution.streaming.state.HandleState*K\n\x0bHandleState\x12\x0b\n\x07\x43REATED\x10\x00\x12\x0f\n\x0bINITIALIZED\x10\x01\x12\x12\n\x0e\x44\x41TA_PROCESSED\x10\x02\x12\n\n\x06\x43LOSED\x10\x03\x62\x06proto3'
+)
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
-_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'StateMessage_pb2', _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "StateMessage_pb2", _globals)
 if not _descriptor._USE_C_DESCRIPTORS:
-  DESCRIPTOR._loaded_options = None
-  _globals['_HANDLESTATE']._serialized_start=1874
-  _globals['_HANDLESTATE']._serialized_end=1949
-  _globals['_STATEREQUEST']._serialized_start=71
-  _globals['_STATEREQUEST']._serialized_end=432
-  _globals['_STATERESPONSE']._serialized_start=434
-  _globals['_STATERESPONSE']._serialized_end=491
-  _globals['_STATEFULPROCESSORCALL']._serialized_start=494
-  _globals['_STATEFULPROCESSORCALL']._serialized_end=887
-  _globals['_STATEVARIABLEREQUEST']._serialized_start=889
-  _globals['_STATEVARIABLEREQUEST']._serialized_end=1011
-  _globals['_IMPLICITGROUPINGKEYREQUEST']._serialized_start=1014
-  _globals['_IMPLICITGROUPINGKEYREQUEST']._serialized_end=1238
-  _globals['_STATECALLCOMMAND']._serialized_start=1240
-  _globals['_STATECALLCOMMAND']._serialized_end=1293
-  _globals['_VALUESTATECALL']._serialized_start=1296
-  _globals['_VALUESTATECALL']._serialized_end=1649
-  _globals['_SETIMPLICITKEY']._serialized_start=1651
-  _globals['_SETIMPLICITKEY']._serialized_end=1680
-  _globals['_REMOVEIMPLICITKEY']._serialized_start=1682
-  _globals['_REMOVEIMPLICITKEY']._serialized_end=1701
-  _globals['_EXISTS']._serialized_start=1703
-  _globals['_EXISTS']._serialized_end=1711
-  _globals['_GET']._serialized_start=1713
-  _globals['_GET']._serialized_end=1718
-  _globals['_VALUESTATEUPDATE']._serialized_start=1720
-  _globals['_VALUESTATEUPDATE']._serialized_end=1769
-  _globals['_CLEAR']._serialized_start=1771
-  _globals['_CLEAR']._serialized_end=1778
-  _globals['_SETHANDLESTATE']._serialized_start=1780
-  _globals['_SETHANDLESTATE']._serialized_end=1872
+    DESCRIPTOR._loaded_options = None
+    _globals["_HANDLESTATE"]._serialized_start = 1874
+    _globals["_HANDLESTATE"]._serialized_end = 1949
+    _globals["_STATEREQUEST"]._serialized_start = 71
+    _globals["_STATEREQUEST"]._serialized_end = 432
+    _globals["_STATERESPONSE"]._serialized_start = 434
+    _globals["_STATERESPONSE"]._serialized_end = 491
+    _globals["_STATEFULPROCESSORCALL"]._serialized_start = 494
+    _globals["_STATEFULPROCESSORCALL"]._serialized_end = 887
+    _globals["_STATEVARIABLEREQUEST"]._serialized_start = 889
+    _globals["_STATEVARIABLEREQUEST"]._serialized_end = 1011
+    _globals["_IMPLICITGROUPINGKEYREQUEST"]._serialized_start = 1014
+    _globals["_IMPLICITGROUPINGKEYREQUEST"]._serialized_end = 1238
+    _globals["_STATECALLCOMMAND"]._serialized_start = 1240
+    _globals["_STATECALLCOMMAND"]._serialized_end = 1293
+    _globals["_VALUESTATECALL"]._serialized_start = 1296
+    _globals["_VALUESTATECALL"]._serialized_end = 1649
+    _globals["_SETIMPLICITKEY"]._serialized_start = 1651
+    _globals["_SETIMPLICITKEY"]._serialized_end = 1680
+    _globals["_REMOVEIMPLICITKEY"]._serialized_start = 1682
+    _globals["_REMOVEIMPLICITKEY"]._serialized_end = 1701
+    _globals["_EXISTS"]._serialized_start = 1703
+    _globals["_EXISTS"]._serialized_end = 1711
+    _globals["_GET"]._serialized_start = 1713
+    _globals["_GET"]._serialized_end = 1718
+    _globals["_VALUESTATEUPDATE"]._serialized_start = 1720
+    _globals["_VALUESTATEUPDATE"]._serialized_end = 1769
+    _globals["_CLEAR"]._serialized_start = 1771
+    _globals["_CLEAR"]._serialized_end = 1778
+    _globals["_SETHANDLESTATE"]._serialized_start = 1780
+    _globals["_SETHANDLESTATE"]._serialized_end = 1872
 # @@protoc_insertion_point(module_scope)