removed unrelated log lines, addressed part of the comments

apache · bogao007 · Jan 22, 2024 · Feb 6, 2024 · Feb 6, 2024 · Jun 20, 2024
commit 9c8c6169a32961fa3237ceaed8c2a82c6f7dea7d
diff --git a/python/pyspark/sql/pandas/group_ops.py b/python/pyspark/sql/pandas/group_ops.py
@@ -378,24 +378,7 @@ def transformWithStateInPandas(self,
         invocations.
 
         The `stateful_processor` should be a Python class that implements the interface defined in
-        pyspark.sql.streaming.stateful_processor. The stateful processor consists 3 functions:
-        `init`, `handleInputRows`, and `close`.
-
-        The `init` function will be invoked as the first method that allows for users to initialize
-        all their state variables and perform other init actions before handling data.
-
-        The `handleInputRows` function will allow users to interact with input data rows. It should
-        take parameters (key, Iterator[`pandas.DataFrame`]) and return another
-        Iterator[`pandas.DataFrame`]. For each group, all columns are passed together as
-        `pandas.DataFrame` to the `handleInputRows` function, and the returned `pandas.DataFrame`
-        across all invocations are combined as a :class:`DataFrame`. Note that the `handleInputRows`
-        function should not make a guess of the number of elements in the iterator. To process all
-        data, the `handleInputRows` function needs to iterate all elements and process them. On the
-        other hand, the `handleInputRows` function is not strictly required toiterate through all
-        elements in the iterator if it intends to read a part of data.
-
-        The `close` function will be called as the last method that allows for users to perform any
-        cleanup or teardown operations.
+        pyspark.sql.streaming.stateful_processor.StatefulProcessor.
 
         The `outputStructType` should be a :class:`StructType` describing the schema of all
         elements in the returned value, `pandas.DataFrame`. The column labels of all elements in
@@ -410,13 +393,13 @@ def transformWithStateInPandas(self,
 
         Parameters
         ----------
-        stateful_processor : StatefulProcessor
-            Instance of statefulProcessor whose functions will be invoked by the operator.
+        stateful_processor : :class:`pyspark.sql.streaming.stateful_processor.StatefulProcessor`
+            Instance of StatefulProcessor whose functions will be invoked by the operator.
         outputStructType : :class:`pyspark.sql.types.DataType` or str
-            the type of the output records. The value can be either a
+            The type of the output records. The value can be either a
             :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
         outputMode : str
-            the output mode of the stateful processor.
+            The output mode of the stateful processor.
         timeMode : str
             The time mode semantics of the stateful processor for timers and TTL.
 
@@ -463,14 +446,10 @@ def transformWithStateUDF(state_api_client: StateApiClient, key: Any,
                                   inputRows: Iterator["PandasDataFrameLike"]) -> Iterator["PandasDataFrameLike"]:
             handle = StatefulProcessorHandle(state_api_client)
 
-            print(f"checking handle state: {state_api_client.handle_state}")
             if (state_api_client.handle_state == StatefulProcessorHandleState.CREATED):
-                print("initializing stateful processor")
                 stateful_processor.init(handle)
-                print("setting handle state to initialized")
                 state_api_client.set_handle_state(StatefulProcessorHandleState.INITIALIZED)
 
-            print(f"handling input rows for key: {key[0]}")
             state_api_client.set_implicit_key(str(key[0]))
             result = stateful_processor.handleInputRows(key, inputRows)
             state_api_client.remove_implicit_key()

diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
@@ -1171,7 +1171,6 @@ def load_stream(self, stream):
         chunks for each group, so that the caller can lazily materialize the data chunk.
         """
         import pyarrow as pa
-        from itertools import tee
 
         def generate_data_batches(batches):
             for batch in batches:
@@ -1180,11 +1179,9 @@ def generate_data_batches(batches):
                 batch_key = tuple(s[0] for s in key_series)
                 yield (batch_key, data_pandas)
 
-        print("Generating data batches...")
         _batches = super(ArrowStreamPandasSerializer, self).load_stream(stream)
         data_batches = generate_data_batches(_batches)
 
-        print("Returning data batches...")
         for k, g in groupby(data_batches, key=lambda x: x[0]):
             yield (k, g)
 
@@ -1196,13 +1193,3 @@ def dump_stream(self, iterator, stream):
         """
         result = [(b, t) for x in iterator for y, t in x for b in y]    
         super().dump_stream(result, stream)
-
-class ImplicitGroupingKeyTracker:
-    def __init__(self) -> None:
-        self._key = None
-
-    def setKey(self, key: Any) -> None:
-        self._key = key
-
-    def getKey(self) -> Any:
-        return self._key
diff --git a/python/pyspark/sql/streaming/state_api_client.py b/python/pyspark/sql/streaming/state_api_client.py
@@ -41,12 +41,10 @@ def __init__(
         self._client_socket.connect(server_address)
         self.sockfile = self._client_socket.makefile("rwb",
                                                      int(os.environ.get("SPARK_BUFFER_SIZE",65536)))
-        print(f"client is ready - connection established")
         self.handle_state = StatefulProcessorHandleState.CREATED
         self.utf8_deserializer = UTF8Deserializer()
 
     def set_handle_state(self, state: StatefulProcessorHandleState) -> None:
-        print(f"setting handle state to: {state}")
         proto_state = self._get_proto_state(state)
         set_handle_state = stateMessage.SetHandleState(state=proto_state)
         handle_call = stateMessage.StatefulProcessorCall(setHandleState=set_handle_state)
@@ -60,40 +58,33 @@ def set_handle_state(self, state: StatefulProcessorHandleState) -> None:
             self.handle_state = state
         else:
             raise Exception(f"Error setting handle state: {response_message.errorMessage}")
-        print(f"setHandleState status= {status}")
 
     def set_implicit_key(self, key: str) -> None:
-        print(f"setting implicit key: {key}")
         set_implicit_key = stateMessage.SetImplicitKey(key=key)
         request = stateMessage.ImplicitGroupingKeyRequest(setImplicitKey=set_implicit_key)
         message = stateMessage.StateRequest(implicitGroupingKeyRequest=request)
 
         self._send_proto_message(message)
         response_message = self._receive_proto_message()
         status = response_message.statusCode
-        print(f"setImplicitKey status= {status}")
         if (status != 0):
             raise Exception(f"Error setting implicit key: {response_message.errorMessage}")
 
     def remove_implicit_key(self) -> None:
-        print(f"removing implicit key")
         remove_implicit_key = stateMessage.RemoveImplicitKey()
         request = stateMessage.ImplicitGroupingKeyRequest(removeImplicitKey=remove_implicit_key)
         message = stateMessage.StateRequest(implicitGroupingKeyRequest=request)
 
         self._send_proto_message(message)
         response_message = self._receive_proto_message()
         status = response_message.statusCode
-        print(f"removeImplicitKey status= {status}")
         if (status != 0):
             raise Exception(f"Error removing implicit key: {response_message.errorMessage}")
 
     def get_value_state(self, state_name: str, schema: Union[StructType, str]) -> None:
         if isinstance(schema, str):
             schema = cast(StructType, _parse_datatype_string(schema))
 
-        print(f"initializing value state: {state_name}")
-
         state_call_command = stateMessage.StateCallCommand()
         state_call_command.stateName = state_name
         state_call_command.schema = schema.json()
@@ -120,22 +111,18 @@ def _get_proto_state(self,
 
     def _send_proto_message(self, message: stateMessage.StateRequest) -> None:
         serialized_msg = message.SerializeToString()
-        print(f"sending message -- len = {len(serialized_msg)} {str(serialized_msg)}")
         write_int(0, self.sockfile)
         write_int(len(serialized_msg), self.sockfile)
         self.sockfile.write(serialized_msg)
         self.sockfile.flush()
 
     def _receive_proto_message(self) -> stateMessage.StateResponse:
         serialized_msg = self._receive_str()
-        print(f"received response message -- len = {len(serialized_msg)} {str(serialized_msg)}")
         # proto3 will not serialize the message if the value is default, in this case 0
         if (len(serialized_msg) == 0):
             return stateMessage.StateResponse(statusCode=0)
         message = stateMessage.StateResponse()
         message.ParseFromString(serialized_msg.encode('utf-8'))
-        print(f"received response message -- status = {str(message.statusCode)},"
-              f" message = {message.errorMessage}")
         return message
 
     def _receive_str(self) -> str:

diff --git a/python/pyspark/sql/streaming/stateful_processor.py b/python/pyspark/sql/streaming/stateful_processor.py
@@ -33,6 +33,10 @@
 
 
 class ValueState:
+    """
+    Class used for arbitrary stateful operations with the v2 API to capture single value state.
+    """
+
     def __init__(self,
             value_state_client: ValueStateClient,
             state_name: str,
@@ -42,9 +46,19 @@ def __init__(self,
         self.schema = schema
 
     def exists(self) -> bool:
+        """
+        Whether state exists or not.
+
+        .. versionadded:: 4.0.0
+        """
         return self._value_state_client.exists(self._state_name)
 
     def get(self) -> Any:
+        """
+        Get the state value if it exists.
+
+        .. versionadded:: 4.0.0
+        """
         value_str = self._value_state_client.get(self._state_name)
         columns = [field.name for field in self.schema.fields]
         dtypes = {}
@@ -67,35 +81,108 @@ def get(self) -> Any:
         return df
 
     def update(self, new_value: Any) -> None:
+        """
+        Update the value of the state.
+
+        .. versionadded:: 4.0.0
+        """
         self._value_state_client.update(self._state_name, self.schema, new_value)
 
     def clear(self) -> None:
+        """
+        Remove this state.
+
+        .. versionadded:: 4.0.0
+        """
         self._value_state_client.clear(self._state_name)
 
 
 class StatefulProcessorHandle:
+    """
+    Represents the operation handle provided to the stateful processor used in the arbitrary state
+    API v2.
+    """
+
     def __init__(
             self,
             state_api_client: StateApiClient) -> None:
         self.state_api_client = state_api_client
 
     def getValueState(self, state_name: str, schema: Union[StructType, str]) -> ValueState:
+        """
+        Function to create new or return existing single value state variable of given type.
+        The user must ensure to call this function only within the `init()` method of the
+        StatefulProcessor.
+
+        .. versionadded:: 4.0.0
+
+        Parameters
+        ----------
+        state_name : str
+            name of the state variable
+        schema : :class:`pyspark.sql.types.DataType` or str
+            The schema of the state variable. The value can be either a
+            :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
+        """
         self.state_api_client.get_value_state(state_name, schema)
         return ValueState(ValueStateClient(self.state_api_client), state_name, schema)
 
 
 class StatefulProcessor(ABC):
+    """
+    Class that represents the arbitrary stateful logic that needs to be provided by the user to
+    perform stateful manipulations on keyed streams.
+    """
+
     @abstractmethod
     def init(self, handle: StatefulProcessorHandle) -> None:
+        """
+        Function that will be invoked as the first method that allows for users to initialize all
+        their state variables and perform other init actions before handling data.
+
+        .. versionadded:: 4.0.0
+
+        Parameters
+        ----------
+        handle : :class:`pyspark.sql.streaming.stateful_processor.StatefulProcessorHandle`
+            Handle to the stateful processor that provides access to the state store and other
+            stateful processing related APIs.
+        """
         pass
 
     @abstractmethod
     def handleInputRows(
             self,
             key: Any,
             rows: Iterator["PandasDataFrameLike"]) -> Iterator["PandasDataFrameLike"]:
+        """
+        Function that will allow users to interact with input data rows along with the grouping key.
+        It should take parameters (key, Iterator[`pandas.DataFrame`]) and return another
+        Iterator[`pandas.DataFrame`]. For each group, all columns are passed together as
+        `pandas.DataFrame` to the function, and the returned `pandas.DataFrame` across all
+        invocations are combined as a :class:`DataFrame`. Note that the function should not make a
+        guess of the number of elements in the iterator. To process all data, the `handleInputRows`
+        function needs to iterate all elements and process them. On the other hand, the
+        `handleInputRows` function is not strictly required to iterate through all elements in the
+        iterator if it intends to read a part of data.
+
+        .. versionadded:: 4.0.0
+
+        Parameters
+        ----------
+        key : Any
+            grouping key.
+        rows : iterable of :class:`pandas.DataFrame`
+            iterator of input rows associated with grouping key
+        """
         pass
 
     @abstractmethod
     def close(self) -> None:
+        """
+        Function called as the last method that allows for users to perform any cleanup or teardown
+        operations.
+
+        .. versionadded:: 4.0.0
+        """
         pass
diff --git a/python/pyspark/sql/streaming/value_state_client.py b/python/pyspark/sql/streaming/value_state_client.py
@@ -29,7 +29,6 @@ def __init__(
         self._state_api_client = state_api_client
 
     def exists(self, state_name: str) -> bool:
-        print(f"checking value state exists: {state_name}")
         exists_call = stateMessage.Exists(stateName=state_name)
         value_state_call = stateMessage.ValueStateCall(exists=exists_call)
         state_variable_request = stateMessage.StateVariableRequest(valueStateCall=value_state_call)
@@ -38,7 +37,6 @@ def exists(self, state_name: str) -> bool:
         self._state_api_client._send_proto_message(message)
         response_message = self._state_api_client._receive_proto_message()
         status = response_message.statusCode
-        print(f"valueStateExists status= {status}")
         if (status == 0):
             return True
         elif (status == -1):
@@ -48,7 +46,6 @@ def exists(self, state_name: str) -> bool:
             raise Exception(f"Error checking value state exists: {response_message.errorMessage}")
 
     def get(self, state_name: str) -> Any:
-        print(f"getting value state: {state_name}")
         get_call = stateMessage.Get(stateName=state_name)
         value_state_call = stateMessage.ValueStateCall(get=get_call)
         state_variable_request = stateMessage.StateVariableRequest(valueStateCall=value_state_call)
@@ -57,7 +54,6 @@ def get(self, state_name: str) -> Any:
         self._state_api_client._send_proto_message(message)
         response_message = self._state_api_client._receive_proto_message()
         status = response_message.statusCode
-        print(f"valueStateGet status= {status}")
         if (status == 0):
             return self._state_api_client._receive_str()
         else:
@@ -66,7 +62,6 @@ def get(self, state_name: str) -> Any:
     def update(self, state_name: str, schema: Union[StructType, str], value: str) -> None:
         if isinstance(schema, str):
             schema = cast(StructType, _parse_datatype_string(schema))
-        print(f"updating value state: {state_name}")
         byteStr = value.encode('utf-8')
         update_call = stateMessage.Update(stateName=state_name, schema=schema.json(), value=byteStr)
         value_state_call = stateMessage.ValueStateCall(update=update_call)
@@ -76,12 +71,10 @@ def update(self, state_name: str, schema: Union[StructType, str], value: str) ->
         self._state_api_client._send_proto_message(message)
         response_message = self._state_api_client._receive_proto_message()
         status = response_message.statusCode
-        print(f"valueStateUpdate status= {status}")
         if (status != 0):
             raise Exception(f"Error updating value state: {response_message.errorMessage}")
 
     def clear(self, state_name: str) -> None:
-        print(f"clearing value state: {state_name}")
         clear_call = stateMessage.Clear(stateName=state_name)
         value_state_call = stateMessage.ValueStateCall(clear=clear_call)
         state_variable_request = stateMessage.StateVariableRequest(valueStateCall=value_state_call)
@@ -90,6 +83,5 @@ def clear(self, state_name: str) -> None:
         self._state_api_client._send_proto_message(message)
         response_message = self._state_api_client._receive_proto_message()
         status = response_message.statusCode
-        print(f"valueStateClear status= {status}")
         if (status != 0):
             raise Exception(f"Error clearing value state: {response_message.errorMessage}")
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -1631,7 +1631,7 @@ def extract_key_value_indexes(grouped_arg_offsets):
         # support combining multiple UDFs.
         assert num_udfs == 1
 
-        # See FlatMapGroupsInPandasExec for how arg_offsets are used to
+        # See TransformWithStateInPandasExec for how arg_offsets are used to
         # distinguish between grouping attributes and data attributes
         arg_offsets, f = read_single_udf(
             pickleSer, infile, eval_type, runner_conf, udf_index=0, profiler=profiler