easy feedback to address

ericm-db · ericm-db · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024
commit 730cae08f5d6e3308a0f56ee6a06cec385b814a8
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
@@ -259,13 +259,13 @@ class IncrementalExecution(
     }
   }
 
-  object StateStoreColumnFamilySchemas extends SparkPlanPartialRule {
+  object StateStoreColumnFamilySchemasRule extends SparkPlanPartialRule {
     override val rule: PartialFunction[SparkPlan, SparkPlan] = {
       case statefulOp: StatefulOperator =>
         statefulOp match {
-          case transformWithStateExec: TransformWithStateExec =>
-            transformWithStateExec.copy(
-              columnFamilySchemas = transformWithStateExec.getColFamilySchemas()
+          case op: TransformWithStateExec =>
+            op.copy(
+              columnFamilySchemas = op.getColFamilySchemas()
             )
           case _ => statefulOp
         }
@@ -565,7 +565,7 @@ class IncrementalExecution(
       // The rule below doesn't change the plan but can cause the side effect that
       // metadata/schema is written in the checkpoint directory of stateful operator.
       planWithStateOpId transform StateSchemaAndOperatorMetadataRule.rule
-      val planWithStateSchemas = planWithStateOpId transform StateStoreColumnFamilySchemas.rule
+      val planWithStateSchemas = planWithStateOpId transform StateStoreColumnFamilySchemasRule.rule
       simulateWatermarkPropagation(planWithStateSchemas)
       planWithStateSchemas transform WatermarkPropagationRule.rule
     }

diff --git a/...ain/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala b/...ain/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala
@@ -303,56 +303,6 @@ object StreamingSymmetricHashJoinHelper extends Logging {
     }
   }
 
-  /**
-   * A custom RDD that allows partitions to be "zipped" together, while ensuring the tasks'
-   * preferred location is based on which executors have the required join state stores already
-   * loaded. This class is a variant of [[org.apache.spark.rdd.ZippedPartitionsRDD2]] which only
-   * changes signature of `f` by taking in a map of column family schemas. This is used for
-   * passing the column family schemas when there is initial state for the TransformWithStateExec
-   * operator
-   */
-  class StateStoreAwareZipPartitionsRDDWithSchemas[A: ClassTag, B: ClassTag, V: ClassTag](
-      sc: SparkContext,
-      var f: (Int, Iterator[A], Iterator[B], Map[String, StateStoreColFamilySchema]) => Iterator[V],
-      var rdd1: RDD[A],
-      var rdd2: RDD[B],
-      stateInfo: StatefulOperatorStateInfo,
-      stateStoreNames: Seq[String],
-      @transient private val storeCoordinator: Option[StateStoreCoordinatorRef],
-      schemas: Map[String, StateStoreColFamilySchema])
-    extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2)) {
-
-    /**
-     * Set the preferred location of each partition using the executor that has the related
-     * [[StateStoreProvider]] already loaded.
-     */
-    override def getPreferredLocations(partition: Partition): Seq[String] = {
-      stateStoreNames.flatMap { storeName =>
-        val stateStoreProviderId = StateStoreProviderId(stateInfo, partition.index, storeName)
-        storeCoordinator.flatMap(_.getLocation(stateStoreProviderId))
-      }.distinct
-    }
-
-    override def compute(s: Partition, context: TaskContext): Iterator[V] = {
-      val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions
-      if (partitions(0).index != partitions(1).index) {
-        throw new IllegalStateException(s"Partition ID should be same in both side: " +
-          s"left ${partitions(0).index} , right ${partitions(1).index}")
-      }
-
-      val partitionId = partitions(0).index
-      f(partitionId, rdd1.iterator(partitions(0), context),
-        rdd2.iterator(partitions(1), context), schemas)
-    }
-
-    override def clearDependencies(): Unit = {
-      super.clearDependencies()
-      rdd1 = null
-      rdd2 = null
-      f = null
-    }
-  }
-
   implicit class StateStoreAwareZipPartitionsHelper[T: ClassTag](dataRDD: RDD[T]) {
     /**
      * Function used by `StreamingSymmetricHashJoinExec` to zip together the partitions of two

diff --git a/...e/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateEncoder.scala b/...e/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateEncoder.scala
@@ -685,7 +685,9 @@ class RangeKeyScanStateEncoder(
   }
 
   def encodePrefixKeyForRangeScan(
-      row: UnsafeRow, avroType: Schema): Array[Byte] = {
+      row: UnsafeRow,
+      avroType: Schema
+  ): Array[Byte] = {
     val record = new GenericData.Record(avroType)
     var fieldIdx = 0
     rangeScanKeyFieldsWithOrdinal.zipWithIndex.foreach { case (fieldWithOrdinal, idx) =>
@@ -887,11 +889,11 @@ class RangeKeyScanStateEncoder(
  * It uses the first byte of the generated byte array to store the version the describes how the
  * row is encoded in the rest of the byte array. Currently, the default version is 0,
  *
+ * If the avroEnc is specified, we are using Avro encoding for this column family's keys
  * VERSION 0:  [ VERSION (1 byte) | ROW (N bytes) ]
  *    The bytes of a UnsafeRow is written unmodified to starting from offset 1
  *    (offset 0 is the version byte of value 0). That is, if the unsafe row has N bytes,
  *    then the generated array byte will be N+1 bytes.
- * If the avroEnc is specified, we are using Avro encoding for this column family's keys
  */
 class NoPrefixKeyStateEncoder(
     keySchema: StructType,

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/package.scala
@@ -89,49 +89,6 @@ package object state {
         extraOptions,
         useMultipleValuesPerKey)
     }
-
-    /** Map each partition of an RDD along with data in a [[StateStore]] that passes the
-     * column family schemas to the storeUpdateFunction. Used to pass Avro encoders/decoders
-     * to executors */
-    def mapPartitionsWithStateStoreWithSchemas[U: ClassTag](
-        stateInfo: StatefulOperatorStateInfo,
-        keySchema: StructType,
-        valueSchema: StructType,
-        keyStateEncoderSpec: KeyStateEncoderSpec,
-        sessionState: SessionState,
-        storeCoordinator: Option[StateStoreCoordinatorRef],
-        useColumnFamilies: Boolean = false,
-        extraOptions: Map[String, String] = Map.empty,
-        useMultipleValuesPerKey: Boolean = false,
-        columnFamilySchemas: Map[String, StateStoreColFamilySchema] = Map.empty)(
-        storeUpdateFunction: (StateStore, Iterator[T], Map[String, StateStoreColFamilySchema]) => Iterator[U]): StateStoreRDD[T, U] = {
-
-      val cleanedF = dataRDD.sparkContext.clean(storeUpdateFunction)
-      val wrappedF = (store: StateStore, iter: Iterator[T]) => {
-        // Abort the state store in case of error
-        TaskContext.get().addTaskCompletionListener[Unit](_ => {
-          if (!store.hasCommitted) store.abort()
-        })
-        cleanedF(store, iter, columnFamilySchemas)
-      }
-
-      new StateStoreRDD(
-        dataRDD,
-        wrappedF,
-        stateInfo.checkpointLocation,
-        stateInfo.queryRunId,
-        stateInfo.operatorId,
-        stateInfo.storeVersion,
-        stateInfo.stateStoreCkptIds,
-        keySchema,
-        valueSchema,
-        keyStateEncoderSpec,
-        sessionState,
-        storeCoordinator,
-        useColumnFamilies,
-        extraOptions,
-        useMultipleValuesPerKey)
-    }
     // scalastyle:on
 
     /** Map each partition of an RDD along with data in a [[ReadStateStore]]. */