files written correctly

ericm-db · ericm-db · Jul 8, 2024 · Jul 8, 2024 · Jul 8, 2024 · Jul 9, 2024
commit 37392bf9233c7fc44d3a4952ece39ae5eb69237b
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -40,6 +40,7 @@ import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._
 import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table}
 import org.apache.spark.sql.connector.read.streaming.{Offset => OffsetV2, ReadLimit, SparkDataStream}
 import org.apache.spark.sql.connector.write.{LogicalWriteInfoImpl, SupportsTruncate, Write}
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.command.StreamingExplainCommand
 import org.apache.spark.sql.execution.streaming.sources.ForeachBatchUserFuncException
 import org.apache.spark.sql.internal.SQLConf
@@ -239,6 +240,23 @@ abstract class StreamExecution(
    */
   val commitLog = new CommitLog(sparkSession, checkpointFile("commits"))
 
+
+  lazy val operatorStateMetadataLogs: Map[Long, OperatorStateMetadataLog] = {
+    populateOperatorStateMetadatas(getLatestExecutionContext().executionPlan.executedPlan)
+  }
+
+  private def populateOperatorStateMetadatas(
+      plan: SparkPlan): Map[Long, OperatorStateMetadataLog] = {
+    plan.flatMap {
+      case s: StateStoreWriter => s.stateInfo.map { info =>
+        val metadataPath = s.metadataFilePath()
+        info.operatorId -> new OperatorStateMetadataLog(sparkSession,
+          metadataPath.toString)
+      }
+      case _ => Seq.empty
+    }.toMap
+  }
+
   /** Whether all fields of the query have been initialized */
   private def isInitialized: Boolean = state.get != INITIALIZING
 

diff --git a/...core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateExec.scala b/...core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateExec.scala
@@ -440,6 +440,20 @@ case class TransformWithStateExec(
     OperatorStateMetadataV2(operatorInfo, stateStoreInfo, json)
   }
 
+  private def stateSchemaFilePath(storeName: Option[String] = None): Path = {
+    def stateInfo = getStateInfo
+    val stateCheckpointPath =
+      new Path(getStateInfo.checkpointLocation,
+        s"${stateInfo.operatorId.toString}")
+    storeName match {
+      case Some(storeName) =>
+        val storeNamePath = new Path(stateCheckpointPath, storeName)
+        new Path(new Path(storeNamePath, "_metadata"), "schema")
+      case None =>
+        new Path(new Path(stateCheckpointPath, "_metadata"), "schema")
+    }
+  }
+
   override protected def doExecute(): RDD[InternalRow] = {
     metrics // force lazy init at driver
 

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateSuite.scala
@@ -804,10 +804,24 @@ class TransformWithStateSuite extends StateStoreMetricsTest
     }
   }
 
+  private def fetchOperatorStateMetadataLog(
+      checkpointDir: String,
+      operatorId: Int): OperatorStateMetadataLog = {
+    val hadoopConf = spark.sessionState.newHadoopConf()
+    val stateChkptPath = new Path(checkpointDir, s"state/$operatorId")
+    val operatorStateMetadataPath = OperatorStateMetadataV2.metadataFilePath(stateChkptPath)
+    new OperatorStateMetadataLog(hadoopConf, operatorStateMetadataPath.toString)
+  }
+
   private def fetchColumnFamilySchemas(
       checkpointDir: String,
       operatorId: Int): List[ColumnFamilySchema] = {
-    fetchStateSchemaV3File(checkpointDir, operatorId).getLatest().get._2
+    val operatorStateMetadataLog = fetchOperatorStateMetadataLog(checkpointDir, operatorId)
+    val stateSchemaFilePath = operatorStateMetadataLog.
+      getLatest().get._2.
+      asInstanceOf[OperatorStateMetadataV2].
+      stateStoreInfo.head.stateSchemaFilePath
+    fetchStateSchemaV3File(checkpointDir, operatorId).get(new Path(stateSchemaFilePath))
   }
 
   private def fetchStateSchemaV3File(