hdfsmetadatalog

ericm-db · ericm-db · Jul 8, 2024 · Jul 8, 2024 · Jul 8, 2024 · Jul 9, 2024
commit 77ffe9531c082ee6a47d2a62f86779ff4d957690
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
@@ -25,6 +25,7 @@ import scala.jdk.CollectionConverters._
 import scala.reflect.ClassTag
 
 import org.apache.commons.io.IOUtils
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
 import org.json4s.{Formats, NoTypeHints}
 import org.json4s.jackson.Serialization
@@ -47,10 +48,25 @@ import org.apache.spark.util.ArrayImplicits._
  *
  * Note: [[HDFSMetadataLog]] doesn't support S3-like file systems as they don't guarantee listing
  * files in a directory always shows the latest files.
+ * @param hadoopConf Hadoop configuration that is used to read / write metadata files.
+ * @param path Path to the directory that will be used for writing metadata.
+ * @param metadataCacheEnabled Whether to cache the batches' metadata in memory.
  */
-class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path: String)
+class HDFSMetadataLog[T <: AnyRef : ClassTag](
+    hadoopConf: Configuration,
+    path: String,
+    val metadataCacheEnabled: Boolean = false)
   extends MetadataLog[T] with Logging {
 
+  def this(sparkSession: SparkSession, path: String) = {
+    this(
+      sparkSession.sessionState.newHadoopConf(),
+      path,
+      metadataCacheEnabled = sparkSession.sessionState.conf.getConf(
+        SQLConf.STREAMING_METADATA_CACHE_ENABLED)
+    )
+  }
+
   private implicit val formats: Formats = Serialization.formats(NoTypeHints)
 
   /** Needed to serialize type T into JSON when using Jackson */
@@ -64,15 +80,12 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
   val metadataPath = new Path(path)
 
   protected val fileManager =
-    CheckpointFileManager.create(metadataPath, sparkSession.sessionState.newHadoopConf())
+    CheckpointFileManager.create(metadataPath, hadoopConf)
 
   if (!fileManager.exists(metadataPath)) {
     fileManager.mkdirs(metadataPath)
   }
 
-  protected val metadataCacheEnabled: Boolean
-    = sparkSession.sessionState.conf.getConf(SQLConf.STREAMING_METADATA_CACHE_ENABLED)
-
   /**
    * Cache the latest two batches. [[StreamExecution]] usually just accesses the latest two batches
    * when committing offsets, this cache will save some file system operations.