-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-15698][SQL][Streaming] Add the ability to remove the old MetadataLog in FileStreamSource #13513
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-15698][SQL][Streaming] Add the ability to remove the old MetadataLog in FileStreamSource #13513
Changes from 1 commit
6cc43a3
b1299dd
5300d9d
4187999
fb5a72c
bbf7663
56a00ae
be1abfa
bddbc7f
84d3d27
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,20 +17,36 @@ | |
|
|
||
| package org.apache.spark.sql.execution.streaming | ||
|
|
||
| <<<<<<< 92ce8d4849a0341c4636e70821b7be57ad3055b1 | ||
| import scala.collection.JavaConverters._ | ||
| ======= | ||
| import java.util.UUID | ||
|
|
||
| import org.apache.hadoop.fs.Path | ||
| import scala.collection.mutable.ArrayBuffer | ||
| import scala.util.control.NonFatal | ||
| >>>>>>> Add the ability to remove the old MetadataLog in FileStreamSource | ||
|
|
||
| import org.apache.hadoop.fs.{Path, PathFilter} | ||
|
|
||
| import org.apache.spark.deploy.SparkHadoopUtil | ||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} | ||
| <<<<<<< 92ce8d4849a0341c4636e70821b7be57ad3055b1 | ||
| import org.apache.spark.sql.execution.datasources.{DataSource, ListingFileCatalog, LogicalRelation} | ||
| ======= | ||
| import org.apache.spark.sql.execution.datasources.{CaseInsensitiveMap, DataSource, ListingFileCatalog, LogicalRelation} | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| >>>>>>> Add the ability to remove the old MetadataLog in FileStreamSource | ||
| import org.apache.spark.sql.types.StructType | ||
|
|
||
| /** | ||
| <<<<<<< 92ce8d4849a0341c4636e70821b7be57ad3055b1 | ||
| * A very simple source that reads files from the given directory as they appear. | ||
| * | ||
| * TODO: Clean up the metadata log files periodically. | ||
| ======= | ||
| * A very simple source that reads text files from the given directory as they appear. | ||
| >>>>>>> Add the ability to remove the old MetadataLog in FileStreamSource | ||
| */ | ||
| class FileStreamSource( | ||
| sparkSession: SparkSession, | ||
|
|
@@ -40,6 +56,7 @@ class FileStreamSource( | |
| metadataPath: String, | ||
| options: Map[String, String]) extends Source with Logging { | ||
|
|
||
| <<<<<<< 92ce8d4849a0341c4636e70821b7be57ad3055b1 | ||
| import FileStreamSource._ | ||
|
|
||
| private val sourceOptions = new FileStreamOptions(options) | ||
|
|
@@ -51,6 +68,11 @@ class FileStreamSource( | |
|
|
||
| private val metadataLog = new HDFSMetadataLog[Array[FileEntry]](sparkSession, metadataPath) | ||
|
|
||
| ======= | ||
| private val fs = new Path(path).getFileSystem(sparkSession.sessionState.newHadoopConf()) | ||
| private val qualifiedBasePath = fs.makeQualified(new Path(path)) // can contains glob patterns | ||
| private val metadataLog = new FileStreamSourceLog(sparkSession, metadataPath) | ||
| >>>>>>> Add the ability to remove the old MetadataLog in FileStreamSource | ||
| private var maxBatchId = metadataLog.getLatest().map(_._1).getOrElse(-1L) | ||
|
|
||
| /** Maximum number of new files to be considered in each batch */ | ||
|
|
@@ -234,3 +256,86 @@ object FileStreamSource { | |
| } | ||
| } | ||
| } | ||
|
|
||
| class FileStreamSourceLog(sparkSession: SparkSession, path: String) | ||
| extends HDFSMetadataLog[Seq[String]](sparkSession, path) { | ||
|
|
||
| // Configurations about metadata compaction | ||
| private val compactInterval = sparkSession.conf.get(SQLConf.FILE_SOURCE_LOG_COMPACT_INTERVAL) | ||
| require(compactInterval > 0, | ||
| s"Please set ${SQLConf.FILE_SOURCE_LOG_COMPACT_INTERVAL.key} (was $compactInterval) to a " + | ||
| s"positive value.") | ||
|
|
||
| private val fileCleanupDelayMs = sparkSession.conf.get(SQLConf.FILE_SOURCE_LOG_CLEANUP_DELAY) | ||
|
|
||
| private val isDeletingExpiredLog = sparkSession.conf.get(SQLConf.FILE_SOURCE_LOG_DELETION) | ||
|
|
||
| private var compactBatchId: Long = -1L | ||
|
|
||
| private def isCompactionBatch(batchId: Long, compactInterval: Long): Boolean = { | ||
| batchId % compactInterval == 0 | ||
| } | ||
|
|
||
| override def add(batchId: Long, metadata: Seq[String]): Boolean = { | ||
| if (isCompactionBatch(batchId, compactInterval)) { | ||
| compactMetadataLog(batchId - 1) | ||
| } | ||
|
|
||
| super.add(batchId, metadata) | ||
| } | ||
|
|
||
| private def compactMetadataLog(batchId: Long): Unit = { | ||
| // read out compact metadata and merge with new metadata. | ||
| val batches = super.get(Some(compactBatchId), Some(batchId)) | ||
| val totalMetadata = batches.flatMap(_._2) | ||
| if (totalMetadata.isEmpty) { | ||
| return | ||
| } | ||
|
|
||
| // Remove old compact metadata file and rewrite. | ||
| val renamedPath = new Path(path, s".${batchId.toString}-${UUID.randomUUID.toString}.tmp") | ||
| fileManager.rename(batchIdToPath(batchId), renamedPath) | ||
|
|
||
| var isSuccess = false | ||
| try { | ||
| isSuccess = super.add(batchId, totalMetadata) | ||
| } catch { | ||
| case NonFatal(e) => isSuccess = false | ||
|
||
| } finally { | ||
| if (!isSuccess) { | ||
| // Rollback to the previous status if compaction is failed. | ||
|
||
| fileManager.delete(batchIdToPath(batchId)) | ||
| fileManager.rename(renamedPath, batchIdToPath(batchId)) | ||
| return | ||
| } else { | ||
| fileManager.delete(renamedPath) | ||
| } | ||
| } | ||
|
|
||
| compactBatchId = batchId | ||
|
|
||
| // Remove expired metadata log | ||
| if (isDeletingExpiredLog) { | ||
| removeOlderThan(compactBatchId) | ||
| } | ||
| } | ||
|
|
||
| private def removeOlderThan(batchId: Long): Unit = { | ||
| val expiredTime = System.currentTimeMillis() - fileCleanupDelayMs | ||
| fileManager.list(metadataPath, new PathFilter { | ||
| override def accept(path: Path): Boolean = { | ||
| try { | ||
| val id = pathToBatchId(path) | ||
| id < batchId | ||
| } catch { | ||
| case _: NumberFormatException => | ||
| false | ||
| } | ||
| } | ||
| }).foreach { f => | ||
| if (f.getModificationTime <= expiredTime) { | ||
| fileManager.delete(f.getPath) | ||
| } | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -544,7 +544,28 @@ object SQLConf { | |
| .internal() | ||
| .doc("How long that a file is guaranteed to be visible for all readers.") | ||
| .timeConf(TimeUnit.MILLISECONDS) | ||
| .createWithDefault(60 * 1000L) // 10 minutes | ||
| .createWithDefault(60 * 10 * 1000L) // 10 minutes | ||
|
|
||
| val FILE_SOURCE_LOG_DELETION = SQLConfigBuilder("spark.sql.streaming.fileSource.log.deletion") | ||
| .internal() | ||
| .doc("Whether to delete the expired log files in file stream source.") | ||
| .booleanConf | ||
| .createWithDefault(true) | ||
|
|
||
| val FILE_SOURCE_LOG_COMPACT_INTERVAL = | ||
| SQLConfigBuilder("spark.sql.streaming.fileSource.log.compactInterval") | ||
| .internal() | ||
| .doc("Number of log files after which all the previous files " + | ||
| "are compacted into the next log file.") | ||
| .intConf | ||
| .createWithDefault(10) | ||
|
|
||
| val FILE_SOURCE_LOG_CLEANUP_DELAY = | ||
| SQLConfigBuilder("spark.sql.streaming.fileSource.log.cleanupDelay") | ||
| .internal() | ||
| .doc("How long in milliseconds a file is guaranteed to be visible for all readers.") | ||
| .timeConf(TimeUnit.MILLISECONDS) | ||
| .createWithDefault(60 * 10 * 1000L) // 10 minutes | ||
|
||
|
|
||
| val STREAMING_SCHEMA_INFERENCE = | ||
| SQLConfigBuilder("spark.sql.streaming.schemaInference") | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd move
(was $compactInterval)at the end of the message.