-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-23880][SQL] Do not trigger any jobs for caching data #21018
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,19 +32,6 @@ import org.apache.spark.storage.StorageLevel | |
| import org.apache.spark.util.LongAccumulator | ||
|
|
||
|
|
||
| object InMemoryRelation { | ||
| def apply( | ||
| useCompression: Boolean, | ||
| batchSize: Int, | ||
| storageLevel: StorageLevel, | ||
| child: SparkPlan, | ||
| tableName: Option[String], | ||
| logicalPlan: LogicalPlan): InMemoryRelation = | ||
| new InMemoryRelation(child.output, useCompression, batchSize, storageLevel, child, tableName)( | ||
| statsOfPlanToCache = logicalPlan.stats, outputOrdering = logicalPlan.outputOrdering) | ||
| } | ||
|
|
||
|
|
||
| /** | ||
| * CachedBatch is a cached batch of rows. | ||
| * | ||
|
|
@@ -55,58 +42,41 @@ object InMemoryRelation { | |
| private[columnar] | ||
| case class CachedBatch(numRows: Int, buffers: Array[Array[Byte]], stats: InternalRow) | ||
|
|
||
| case class InMemoryRelation( | ||
| output: Seq[Attribute], | ||
| case class CachedRDDBuilder( | ||
| useCompression: Boolean, | ||
| batchSize: Int, | ||
| storageLevel: StorageLevel, | ||
| @transient child: SparkPlan, | ||
| @transient cachedPlan: SparkPlan, | ||
| tableName: Option[String])( | ||
| @transient var _cachedColumnBuffers: RDD[CachedBatch] = null, | ||
| val sizeInBytesStats: LongAccumulator = child.sqlContext.sparkContext.longAccumulator, | ||
| statsOfPlanToCache: Statistics, | ||
| override val outputOrdering: Seq[SortOrder]) | ||
| extends logical.LeafNode with MultiInstanceRelation { | ||
|
|
||
| override protected def innerChildren: Seq[SparkPlan] = Seq(child) | ||
|
|
||
| override def doCanonicalize(): logical.LogicalPlan = | ||
| copy(output = output.map(QueryPlan.normalizeExprId(_, child.output)), | ||
| storageLevel = StorageLevel.NONE, | ||
| child = child.canonicalized, | ||
| tableName = None)( | ||
| _cachedColumnBuffers, | ||
| sizeInBytesStats, | ||
| statsOfPlanToCache, | ||
| outputOrdering) | ||
| @transient private var _cachedColumnBuffers: RDD[CachedBatch] = null) { | ||
|
|
||
| override def producedAttributes: AttributeSet = outputSet | ||
|
|
||
| @transient val partitionStatistics = new PartitionStatistics(output) | ||
| val sizeInBytesStats: LongAccumulator = cachedPlan.sqlContext.sparkContext.longAccumulator | ||
|
|
||
| override def computeStats(): Statistics = { | ||
| if (sizeInBytesStats.value == 0L) { | ||
| // Underlying columnar RDD hasn't been materialized, use the stats from the plan to cache. | ||
| // Note that we should drop the hint info here. We may cache a plan whose root node is a hint | ||
| // node. When we lookup the cache with a semantically same plan without hint info, the plan | ||
| // returned by cache lookup should not have hint info. If we lookup the cache with a | ||
| // semantically same plan with a different hint info, `CacheManager.useCachedData` will take | ||
| // care of it and retain the hint info in the lookup input plan. | ||
| statsOfPlanToCache.copy(hints = HintInfo()) | ||
| } else { | ||
| Statistics(sizeInBytes = sizeInBytesStats.value.longValue) | ||
| def cachedColumnBuffers: RDD[CachedBatch] = { | ||
| if (_cachedColumnBuffers == null) { | ||
| synchronized { | ||
| if (_cachedColumnBuffers == null) { | ||
| _cachedColumnBuffers = buildBuffers() | ||
| } | ||
| } | ||
| } | ||
| _cachedColumnBuffers | ||
| } | ||
|
|
||
| // If the cached column buffers were not passed in, we calculate them in the constructor. | ||
| // As in Spark, the actual work of caching is lazy. | ||
| if (_cachedColumnBuffers == null) { | ||
| buildBuffers() | ||
| def clearCache(blocking: Boolean = true): Unit = { | ||
| if (_cachedColumnBuffers != null) { | ||
| synchronized { | ||
| if (_cachedColumnBuffers != null) { | ||
| _cachedColumnBuffers.unpersist(blocking) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shall we also do
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok |
||
| _cachedColumnBuffers = null | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private def buildBuffers(): Unit = { | ||
| val output = child.output | ||
| val cached = child.execute().mapPartitionsInternal { rowIterator => | ||
| private def buildBuffers(): RDD[CachedBatch] = { | ||
| val output = cachedPlan.output | ||
| val cached = cachedPlan.execute().mapPartitionsInternal { rowIterator => | ||
| new Iterator[CachedBatch] { | ||
| def next(): CachedBatch = { | ||
| val columnBuilders = output.map { attribute => | ||
|
|
@@ -154,32 +124,77 @@ case class InMemoryRelation( | |
|
|
||
| cached.setName( | ||
| tableName.map(n => s"In-memory table $n") | ||
| .getOrElse(StringUtils.abbreviate(child.toString, 1024))) | ||
| _cachedColumnBuffers = cached | ||
| .getOrElse(StringUtils.abbreviate(cachedPlan.toString, 1024))) | ||
| cached | ||
| } | ||
| } | ||
|
|
||
| object InMemoryRelation { | ||
|
|
||
| def apply( | ||
| useCompression: Boolean, | ||
| batchSize: Int, | ||
| storageLevel: StorageLevel, | ||
| child: SparkPlan, | ||
| tableName: Option[String], | ||
| logicalPlan: LogicalPlan): InMemoryRelation = { | ||
| val cacheBuilder = CachedRDDBuilder(useCompression, batchSize, storageLevel, child, tableName)() | ||
| new InMemoryRelation(child.output, cacheBuilder)( | ||
| statsOfPlanToCache = logicalPlan.stats, outputOrdering = logicalPlan.outputOrdering) | ||
| } | ||
|
|
||
| def apply(cacheBuilder: CachedRDDBuilder, logicalPlan: LogicalPlan): InMemoryRelation = { | ||
| new InMemoryRelation(cacheBuilder.cachedPlan.output, cacheBuilder)( | ||
| statsOfPlanToCache = logicalPlan.stats, outputOrdering = logicalPlan.outputOrdering) | ||
| } | ||
| } | ||
|
|
||
| case class InMemoryRelation( | ||
| output: Seq[Attribute], | ||
| @transient cacheBuilder: CachedRDDBuilder)( | ||
| statsOfPlanToCache: Statistics, | ||
| override val outputOrdering: Seq[SortOrder]) | ||
| extends logical.LeafNode with MultiInstanceRelation { | ||
|
|
||
| override protected def innerChildren: Seq[SparkPlan] = Seq(cachedPlan) | ||
|
|
||
| override def doCanonicalize(): logical.LogicalPlan = | ||
| copy(output = output.map(QueryPlan.normalizeExprId(_, cachedPlan.output)), | ||
| cacheBuilder)( | ||
| statsOfPlanToCache, | ||
| outputOrdering) | ||
|
|
||
| override def producedAttributes: AttributeSet = outputSet | ||
|
|
||
| @transient val partitionStatistics = new PartitionStatistics(output) | ||
|
|
||
| def cachedPlan: SparkPlan = cacheBuilder.cachedPlan | ||
|
|
||
| override def computeStats(): Statistics = { | ||
| if (cacheBuilder.sizeInBytesStats.value == 0L) { | ||
| // Underlying columnar RDD hasn't been materialized, use the stats from the plan to cache. | ||
| // Note that we should drop the hint info here. We may cache a plan whose root node is a hint | ||
| // node. When we lookup the cache with a semantically same plan without hint info, the plan | ||
| // returned by cache lookup should not have hint info. If we lookup the cache with a | ||
| // semantically same plan with a different hint info, `CacheManager.useCachedData` will take | ||
| // care of it and retain the hint info in the lookup input plan. | ||
| statsOfPlanToCache.copy(hints = HintInfo()) | ||
| } else { | ||
| Statistics(sizeInBytes = cacheBuilder.sizeInBytesStats.value.longValue) | ||
| } | ||
| } | ||
|
|
||
| def withOutput(newOutput: Seq[Attribute]): InMemoryRelation = { | ||
| InMemoryRelation( | ||
| newOutput, useCompression, batchSize, storageLevel, child, tableName)( | ||
| _cachedColumnBuffers, sizeInBytesStats, statsOfPlanToCache, outputOrdering) | ||
| InMemoryRelation(newOutput, cacheBuilder)(statsOfPlanToCache, outputOrdering) | ||
| } | ||
|
|
||
| override def newInstance(): this.type = { | ||
| new InMemoryRelation( | ||
| output.map(_.newInstance()), | ||
| useCompression, | ||
| batchSize, | ||
| storageLevel, | ||
| child, | ||
| tableName)( | ||
| _cachedColumnBuffers, | ||
| sizeInBytesStats, | ||
| cacheBuilder)( | ||
| statsOfPlanToCache, | ||
| outputOrdering).asInstanceOf[this.type] | ||
| } | ||
|
|
||
| def cachedColumnBuffers: RDD[CachedBatch] = _cachedColumnBuffers | ||
|
|
||
| override protected def otherCopyArgs: Seq[AnyRef] = | ||
| Seq(_cachedColumnBuffers, sizeInBytesStats, statsOfPlanToCache) | ||
| override protected def otherCopyArgs: Seq[AnyRef] = Seq(statsOfPlanToCache) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,7 @@ import scala.concurrent.duration._ | |
| import scala.language.postfixOps | ||
|
|
||
| import org.apache.spark.CleanerListener | ||
| import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart} | ||
| import org.apache.spark.sql.catalyst.TableIdentifier | ||
| import org.apache.spark.sql.catalyst.expressions.SubqueryExpression | ||
| import org.apache.spark.sql.execution.{RDDScanExec, SparkPlan} | ||
|
|
@@ -52,7 +53,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext | |
| val plan = spark.table(tableName).queryExecution.sparkPlan | ||
| plan.collect { | ||
| case InMemoryTableScanExec(_, _, relation) => | ||
| relation.cachedColumnBuffers.id | ||
| relation.cacheBuilder.cachedColumnBuffers.id | ||
| case _ => | ||
| fail(s"Table $tableName is not cached\n" + plan) | ||
| }.head | ||
|
|
@@ -78,7 +79,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext | |
| private def getNumInMemoryTablesRecursively(plan: SparkPlan): Int = { | ||
| plan.collect { | ||
| case InMemoryTableScanExec(_, _, relation) => | ||
| getNumInMemoryTablesRecursively(relation.child) + 1 | ||
| getNumInMemoryTablesRecursively(relation.cachedPlan) + 1 | ||
| }.sum | ||
| } | ||
|
|
||
|
|
@@ -200,7 +201,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext | |
| spark.catalog.cacheTable("testData") | ||
| assertResult(0, "Double InMemoryRelations found, cacheTable() is not idempotent") { | ||
| spark.table("testData").queryExecution.withCachedData.collect { | ||
| case r @ InMemoryRelation(_, _, _, _, _: InMemoryTableScanExec, _) => r | ||
| case r: InMemoryRelation if r.cachedPlan.isInstanceOf[InMemoryTableScanExec] => r | ||
| }.size | ||
| } | ||
|
|
||
|
|
@@ -367,12 +368,12 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext | |
| val toBeCleanedAccIds = new HashSet[Long] | ||
|
|
||
| val accId1 = spark.table("t1").queryExecution.withCachedData.collect { | ||
| case i: InMemoryRelation => i.sizeInBytesStats.id | ||
| case i: InMemoryRelation => i.cacheBuilder.sizeInBytesStats.id | ||
| }.head | ||
| toBeCleanedAccIds += accId1 | ||
|
|
||
| val accId2 = spark.table("t1").queryExecution.withCachedData.collect { | ||
| case i: InMemoryRelation => i.sizeInBytesStats.id | ||
| case i: InMemoryRelation => i.cacheBuilder.sizeInBytesStats.id | ||
| }.head | ||
| toBeCleanedAccIds += accId2 | ||
|
|
||
|
|
@@ -794,4 +795,29 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext | |
| } | ||
| } | ||
| } | ||
|
|
||
| private def checkIfNoJobTriggered[T](f: => T): T = { | ||
| var numJobTrigered = 0 | ||
| val jobListener = new SparkListener { | ||
| override def onJobStart(jobStart: SparkListenerJobStart): Unit = { | ||
| numJobTrigered += 1 | ||
| } | ||
| } | ||
| sparkContext.addSparkListener(jobListener) | ||
| try { | ||
| val result = f | ||
| sparkContext.listenerBus.waitUntilEmpty(10000L) | ||
| assert(numJobTrigered === 0) | ||
| result | ||
| } finally { | ||
| sparkContext.removeSparkListener(jobListener) | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-23880 table cache should be lazy and don't trigger any jobs") { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Without the changes in this PR, this test still can pass. : )
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh, I'll recheck. Thanks! |
||
| val cachedData = checkIfNoJobTriggered { | ||
| spark.range(1002).filter('id > 1000).orderBy('id.desc).cache() | ||
| } | ||
| assert(cachedData.collect === Seq(1001)) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
_cachedColumnBuffersisprivate[sql], so I'm not sure if thissynchronizedcan be very effective.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I feel thread contention is low here, so I like simpler code. But, I welcome suggestions for more efficient&simpler code.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should not care about thread-safety at all or do it right. Please prove
CachedRDDBuilderwill never be accessed by multiple threads and remove thesesynchronized, or making_cachedColumnBuffersprivate.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok, I'll recheck and update.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this pr w/o
synchronized, I found multi-thread queries wrongly built four RDDs for a single cache;Either way, I think we should make
_cachedColumnBuffersprivate, so I fixed.