-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-23626][CORE] Eagerly compute RDD.partitions on entire DAG when submitting job to DAGScheduler #34265
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-23626][CORE] Eagerly compute RDD.partitions on entire DAG when submitting job to DAGScheduler #34265
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -732,6 +732,32 @@ private[spark] class DAGScheduler( | |
| missing.toList | ||
| } | ||
|
|
||
| /** Invoke `.partitions` on the given RDD and all of its ancestors */ | ||
| private def eagerlyComputePartitionsForRddAndAncestors(rdd: RDD[_]): Unit = { | ||
| val visitedRdds = new HashSet[RDD[_]] | ||
| // We are manually maintaining a stack here to prevent StackOverflowError | ||
| // caused by recursively visiting | ||
| val waitingForVisit = new ListBuffer[RDD[_]] | ||
| waitingForVisit += rdd | ||
|
|
||
| def visit(rdd: RDD[_]): Unit = { | ||
| if (!visitedRdds(rdd)) { | ||
| visitedRdds += rdd | ||
|
|
||
| // Eagerly compute: | ||
| rdd.partitions | ||
|
|
||
| for (dep <- rdd.dependencies) { | ||
| waitingForVisit.prepend(dep.rdd) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| while (waitingForVisit.nonEmpty) { | ||
| visit(waitingForVisit.remove(0)) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Registers the given jobId among the jobs that need the given stage and | ||
| * all of that stage's ancestors. | ||
|
|
@@ -841,6 +867,11 @@ private[spark] class DAGScheduler( | |
| "Total number of partitions: " + maxPartitions) | ||
| } | ||
|
|
||
| // SPARK-23626: `RDD.getPartitions()` can be slow, so we eagerly compute | ||
| // `.partitions` on every RDD in the DAG to ensure that `getPartitions()` | ||
| // is evaluated outside of the DAGScheduler's single-threaded event loop: | ||
| eagerlyComputePartitionsForRddAndAncestors(rdd) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be a good idea to add an assertion in the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that's a good idea, but I'd like to defer it to a separate followup PR. I've filed https://issues.apache.org/jira/browse/SPARK-37009 to track that. |
||
|
|
||
| val jobId = nextJobId.getAndIncrement() | ||
| if (partitions.isEmpty) { | ||
| val clonedProperties = Utils.cloneProperties(properties) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.