-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-13136][SQL] Create a dedicated Broadcast exchange operator #11083
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
aa7120e
c2b7533
6a5568a
9adecdd
d0194fb
02a61b8
c12c8e6
c7dd7ae
e847383
d73f11c
9c0f4bf
da4a966
681f347
7db240a
3ad839d
1116768
a5501cf
c7429bb
b12bbc2
9d52650
54b558d
f33d2cb
28363c8
f812a31
4b5978b
c8c175e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -388,19 +388,6 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[ | |
| withCoordinator | ||
| } | ||
|
|
||
| /** | ||
| * Create a [[Broadcast]] operator for a given [[BroadcastMode]] and [[SparkPlan]]. | ||
| */ | ||
| private def createBroadcast(mode: BroadcastMode, plan: SparkPlan): Broadcast = mode match { | ||
| case IdentityBroadcastMode => | ||
| Broadcast(mode, identity, plan) | ||
| case HashSetBroadcastMode(keys) => | ||
| Broadcast(mode, HashSemiJoin.buildKeyHashSet(keys, plan, _), plan) | ||
| case HashedRelationBroadcastMode(canJoinKeyFitWithinLong, keys) => | ||
| Broadcast(mode, HashedRelation(canJoinKeyFitWithinLong, keys, plan, _), plan) | ||
| case _ => sys.error(s"Unknown BroadcastMode: $mode") | ||
| } | ||
|
|
||
| private def ensureDistributionAndOrdering(operator: SparkPlan): SparkPlan = { | ||
| val requiredChildDistributions: Seq[Distribution] = operator.requiredChildDistribution | ||
| val requiredChildOrderings: Seq[Seq[SortOrder]] = operator.requiredChildOrdering | ||
|
|
@@ -415,11 +402,11 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[ | |
| case (child, BroadcastDistribution(m1)) => | ||
| child match { | ||
| // The child is broadcasting the same variable: keep the child. | ||
| case Broadcast(m2, _, _) if m1 == m2 => child | ||
| case Broadcast(m2, _) if m1 == m2 => child | ||
|
||
| // The child is broadcasting a different variable: replace the child. | ||
| case Broadcast(m2, _, src) => createBroadcast(m1, src) | ||
| case Broadcast(m2, src) => Broadcast(m1, src) | ||
| // Create a broadcast on top of the child. | ||
| case _ => createBroadcast(m1, child) | ||
| case _ => Broadcast(m1, child) | ||
| } | ||
| case (child, distribution) => | ||
| Exchange(createPartitioning(distribution, defaultNumPreShufflePartitions), child) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -80,13 +80,6 @@ trait HashSemiJoin { | |
| } | ||
|
|
||
| private[execution] object HashSemiJoin { | ||
| def buildKeyHashSet( | ||
| keys: Seq[Expression], | ||
| plan: SparkPlan, | ||
| rows: Array[InternalRow]): java.util.HashSet[InternalRow] = { | ||
| buildKeyHashSet(keys, plan.output, rows.iterator) | ||
| } | ||
|
|
||
| def buildKeyHashSet( | ||
| keys: Seq[Expression], | ||
| attributes: Seq[Attribute], | ||
|
|
@@ -110,4 +103,10 @@ private[execution] object HashSemiJoin { | |
| } | ||
|
|
||
| /** HashSetBroadcastMode requires that the input rows are broadcasted as a set. */ | ||
| private[execution] case class HashSetBroadcastMode(keys: Seq[Expression]) extends BroadcastMode | ||
| private[execution] case class HashSetBroadcastMode( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is ok for now since the following isn't the scope of this pull request. in the future we should just use the hashed relations to do this, rather than using a hashset.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I created https://issues.apache.org/jira/browse/SPARK-13422 to track this one. |
||
| keys: Seq[Expression], | ||
| attributes: Seq[Attribute]) extends BroadcastMode { | ||
| def apply(rows: Array[InternalRow]): java.util.HashSet[InternalRow] = { | ||
| HashSemiJoin.buildKeyHashSet(keys, attributes, rows.iterator) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to merge this class with Exchange?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You want to add this to
Exchange.scala?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@yhuai It took a while for your question to sink in. We could integrate this with Exchange. It makes sense because both operators are distributing data and their structure actually is quite similar. I am not huge fan of this because we will end up conflating concerns by implementing two naturally separate code and data paths in one operator.
Lemme know what you think.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is a logical exchange, but probably deserve two different physical operator.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One way is ... rename the current exchange ShuffleExchange, and rename Broadcast BroadcastExchange.