-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-13136][SQL] Create a dedicated Broadcast exchange operator #11083
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
aa7120e
c2b7533
6a5568a
9adecdd
d0194fb
02a61b8
c12c8e6
c7dd7ae
e847383
d73f11c
9c0f4bf
da4a966
681f347
7db240a
3ad839d
1116768
a5501cf
c7429bb
b12bbc2
9d52650
54b558d
f33d2cb
28363c8
f812a31
4b5978b
c8c175e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ | |
|
|
||
| package org.apache.spark.sql.catalyst.plans.physical | ||
|
|
||
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.catalyst.expressions._ | ||
| import org.apache.spark.sql.types.{DataType, IntegerType} | ||
|
|
||
|
|
@@ -75,6 +76,12 @@ case class OrderedDistribution(ordering: Seq[SortOrder]) extends Distribution { | |
| def clustering: Set[Expression] = ordering.map(_.child).toSet | ||
| } | ||
|
|
||
| /** | ||
| * Represents data where tuples are broadcasted to every node. It is quite common that the | ||
| * entire set of tuples is transformed into different data structure. | ||
| */ | ||
| case class BroadcastDistribution(f: Iterable[InternalRow] => Any = identity) extends Distribution | ||
|
|
||
| /** | ||
| * Describes how an operator's output is split across partitions. The `compatibleWith`, | ||
| * `guarantees`, and `satisfies` methods describe relationships between child partitionings, | ||
|
|
@@ -213,7 +220,10 @@ case class RoundRobinPartitioning(numPartitions: Int) extends Partitioning { | |
| case object SinglePartition extends Partitioning { | ||
| val numPartitions = 1 | ||
|
|
||
| override def satisfies(required: Distribution): Boolean = true | ||
| override def satisfies(required: Distribution): Boolean = required match { | ||
| case _: BroadcastDistribution => false | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think this is ok for now, but technically we don't need to introduce an exchange if both sides of the join have only one partition. i guess this framework does not currently handle that. |
||
| case _ => true | ||
| } | ||
|
|
||
| override def compatibleWith(other: Partitioning): Boolean = other.numPartitions == 1 | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,91 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
| package org.apache.spark.sql.execution | ||
|
|
||
| import scala.concurrent._ | ||
| import scala.concurrent.duration._ | ||
|
|
||
| import org.apache.spark.broadcast | ||
| import org.apache.spark.rdd.RDD | ||
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.catalyst.expressions.Attribute | ||
| import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext | ||
| import org.apache.spark.sql.execution.metric.SQLMetrics | ||
| import org.apache.spark.util.ThreadUtils | ||
|
|
||
| /** | ||
| * A broadcast collects, transforms and finally broadcasts the result of a transformed SparkPlan. | ||
| */ | ||
| case class Broadcast(f: Iterable[InternalRow] => Any, child: SparkPlan) extends UnaryNode { | ||
|
|
||
| override def output: Seq[Attribute] = child.output | ||
|
|
||
| override private[sql] lazy val metrics = Map( | ||
| "numRows" -> SQLMetrics.createLongMetric(sparkContext, "number of rows") | ||
| ) | ||
|
|
||
| val timeout: Duration = { | ||
| val timeoutValue = sqlContext.conf.broadcastTimeout | ||
| if (timeoutValue < 0) { | ||
| Duration.Inf | ||
| } else { | ||
| timeoutValue.seconds | ||
| } | ||
| } | ||
|
|
||
| @transient | ||
| private lazy val relationFuture: Future[broadcast.Broadcast[Any]] = { | ||
| val numBuildRows = longMetric("numRows") | ||
|
|
||
| // broadcastFuture is used in "doExecute". Therefore we can get the execution id correctly here. | ||
| val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) | ||
| Future { | ||
| // This will run in another thread. Set the execution id so that we can connect these jobs | ||
| // with the correct execution. | ||
| SQLExecution.withExecutionId(sparkContext, executionId) { | ||
| // Note that we use .execute().collect() because we don't want to convert data to Scala | ||
| // types | ||
| val input: Array[InternalRow] = child.execute().map { row => | ||
| numBuildRows += 1 | ||
| row.copy() | ||
| }.collect() | ||
|
|
||
| // Construct and broadcast the relation. | ||
| sparkContext.broadcast(f(input)) | ||
| } | ||
| }(Broadcast.executionContext) | ||
| } | ||
|
|
||
| override protected def doPrepare(): Unit = { | ||
| // Materialize the future. | ||
| relationFuture | ||
| } | ||
|
|
||
| override protected def doExecute(): RDD[InternalRow] = { | ||
| child.execute() // TODO throw an Exception here? | ||
|
||
| } | ||
|
|
||
| override protected[sql] def doExecuteBroadcast[T](): broadcast.Broadcast[T] = { | ||
| val result = Await.result(relationFuture, timeout) | ||
| result.asInstanceOf[broadcast.Broadcast[T]] | ||
| } | ||
| } | ||
|
|
||
| object Broadcast { | ||
| private[execution] val executionContext = ExecutionContext.fromExecutorService( | ||
| ThreadUtils.newDaemonCachedThreadPool("build-broadcast", 128)) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,7 @@ import java.util.concurrent.atomic.AtomicBoolean | |
| import scala.collection.mutable.ArrayBuffer | ||
|
|
||
| import org.apache.spark.Logging | ||
| import org.apache.spark.broadcast | ||
| import org.apache.spark.rdd.{RDD, RDDOperationScope} | ||
| import org.apache.spark.sql.{Row, SQLContext} | ||
| import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} | ||
|
|
@@ -98,14 +99,29 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ | |
| def requiredChildOrdering: Seq[Seq[SortOrder]] = Seq.fill(children.size)(Nil) | ||
|
|
||
| /** | ||
| * Returns the result of this query as an RDD[InternalRow] by delegating to doExecute | ||
| * after adding query plan information to created RDDs for visualization. | ||
| * Concrete implementations of SparkPlan should override doExecute instead. | ||
| * Returns the result of this query as an RDD[InternalRow] by delegating to doExecute after | ||
| * preparations. Concrete implementations of SparkPlan should override doExecute. | ||
| */ | ||
| final def execute(): RDD[InternalRow] = { | ||
| final def execute(): RDD[InternalRow] = executeQuery { | ||
| doExecute() | ||
| } | ||
|
|
||
| /** | ||
| * Returns the result of this query as a broadcast variable by delegating to doBroadcast after | ||
| * preparations. Concrete implementations of SparkPlan should override doBroadcast. | ||
| */ | ||
| final def executeBroadcast[T](): broadcast.Broadcast[T] = executeQuery { | ||
| doExecuteBroadcast() | ||
| } | ||
|
|
||
| /** | ||
| * Execute a query after preparing the query and adding query plan information to created RDDs | ||
| * for visualization. | ||
| */ | ||
| private final def executeQuery[T](query: => T): T = { | ||
| RDDOperationScope.withScope(sparkContext, nodeName, false, true) { | ||
| prepare() | ||
| doExecute() | ||
| query | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -135,6 +151,14 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ | |
| */ | ||
| protected def doExecute(): RDD[InternalRow] | ||
|
|
||
| /** | ||
| * Overridden by concrete implementations of SparkPlan. | ||
| * Produces the result of the query as a broadcast variable. | ||
| */ | ||
| protected[sql] def doExecuteBroadcast[T](): broadcast.Broadcast[T] = { | ||
| throw new NotImplementedError(s"$nodeName does not implement doExecuteBroadcast") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. UnsupportedOperationException ? |
||
| } | ||
|
|
||
| /** | ||
| * Runs this query returning the result as an array. | ||
| */ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution | |
|
|
||
| import scala.collection.mutable.ArrayBuffer | ||
|
|
||
| import org.apache.spark.broadcast | ||
| import org.apache.spark.rdd.RDD | ||
| import org.apache.spark.sql.SQLContext | ||
| import org.apache.spark.sql.catalyst.InternalRow | ||
|
|
@@ -360,6 +361,9 @@ private[sql] case class CollapseCodegenStages(sqlContext: SQLContext) extends Ru | |
| // the generated code will be huge if there are too many columns | ||
| val haveManyColumns = plan.output.length > 200 | ||
| !willFallback && !haveManyColumns | ||
| // Collapse a broadcast into the stage - it should not contain any code that can be | ||
| // codegenerated. | ||
| case _: Broadcast => true | ||
|
||
| case _ => false | ||
| } | ||
|
|
||
|
|
@@ -370,10 +374,10 @@ private[sql] case class CollapseCodegenStages(sqlContext: SQLContext) extends Ru | |
| var inputs = ArrayBuffer[SparkPlan]() | ||
| val combined = plan.transform { | ||
| // The build side can't be compiled together | ||
| case b @ BroadcastHashJoin(_, _, BuildLeft, _, left, right) => | ||
| b.copy(left = apply(left)) | ||
| case b @ BroadcastHashJoin(_, _, BuildRight, _, left, right) => | ||
| b.copy(right = apply(right)) | ||
| case b @ BroadcastHashJoin(_, _, BuildLeft, _, Broadcast(f, left), _) => | ||
| b.copy(left = Broadcast(f, apply(left))) | ||
| case b @ BroadcastHashJoin(_, _, BuildRight, _, _, Broadcast(f, right)) => | ||
| b.copy(right = Broadcast(f, apply(right))) | ||
| case p if !supportCodegen(p) => | ||
| val input = apply(p) // collapse them recursively | ||
| inputs += input | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i'm thinking maybe it's better to just declare that we want a hashed broadcast distribution, and then don't take a closure. The reason it is bad to take a closure is that this won't work if we want to whole-stage codegen the building of the hash table, or if we want to change the internal engine to a push-based model.