-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-20392][SQL] Set barrier to prevent re-entering a tree #17770
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
82978d7
24905e3
e15b001
a076d83
b29ded3
8c8fe1e
a855182
4ff9610
d0a94f4
02e11f9
17f1a02
4629959
c313e35
7e9dfac
fba3690
f63ea0b
b9d03cd
6a7204c
3437ae0
555fa8e
505aba6
f3e4208
c0bee01
1c1cc9d
eb0598e
cba784b
b478e55
8314cc3
6add9ec
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -919,4 +919,11 @@ case class AnalysisBarrier(child: LogicalPlan) extends LeafNode { | |
| override def output: Seq[Attribute] = child.output | ||
| override def analyzed: Boolean = true | ||
| override def isStreaming: Boolean = child.isStreaming | ||
| override lazy val canonicalized: LogicalPlan = child.canonicalized | ||
|
|
||
| override def find(f: LogicalPlan => Boolean): Option[LogicalPlan] = if (f(this)) { | ||
|
||
| Some(this) | ||
| } else { | ||
| child.find(f) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -177,19 +177,18 @@ class Dataset[T] private[sql]( | |
| @transient private[sql] val logicalPlan: LogicalPlan = { | ||
| // For various commands (like DDL) and queries with side effects, we force query execution | ||
| // to happen right away to let these side effects take place eagerly. | ||
| queryExecution.analyzed match { | ||
| val analyzed = queryExecution.analyzed match { | ||
| case c: Command => | ||
| LocalRelation(c.output, queryExecution.executedPlan.executeCollect()) | ||
| case u @ Union(children) if children.forall(_.isInstanceOf[Command]) => | ||
| LocalRelation(u.output, queryExecution.executedPlan.executeCollect()) | ||
| case _ => | ||
| queryExecution.analyzed | ||
| } | ||
| // Wraps analyzed logical plans with an analysis barrier so we won't traverse/resolve it again. | ||
| AnalysisBarrier(analyzed) | ||
| } | ||
|
|
||
| // Wraps analyzed logical plans with an analysis barrier so we won't traverse/resolve it again. | ||
| @transient private val planWithBarrier: LogicalPlan = AnalysisBarrier(logicalPlan) | ||
|
|
||
| /** | ||
| * Currently [[ExpressionEncoder]] is the only implementation of [[Encoder]], here we turn the | ||
| * passed in encoder to [[ExpressionEncoder]] explicitly, and mark it implicit so that we can use | ||
|
|
@@ -416,7 +415,7 @@ class Dataset[T] private[sql]( | |
| */ | ||
| @Experimental | ||
| @InterfaceStability.Evolving | ||
| def as[U : Encoder]: Dataset[U] = Dataset[U](sparkSession, planWithBarrier) | ||
| def as[U : Encoder]: Dataset[U] = Dataset[U](sparkSession, logicalPlan) | ||
|
|
||
| /** | ||
| * Converts this strongly typed collection of data to generic `DataFrame` with columns renamed. | ||
|
|
@@ -619,7 +618,7 @@ class Dataset[T] private[sql]( | |
| require(parsedDelay.milliseconds >= 0 && parsedDelay.months >= 0, | ||
| s"delay threshold ($delayThreshold) should not be negative.") | ||
| EliminateEventTimeWatermark( | ||
| EventTimeWatermark(UnresolvedAttribute(eventTime), parsedDelay, planWithBarrier)) | ||
| EventTimeWatermark(UnresolvedAttribute(eventTime), parsedDelay, logicalPlan)) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -793,7 +792,7 @@ class Dataset[T] private[sql]( | |
| * @since 2.0.0 | ||
| */ | ||
| def join(right: Dataset[_]): DataFrame = withPlan { | ||
| Join(planWithBarrier, right.planWithBarrier, joinType = Inner, None) | ||
| Join(logicalPlan, right.logicalPlan, joinType = Inner, None) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -871,7 +870,7 @@ class Dataset[T] private[sql]( | |
| // Analyze the self join. The assumption is that the analyzer will disambiguate left vs right | ||
| // by creating a new instance for one of the branch. | ||
| val joined = sparkSession.sessionState.executePlan( | ||
| Join(planWithBarrier, right.planWithBarrier, joinType = JoinType(joinType), None)) | ||
| Join(logicalPlan, right.logicalPlan, joinType = JoinType(joinType), None)) | ||
| .analyzed.asInstanceOf[Join] | ||
|
|
||
| withPlan { | ||
|
|
@@ -932,7 +931,7 @@ class Dataset[T] private[sql]( | |
| // Trigger analysis so in the case of self-join, the analyzer will clone the plan. | ||
| // After the cloning, left and right side will have distinct expression ids. | ||
| val plan = withPlan( | ||
| Join(planWithBarrier, right.planWithBarrier, JoinType(joinType), Some(joinExprs.expr))) | ||
| Join(logicalPlan, right.logicalPlan, JoinType(joinType), Some(joinExprs.expr))) | ||
| .queryExecution.analyzed.asInstanceOf[Join] | ||
|
|
||
| // If auto self join alias is disabled, return the plan. | ||
|
|
@@ -941,8 +940,8 @@ class Dataset[T] private[sql]( | |
| } | ||
|
|
||
| // If left/right have no output set intersection, return the plan. | ||
| val lanalyzed = withPlan(this.planWithBarrier).queryExecution.analyzed | ||
| val ranalyzed = withPlan(right.planWithBarrier).queryExecution.analyzed | ||
| val lanalyzed = withPlan(this.logicalPlan).queryExecution.analyzed | ||
| val ranalyzed = withPlan(right.logicalPlan).queryExecution.analyzed | ||
| if (lanalyzed.outputSet.intersect(ranalyzed.outputSet).isEmpty) { | ||
| return withPlan(plan) | ||
| } | ||
|
|
@@ -974,7 +973,7 @@ class Dataset[T] private[sql]( | |
| * @since 2.1.0 | ||
| */ | ||
| def crossJoin(right: Dataset[_]): DataFrame = withPlan { | ||
| Join(planWithBarrier, right.planWithBarrier, joinType = Cross, None) | ||
| Join(logicalPlan, right.logicalPlan, joinType = Cross, None) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -1006,8 +1005,8 @@ class Dataset[T] private[sql]( | |
| // etc. | ||
| val joined = sparkSession.sessionState.executePlan( | ||
| Join( | ||
| this.planWithBarrier, | ||
| other.planWithBarrier, | ||
| this.logicalPlan, | ||
| other.logicalPlan, | ||
| JoinType(joinType), | ||
| Some(condition.expr))).analyzed.asInstanceOf[Join] | ||
|
|
||
|
|
@@ -1177,7 +1176,7 @@ class Dataset[T] private[sql]( | |
| */ | ||
| @scala.annotation.varargs | ||
| def hint(name: String, parameters: String*): Dataset[T] = withTypedPlan { | ||
| Hint(name, parameters, planWithBarrier) | ||
| Hint(name, parameters, logicalPlan) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -1203,7 +1202,7 @@ class Dataset[T] private[sql]( | |
| * @since 1.6.0 | ||
| */ | ||
| def as(alias: String): Dataset[T] = withTypedPlan { | ||
| SubqueryAlias(alias, planWithBarrier) | ||
| SubqueryAlias(alias, logicalPlan) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -1241,7 +1240,7 @@ class Dataset[T] private[sql]( | |
| */ | ||
| @scala.annotation.varargs | ||
| def select(cols: Column*): DataFrame = withPlan { | ||
| Project(cols.map(_.named), planWithBarrier) | ||
| Project(cols.map(_.named), logicalPlan) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -1296,8 +1295,8 @@ class Dataset[T] private[sql]( | |
| @InterfaceStability.Evolving | ||
| def select[U1](c1: TypedColumn[T, U1]): Dataset[U1] = { | ||
| implicit val encoder = c1.encoder | ||
| val project = Project(c1.withInputType(exprEnc, planWithBarrier.output).named :: Nil, | ||
| planWithBarrier) | ||
| val project = Project(c1.withInputType(exprEnc, logicalPlan.output).named :: Nil, | ||
| logicalPlan) | ||
|
|
||
| if (encoder.flat) { | ||
| new Dataset[U1](sparkSession, project, encoder) | ||
|
|
@@ -1315,8 +1314,8 @@ class Dataset[T] private[sql]( | |
| protected def selectUntyped(columns: TypedColumn[_, _]*): Dataset[_] = { | ||
| val encoders = columns.map(_.encoder) | ||
| val namedColumns = | ||
| columns.map(_.withInputType(exprEnc, planWithBarrier.output).named) | ||
| val execution = new QueryExecution(sparkSession, Project(namedColumns, planWithBarrier)) | ||
| columns.map(_.withInputType(exprEnc, logicalPlan.output).named) | ||
| val execution = new QueryExecution(sparkSession, Project(namedColumns, logicalPlan)) | ||
| new Dataset(sparkSession, execution, ExpressionEncoder.tuple(encoders)) | ||
| } | ||
|
|
||
|
|
@@ -1392,7 +1391,7 @@ class Dataset[T] private[sql]( | |
| * @since 1.6.0 | ||
| */ | ||
| def filter(condition: Column): Dataset[T] = withTypedPlan { | ||
| Filter(condition.expr, planWithBarrier) | ||
| Filter(condition.expr, logicalPlan) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -1569,7 +1568,7 @@ class Dataset[T] private[sql]( | |
| @Experimental | ||
| @InterfaceStability.Evolving | ||
| def groupByKey[K: Encoder](func: T => K): KeyValueGroupedDataset[K, T] = { | ||
| val inputPlan = planWithBarrier | ||
| val inputPlan = logicalPlan | ||
| val withGroupingKey = AppendColumns(func, inputPlan) | ||
| val executed = sparkSession.sessionState.executePlan(withGroupingKey) | ||
|
|
||
|
|
@@ -1715,7 +1714,7 @@ class Dataset[T] private[sql]( | |
| * @since 2.0.0 | ||
| */ | ||
| def limit(n: Int): Dataset[T] = withTypedPlan { | ||
| Limit(Literal(n), planWithBarrier) | ||
| Limit(Literal(n), logicalPlan) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -1744,7 +1743,8 @@ class Dataset[T] private[sql]( | |
| def union(other: Dataset[T]): Dataset[T] = withSetOperator { | ||
| // This breaks caching, but it's usually ok because it addresses a very specific use case: | ||
| // using union to union many files or partitions. | ||
| CombineUnions(Union(logicalPlan, other.logicalPlan)).mapChildren(AnalysisBarrier) | ||
| CombineUnions(Union(EliminateBarriers(logicalPlan), EliminateBarriers(other.logicalPlan))) | ||
| .mapChildren(AnalysisBarrier) | ||
|
||
| } | ||
|
|
||
| /** | ||
|
|
@@ -1758,7 +1758,7 @@ class Dataset[T] private[sql]( | |
| * @since 1.6.0 | ||
| */ | ||
| def intersect(other: Dataset[T]): Dataset[T] = withSetOperator { | ||
| Intersect(planWithBarrier, other.planWithBarrier) | ||
| Intersect(logicalPlan, other.logicalPlan) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -1772,7 +1772,7 @@ class Dataset[T] private[sql]( | |
| * @since 2.0.0 | ||
| */ | ||
| def except(other: Dataset[T]): Dataset[T] = withSetOperator { | ||
| Except(planWithBarrier, other.planWithBarrier) | ||
| Except(logicalPlan, other.logicalPlan) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -1793,7 +1793,7 @@ class Dataset[T] private[sql]( | |
| s"Fraction must be nonnegative, but got ${fraction}") | ||
|
|
||
| withTypedPlan { | ||
| Sample(0.0, fraction, withReplacement, seed, planWithBarrier)() | ||
| Sample(0.0, fraction, withReplacement, seed, logicalPlan)() | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -1835,15 +1835,15 @@ class Dataset[T] private[sql]( | |
| // overlapping splits. To prevent this, we explicitly sort each input partition to make the | ||
| // ordering deterministic. Note that MapTypes cannot be sorted and are explicitly pruned out | ||
| // from the sort order. | ||
| val sortOrder = planWithBarrier.output | ||
| val sortOrder = logicalPlan.output | ||
| .filter(attr => RowOrdering.isOrderable(attr.dataType)) | ||
| .map(SortOrder(_, Ascending)) | ||
| val plan = if (sortOrder.nonEmpty) { | ||
| Sort(sortOrder, global = false, planWithBarrier) | ||
| Sort(sortOrder, global = false, logicalPlan) | ||
| } else { | ||
| // SPARK-12662: If sort order is empty, we materialize the dataset to guarantee determinism | ||
| cache() | ||
| planWithBarrier | ||
| logicalPlan | ||
| } | ||
| val sum = weights.sum | ||
| val normalizedCumWeights = weights.map(_ / sum).scanLeft(0.0d)(_ + _) | ||
|
|
@@ -1927,7 +1927,7 @@ class Dataset[T] private[sql]( | |
|
|
||
| withPlan { | ||
| Generate(generator, join = true, outer = false, | ||
| qualifier = None, generatorOutput = Nil, planWithBarrier) | ||
| qualifier = None, generatorOutput = Nil, logicalPlan) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -1968,7 +1968,7 @@ class Dataset[T] private[sql]( | |
|
|
||
| withPlan { | ||
| Generate(generator, join = true, outer = false, | ||
| qualifier = None, generatorOutput = Nil, planWithBarrier) | ||
| qualifier = None, generatorOutput = Nil, logicalPlan) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -2131,7 +2131,7 @@ class Dataset[T] private[sql]( | |
| } | ||
| cols | ||
| } | ||
| Deduplicate(groupCols, planWithBarrier, isStreaming) | ||
| Deduplicate(groupCols, logicalPlan, isStreaming) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -2280,7 +2280,7 @@ class Dataset[T] private[sql]( | |
| @Experimental | ||
| @InterfaceStability.Evolving | ||
| def filter(func: T => Boolean): Dataset[T] = { | ||
| withTypedPlan(TypedFilter(func, planWithBarrier)) | ||
| withTypedPlan(TypedFilter(func, logicalPlan)) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -2294,7 +2294,7 @@ class Dataset[T] private[sql]( | |
| @Experimental | ||
| @InterfaceStability.Evolving | ||
| def filter(func: FilterFunction[T]): Dataset[T] = { | ||
| withTypedPlan(TypedFilter(func, planWithBarrier)) | ||
| withTypedPlan(TypedFilter(func, logicalPlan)) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -2308,7 +2308,7 @@ class Dataset[T] private[sql]( | |
| @Experimental | ||
| @InterfaceStability.Evolving | ||
| def map[U : Encoder](func: T => U): Dataset[U] = withTypedPlan { | ||
| MapElements[T, U](func, planWithBarrier) | ||
| MapElements[T, U](func, logicalPlan) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -2323,7 +2323,7 @@ class Dataset[T] private[sql]( | |
| @InterfaceStability.Evolving | ||
| def map[U](func: MapFunction[T, U], encoder: Encoder[U]): Dataset[U] = { | ||
| implicit val uEnc = encoder | ||
| withTypedPlan(MapElements[T, U](func, planWithBarrier)) | ||
| withTypedPlan(MapElements[T, U](func, logicalPlan)) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -2339,7 +2339,7 @@ class Dataset[T] private[sql]( | |
| def mapPartitions[U : Encoder](func: Iterator[T] => Iterator[U]): Dataset[U] = { | ||
| new Dataset[U]( | ||
| sparkSession, | ||
| MapPartitions[T, U](func, planWithBarrier), | ||
| MapPartitions[T, U](func, logicalPlan), | ||
| implicitly[Encoder[U]]) | ||
| } | ||
|
|
||
|
|
@@ -2370,7 +2370,7 @@ class Dataset[T] private[sql]( | |
| val rowEncoder = encoder.asInstanceOf[ExpressionEncoder[Row]] | ||
| Dataset.ofRows( | ||
| sparkSession, | ||
| MapPartitionsInR(func, packageNames, broadcastVars, schema, rowEncoder, planWithBarrier)) | ||
| MapPartitionsInR(func, packageNames, broadcastVars, schema, rowEncoder, logicalPlan)) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -2525,7 +2525,7 @@ class Dataset[T] private[sql]( | |
| * @since 1.6.0 | ||
| */ | ||
| def repartition(numPartitions: Int): Dataset[T] = withTypedPlan { | ||
| Repartition(numPartitions, shuffle = true, planWithBarrier) | ||
| Repartition(numPartitions, shuffle = true, logicalPlan) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -2539,7 +2539,7 @@ class Dataset[T] private[sql]( | |
| */ | ||
| @scala.annotation.varargs | ||
| def repartition(numPartitions: Int, partitionExprs: Column*): Dataset[T] = withTypedPlan { | ||
| RepartitionByExpression(partitionExprs.map(_.expr), planWithBarrier, numPartitions) | ||
| RepartitionByExpression(partitionExprs.map(_.expr), logicalPlan, numPartitions) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -2555,8 +2555,7 @@ class Dataset[T] private[sql]( | |
| @scala.annotation.varargs | ||
| def repartition(partitionExprs: Column*): Dataset[T] = withTypedPlan { | ||
| RepartitionByExpression( | ||
| partitionExprs.map(_.expr), planWithBarrier, | ||
| sparkSession.sessionState.conf.numShufflePartitions) | ||
| partitionExprs.map(_.expr), logicalPlan, sparkSession.sessionState.conf.numShufflePartitions) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -2577,7 +2576,7 @@ class Dataset[T] private[sql]( | |
| * @since 1.6.0 | ||
| */ | ||
| def coalesce(numPartitions: Int): Dataset[T] = withTypedPlan { | ||
| Repartition(numPartitions, shuffle = false, planWithBarrier) | ||
| Repartition(numPartitions, shuffle = false, logicalPlan) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -2666,7 +2665,7 @@ class Dataset[T] private[sql]( | |
| */ | ||
| lazy val rdd: RDD[T] = { | ||
| val objectType = exprEnc.deserializer.dataType | ||
| val deserialized = CatalystSerde.deserialize[T](planWithBarrier) | ||
| val deserialized = CatalystSerde.deserialize[T](logicalPlan) | ||
| sparkSession.sessionState.executePlan(deserialized).toRdd.mapPartitions { rows => | ||
| rows.map(_.get(0, objectType).asInstanceOf[T]) | ||
| } | ||
|
|
@@ -2765,7 +2764,7 @@ class Dataset[T] private[sql]( | |
| comment = None, | ||
| properties = Map.empty, | ||
| originalText = None, | ||
| child = planWithBarrier, | ||
| child = logicalPlan, | ||
| allowExisting = false, | ||
| replace = replace, | ||
| viewType = viewType) | ||
|
|
@@ -2936,7 +2935,7 @@ class Dataset[T] private[sql]( | |
| } | ||
| } | ||
| withTypedPlan { | ||
| Sort(sortOrder, global = global, planWithBarrier) | ||
| Sort(sortOrder, global = global, logicalPlan) | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adding
override lazy val canonicalized: LogicalPlan = child.canonicalized?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It should be fine to use the default
canonicalized.