-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-4226][SQL] SparkSQL - Add support for subqueries in predicates('in' clause) #3249
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0a41e91
0134915
152fd23
9e361df
86a4430
4ee8c18
834acda
f1b7d30
dc424df
4afc469
03db47b
a27cca6
7653eee
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ import org.apache.spark.util.collection.OpenHashSet | |
| import org.apache.spark.sql.AnalysisException | ||
| import org.apache.spark.sql.catalyst.errors.TreeNodeException | ||
| import org.apache.spark.sql.catalyst.expressions._ | ||
| import org.apache.spark.sql.catalyst.plans._ | ||
| import org.apache.spark.sql.catalyst.plans.logical._ | ||
| import org.apache.spark.sql.catalyst.rules._ | ||
| import org.apache.spark.sql.types._ | ||
|
|
@@ -59,6 +60,7 @@ class Analyzer(catalog: Catalog, | |
| ResolveGroupingAnalytics :: | ||
| ResolveSortReferences :: | ||
| ImplicitGenerate :: | ||
| SubQueryExpressions :: | ||
| ResolveFunctions :: | ||
| GlobalAggregates :: | ||
| UnresolvedHavingClauseAttributes :: | ||
|
|
@@ -422,6 +424,108 @@ class Analyzer(catalog: Catalog, | |
| Generate(g, join = false, outer = false, None, child) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Transforms the query which has subquery expressions in where clause to left semi join. | ||
| * select T1.x from T1 where T1.x in (select T2.y from T2) transformed to | ||
| * select T1.x from T1 left semi join T2 on T1.x = T2.y. | ||
| */ | ||
| object SubQueryExpressions extends Rule[LogicalPlan] { | ||
|
|
||
| def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
| case p: LogicalPlan if !p.childrenResolved => p | ||
| case filter @ Filter(conditions, child) => | ||
| val subqueryExprs = conditions.collect { | ||
| case In(exp, Seq(SubqueryExpression(subquery))) => (exp, subquery) | ||
| } | ||
| // Replace subqueries with a dummy true literal since they are evaluated separately now. | ||
| val transformedConds = conditions.transform { | ||
| case In(_, Seq(SubqueryExpression(_))) => Literal(true) | ||
| } | ||
| subqueryExprs match { | ||
| case Seq() => filter // No subqueries. | ||
| case Seq((exp, subquery)) => | ||
| createLeftSemiJoin( | ||
| child, | ||
| exp, | ||
| subquery, | ||
| transformedConds) | ||
| case _ => | ||
| throw new TreeNodeException(filter, "Only one SubQuery expression is supported.") | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Create LeftSemi join with parent query to the subquery which is mentioned in 'IN' predicate | ||
| * And combine the subquery conditions and parent query conditions. | ||
| */ | ||
| def createLeftSemiJoin(left: LogicalPlan, | ||
| value: Expression, | ||
| subquery: LogicalPlan, | ||
| parentConds: Expression) : LogicalPlan = { | ||
| val (transformedPlan, subqueryConds) = transformAndGetConditions(value, subquery) | ||
| // Add both parent query conditions and subquery conditions as join conditions | ||
| val allPredicates = And(parentConds, subqueryConds) | ||
| Join(left, transformedPlan, LeftSemi, Some(allPredicates)) | ||
| } | ||
|
|
||
| /** | ||
| * Transform the subquery LogicalPlan and add the expressions which are used as filters to the | ||
| * projection. And also return filter conditions used in subquery | ||
| */ | ||
| def transformAndGetConditions(value: Expression, | ||
| subquery: LogicalPlan): (LogicalPlan, Expression) = { | ||
| val expr = new scala.collection.mutable.ArrayBuffer[Expression]() | ||
| // TODO : we only decorelate subqueries in very specific cases like the cases mentioned above | ||
| // in documentation. The more complex queries like using of subqueries inside subqueries can | ||
| // be supported in future. | ||
| val transformedPlan = subquery transform { | ||
| case project @ Project(projectList, f @ Filter(condition, child)) => | ||
| // Don't support more than one item in select list of subquery | ||
| if(projectList.size > 1) { | ||
| throw new TreeNodeException( | ||
| project, | ||
| "SubQuery can contain only one item in Select List") | ||
| } | ||
| val resolvedChild = ResolveRelations(child) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do you resolve the relation here? I believe the subquery should be resolved already before entering the rule of |
||
| // Add the expressions to the projections which are used as filters in subquery | ||
| val toBeAddedExprs = f.references.filter{a => | ||
| resolvedChild.resolve(a.name, resolver) != None && !project.outputSet.contains(a)} | ||
| val nameToExprMap = collection.mutable.Map[String, Alias]() | ||
| // Create aliases for all projection expressions. | ||
| val witAliases = (projectList ++ toBeAddedExprs).zipWithIndex.map { | ||
| case (exp, index) => | ||
| nameToExprMap.put(exp.name, Alias(exp, s"sqc$index")()) | ||
| Alias(exp, s"sqc$index")() | ||
| } | ||
| // Replace the condition column names with alias names. | ||
| val transformedConds = condition.transform { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not so sure why you cares about the subquery condition, as in Hive wiki described |
||
| case a: Attribute if resolvedChild.resolve(a.name, resolver) != None => | ||
| nameToExprMap.get(a.name).get.toAttribute | ||
| } | ||
| // Join the first projection column of subquery to the main query and add as condition | ||
| // TODO : We can avoid if the parent condition already has this condition. | ||
| expr += EqualTo(value, witAliases(0).toAttribute) | ||
| expr += transformedConds | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Connect the subquery filter with the join condition doesn't make any sense to me, as we will transform the whole logical plan as left semi join, pull out the condition of the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| Project(witAliases, child) | ||
| case project @ Project(projectList, child) => | ||
| // Don't support more than one item in select list of subquery | ||
| if(projectList.size > 1) { | ||
| throw new TreeNodeException( | ||
| project, | ||
| "SubQuery can contain only one item in Select List") | ||
| } | ||
| // Case 1 Uncorelated queries | ||
| // Create aliases for all projection expressions. | ||
| val witAliases = projectList.zipWithIndex.map{case (x,y) => Alias(x, s"sqc$y")()} | ||
| // Take the first projection expression as join condition. | ||
| expr += EqualTo(value, witAliases(0).toAttribute) | ||
| Project(witAliases, child) | ||
| } | ||
| // Add alias to Subquery as 'subquery' | ||
| (transformedPlan, expr.reduce(And)) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.catalyst.expressions | ||
|
|
||
| import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
|
|
||
| /** | ||
| * Evaluates whether `subquery` result contains `value`. | ||
| * For example : 'SELECT * FROM src a WHERE a.key in (SELECT b.key FROM src b)' | ||
| * @param subquery In the above example 'SELECT b.key FROM src b' is 'subquery' | ||
| */ | ||
| case class SubqueryExpression(subquery: LogicalPlan) extends Expression { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of making the Subquery as a fake expression, a better idea probably create a new logical plan like That's also how I implement the |
||
|
|
||
| type EvaluatedType = Any | ||
| def dataType = subquery.output.head.dataType | ||
| override def foldable = false | ||
| def nullable = true | ||
| override def toString = s"SubqueryExpression(${subquery.output.mkString(",")})" | ||
| override lazy val resolved = false | ||
| def children = Nil | ||
| override def eval(input: Row): Any = | ||
| sys.error(s"SubqueryExpression eval should not be called since it will be converted" | ||
| + " to join query") | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We are not going to handle the non
Subquerycase here right? how about