-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-16355] [SPARK-16354] [SQL] Fix Bugs When LIMIT/TABLESAMPLE is Non-foldable, Zero or Negative #14034
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-16355] [SPARK-16354] [SQL] Fix Bugs When LIMIT/TABLESAMPLE is Non-foldable, Zero or Negative #14034
Changes from 3 commits
1255968
bdf4e56
3c402d3
5b36fbc
a2a828f
f600ba4
8fd72f6
1abdbb9
3036847
d135b77
028aa79
01137dc
0ebbdfe
dec5ad9
2e6f8d8
d66870b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -660,7 +660,12 @@ case class GlobalLimit(limitExpr: Expression, child: LogicalPlan) extends UnaryN | |
| } | ||
| override lazy val statistics: Statistics = { | ||
| val limit = limitExpr.eval().asInstanceOf[Int] | ||
| val sizeInBytes = (limit: Long) * output.map(a => a.dataType.defaultSize).sum | ||
| var sizeInBytes = (limit: Long) * output.map(a => a.dataType.defaultSize).sum | ||
| if (sizeInBytes == 0) { | ||
|
||
| // sizeInBytes can't be zero, or sizeInBytes of BinaryNode will also be zero | ||
| // (product of children). | ||
| sizeInBytes = 1 | ||
| } | ||
| child.statistics.copy(sizeInBytes = sizeInBytes) | ||
| } | ||
| } | ||
|
|
@@ -675,7 +680,12 @@ case class LocalLimit(limitExpr: Expression, child: LogicalPlan) extends UnaryNo | |
| } | ||
| override lazy val statistics: Statistics = { | ||
| val limit = limitExpr.eval().asInstanceOf[Int] | ||
| val sizeInBytes = (limit: Long) * output.map(a => a.dataType.defaultSize).sum | ||
| var sizeInBytes = (limit: Long) * output.map(a => a.dataType.defaultSize).sum | ||
| if (sizeInBytes == 0) { | ||
| // sizeInBytes can't be zero, or sizeInBytes of BinaryNode will also be zero | ||
| // (product of children). | ||
| sizeInBytes = 1 | ||
| } | ||
| child.statistics.copy(sizeInBytes = sizeInBytes) | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,10 +17,12 @@ | |
|
|
||
| package org.apache.spark.sql | ||
|
|
||
| import org.apache.spark.sql.catalyst.plans.logical.{GlobalLimit, Join, LocalLimit} | ||
| import org.apache.spark.sql.test.SharedSQLContext | ||
| import org.apache.spark.sql.types._ | ||
|
|
||
| class StatisticsSuite extends QueryTest with SharedSQLContext { | ||
| import testImplicits._ | ||
|
|
||
| test("SPARK-15392: DataFrame created from RDD should not be broadcasted") { | ||
| val rdd = sparkContext.range(1, 100).map(i => Row(i, i)) | ||
|
|
@@ -31,4 +33,46 @@ class StatisticsSuite extends QueryTest with SharedSQLContext { | |
| spark.sessionState.conf.autoBroadcastJoinThreshold) | ||
| } | ||
|
|
||
| test("estimates the size of limit") { | ||
| withTempTable("test") { | ||
| Seq(("one", 1), ("two", 2), ("three", 3), ("four", 4)).toDF("k", "v") | ||
| .createOrReplaceTempView("test") | ||
| Seq((0, 1), (1, 24), (2, 48)).foreach { case (limit, expected) => | ||
| val df = sql(s"""SELECT * FROM test limit $limit""") | ||
|
|
||
| val sizesGlobalLimit = df.queryExecution.analyzed.collect { case g: GlobalLimit => | ||
| g.statistics.sizeInBytes | ||
| } | ||
| assert(sizesGlobalLimit.size === 1, s"Size wrong for:\n ${df.queryExecution}") | ||
| assert(sizesGlobalLimit(0).equals(BigInt(expected)), | ||
| s"expected exact size 24 for table 'test', got: ${sizesGlobalLimit(0)}") | ||
|
|
||
| val sizesLocalLimit = df.queryExecution.analyzed.collect { case l: LocalLimit => | ||
| l.statistics.sizeInBytes | ||
| } | ||
| assert(sizesLocalLimit.size === 1, s"Size wrong for:\n ${df.queryExecution}") | ||
| assert(sizesLocalLimit(0).equals(BigInt(expected)), | ||
| s"expected exact size 24 for table 'test', got: ${sizesLocalLimit(0)}") | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("estimates the size of a limit 0 on outer join") { | ||
| withTempTable("test") { | ||
| Seq(("one", 1), ("two", 2), ("three", 3), ("four", 4)).toDF("k", "v") | ||
| .createOrReplaceTempView("test") | ||
| val df1 = spark.table("test") | ||
| val df2 = spark.table("test").limit(0) | ||
| val df = df1.join(df2, Seq("k"), "left") | ||
|
|
||
| val sizes = df.queryExecution.analyzed.collect { case g: Join => | ||
| g.statistics.sizeInBytes | ||
| } | ||
|
|
||
| assert(sizes.size === 1, s"Size wrong for:\n ${df.queryExecution}") | ||
|
||
| assert(sizes(0).equals(BigInt(96)), | ||
|
||
| s"expected exact size 96 for table 'test', got: ${sizes(0)}") | ||
| } | ||
| } | ||
|
|
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is the limit expression guaranteed to be literal?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nope. Users can input an expression here. For example,
spark/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
Line 234 in e5d703b
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ah, but it's still foldable. Is it possible it's non-foldable?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It sounds like nobody supports it. All of the mainstream DB vendors only support integer
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We do not support non-foldable limit clauses.
spark/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
Lines 67 to 89 in d063898
spark/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
Lines 398 to 401 in d063898
But,,, we do not issue an exception if users do it. Thus, the error we got is strange:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let me do it in this PR. Thank you for your review! : )