-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-34538][SQL] Hive Metastore support filter by not-in #31646
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3f95df6
3cefafe
0dc1c4a
b9c7f44
cf3ba56
0415448
113bd62
01b7466
e37f5f9
93eedd3
e8c7b6c
062b1d4
4704ec3
5acbf6a
17aea47
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -748,6 +748,15 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { | |
| values.map(value => s"$name = $value").mkString("(", " or ", ")") | ||
| } | ||
|
|
||
| def convertNotInToAnd(name: String, values: Seq[String]): String = { | ||
| values.map(value => s"$name != $value").mkString("(", " and ", ")") | ||
| } | ||
|
|
||
| def hasNullLiteral(list: Seq[Expression]): Boolean = list.exists { | ||
| case Literal(null, _) => true | ||
| case _ => false | ||
| } | ||
|
|
||
| val useAdvanced = SQLConf.get.advancedPartitionPredicatePushdownEnabled | ||
| val inSetThreshold = SQLConf.get.metastorePartitionPruningInSetThreshold | ||
|
|
||
|
|
@@ -763,10 +772,20 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { | |
| } | ||
|
|
||
| def convert(expr: Expression): Option[String] = expr match { | ||
| case Not(InSet(_, values)) if values.size > inSetThreshold => | ||
| None | ||
|
|
||
| case Not(In(_, list)) if hasNullLiteral(list) => None | ||
| case Not(InSet(_, list)) if list.contains(null) => None | ||
|
|
||
| case In(ExtractAttribute(SupportedAttribute(name)), ExtractableLiterals(values)) | ||
| if useAdvanced => | ||
| Some(convertInToOr(name, values)) | ||
|
|
||
| case Not(In(ExtractAttribute(SupportedAttribute(name)), ExtractableLiterals(values))) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think this rewrite is incorrect. We need to make sure the values of IN are all not null.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My bad, missed this. For |
||
| if useAdvanced => | ||
| Some(convertNotInToAnd(name, values)) | ||
|
|
||
| case InSet(child, values) if useAdvanced && values.size > inSetThreshold => | ||
| val dataType = child.dataType | ||
| // Skip null here is safe, more details could see at ExtractableLiterals. | ||
|
|
@@ -779,10 +798,18 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { | |
| if useAdvanced && child.dataType == DateType => | ||
| Some(convertInToOr(name, values)) | ||
|
|
||
| case Not(InSet(child @ ExtractAttribute(SupportedAttribute(name)), | ||
| ExtractableDateValues(values))) if useAdvanced && child.dataType == DateType => | ||
| Some(convertNotInToAnd(name, values)) | ||
|
|
||
| case InSet(ExtractAttribute(SupportedAttribute(name)), ExtractableValues(values)) | ||
| if useAdvanced => | ||
| Some(convertInToOr(name, values)) | ||
|
|
||
| case Not(InSet(ExtractAttribute(SupportedAttribute(name)), ExtractableValues(values))) | ||
| if useAdvanced => | ||
| Some(convertNotInToAnd(name, values)) | ||
|
|
||
| case op @ SpecialBinaryComparison( | ||
| ExtractAttribute(SupportedAttribute(name)), ExtractableLiteral(value)) => | ||
| Some(s"$name ${op.symbol} $value") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -418,6 +418,76 @@ class HivePartitionFilteringSuite(version: String) | |
| dateStrValue) | ||
| } | ||
|
|
||
| test("getPartitionsByFilter: not in/inset string type") { | ||
| def check(condition: Expression, result: Seq[String]): Unit = { | ||
| testMetastorePartitionFiltering( | ||
| condition, | ||
| dsValue, | ||
| hValue, | ||
| result, | ||
| dateValue, | ||
| dateStrValue | ||
| ) | ||
| } | ||
|
|
||
| check( | ||
| Not(In(attr("chunk"), Seq(Literal("aa"), Literal("ab")))), | ||
| Seq("ba", "bb") | ||
| ) | ||
| check( | ||
| Not(In(attr("chunk"), Seq(Literal("aa"), Literal("ab"), Literal(null)))), | ||
| chunkValue | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yea, this test can detect the correctness bug about null handling. |
||
| ) | ||
|
|
||
| check( | ||
| Not(InSet(attr("chunk"), Set(Literal("aa").eval(), Literal("ab").eval()))), | ||
| Seq("ba", "bb") | ||
| ) | ||
| check( | ||
| Not(InSet(attr("chunk"), Set("aa", "ab", null))), | ||
| chunkValue | ||
| ) | ||
| } | ||
|
|
||
| test("getPartitionsByFilter: not in/inset date type") { | ||
| def check(condition: Expression, result: Seq[String]): Unit = { | ||
| testMetastorePartitionFiltering( | ||
| condition, | ||
| dsValue, | ||
| hValue, | ||
| chunkValue, | ||
| result, | ||
| dateStrValue | ||
| ) | ||
| } | ||
|
|
||
| check( | ||
| Not(In(attr("d"), | ||
| Seq(Literal(Date.valueOf("2019-01-01")), | ||
| Literal(Date.valueOf("2019-01-02"))))), | ||
| Seq("2019-01-03") | ||
| ) | ||
| check( | ||
| Not(In(attr("d"), | ||
| Seq(Literal(Date.valueOf("2019-01-01")), | ||
| Literal(Date.valueOf("2019-01-02")), Literal(null)))), | ||
| dateValue | ||
| ) | ||
|
|
||
| check( | ||
| Not(InSet(attr("d"), | ||
| Set(Literal(Date.valueOf("2019-01-01")).eval(), | ||
| Literal(Date.valueOf("2019-01-02")).eval()))), | ||
| Seq("2019-01-03") | ||
| ) | ||
| check( | ||
| Not(InSet(attr("d"), | ||
| Set(Literal(Date.valueOf("2019-01-01")).eval(), | ||
| Literal(Date.valueOf("2019-01-02")).eval(), null))), | ||
| dateValue | ||
| ) | ||
| } | ||
|
|
||
| test("getPartitionsByFilter: cast(datestr as date)= 2020-01-01") { | ||
| testMetastorePartitionFiltering( | ||
| attr("datestr").cast(DateType) === Date.valueOf("2020-01-01"), | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
More than 10,000 values will cause the Hive Metastore stack overflow.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about stoping push it If it's values size exceeds the threshold ? In this case it can not be covert like
>= and <=.