Skip to content
Next Next commit
init
  • Loading branch information
ulysses-you committed Feb 25, 2021
commit 3f95df68bf0721bca63f949a64aaf9d7382fed5d
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,10 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
values.map(value => s"$name = $value").mkString("(", " or ", ")")
}

def convertNotInToAnd(name: String, values: Seq[String]): String = {
values.map(value => s"$name != $value").mkString("(", " and ", ")")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More than 10,000 values will cause the Hive Metastore stack overflow.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about stoping push it If it's values size exceeds the threshold ? In this case it can not be covert like >= and <=.

}

val useAdvanced = SQLConf.get.advancedPartitionPredicatePushdownEnabled
val inSetThreshold = SQLConf.get.metastorePartitionPruningInSetThreshold

Expand All @@ -767,6 +771,10 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
if useAdvanced =>
Some(convertInToOr(name, values))

case Not(In(ExtractAttribute(SupportedAttribute(name)), ExtractableLiterals(values)))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not(a IN (null, 2)): if a = 1, the final result is null.
a != 2: if a = 1, the final result is true

I think this rewrite is incorrect. We need to make sure the values of IN are all not null.

Copy link
Contributor Author

@ulysses-you ulysses-you Mar 10, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My bad, missed this. For Not(InSet) case, null value can not be skip safely, it should convert to and(xx != null).

if useAdvanced =>
Some(convertNotInToAnd(name, values))

case InSet(child, values) if useAdvanced && values.size > inSetThreshold =>
val dataType = child.dataType
// Skip null here is safe, more details could see at ExtractableLiterals.
Expand All @@ -779,10 +787,18 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
if useAdvanced && child.dataType == DateType =>
Some(convertInToOr(name, values))

case Not(InSet(child @ ExtractAttribute(SupportedAttribute(name)),
ExtractableDateValues(values))) if useAdvanced && child.dataType == DateType =>
Some(convertNotInToAnd(name, values))

case InSet(ExtractAttribute(SupportedAttribute(name)), ExtractableValues(values))
if useAdvanced =>
Some(convertInToOr(name, values))

case Not(InSet(ExtractAttribute(SupportedAttribute(name)), ExtractableValues(values)))
if useAdvanced =>
Some(convertNotInToAnd(name, values))

case op @ SpecialBinaryComparison(
ExtractAttribute(SupportedAttribute(name)), ExtractableLiteral(value)) =>
Some(s"$name ${op.symbol} $value")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,28 @@ class FiltersSuite extends SparkFunSuite with Logging with PlanTest {
(a("datecol", DateType) =!= Literal(Date.valueOf("2019-01-01"))) :: Nil,
"datecol != 2019-01-01")

filterTest("not-in int filter",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shall we add a test case for NULL as while, e.g. NOT IN (1, 2, ..., NULL)? Given we have bug before.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it should be, add the null value for test.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems exists some issue about null value. Created #31659.

(Not(In(a("intcol", IntegerType), Seq(Literal(1), Literal(2))))) :: Nil,
"(intcol != 1 and intcol != 2)")

filterTest("not-in string filter",
(Not(In(a("strcol", StringType), Seq(Literal("a"), Literal("b"))))) :: Nil,
"""(strcol != "a" and strcol != "b")""")

filterTest("not-inset, int filter",
(Not(InSet(a("intcol", IntegerType), Set(1, 2)))) :: Nil,
"(intcol != 1 and intcol != 2)")

filterTest("not-inset, string filter",
(Not(InSet(a("strcol", StringType), Set(Literal("a").eval(), Literal("b").eval())))) :: Nil,
"""(strcol != "a" and strcol != "b")""")

filterTest("not-inset, date filter",
(Not(InSet(a("datecol", DateType),
Set(Literal(Date.valueOf("2020-01-01")).eval(),
Literal(Date.valueOf("2020-01-02")).eval())))) :: Nil,
"""(datecol != "2020-01-01" and datecol != "2020-01-02")""")

// Applying the predicate `x IN (NULL)` should return an empty set, but since this optimization
// will be applied by Catalyst, this filter converter does not need to account for this.
filterTest("SPARK-24879 IN predicates with only NULLs will not cause a NPE",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,52 @@ class HivePartitionFilteringSuite(version: String)
dateStrValue)
}

test("getPartitionsByFilter: not in chunk") {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the effective test that can catch correctness bugs. Does it test null?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The coverage of previous test is not good. I rewrite the test and include you mentioned.

testMetastorePartitionFiltering(
Not(In(attr("chunk"), Seq(Literal("aa"), Literal("ab")))),
dsValue,
hValue,
Seq("ba", "bb"),
dateValue,
dateStrValue
)
}

test("getPartitionsByFilter: not in ds") {
testMetastorePartitionFiltering(
Not(In(attr("ds"), Seq(Literal(20170102)))),
Seq(20170101, 20170103),
hValue,
chunkValue,
dateValue,
dateStrValue
)
}

test("getPartitionsByFilter: not inset date") {
testMetastorePartitionFiltering(
Not(InSet(attr("d"),
Set(Literal(Date.valueOf("2019-01-01")).eval(),
Literal(Date.valueOf("2019-01-02")).eval()))),
dsValue,
hValue,
chunkValue,
Seq("2019-01-03") ++ defaultPartition,
dateStrValue
)
}

test("getPartitionsByFilter: not inset h") {
testMetastorePartitionFiltering(
Not(InSet(attr("h"), Set(1, 2))),
dsValue,
Seq(0, 3, 4),
chunkValue,
dateValue,
dateStrValue
)
}

test("getPartitionsByFilter: cast(datestr as date)= 2020-01-01") {
testMetastorePartitionFiltering(
attr("datestr").cast(DateType) === Date.valueOf("2020-01-01"),
Expand Down