Skip to content
Closed
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ package org.apache.spark.sql.catalyst.expressions
*
* The following rules are applied:
* - Names and nullability hints for [[org.apache.spark.sql.types.DataType]]s are stripped.
* - Names for [[GetStructField]] are stripped.
* - Commutative and associative operations ([[Add]] and [[Multiply]]) have their children ordered
* by `hashCode`.
* - [[EqualTo]] and [[EqualNullSafe]] are reordered by `hashCode`.
Expand All @@ -37,10 +38,11 @@ object Canonicalize {
expressionReorder(ignoreNamesTypes(e))
}

/** Remove names and nullability from types. */
/** Remove names and nullability from types, and names from `GetStructField`. */
private[expressions] def ignoreNamesTypes(e: Expression): Expression = e match {
case a: AttributeReference =>
AttributeReference("none", a.dataType.asNullable)(exprId = a.exprId)
case GetStructField(child, ordinal, Some(_)) => GetStructField(child, ordinal, None)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks not precisely matched the comments of ignoreNamesTypes. It's better to change it accordingly.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/** Remove names and nullability from types. */

I can change it to and / or.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment of Canonicalize says:

The following rules are applied:
 *  - Names and nullability hints for [[org.apache.spark.sql.types.DataType]]s are stripped.
...

It's also needed to be update.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/** Remove names and nullability from types. */

Actually after this change it is not only for types.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. I re-wrote it a bit. Should look okay now.

case _ => e
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.plans.logical.Range
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class CanonicalizeSuite extends SparkFunSuite {

Expand Down Expand Up @@ -50,4 +51,32 @@ class CanonicalizeSuite extends SparkFunSuite {
assert(range.where(arrays1).sameResult(range.where(arrays2)))
assert(!range.where(arrays1).sameResult(range.where(arrays3)))
}

test("SPARK-26402: accessing nested fields with different cases in case insensitive mode") {
val expId = NamedExpression.newExprId
val qualifier = Seq.empty[String]
val structType = StructType(
StructField("a", StructType(StructField("b", IntegerType, false) :: Nil), false) :: Nil)

// GetStructField with different names are semantically equal
val fieldB1 = GetStructField(
AttributeReference("data1", structType, false)(expId, qualifier),
0, Some("b1"))
Copy link
Member

@dongjoon-hyun dongjoon-hyun Dec 21, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for nit-picking. This should be a1 (and a2 at line 67) because this is the first level.
Consequently, fieldB1 -> fieldA1?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! Done.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

    val fieldA1 = GetStructField(
      AttributeReference("data1", structType, false)(expId, qualifier),
      0, Some("a1"))
    val fieldA2 = GetStructField(
      AttributeReference("data2", structType, false)(expId, qualifier),
      0, Some("a2"))
    assert(fieldA1.semanticEquals(fieldA2))

    val fieldB1 = GetStructField(
      GetStructField(
        AttributeReference("data1", structType, false)(expId, qualifier),
        0, Some("a1")),
      0, Some("b1"))
    val fieldB2 = GetStructField(
      GetStructField(
        AttributeReference("data2", structType, false)(expId, qualifier),
        0, Some("a2")),
      0, Some("b2"))
    assert(fieldB1.semanticEquals(fieldB2))

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dongjoon-hyun I put the ordering wrong. Addressed as you suggested. Thanks!

val fieldB2 = GetStructField(
AttributeReference("data2", structType, false)(expId, qualifier),
0, Some("b2"))
assert(fieldB1.semanticEquals(fieldB2))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line will fail to build.


val fieldA1 = GetStructField(
GetStructField(
AttributeReference("data1", structType, false)(expId, qualifier),
0, Some("a1")),
0, Some("b1"))
val fieldA2 = GetStructField(
GetStructField(
AttributeReference("data2", structType, false)(expId, qualifier),
0, Some("a2")),
0, Some("b2"))
assert(fieldA1.semanticEquals(fieldA2))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLite
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules._
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class BinaryComparisonSimplificationSuite extends PlanTest with PredicateHelper {

Expand Down Expand Up @@ -92,4 +93,33 @@ class BinaryComparisonSimplificationSuite extends PlanTest with PredicateHelper
val correctAnswer = nonNullableRelation.analyze
comparePlans(actual, correctAnswer)
}

test("SPARK-26402: accessing nested fields with different cases in case insensitive mode") {
val expId = NamedExpression.newExprId
val qualifier = Seq.empty[String]
val structType = StructType(
StructField("a", StructType(StructField("b", IntegerType, false) :: Nil), false) :: Nil)

val fieldA1 = GetStructField(
GetStructField(
AttributeReference("data1", structType, false)(expId, qualifier),
0, Some("a1")),
0, Some("b1"))
val fieldA2 = GetStructField(
GetStructField(
AttributeReference("data2", structType, false)(expId, qualifier),
0, Some("a2")),
0, Some("b2"))

// GetStructField with different names are semantically equal; thus, `EqualTo(fieldA1, fieldA2)`
// will be optimized to `TrueLiteral` by `SimplifyBinaryComparison`.
val originalQuery = nonNullableRelation
.where(EqualTo(fieldA1, fieldA2))
.analyze

val optimized = Optimize.execute(originalQuery)
val correctAnswer = nonNullableRelation.where(Literal.TrueLiteral).analyze
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is removed eventually. To pass the test, we need to remove where(Literal.TrueLiteral) here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, this has BooleanSimplification. Removed it.


comparePlans(optimized, correctAnswer)
}
}
19 changes: 19 additions & 0 deletions sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2937,6 +2937,25 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
}
}
}

test("SPARK-26402: accessing nested fields with different cases in case insensitive mode") {
withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") {
val msg = intercept[AnalysisException] {
withTable("t") {
sql("create table t (s struct<i: Int>) using json")
checkAnswer(sql("select s.I from t group by s.i"), Nil)
}
}.message
assert(msg.contains("No such struct field I in i"))
}

withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
withTable("t") {
sql("create table t (s struct<i: Int>) using json")
checkAnswer(sql("select s.I from t group by s.i"), Nil)
}
}
}
}

case class Foo(bar: Option[String])