Skip to content

Commit d08f53d

Browse files
MaxGekkcloud-fan
authored andcommitted
[SPARK-24605][SQL] size(null) returns null instead of -1
## What changes were proposed in this pull request? In PR, I propose new behavior of `size(null)` under the config flag `spark.sql.legacy.sizeOfNull`. If the former one is disabled, the `size()` function returns `null` for `null` input. By default the `spark.sql.legacy.sizeOfNull` is enabled to keep backward compatibility with previous versions. In that case, `size(null)` returns `-1`. ## How was this patch tested? Modified existing tests for the `size()` function to check new behavior (`null`) and old one (`-1`). Author: Maxim Gekk <[email protected]> Closes #21598 from MaxGekk/legacy-size-of-null.
1 parent 1b9368f commit d08f53d

File tree

5 files changed

+93
-39
lines changed

5 files changed

+93
-39
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -67,37 +67,61 @@ trait BinaryArrayExpressionWithImplicitCast extends BinaryExpression
6767

6868

6969
/**
70-
* Given an array or map, returns its size. Returns -1 if null.
70+
* Given an array or map, returns total number of elements in it.
7171
*/
7272
@ExpressionDescription(
73-
usage = "_FUNC_(expr) - Returns the size of an array or a map. Returns -1 if null.",
73+
usage = """
74+
_FUNC_(expr) - Returns the size of an array or a map.
75+
The function returns -1 if its input is null and spark.sql.legacy.sizeOfNull is set to true.
76+
If spark.sql.legacy.sizeOfNull is set to false, the function returns null for null input.
77+
By default, the spark.sql.legacy.sizeOfNull parameter is set to true.
78+
""",
7479
examples = """
7580
Examples:
7681
> SELECT _FUNC_(array('b', 'd', 'c', 'a'));
7782
4
83+
> SELECT _FUNC_(map('a', 1, 'b', 2));
84+
2
85+
> SELECT _FUNC_(NULL);
86+
-1
7887
""")
79-
case class Size(child: Expression) extends UnaryExpression with ExpectsInputTypes {
88+
case class Size(
89+
child: Expression,
90+
legacySizeOfNull: Boolean)
91+
extends UnaryExpression with ExpectsInputTypes {
92+
93+
def this(child: Expression) =
94+
this(
95+
child,
96+
legacySizeOfNull = SQLConf.get.getConf(SQLConf.LEGACY_SIZE_OF_NULL))
97+
8098
override def dataType: DataType = IntegerType
8199
override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(ArrayType, MapType))
82-
override def nullable: Boolean = false
100+
override def nullable: Boolean = if (legacySizeOfNull) false else super.nullable
83101

84102
override def eval(input: InternalRow): Any = {
85103
val value = child.eval(input)
86104
if (value == null) {
87-
-1
105+
if (legacySizeOfNull) -1 else null
88106
} else child.dataType match {
89107
case _: ArrayType => value.asInstanceOf[ArrayData].numElements()
90108
case _: MapType => value.asInstanceOf[MapData].numElements()
109+
case other => throw new UnsupportedOperationException(
110+
s"The size function doesn't support the operand type ${other.getClass.getCanonicalName}")
91111
}
92112
}
93113

94114
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
95-
val childGen = child.genCode(ctx)
96-
ev.copy(code = code"""
115+
if (legacySizeOfNull) {
116+
val childGen = child.genCode(ctx)
117+
ev.copy(code = code"""
97118
boolean ${ev.isNull} = false;
98119
${childGen.code}
99120
${CodeGenerator.javaType(dataType)} ${ev.value} = ${childGen.isNull} ? -1 :
100121
(${childGen.value}).numElements();""", isNull = FalseLiteral)
122+
} else {
123+
defineCodeGen(ctx, ev, c => s"($c).numElements()")
124+
}
101125
}
102126
}
103127

sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1324,6 +1324,12 @@ object SQLConf {
13241324
"Other column values can be ignored during parsing even if they are malformed.")
13251325
.booleanConf
13261326
.createWithDefault(true)
1327+
1328+
val LEGACY_SIZE_OF_NULL = buildConf("spark.sql.legacy.sizeOfNull")
1329+
.doc("If it is set to true, size of null returns -1. This behavior was inherited from Hive. " +
1330+
"The size function returns null for null input if the flag is disabled.")
1331+
.booleanConf
1332+
.createWithDefault(true)
13271333
}
13281334

13291335
/**
@@ -1686,6 +1692,8 @@ class SQLConf extends Serializable with Logging {
16861692

16871693
def csvColumnPruning: Boolean = getConf(SQLConf.CSV_PARSER_COLUMN_PRUNING)
16881694

1695+
def legacySizeOfNull: Boolean = getConf(SQLConf.LEGACY_SIZE_OF_NULL)
1696+
16891697
/** ********************** SQLConf functionality methods ************ */
16901698

16911699
/** Set Spark SQL configuration properties. */

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,25 +24,37 @@ import org.apache.spark.sql.types._
2424

2525
class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
2626

27-
test("Array and Map Size") {
27+
def testSize(legacySizeOfNull: Boolean, sizeOfNull: Any): Unit = {
2828
val a0 = Literal.create(Seq(1, 2, 3), ArrayType(IntegerType))
2929
val a1 = Literal.create(Seq[Integer](), ArrayType(IntegerType))
3030
val a2 = Literal.create(Seq(1, 2), ArrayType(IntegerType))
3131

32-
checkEvaluation(Size(a0), 3)
33-
checkEvaluation(Size(a1), 0)
34-
checkEvaluation(Size(a2), 2)
32+
checkEvaluation(Size(a0, legacySizeOfNull), 3)
33+
checkEvaluation(Size(a1, legacySizeOfNull), 0)
34+
checkEvaluation(Size(a2, legacySizeOfNull), 2)
3535

3636
val m0 = Literal.create(Map("a" -> "a", "b" -> "b"), MapType(StringType, StringType))
3737
val m1 = Literal.create(Map[String, String](), MapType(StringType, StringType))
3838
val m2 = Literal.create(Map("a" -> "a"), MapType(StringType, StringType))
3939

40-
checkEvaluation(Size(m0), 2)
41-
checkEvaluation(Size(m1), 0)
42-
checkEvaluation(Size(m2), 1)
40+
checkEvaluation(Size(m0, legacySizeOfNull), 2)
41+
checkEvaluation(Size(m1, legacySizeOfNull), 0)
42+
checkEvaluation(Size(m2, legacySizeOfNull), 1)
43+
44+
checkEvaluation(
45+
Size(Literal.create(null, MapType(StringType, StringType)), legacySizeOfNull),
46+
expected = sizeOfNull)
47+
checkEvaluation(
48+
Size(Literal.create(null, ArrayType(StringType)), legacySizeOfNull),
49+
expected = sizeOfNull)
50+
}
51+
52+
test("Array and Map Size - legacy") {
53+
testSize(legacySizeOfNull = true, sizeOfNull = -1)
54+
}
4355

44-
checkEvaluation(Size(Literal.create(null, MapType(StringType, StringType))), -1)
45-
checkEvaluation(Size(Literal.create(null, ArrayType(StringType))), -1)
56+
test("Array and Map Size") {
57+
testSize(legacySizeOfNull = false, sizeOfNull = null)
4658
}
4759

4860
test("MapKeys/MapValues") {

sql/core/src/main/scala/org/apache/spark/sql/functions.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3431,7 +3431,7 @@ object functions {
34313431
* @group collection_funcs
34323432
* @since 1.5.0
34333433
*/
3434-
def size(e: Column): Column = withExpr { Size(e.expr) }
3434+
def size(e: Column): Column = withExpr { new Size(e.expr) }
34353435

34363436
/**
34373437
* Sorts the input array for the given column in ascending order,

sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala

Lines changed: 32 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -487,26 +487,29 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
487487
}.getMessage().contains("only supports array input"))
488488
}
489489

490-
test("array size function") {
490+
def testSizeOfArray(sizeOfNull: Any): Unit = {
491491
val df = Seq(
492492
(Seq[Int](1, 2), "x"),
493493
(Seq[Int](), "y"),
494494
(Seq[Int](1, 2, 3), "z"),
495495
(null, "empty")
496496
).toDF("a", "b")
497-
checkAnswer(
498-
df.select(size($"a")),
499-
Seq(Row(2), Row(0), Row(3), Row(-1))
500-
)
501-
checkAnswer(
502-
df.selectExpr("size(a)"),
503-
Seq(Row(2), Row(0), Row(3), Row(-1))
504-
)
505497

506-
checkAnswer(
507-
df.selectExpr("cardinality(a)"),
508-
Seq(Row(2L), Row(0L), Row(3L), Row(-1L))
509-
)
498+
checkAnswer(df.select(size($"a")), Seq(Row(2), Row(0), Row(3), Row(sizeOfNull)))
499+
checkAnswer(df.selectExpr("size(a)"), Seq(Row(2), Row(0), Row(3), Row(sizeOfNull)))
500+
checkAnswer(df.selectExpr("cardinality(a)"), Seq(Row(2L), Row(0L), Row(3L), Row(sizeOfNull)))
501+
}
502+
503+
test("array size function - legacy") {
504+
withSQLConf(SQLConf.LEGACY_SIZE_OF_NULL.key -> "true") {
505+
testSizeOfArray(sizeOfNull = -1)
506+
}
507+
}
508+
509+
test("array size function") {
510+
withSQLConf(SQLConf.LEGACY_SIZE_OF_NULL.key -> "false") {
511+
testSizeOfArray(sizeOfNull = null)
512+
}
510513
}
511514

512515
test("dataframe arrays_zip function") {
@@ -567,21 +570,28 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
567570
}
568571
}
569572

570-
test("map size function") {
573+
def testSizeOfMap(sizeOfNull: Any): Unit = {
571574
val df = Seq(
572575
(Map[Int, Int](1 -> 1, 2 -> 2), "x"),
573576
(Map[Int, Int](), "y"),
574577
(Map[Int, Int](1 -> 1, 2 -> 2, 3 -> 3), "z"),
575578
(null, "empty")
576579
).toDF("a", "b")
577-
checkAnswer(
578-
df.select(size($"a")),
579-
Seq(Row(2), Row(0), Row(3), Row(-1))
580-
)
581-
checkAnswer(
582-
df.selectExpr("size(a)"),
583-
Seq(Row(2), Row(0), Row(3), Row(-1))
584-
)
580+
581+
checkAnswer(df.select(size($"a")), Seq(Row(2), Row(0), Row(3), Row(sizeOfNull)))
582+
checkAnswer(df.selectExpr("size(a)"), Seq(Row(2), Row(0), Row(3), Row(sizeOfNull)))
583+
}
584+
585+
test("map size function - legacy") {
586+
withSQLConf(SQLConf.LEGACY_SIZE_OF_NULL.key -> "true") {
587+
testSizeOfMap(sizeOfNull = -1: Int)
588+
}
589+
}
590+
591+
test("map size function") {
592+
withSQLConf(SQLConf.LEGACY_SIZE_OF_NULL.key -> "false") {
593+
testSizeOfMap(sizeOfNull = null)
594+
}
585595
}
586596

587597
test("map_keys/map_values function") {

0 commit comments

Comments
 (0)