[SPARK-53559][SQL][CATALYST] Fix HLL sketch updates to use raw collation key bytes

Chris Boumalhab · cboumalh · dtenedor · commit 86d310b97f85 · 2025-09-16T11:18:47.000-07:00
### What changes were proposed in this pull request? - Extract the input UTF8String. - Ignore strings that are collation equal to the empty string when updating the sketch. Before: ``` val cKey = CollationFactory.getCollationKey(v.asInstanceOf[UTF8String], st.collationId) sketch.update(cKey.toString) ``` After: ``` val collation = CollationFactory.fetchCollation(st.collationId) val str = v.asInstanceOf[UTF8String] if (!collation.equalsFunction(str, UTF8String.EMPTY_UTF8)) { sketch.update(collation.sortKeyFunction.apply(str)) } ```` ### Why are the changes needed? As discussed in #51298 (comment). Collation keys are arbitrary byte sequences, not guaranteed to be valid UTF-8. Converting them to a Java String replaces invalid UTF-8 bytes with U+FFFD (the replacement character). This can collapse distinct strings into identical values, causing the sketch to treat different strings as the same. Also, string collations must be considered when updating a sketch. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Build repo and test suites ### Was this patch authored or co-authored using generative AI tooling? No ### Jira https://issues.apache.org/jira/browse/SPARK-53559 Closes #52316 from cboumalh/SPARK-53559_refactor_hll_sketch_update. Lead-authored-by: Chris Boumalhab <cboumalh@amazon.com> Co-authored-by: Chris Boumalhab <84485659+cboumalh@users.noreply.github.com> Signed-off-by: Daniel Tenedorio <daniel.tenedorio@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/datasketchesAggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/datasketchesAggregates.scala
@@ -130,6 +130,10 @@ case class HllSketchAgg(
    * Evaluate the input row and update the HllSketch instance with the row's value. The update
    * function only supports a subset of Spark SQL types, and an exception will be thrown for
    * unsupported types.
+   * Notes:
+   *   - Null values are ignored.
+   *   - Empty byte arrays are ignored.
+   *   - Strings that are collation-equal to the empty string are ignored.
    *
    * @param sketch The HllSketch instance.
    * @param input  an input row
@@ -146,8 +150,11 @@ case class HllSketchAgg(
         case IntegerType => sketch.update(v.asInstanceOf[Int])
         case LongType => sketch.update(v.asInstanceOf[Long])
         case st: StringType =>
-          val cKey = CollationFactory.getCollationKey(v.asInstanceOf[UTF8String], st.collationId)
-          sketch.update(cKey.toString)
+          val collation = CollationFactory.fetchCollation(st.collationId)
+          val str = v.asInstanceOf[UTF8String]
+          if (!collation.equalsFunction(str, UTF8String.EMPTY_UTF8)) {
+            sketch.update(collation.sortKeyFunction.apply(str))
+          }
         case BinaryType => sketch.update(v.asInstanceOf[Array[Byte]])
         case dataType => throw new SparkUnsupportedOperationException(
           errorClass = "_LEGACY_ERROR_TEMP_3121",
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/hll.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/hll.sql.out
@@ -6,6 +6,20 @@ DropTable true, false
 +- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t1
 
 
+-- !query
+DROP TABLE IF EXISTS hll_binary_test
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.hll_binary_test
+
+
+-- !query
+DROP TABLE IF EXISTS hll_string_test
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.hll_string_test
+
+
 -- !query
 CREATE TABLE t1 USING JSON AS VALUES (0), (1), (2), (2), (2), (3), (4) as tab(col)
 -- !query analysis
@@ -14,6 +28,34 @@ CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t1`, ErrorIfExis
       +- LocalRelation [col#x]
 
 
+-- !query
+CREATE TABLE hll_binary_test (bytes BINARY) USING PARQUET
+-- !query analysis
+CreateDataSourceTableCommand `spark_catalog`.`default`.`hll_binary_test`, false
+
+
+-- !query
+CREATE TABLE hll_string_test (s STRING) USING PARQUET
+-- !query analysis
+CreateDataSourceTableCommand `spark_catalog`.`default`.`hll_string_test`, false
+
+
+-- !query
+INSERT INTO hll_binary_test VALUES (X''), (CAST('  ' AS BINARY)), (X'e280'), (X'c1'), (X'c120')
+-- !query analysis
+InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/hll_binary_test, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/hll_binary_test], Append, `spark_catalog`.`default`.`hll_binary_test`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/hll_binary_test), [bytes]
++- Project [col1#x AS bytes#x]
+   +- LocalRelation [col1#x]
+
+
+-- !query
+INSERT INTO hll_string_test VALUES (''), ('  '), (CAST(X'C1' AS STRING)), (CAST(X'80' AS STRING)), ('\uFFFD'), ('Å'), ('å'), ('a\u030A'), ('Å '), ('å  '), ('a\u030A   ')
+-- !query analysis
+InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/hll_string_test, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/hll_string_test], Append, `spark_catalog`.`default`.`hll_string_test`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/hll_string_test), [s]
++- Project [col1#x AS s#x]
+   +- LocalRelation [col1#x]
+
+
 -- !query
 SELECT hll_sketch_estimate(hll_sketch_agg(col)) AS result FROM t1
 -- !query analysis
@@ -22,6 +64,78 @@ Aggregate [hll_sketch_estimate(hll_sketch_agg(col#x, 12, 0, 0)) AS result#xL]
    +- Relation spark_catalog.default.t1[col#x] json
 
 
+-- !query
+SELECT hll_sketch_estimate(hll_sketch_agg(bytes)) FROM hll_binary_test
+-- !query analysis
+Aggregate [hll_sketch_estimate(hll_sketch_agg(bytes#x, 12, 0, 0)) AS hll_sketch_estimate(hll_sketch_agg(bytes, 12))#xL]
++- SubqueryAlias spark_catalog.default.hll_binary_test
+   +- Relation spark_catalog.default.hll_binary_test[bytes#x] parquet
+
+
+-- !query
+SELECT hll_sketch_estimate(hll_sketch_agg(s)) utf8_b FROM hll_string_test
+-- !query analysis
+Aggregate [hll_sketch_estimate(hll_sketch_agg(s#x, 12, 0, 0)) AS utf8_b#xL]
++- SubqueryAlias spark_catalog.default.hll_string_test
+   +- Relation spark_catalog.default.hll_string_test[s#x] parquet
+
+
+-- !query
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UTF8_LCASE)) utf8_lc FROM hll_string_test
+-- !query analysis
+Aggregate [hll_sketch_estimate(hll_sketch_agg(collate(s#x, UTF8_LCASE), 12, 0, 0)) AS utf8_lc#xL]
++- SubqueryAlias spark_catalog.default.hll_string_test
+   +- Relation spark_catalog.default.hll_string_test[s#x] parquet
+
+
+-- !query
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UNICODE)) unicode FROM hll_string_test
+-- !query analysis
+Aggregate [hll_sketch_estimate(hll_sketch_agg(collate(s#x, UNICODE), 12, 0, 0)) AS unicode#xL]
++- SubqueryAlias spark_catalog.default.hll_string_test
+   +- Relation spark_catalog.default.hll_string_test[s#x] parquet
+
+
+-- !query
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UNICODE_CI)) unicode_ci FROM hll_string_test
+-- !query analysis
+Aggregate [hll_sketch_estimate(hll_sketch_agg(collate(s#x, UNICODE_CI), 12, 0, 0)) AS unicode_ci#xL]
++- SubqueryAlias spark_catalog.default.hll_string_test
+   +- Relation spark_catalog.default.hll_string_test[s#x] parquet
+
+
+-- !query
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UTF8_BINARY_RTRIM)) utf8_b_rt FROM hll_string_test
+-- !query analysis
+Aggregate [hll_sketch_estimate(hll_sketch_agg(collate(s#x, UTF8_BINARY_RTRIM), 12, 0, 0)) AS utf8_b_rt#xL]
++- SubqueryAlias spark_catalog.default.hll_string_test
+   +- Relation spark_catalog.default.hll_string_test[s#x] parquet
+
+
+-- !query
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UTF8_LCASE_RTRIM)) utf8_lc_rt FROM hll_string_test
+-- !query analysis
+Aggregate [hll_sketch_estimate(hll_sketch_agg(collate(s#x, UTF8_LCASE_RTRIM), 12, 0, 0)) AS utf8_lc_rt#xL]
++- SubqueryAlias spark_catalog.default.hll_string_test
+   +- Relation spark_catalog.default.hll_string_test[s#x] parquet
+
+
+-- !query
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UNICODE_RTRIM)) unicode_rt FROM hll_string_test
+-- !query analysis
+Aggregate [hll_sketch_estimate(hll_sketch_agg(collate(s#x, UNICODE_RTRIM), 12, 0, 0)) AS unicode_rt#xL]
++- SubqueryAlias spark_catalog.default.hll_string_test
+   +- Relation spark_catalog.default.hll_string_test[s#x] parquet
+
+
+-- !query
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UNICODE_CI_RTRIM)) unicode_ci_rt FROM hll_string_test
+-- !query analysis
+Aggregate [hll_sketch_estimate(hll_sketch_agg(collate(s#x, UNICODE_CI_RTRIM), 12, 0, 0)) AS unicode_ci_rt#xL]
++- SubqueryAlias spark_catalog.default.hll_string_test
+   +- Relation spark_catalog.default.hll_string_test[s#x] parquet
+
+
 -- !query
 SELECT hll_sketch_estimate(hll_sketch_agg(col, 12))
 FROM VALUES (50), (60), (60), (60), (75), (100) tab(col)
@@ -213,3 +327,17 @@ DROP TABLE IF EXISTS t1
 -- !query analysis
 DropTable true, false
 +- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t1
+
+
+-- !query
+DROP TABLE IF EXISTS hll_binary_test
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.hll_binary_test
+
+
+-- !query
+DROP TABLE IF EXISTS hll_string_test
+-- !query analysis
+DropTable true, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.hll_string_test
diff --git a/sql/core/src/test/resources/sql-tests/inputs/hll.sql b/sql/core/src/test/resources/sql-tests/inputs/hll.sql
@@ -1,10 +1,40 @@
 -- Positive test cases
 -- Create a table with some testing data.
 DROP TABLE IF EXISTS t1;
+DROP TABLE IF EXISTS hll_binary_test;
+DROP TABLE IF EXISTS hll_string_test;
+
 CREATE TABLE t1 USING JSON AS VALUES (0), (1), (2), (2), (2), (3), (4) as tab(col);
+CREATE TABLE hll_binary_test (bytes BINARY) USING PARQUET;
+CREATE TABLE hll_string_test (s STRING) USING PARQUET;
+
+INSERT INTO hll_binary_test VALUES (X''), (CAST('  ' AS BINARY)), (X'e280'), (X'c1'), (X'c120');
+
+-- `\u030A` is the "combining ring above" Unicode character: https://www.compart.com/en/unicode/U+030A
+-- `\uFFFD is the Unicode replacement character
+-- `\xC1` is an invalid Unicode byte.
+-- `\x80` is a Unicode continuation byte, that is it cannot be the first byte of a multi-byte UTF8 character.
+-- All strings are different based on the UTF8_BINARY collation.
+-- The first and second strings are equal for any collation with the RTRIM modifier, and equal to the empty string.
+-- The last three strings are respectively equal to the next last three strings for any collation with the RTRIM modifier.
+-- The strings "\xC1", "\x80" and "\uFFFD" are equal for all collations except UTF8_BINARY.
+-- The (sub)strings `å` and `a\u030A` are equal for the UNICODE family of collations.
+-- `å` is the lowercase version of `Å`.
+INSERT INTO hll_string_test VALUES (''), ('  '), (CAST(X'C1' AS STRING)), (CAST(X'80' AS STRING)), ('\uFFFD'), ('Å'), ('å'), ('a\u030A'), ('Å '), ('å  '), ('a\u030A   ');
 
 SELECT hll_sketch_estimate(hll_sketch_agg(col)) AS result FROM t1;
 
+SELECT hll_sketch_estimate(hll_sketch_agg(bytes)) FROM hll_binary_test;
+
+SELECT hll_sketch_estimate(hll_sketch_agg(s)) utf8_b FROM hll_string_test;
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UTF8_LCASE)) utf8_lc FROM hll_string_test;
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UNICODE)) unicode FROM hll_string_test;
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UNICODE_CI)) unicode_ci FROM hll_string_test;
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UTF8_BINARY_RTRIM)) utf8_b_rt FROM hll_string_test;
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UTF8_LCASE_RTRIM)) utf8_lc_rt FROM hll_string_test;
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UNICODE_RTRIM)) unicode_rt FROM hll_string_test;
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UNICODE_CI_RTRIM)) unicode_ci_rt FROM hll_string_test;
+
 SELECT hll_sketch_estimate(hll_sketch_agg(col, 12))
 FROM VALUES (50), (60), (60), (60), (75), (100) tab(col);
 
@@ -74,3 +104,5 @@ FROM (SELECT CAST('abc' AS BINARY) AS buffer);
 
 -- Clean up
 DROP TABLE IF EXISTS t1;
+DROP TABLE IF EXISTS hll_binary_test;
+DROP TABLE IF EXISTS hll_string_test;
diff --git a/sql/core/src/test/resources/sql-tests/results/hll.sql.out b/sql/core/src/test/resources/sql-tests/results/hll.sql.out
@@ -7,6 +7,22 @@ struct<>
 
 
 
+-- !query
+DROP TABLE IF EXISTS hll_binary_test
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+DROP TABLE IF EXISTS hll_string_test
+-- !query schema
+struct<>
+-- !query output
+
+
+
 -- !query
 CREATE TABLE t1 USING JSON AS VALUES (0), (1), (2), (2), (2), (3), (4) as tab(col)
 -- !query schema
@@ -15,6 +31,38 @@ struct<>
 
 
 
+-- !query
+CREATE TABLE hll_binary_test (bytes BINARY) USING PARQUET
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+CREATE TABLE hll_string_test (s STRING) USING PARQUET
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+INSERT INTO hll_binary_test VALUES (X''), (CAST('  ' AS BINARY)), (X'e280'), (X'c1'), (X'c120')
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+INSERT INTO hll_string_test VALUES (''), ('  '), (CAST(X'C1' AS STRING)), (CAST(X'80' AS STRING)), ('\uFFFD'), ('Å'), ('å'), ('a\u030A'), ('Å '), ('å  '), ('a\u030A   ')
+-- !query schema
+struct<>
+-- !query output
+
+
+
 -- !query
 SELECT hll_sketch_estimate(hll_sketch_agg(col)) AS result FROM t1
 -- !query schema
@@ -23,6 +71,78 @@ struct<result:bigint>
 5
 
 
+-- !query
+SELECT hll_sketch_estimate(hll_sketch_agg(bytes)) FROM hll_binary_test
+-- !query schema
+struct<hll_sketch_estimate(hll_sketch_agg(bytes, 12)):bigint>
+-- !query output
+4
+
+
+-- !query
+SELECT hll_sketch_estimate(hll_sketch_agg(s)) utf8_b FROM hll_string_test
+-- !query schema
+struct<utf8_b:bigint>
+-- !query output
+10
+
+
+-- !query
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UTF8_LCASE)) utf8_lc FROM hll_string_test
+-- !query schema
+struct<utf8_lc:bigint>
+-- !query output
+7
+
+
+-- !query
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UNICODE)) unicode FROM hll_string_test
+-- !query schema
+struct<unicode:bigint>
+-- !query output
+7
+
+
+-- !query
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UNICODE_CI)) unicode_ci FROM hll_string_test
+-- !query schema
+struct<unicode_ci:bigint>
+-- !query output
+6
+
+
+-- !query
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UTF8_BINARY_RTRIM)) utf8_b_rt FROM hll_string_test
+-- !query schema
+struct<utf8_b_rt:bigint>
+-- !query output
+6
+
+
+-- !query
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UTF8_LCASE_RTRIM)) utf8_lc_rt FROM hll_string_test
+-- !query schema
+struct<utf8_lc_rt:bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UNICODE_RTRIM)) unicode_rt FROM hll_string_test
+-- !query schema
+struct<unicode_rt:bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT hll_sketch_estimate(hll_sketch_agg(s COLLATE UNICODE_CI_RTRIM)) unicode_ci_rt FROM hll_string_test
+-- !query schema
+struct<unicode_ci_rt:bigint>
+-- !query output
+2
+
+
 -- !query
 SELECT hll_sketch_estimate(hll_sketch_agg(col, 12))
 FROM VALUES (50), (60), (60), (60), (75), (100) tab(col)
@@ -267,3 +387,19 @@ DROP TABLE IF EXISTS t1
 struct<>
 -- !query output
 
+
+
+-- !query
+DROP TABLE IF EXISTS hll_binary_test
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+DROP TABLE IF EXISTS hll_string_test
+-- !query schema
+struct<>
+-- !query output
+