Skip to content

Commit a7dc020

Browse files
uros-dbcloud-fan
authored andcommitted
[SPARK-48681][SQL] Use ICU in Lower/Upper expressions for UTF8_BINARY strings
### What changes were proposed in this pull request? Update `Lower` & `Upper` Spark expressions to use ICU case mappings for UTF8_BINARY collation, instead of the currently used JVM case mappings. This behaviour is put under the `ICU_CASE_MAPPINGS_ENABLED` flag in SQLConf, which is `true` by default. ### Why are the changes needed? To keep the consistency between collations - all collations shouls use ICU-based case mappings, including the UTF8_BINARY collation. ### Does this PR introduce _any_ user-facing change? Yes, the behaviour of `lower` & `upper` string functions for UTF8_BINARY will now rely on ICU-based case mappings. However, by turning the `ICU_CASE_MAPPINGS_ENABLED` flag off, users can get the old JVM-based case mappings. Note that the difference between the two is really subtle. ### How was this patch tested? Existing tests, with extended `CollationSupport` unit tests for Lower/Upper to verify both ICU and JVM behaviour. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #47043 from uros-db/change-lower-upper. Authored-by: Uros Bojanic <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 4663b84 commit a7dc020

File tree

4 files changed

+47
-15
lines changed

4 files changed

+47
-15
lines changed

common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -206,21 +206,22 @@ public static boolean execICU(final UTF8String l, final UTF8String r,
206206
}
207207

208208
public static class Upper {
209-
public static UTF8String exec(final UTF8String v, final int collationId) {
209+
public static UTF8String exec(final UTF8String v, final int collationId, boolean useICU) {
210210
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
211211
if (collation.supportsBinaryEquality) {
212-
return execBinary(v);
212+
return useICU ? execBinaryICU(v) : execBinary(v);
213213
} else if (collation.supportsLowercaseEquality) {
214214
return execLowercase(v);
215215
} else {
216216
return execICU(v, collationId);
217217
}
218218
}
219-
public static String genCode(final String v, final int collationId) {
219+
public static String genCode(final String v, final int collationId, boolean useICU) {
220220
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
221221
String expr = "CollationSupport.Upper.exec";
222222
if (collation.supportsBinaryEquality) {
223-
return String.format(expr + "Binary(%s)", v);
223+
String funcName = useICU ? "BinaryICU" : "Binary";
224+
return String.format(expr + "%s(%s)", funcName, v);
224225
} else if (collation.supportsLowercaseEquality) {
225226
return String.format(expr + "Lowercase(%s)", v);
226227
} else {
@@ -230,6 +231,9 @@ public static String genCode(final String v, final int collationId) {
230231
public static UTF8String execBinary(final UTF8String v) {
231232
return v.toUpperCase();
232233
}
234+
public static UTF8String execBinaryICU(final UTF8String v) {
235+
return CollationAwareUTF8String.toUpperCase(v);
236+
}
233237
public static UTF8String execLowercase(final UTF8String v) {
234238
return CollationAwareUTF8String.toUpperCase(v);
235239
}
@@ -239,21 +243,22 @@ public static UTF8String execICU(final UTF8String v, final int collationId) {
239243
}
240244

241245
public static class Lower {
242-
public static UTF8String exec(final UTF8String v, final int collationId) {
246+
public static UTF8String exec(final UTF8String v, final int collationId, boolean useICU) {
243247
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
244248
if (collation.supportsBinaryEquality) {
245-
return execBinary(v);
249+
return useICU ? execBinaryICU(v) : execBinary(v);
246250
} else if (collation.supportsLowercaseEquality) {
247251
return execLowercase(v);
248252
} else {
249253
return execICU(v, collationId);
250254
}
251255
}
252-
public static String genCode(final String v, final int collationId) {
256+
public static String genCode(final String v, final int collationId, boolean useICU) {
253257
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
254-
String expr = "CollationSupport.Lower.exec";
258+
String expr = "CollationSupport.Lower.exec";
255259
if (collation.supportsBinaryEquality) {
256-
return String.format(expr + "Binary(%s)", v);
260+
String funcName = useICU ? "BinaryICU" : "Binary";
261+
return String.format(expr + "%s(%s)", funcName, v);
257262
} else if (collation.supportsLowercaseEquality) {
258263
return String.format(expr + "Lowercase(%s)", v);
259264
} else {
@@ -263,6 +268,9 @@ public static String genCode(final String v, final int collationId) {
263268
public static UTF8String execBinary(final UTF8String v) {
264269
return v.toLowerCase();
265270
}
271+
public static UTF8String execBinaryICU(final UTF8String v) {
272+
return CollationAwareUTF8String.toLowerCase(v);
273+
}
266274
public static UTF8String execLowercase(final UTF8String v) {
267275
return CollationAwareUTF8String.toLowerCase(v);
268276
}

common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -606,7 +606,11 @@ private void assertUpper(String target, String collationName, String expected)
606606
UTF8String target_utf8 = UTF8String.fromString(target);
607607
UTF8String expected_utf8 = UTF8String.fromString(expected);
608608
int collationId = CollationFactory.collationNameToId(collationName);
609-
assertEquals(expected_utf8, CollationSupport.Upper.exec(target_utf8, collationId));
609+
// Testing the new ICU-based implementation of the Upper function.
610+
assertEquals(expected_utf8, CollationSupport.Upper.exec(target_utf8, collationId, true));
611+
// Testing the old JVM-based implementation of the Upper function.
612+
assertEquals(expected_utf8, CollationSupport.Upper.exec(target_utf8, collationId, false));
613+
// Note: results should be the same in these tests for both ICU and JVM-based implementations.
610614
}
611615

612616
@Test
@@ -660,7 +664,11 @@ private void assertLower(String target, String collationName, String expected)
660664
UTF8String target_utf8 = UTF8String.fromString(target);
661665
UTF8String expected_utf8 = UTF8String.fromString(expected);
662666
int collationId = CollationFactory.collationNameToId(collationName);
663-
assertEquals(expected_utf8, CollationSupport.Lower.exec(target_utf8, collationId));
667+
// Testing the new ICU-based implementation of the Lower function.
668+
assertEquals(expected_utf8, CollationSupport.Lower.exec(target_utf8, collationId, true));
669+
// Testing the old JVM-based implementation of the Lower function.
670+
assertEquals(expected_utf8, CollationSupport.Lower.exec(target_utf8, collationId, false));
671+
// Note: results should be the same in these tests for both ICU and JVM-based implementations.
664672
}
665673

666674
@Test

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -455,12 +455,16 @@ case class Upper(child: Expression)
455455

456456
final lazy val collationId: Int = child.dataType.asInstanceOf[StringType].collationId
457457

458-
override def convert(v: UTF8String): UTF8String = CollationSupport.Upper.exec(v, collationId)
458+
// Flag to indicate whether to use ICU instead of JVM case mappings for UTF8_BINARY collation.
459+
private final lazy val useICU = SQLConf.get.getConf(SQLConf.ICU_CASE_MAPPINGS_ENABLED)
460+
461+
override def convert(v: UTF8String): UTF8String =
462+
CollationSupport.Upper.exec(v, collationId, useICU)
459463

460464
final override val nodePatterns: Seq[TreePattern] = Seq(UPPER_OR_LOWER)
461465

462466
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
463-
defineCodeGen(ctx, ev, c => CollationSupport.Upper.genCode(c, collationId))
467+
defineCodeGen(ctx, ev, c => CollationSupport.Upper.genCode(c, collationId, useICU))
464468
}
465469

466470
override protected def withNewChildInternal(newChild: Expression): Upper = copy(child = newChild)
@@ -483,12 +487,16 @@ case class Lower(child: Expression)
483487

484488
final lazy val collationId: Int = child.dataType.asInstanceOf[StringType].collationId
485489

486-
override def convert(v: UTF8String): UTF8String = CollationSupport.Lower.exec(v, collationId)
490+
// Flag to indicate whether to use ICU instead of JVM case mappings for UTF8_BINARY collation.
491+
private final lazy val useICU = SQLConf.get.getConf(SQLConf.ICU_CASE_MAPPINGS_ENABLED)
492+
493+
override def convert(v: UTF8String): UTF8String =
494+
CollationSupport.Lower.exec(v, collationId, useICU)
487495

488496
final override val nodePatterns: Seq[TreePattern] = Seq(UPPER_OR_LOWER)
489497

490498
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
491-
defineCodeGen(ctx, ev, c => CollationSupport.Lower.genCode(c, collationId))
499+
defineCodeGen(ctx, ev, c => CollationSupport.Lower.genCode(c, collationId, useICU))
492500
}
493501

494502
override def prettyName: String =

sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -785,6 +785,14 @@ object SQLConf {
785785
_ => Map())
786786
.createWithDefault("UTF8_BINARY")
787787

788+
val ICU_CASE_MAPPINGS_ENABLED =
789+
buildConf("spark.sql.icu.caseMappings.enabled")
790+
.doc("When enabled we use the ICU library (instead of the JVM) to implement case mappings" +
791+
" for strings under UTF8_BINARY collation.")
792+
.version("4.0.0")
793+
.booleanConf
794+
.createWithDefault(true)
795+
788796
val FETCH_SHUFFLE_BLOCKS_IN_BATCH =
789797
buildConf("spark.sql.adaptive.fetchShuffleBlocksInBatch")
790798
.internal()

0 commit comments

Comments
 (0)