From b5a37e7b80db96927b8b2ee6e8f96510b219daa3 Mon Sep 17 00:00:00 2001 From: Jovan Pavlovic Date: Fri, 20 Sep 2024 16:24:05 +0200 Subject: [PATCH 01/19] introduce space trimming flag in collation factory. --- .../sql/catalyst/util/CollationFactory.java | 240 +++++++++++++----- 1 file changed, 174 insertions(+), 66 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index d5dbca7eb89b..fb28d9e6c86a 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -99,7 +99,8 @@ public record CollationMeta( String icuVersion, String padAttribute, boolean accentSensitivity, - boolean caseSensitivity) { } + boolean caseSensitivity, + String spaceTrimming) { } /** * Entry encapsulating all information about a collation. @@ -200,6 +201,7 @@ public Collation( * bit 28-24: Reserved. * bit 23-22: Reserved for version. * bit 21-18: Reserved for space trimming. + * 0000 = none, 0001 = left trim, 0010 = right trim, 0011 = trim. * bit 17-0: Depend on collation family. * --- * INDETERMINATE collation ID binary layout: @@ -214,7 +216,8 @@ public Collation( * UTF8_BINARY collation ID binary layout: * bit 31-24: Zeroes. * bit 23-22: Zeroes, reserved for version. - * bit 21-18: Zeroes, reserved for space trimming. + * bit 21-18: Reserved for space trimming. + * 0000 = none, 0001 = left trim, 0010 = right trim, 0011 = trim. * bit 17-3: Zeroes. * bit 2: 0, reserved for accent sensitivity. * bit 1: 0, reserved for uppercase and case-insensitive. @@ -225,7 +228,8 @@ public Collation( * bit 29: 1 * bit 28-24: Zeroes. * bit 23-22: Zeroes, reserved for version. - * bit 21-18: Zeroes, reserved for space trimming. + * bit 21-18: Reserved for space trimming. + * 0000 = none, 0001 = left trim, 0010 = right trim, 0011 = trim. * bit 17: 0 = case-sensitive, 1 = case-insensitive. * bit 16: 0 = accent-sensitive, 1 = accent-insensitive. * bit 15-14: Zeroes, reserved for punctuation sensitivity. @@ -238,7 +242,13 @@ public Collation( * - UNICODE -> 0x20000000 * - UNICODE_AI -> 0x20010000 * - UNICODE_CI -> 0x20020000 + * - UNICODE_LTRIM -> 0x20040000 + * - UNICODE_RTRIM -> 0x20080000 + * - UNICODE_TRIM -> 0x200C0000 * - UNICODE_CI_AI -> 0x20030000 + * - UNICODE_CI_TRIM -> 0x200E0000 + * - UNICODE_AI_TRIM -> 0x200D0000 + * - UNICODE_CI_AI_TRIM-> 0x200F0000 * - af -> 0x20000001 * - af_CI_AI -> 0x20030001 */ @@ -259,6 +269,14 @@ protected enum ImplementationProvider { UTF8_BINARY, ICU } + /** + * Bits 19-18 having value 00 for no space trimming, 01 for left space trimming + * 10 for right space trimming and 11 for both sides space trimming. + */ + protected enum SpaceTrimming { + NONE, LTRIM, RTRIM, TRIM + } + /** * Offset in binary collation ID layout. */ @@ -279,6 +297,17 @@ protected enum ImplementationProvider { */ protected static final int IMPLEMENTATION_PROVIDER_MASK = 0b1; + + /** + * Offset in binary collation ID layout. + */ + protected static final int SPACE_TRIMMING_OFFSET = 18; + + /** + * Bitmask corresponding to width in bits in binary collation ID layout. + */ + protected static final int SPACE_TRIMMING_MASK = 0b11; + private static final int INDETERMINATE_COLLATION_ID = -1; /** @@ -303,6 +332,14 @@ private static DefinitionOrigin getDefinitionOrigin(int collationId) { DEFINITION_ORIGIN_OFFSET, DEFINITION_ORIGIN_MASK)]; } + /** + * Utility function to retrieve `SpaceTrimming` enum instance from collation ID. + */ + protected static SpaceTrimming getSpaceTrimming(int collationId) { + return SpaceTrimming.values()[SpecifierUtils.getSpecValue(collationId, + SPACE_TRIMMING_OFFSET, SPACE_TRIMMING_MASK)]; + } + /** * Main entry point for retrieving `Collation` instance from collation ID. */ @@ -398,26 +435,30 @@ private enum CaseSensitivity { private static final String UTF8_LCASE_COLLATION_NAME = "UTF8_LCASE"; private static final int UTF8_BINARY_COLLATION_ID = - new CollationSpecUTF8(CaseSensitivity.UNSPECIFIED).collationId; + new CollationSpecUTF8(CaseSensitivity.UNSPECIFIED, SpaceTrimming.NONE).collationId; private static final int UTF8_LCASE_COLLATION_ID = - new CollationSpecUTF8(CaseSensitivity.LCASE).collationId; + new CollationSpecUTF8(CaseSensitivity.LCASE, SpaceTrimming.NONE).collationId; protected static Collation UTF8_BINARY_COLLATION = - new CollationSpecUTF8(CaseSensitivity.UNSPECIFIED).buildCollation(); + new CollationSpecUTF8(CaseSensitivity.UNSPECIFIED, SpaceTrimming.NONE).buildCollation(); protected static Collation UTF8_LCASE_COLLATION = - new CollationSpecUTF8(CaseSensitivity.LCASE).buildCollation(); + new CollationSpecUTF8(CaseSensitivity.LCASE, SpaceTrimming.NONE).buildCollation(); private final int collationId; - private CollationSpecUTF8(CaseSensitivity caseSensitivity) { - this.collationId = + private CollationSpecUTF8( + CaseSensitivity caseSensitivity, + SpaceTrimming spaceTrimming) { + int collationId = SpecifierUtils.setSpecValue(0, CASE_SENSITIVITY_OFFSET, caseSensitivity); + this.collationId = + SpecifierUtils.setSpecValue(collationId, SPACE_TRIMMING_OFFSET, spaceTrimming); } private static int collationNameToId(String originalName, String collationName) throws SparkException { - if (UTF8_BINARY_COLLATION.collationName.equals(collationName)) { + if (collationName.startsWith(UTF8_BINARY_COLLATION.collationName)) { return UTF8_BINARY_COLLATION_ID; - } else if (UTF8_LCASE_COLLATION.collationName.equals(collationName)) { + } else if (collationName.startsWith(UTF8_LCASE_COLLATION.collationName)) { return UTF8_LCASE_COLLATION_ID; } else { // Throw exception with original (before case conversion) collation name. @@ -429,15 +470,17 @@ private static CollationSpecUTF8 fromCollationId(int collationId) { // Extract case sensitivity from collation ID. int caseConversionOrdinal = SpecifierUtils.getSpecValue(collationId, CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK); - // Verify only case sensitivity bits were set settable in UTF8_BINARY family of collations. - assert (SpecifierUtils.removeSpec(collationId, - CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK) == 0); - return new CollationSpecUTF8(CaseSensitivity.values()[caseConversionOrdinal]); + // Extract space trimming from collation ID. + int spaceTrimmingOrdinal = getSpaceTrimming(collationId).ordinal(); + return new CollationSpecUTF8( + CaseSensitivity.values()[caseConversionOrdinal], + SpaceTrimming.values()[spaceTrimmingOrdinal]); } @Override protected Collation buildCollation() { - if (collationId == UTF8_BINARY_COLLATION_ID) { + // At this transient moment, collationId with space trimming might be used but should be ignored + if (SpecifierUtils.getSpecValue(collationId, CASE_SENSITIVITY_OFFSET, CaseSensitivity.LCASE.ordinal()) == 0) { return new Collation( UTF8_BINARY_COLLATION_NAME, PROVIDER_SPARK, @@ -474,7 +517,8 @@ protected CollationMeta buildCollationMeta() { /* icuVersion = */ null, COLLATION_PAD_ATTRIBUTE, /* accentSensitivity = */ true, - /* caseSensitivity = */ true); + /* caseSensitivity = */ true, + /* spaceTrimming = */ SpaceTrimming.NONE.toString()); } else { return new CollationMeta( CATALOG, @@ -485,7 +529,8 @@ protected CollationMeta buildCollationMeta() { /* icuVersion = */ null, COLLATION_PAD_ATTRIBUTE, /* accentSensitivity = */ true, - /* caseSensitivity = */ false); + /* caseSensitivity = */ false, + /* spaceTrimming = */ SpaceTrimming.NONE.toString()); } } @@ -620,18 +665,28 @@ private enum AccentSensitivity { } } - private static final int UNICODE_COLLATION_ID = - new CollationSpecICU("UNICODE", CaseSensitivity.CS, AccentSensitivity.AS).collationId; - private static final int UNICODE_CI_COLLATION_ID = - new CollationSpecICU("UNICODE", CaseSensitivity.CI, AccentSensitivity.AS).collationId; + private static final int UNICODE_COLLATION_ID = new CollationSpecICU( + "UNICODE", + CaseSensitivity.CS, + AccentSensitivity.AS, + SpaceTrimming.NONE).collationId; + + private static final int UNICODE_CI_COLLATION_ID = new CollationSpecICU( + "UNICODE", + CaseSensitivity.CI, + AccentSensitivity.AS, + SpaceTrimming.NONE).collationId; private final CaseSensitivity caseSensitivity; private final AccentSensitivity accentSensitivity; private final String locale; private final int collationId; - private CollationSpecICU(String locale, CaseSensitivity caseSensitivity, - AccentSensitivity accentSensitivity) { + private CollationSpecICU( + String locale, + CaseSensitivity caseSensitivity, + AccentSensitivity accentSensitivity, + SpaceTrimming spaceTrimming) { this.locale = locale; this.caseSensitivity = caseSensitivity; this.accentSensitivity = accentSensitivity; @@ -644,6 +699,8 @@ private CollationSpecICU(String locale, CaseSensitivity caseSensitivity, caseSensitivity); collationId = SpecifierUtils.setSpecValue(collationId, ACCENT_SENSITIVITY_OFFSET, accentSensitivity); + collationId = SpecifierUtils.setSpecValue(collationId, SPACE_TRIMMING_OFFSET, + spaceTrimming); this.collationId = collationId; } @@ -663,56 +720,102 @@ private static int collationNameToId( throw collationInvalidNameException(originalName); } else { String locale = collationName.substring(0, lastPos); + String remainingSpecifiers = collationName.substring(lastPos); + + // Initialize default specifier flags. + boolean isCaseSpecifierSet = false; + boolean isAccentSpecifierSet = false; + boolean isSpaceTrimmingSpecifierSet = false; + CaseSensitivity caseSensitivity = CaseSensitivity.CS; // Default: Case Sensitive + AccentSensitivity accentSensitivity = AccentSensitivity.AS; // Default: Accent Sensitive + SpaceTrimming spaceTrimming = SpaceTrimming.TRIM; // Default: No Trim + + String[] specifiers = remainingSpecifiers.split("_"); + int collationId = ICULocaleToId.get(ICULocaleMapUppercase.get(locale)); - // Try all combinations of AS/AI and CS/CI. - CaseSensitivity caseSensitivity; - AccentSensitivity accentSensitivity; - if (collationName.equals(locale) || - collationName.equals(locale + "_AS") || - collationName.equals(locale + "_CS") || - collationName.equals(locale + "_AS_CS") || - collationName.equals(locale + "_CS_AS") - ) { - caseSensitivity = CaseSensitivity.CS; - accentSensitivity = AccentSensitivity.AS; - } else if (collationName.equals(locale + "_CI") || - collationName.equals(locale + "_AS_CI") || - collationName.equals(locale + "_CI_AS")) { - caseSensitivity = CaseSensitivity.CI; - accentSensitivity = AccentSensitivity.AS; - } else if (collationName.equals(locale + "_AI") || - collationName.equals(locale + "_CS_AI") || - collationName.equals(locale + "_AI_CS")) { - caseSensitivity = CaseSensitivity.CS; - accentSensitivity = AccentSensitivity.AI; - } else if (collationName.equals(locale + "_AI_CI") || - collationName.equals(locale + "_CI_AI")) { - caseSensitivity = CaseSensitivity.CI; - accentSensitivity = AccentSensitivity.AI; - } else { - throw collationInvalidNameException(originalName); + // Iterate through specifiers and set corresponding flags + for (String specifier : specifiers) { + switch (specifier) { + case "CI": + if (isCaseSpecifierSet) { + throw collationInvalidNameException(originalName); + } + caseSensitivity = CaseSensitivity.CI; + isCaseSpecifierSet = true; + break; + case "CS": + if (isCaseSpecifierSet) { + throw collationInvalidNameException(originalName); + } + caseSensitivity = CaseSensitivity.CS; + isCaseSpecifierSet = true; + break; + case "AI": + if (isAccentSpecifierSet) { + throw collationInvalidNameException(originalName); + } + accentSensitivity = AccentSensitivity.AI; + isAccentSpecifierSet = true; + break; + case "AS": + if (isAccentSpecifierSet) { + throw collationInvalidNameException(originalName); + } + accentSensitivity = AccentSensitivity.AS; + isAccentSpecifierSet = true; + break; + case "LTRIM": + if (isSpaceTrimmingSpecifierSet) { + throw collationInvalidNameException(originalName); + } + spaceTrimming = SpaceTrimming.LTRIM; + isSpaceTrimmingSpecifierSet = true; + break; + case "RTRIM": + if (isSpaceTrimmingSpecifierSet) { + throw collationInvalidNameException(originalName); + } + spaceTrimming = SpaceTrimming.RTRIM; + isSpaceTrimmingSpecifierSet = true; + break; + case "TRIM": + if (isSpaceTrimmingSpecifierSet) { + throw collationInvalidNameException(originalName); + } + spaceTrimming = SpaceTrimming.TRIM; + isSpaceTrimmingSpecifierSet = true; + break; + default: + throw collationInvalidNameException(originalName); + } } // Build collation ID from computed specifiers. collationId = SpecifierUtils.setSpecValue(collationId, - IMPLEMENTATION_PROVIDER_OFFSET, ImplementationProvider.ICU); + IMPLEMENTATION_PROVIDER_OFFSET, ImplementationProvider.ICU); + collationId = SpecifierUtils.setSpecValue(collationId, + CASE_SENSITIVITY_OFFSET, caseSensitivity); collationId = SpecifierUtils.setSpecValue(collationId, - CASE_SENSITIVITY_OFFSET, caseSensitivity); + ACCENT_SENSITIVITY_OFFSET, accentSensitivity); collationId = SpecifierUtils.setSpecValue(collationId, - ACCENT_SENSITIVITY_OFFSET, accentSensitivity); + SPACE_TRIMMING_OFFSET, spaceTrimming); return collationId; } } private static CollationSpecICU fromCollationId(int collationId) { // Parse specifiers from collation ID. + int spaceTrimmingOrdinal = SpecifierUtils.getSpecValue(collationId, + SPACE_TRIMMING_OFFSET, SPACE_TRIMMING_MASK); int caseSensitivityOrdinal = SpecifierUtils.getSpecValue(collationId, CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK); int accentSensitivityOrdinal = SpecifierUtils.getSpecValue(collationId, ACCENT_SENSITIVITY_OFFSET, ACCENT_SENSITIVITY_MASK); collationId = SpecifierUtils.removeSpec(collationId, IMPLEMENTATION_PROVIDER_OFFSET, IMPLEMENTATION_PROVIDER_MASK); + collationId = SpecifierUtils.removeSpec(collationId, + SPACE_TRIMMING_OFFSET, SPACE_TRIMMING_MASK); collationId = SpecifierUtils.removeSpec(collationId, CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK); collationId = SpecifierUtils.removeSpec(collationId, @@ -723,8 +826,9 @@ private static CollationSpecICU fromCollationId(int collationId) { assert(localeId >= 0 && localeId < ICULocaleNames.length); CaseSensitivity caseSensitivity = CaseSensitivity.values()[caseSensitivityOrdinal]; AccentSensitivity accentSensitivity = AccentSensitivity.values()[accentSensitivityOrdinal]; + SpaceTrimming spaceTrimming = SpaceTrimming.values()[spaceTrimmingOrdinal]; String locale = ICULocaleNames[localeId]; - return new CollationSpecICU(locale, caseSensitivity, accentSensitivity); + return new CollationSpecICU(locale, caseSensitivity, accentSensitivity, spaceTrimming); } @Override @@ -774,15 +878,17 @@ protected CollationMeta buildCollationMeta() { VersionInfo.ICU_VERSION.toString(), COLLATION_PAD_ATTRIBUTE, accentSensitivity == AccentSensitivity.AS, - caseSensitivity == CaseSensitivity.CS); + caseSensitivity == CaseSensitivity.CS, + getSpaceTrimming(collationId).toString()); } /** * Compute normalized collation name. Components of collation name are given in order: * - Locale name + * - Optional space trimming when non-default preceded by underscore * - Optional case sensitivity when non-default preceded by underscore * - Optional accent sensitivity when non-default preceded by underscore - * Examples: en, en_USA_CI_AI, sr_Cyrl_SRB_AI. + * Examples: en, en_USA_CI_LTRIM, en_USA_CI_AI, en_USA_CI_AI_TRIM, sr_Cyrl_SRB_AI. */ private String collationName() { StringBuilder builder = new StringBuilder(); @@ -795,20 +901,22 @@ private String collationName() { builder.append('_'); builder.append(accentSensitivity.toString()); } + SpaceTrimming spaceTrimming = getSpaceTrimming(collationId); + if(spaceTrimming != SpaceTrimming.NONE) { + builder.append('_'); + builder.append(spaceTrimming.toString()); + } return builder.toString(); } private static List allCollationNames() { List collationNames = new ArrayList<>(); - for (String locale: ICULocaleToId.keySet()) { - // CaseSensitivity.CS + AccentSensitivity.AS - collationNames.add(locale); - // CaseSensitivity.CS + AccentSensitivity.AI - collationNames.add(locale + "_AI"); - // CaseSensitivity.CI + AccentSensitivity.AS - collationNames.add(locale + "_CI"); - // CaseSensitivity.CI + AccentSensitivity.AI - collationNames.add(locale + "_CI_AI"); + List caseAccentSpecifiers = Arrays.asList("", "_AI", "_CI", "_CI_AI"); + for (String locale : ICULocaleToId.keySet()) { + for (String caseAccent : caseAccentSpecifiers) { + String collationName = locale + caseAccent; + collationNames.add(collationName); + } } return collationNames.stream().sorted().toList(); } From 954c6c15ec9e44547641336ddd151290b0a83527 Mon Sep 17 00:00:00 2001 From: Jovan Pavlovic Date: Sun, 22 Sep 2024 18:07:06 +0200 Subject: [PATCH 02/19] refactor CollationFactory. --- .../sql/catalyst/util/CollationFactory.java | 87 +++++++++++++++---- 1 file changed, 72 insertions(+), 15 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index fb28d9e6c86a..55294c334f40 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -395,6 +395,8 @@ private static int collationNameToId(String collationName) throws SparkException protected abstract CollationMeta buildCollationMeta(); + protected abstract String normalizedCollationName(); + static List listCollations() { return Stream.concat( CollationSpecUTF8.listCollations().stream(), @@ -443,11 +445,16 @@ private enum CaseSensitivity { protected static Collation UTF8_LCASE_COLLATION = new CollationSpecUTF8(CaseSensitivity.LCASE, SpaceTrimming.NONE).buildCollation(); + private final CaseSensitivity caseSensitivity; + private final SpaceTrimming spaceTrimming; private final int collationId; private CollationSpecUTF8( CaseSensitivity caseSensitivity, SpaceTrimming spaceTrimming) { + this.caseSensitivity = caseSensitivity; + this.spaceTrimming = spaceTrimming; + int collationId = SpecifierUtils.setSpecValue(0, CASE_SENSITIVITY_OFFSET, caseSensitivity); this.collationId = @@ -456,14 +463,42 @@ private CollationSpecUTF8( private static int collationNameToId(String originalName, String collationName) throws SparkException { + + int baseId; + String collationNamePrefix; + if (collationName.startsWith(UTF8_BINARY_COLLATION.collationName)) { - return UTF8_BINARY_COLLATION_ID; + baseId = UTF8_BINARY_COLLATION_ID; + collationNamePrefix = UTF8_BINARY_COLLATION.collationName; } else if (collationName.startsWith(UTF8_LCASE_COLLATION.collationName)) { - return UTF8_LCASE_COLLATION_ID; + baseId = UTF8_LCASE_COLLATION_ID; + collationNamePrefix = UTF8_LCASE_COLLATION.collationName; } else { // Throw exception with original (before case conversion) collation name. throw collationInvalidNameException(originalName); } + + String remainingSpecifiers = collationName.substring(collationNamePrefix.length()); + if(remainingSpecifiers.isEmpty()) { + return baseId; + } + if(!remainingSpecifiers.startsWith("_")){ + throw collationInvalidNameException(originalName); + } + + SpaceTrimming spaceTrimming = SpaceTrimming.NONE; + String remainingSpec = remainingSpecifiers.substring(1); + if (remainingSpec.equals("LTRIM")) { + spaceTrimming = SpaceTrimming.LTRIM; + } else if (remainingSpec.equals("RTRIM")) { + spaceTrimming = SpaceTrimming.RTRIM; + } else if(remainingSpec.equals("TRIM")) { + spaceTrimming = SpaceTrimming.TRIM; + } else { + throw collationInvalidNameException(originalName); + } + + return SpecifierUtils.setSpecValue(baseId, SPACE_TRIMMING_OFFSET, spaceTrimming); } private static CollationSpecUTF8 fromCollationId(int collationId) { @@ -479,8 +514,7 @@ private static CollationSpecUTF8 fromCollationId(int collationId) { @Override protected Collation buildCollation() { - // At this transient moment, collationId with space trimming might be used but should be ignored - if (SpecifierUtils.getSpecValue(collationId, CASE_SENSITIVITY_OFFSET, CaseSensitivity.LCASE.ordinal()) == 0) { + if (caseSensitivity == CaseSensitivity.UNSPECIFIED) { return new Collation( UTF8_BINARY_COLLATION_NAME, PROVIDER_SPARK, @@ -507,33 +541,54 @@ protected Collation buildCollation() { @Override protected CollationMeta buildCollationMeta() { - if (collationId == UTF8_BINARY_COLLATION_ID) { + if (caseSensitivity == CaseSensitivity.UNSPECIFIED) { return new CollationMeta( CATALOG, SCHEMA, - UTF8_BINARY_COLLATION_NAME, + normalizedCollationName(), /* language = */ null, /* country = */ null, /* icuVersion = */ null, COLLATION_PAD_ATTRIBUTE, /* accentSensitivity = */ true, /* caseSensitivity = */ true, - /* spaceTrimming = */ SpaceTrimming.NONE.toString()); + spaceTrimming.toString()); } else { return new CollationMeta( CATALOG, SCHEMA, - UTF8_LCASE_COLLATION_NAME, + normalizedCollationName(), /* language = */ null, /* country = */ null, /* icuVersion = */ null, COLLATION_PAD_ATTRIBUTE, /* accentSensitivity = */ true, /* caseSensitivity = */ false, - /* spaceTrimming = */ SpaceTrimming.NONE.toString()); + spaceTrimming.toString()); } } + /** + * Compute normalized collation name. Components of collation name are given in order: + * - Base collation name (UTF8_BINARY or UTF8_LCASE) + * - Optional space trimming when non-default preceded by underscore + * Examples: UTF8_BINARY, UTF8_BINARY_LCASE_LTRIM, UTF8_BINARY_TRIM. + */ + @Override + protected String normalizedCollationName() { + StringBuilder builder = new StringBuilder(); + if(SpecifierUtils.getSpecValue(collationId, CASE_SENSITIVITY_OFFSET, CaseSensitivity.LCASE.ordinal()) == 0){ + builder.append(UTF8_BINARY_COLLATION_NAME); + }else{ + builder.append(UTF8_LCASE_COLLATION_NAME); + } + if (spaceTrimming != SpaceTrimming.NONE) { + builder.append('_'); + builder.append(spaceTrimming.toString()); + } + return builder.toString(); + } + static List listCollations() { CollationIdentifier UTF8_BINARY_COLLATION_IDENT = new CollationIdentifier(PROVIDER_SPARK, UTF8_BINARY_COLLATION_NAME, "1.0"); @@ -679,6 +734,7 @@ private enum AccentSensitivity { private final CaseSensitivity caseSensitivity; private final AccentSensitivity accentSensitivity; + private final SpaceTrimming spaceTrimming; private final String locale; private final int collationId; @@ -690,6 +746,7 @@ private CollationSpecICU( this.locale = locale; this.caseSensitivity = caseSensitivity; this.accentSensitivity = accentSensitivity; + this.spaceTrimming = spaceTrimming; // Construct collation ID from locale, case-sensitivity and accent-sensitivity specifiers. int collationId = ICULocaleToId.get(locale); // Mandatory ICU implementation provider. @@ -856,7 +913,7 @@ protected Collation buildCollation() { // Freeze ICU collator to ensure thread safety. collator.freeze(); return new Collation( - collationName(), + normalizedCollationName(), PROVIDER_ICU, collator, (s1, s2) -> collator.compare(s1.toValidString(), s2.toValidString()), @@ -872,25 +929,26 @@ protected CollationMeta buildCollationMeta() { return new CollationMeta( CATALOG, SCHEMA, - collationName(), + normalizedCollationName(), ICULocaleMap.get(locale).getDisplayLanguage(), ICULocaleMap.get(locale).getDisplayCountry(), VersionInfo.ICU_VERSION.toString(), COLLATION_PAD_ATTRIBUTE, accentSensitivity == AccentSensitivity.AS, caseSensitivity == CaseSensitivity.CS, - getSpaceTrimming(collationId).toString()); + spaceTrimming.toString()); } /** * Compute normalized collation name. Components of collation name are given in order: * - Locale name - * - Optional space trimming when non-default preceded by underscore * - Optional case sensitivity when non-default preceded by underscore * - Optional accent sensitivity when non-default preceded by underscore + * - Optional space trimming when non-default preceded by underscore * Examples: en, en_USA_CI_LTRIM, en_USA_CI_AI, en_USA_CI_AI_TRIM, sr_Cyrl_SRB_AI. */ - private String collationName() { + @Override + protected String normalizedCollationName() { StringBuilder builder = new StringBuilder(); builder.append(locale); if (caseSensitivity != CaseSensitivity.CS) { @@ -901,7 +959,6 @@ private String collationName() { builder.append('_'); builder.append(accentSensitivity.toString()); } - SpaceTrimming spaceTrimming = getSpaceTrimming(collationId); if(spaceTrimming != SpaceTrimming.NONE) { builder.append('_'); builder.append(spaceTrimming.toString()); From 803d187d608058f44858bc6c7c8be337d8f60c0f Mon Sep 17 00:00:00 2001 From: Jovan Pavlovic Date: Sun, 22 Sep 2024 22:57:22 +0200 Subject: [PATCH 03/19] fix collation name to id. --- .../spark/sql/catalyst/util/CollationFactory.java | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 55294c334f40..83c41afa7976 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -777,7 +777,18 @@ private static int collationNameToId( throw collationInvalidNameException(originalName); } else { String locale = collationName.substring(0, lastPos); - String remainingSpecifiers = collationName.substring(lastPos); + int collationId = ICULocaleToId.get(ICULocaleMapUppercase.get(locale)); + + // No other specifiers present. + if(collationName.equals(locale)){ + return collationId; + } + + if(collationName.charAt(locale.length()) != '_'){ + throw collationInvalidNameException(originalName); + } + // Extract remaining specifiers and trim "_" separator. + String remainingSpecifiers = collationName.substring(lastPos + 1); // Initialize default specifier flags. boolean isCaseSpecifierSet = false; @@ -789,8 +800,6 @@ private static int collationNameToId( String[] specifiers = remainingSpecifiers.split("_"); - int collationId = ICULocaleToId.get(ICULocaleMapUppercase.get(locale)); - // Iterate through specifiers and set corresponding flags for (String specifier : specifiers) { switch (specifier) { From a880e02223c04c9e65d3d3424814fe0d64b29178 Mon Sep 17 00:00:00 2001 From: Jovan Pavlovic Date: Mon, 23 Sep 2024 09:31:05 +0200 Subject: [PATCH 04/19] minor fix. --- .../sql/catalyst/util/CollationFactory.java | 176 +++++++++--------- 1 file changed, 87 insertions(+), 89 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 83c41afa7976..c4e59711ac4f 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -775,99 +775,97 @@ private static int collationNameToId( } if (lastPos == -1) { throw collationInvalidNameException(originalName); - } else { - String locale = collationName.substring(0, lastPos); - int collationId = ICULocaleToId.get(ICULocaleMapUppercase.get(locale)); - - // No other specifiers present. - if(collationName.equals(locale)){ - return collationId; - } + } + String locale = collationName.substring(0, lastPos); + int collationId = ICULocaleToId.get(ICULocaleMapUppercase.get(locale)); + collationId = SpecifierUtils.setSpecValue(collationId, + IMPLEMENTATION_PROVIDER_OFFSET, ImplementationProvider.ICU); - if(collationName.charAt(locale.length()) != '_'){ - throw collationInvalidNameException(originalName); - } - // Extract remaining specifiers and trim "_" separator. - String remainingSpecifiers = collationName.substring(lastPos + 1); - - // Initialize default specifier flags. - boolean isCaseSpecifierSet = false; - boolean isAccentSpecifierSet = false; - boolean isSpaceTrimmingSpecifierSet = false; - CaseSensitivity caseSensitivity = CaseSensitivity.CS; // Default: Case Sensitive - AccentSensitivity accentSensitivity = AccentSensitivity.AS; // Default: Accent Sensitive - SpaceTrimming spaceTrimming = SpaceTrimming.TRIM; // Default: No Trim - - String[] specifiers = remainingSpecifiers.split("_"); - - // Iterate through specifiers and set corresponding flags - for (String specifier : specifiers) { - switch (specifier) { - case "CI": - if (isCaseSpecifierSet) { - throw collationInvalidNameException(originalName); - } - caseSensitivity = CaseSensitivity.CI; - isCaseSpecifierSet = true; - break; - case "CS": - if (isCaseSpecifierSet) { - throw collationInvalidNameException(originalName); - } - caseSensitivity = CaseSensitivity.CS; - isCaseSpecifierSet = true; - break; - case "AI": - if (isAccentSpecifierSet) { - throw collationInvalidNameException(originalName); - } - accentSensitivity = AccentSensitivity.AI; - isAccentSpecifierSet = true; - break; - case "AS": - if (isAccentSpecifierSet) { - throw collationInvalidNameException(originalName); - } - accentSensitivity = AccentSensitivity.AS; - isAccentSpecifierSet = true; - break; - case "LTRIM": - if (isSpaceTrimmingSpecifierSet) { - throw collationInvalidNameException(originalName); - } - spaceTrimming = SpaceTrimming.LTRIM; - isSpaceTrimmingSpecifierSet = true; - break; - case "RTRIM": - if (isSpaceTrimmingSpecifierSet) { - throw collationInvalidNameException(originalName); - } - spaceTrimming = SpaceTrimming.RTRIM; - isSpaceTrimmingSpecifierSet = true; - break; - case "TRIM": - if (isSpaceTrimmingSpecifierSet) { - throw collationInvalidNameException(originalName); - } - spaceTrimming = SpaceTrimming.TRIM; - isSpaceTrimmingSpecifierSet = true; - break; - default: + // No other specifiers present. + if(collationName.equals(locale)){ + return collationId; + } + if(collationName.charAt(locale.length()) != '_'){ + throw collationInvalidNameException(originalName); + } + // Extract remaining specifiers and trim "_" separator. + String remainingSpecifiers = collationName.substring(lastPos + 1); + + // Initialize default specifier flags. + boolean isCaseSpecifierSet = false; + boolean isAccentSpecifierSet = false; + boolean isSpaceTrimmingSpecifierSet = false; + CaseSensitivity caseSensitivity = CaseSensitivity.CS; // Default: Case Sensitive + AccentSensitivity accentSensitivity = AccentSensitivity.AS; // Default: Accent Sensitive + SpaceTrimming spaceTrimming = SpaceTrimming.TRIM; // Default: No Trim + + String[] specifiers = remainingSpecifiers.split("_"); + + // Iterate through specifiers and set corresponding flags + for (String specifier : specifiers) { + switch (specifier) { + case "CI": + if (isCaseSpecifierSet) { throw collationInvalidNameException(originalName); - } + } + caseSensitivity = CaseSensitivity.CI; + isCaseSpecifierSet = true; + break; + case "CS": + if (isCaseSpecifierSet) { + throw collationInvalidNameException(originalName); + } + caseSensitivity = CaseSensitivity.CS; + isCaseSpecifierSet = true; + break; + case "AI": + if (isAccentSpecifierSet) { + throw collationInvalidNameException(originalName); + } + accentSensitivity = AccentSensitivity.AI; + isAccentSpecifierSet = true; + break; + case "AS": + if (isAccentSpecifierSet) { + throw collationInvalidNameException(originalName); + } + accentSensitivity = AccentSensitivity.AS; + isAccentSpecifierSet = true; + break; + case "LTRIM": + if (isSpaceTrimmingSpecifierSet) { + throw collationInvalidNameException(originalName); + } + spaceTrimming = SpaceTrimming.LTRIM; + isSpaceTrimmingSpecifierSet = true; + break; + case "RTRIM": + if (isSpaceTrimmingSpecifierSet) { + throw collationInvalidNameException(originalName); + } + spaceTrimming = SpaceTrimming.RTRIM; + isSpaceTrimmingSpecifierSet = true; + break; + case "TRIM": + if (isSpaceTrimmingSpecifierSet) { + throw collationInvalidNameException(originalName); + } + spaceTrimming = SpaceTrimming.TRIM; + isSpaceTrimmingSpecifierSet = true; + break; + default: + throw collationInvalidNameException(originalName); } - - // Build collation ID from computed specifiers. - collationId = SpecifierUtils.setSpecValue(collationId, - IMPLEMENTATION_PROVIDER_OFFSET, ImplementationProvider.ICU); - collationId = SpecifierUtils.setSpecValue(collationId, - CASE_SENSITIVITY_OFFSET, caseSensitivity); - collationId = SpecifierUtils.setSpecValue(collationId, - ACCENT_SENSITIVITY_OFFSET, accentSensitivity); - collationId = SpecifierUtils.setSpecValue(collationId, - SPACE_TRIMMING_OFFSET, spaceTrimming); - return collationId; } + + // Build collation ID from computed specifiers. + collationId = SpecifierUtils.setSpecValue(collationId, + CASE_SENSITIVITY_OFFSET, caseSensitivity); + collationId = SpecifierUtils.setSpecValue(collationId, + ACCENT_SENSITIVITY_OFFSET, accentSensitivity); + collationId = SpecifierUtils.setSpecValue(collationId, + SPACE_TRIMMING_OFFSET, spaceTrimming); + return collationId; } private static CollationSpecICU fromCollationId(int collationId) { From 01628d69bca8c4539443062fc6ea499aa404e346 Mon Sep 17 00:00:00 2001 From: Jovan Pavlovic Date: Mon, 23 Sep 2024 09:38:05 +0200 Subject: [PATCH 05/19] fix default space trimming. --- .../org/apache/spark/sql/catalyst/util/CollationFactory.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index c4e59711ac4f..c2d86a2c50f4 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -797,7 +797,7 @@ private static int collationNameToId( boolean isSpaceTrimmingSpecifierSet = false; CaseSensitivity caseSensitivity = CaseSensitivity.CS; // Default: Case Sensitive AccentSensitivity accentSensitivity = AccentSensitivity.AS; // Default: Accent Sensitive - SpaceTrimming spaceTrimming = SpaceTrimming.TRIM; // Default: No Trim + SpaceTrimming spaceTrimming = SpaceTrimming.NONE; // Default: No Trim String[] specifiers = remainingSpecifiers.split("_"); From 5192476531e3265495a2da11c14ac04e122f7d22 Mon Sep 17 00:00:00 2001 From: Jovan Pavlovic Date: Mon, 23 Sep 2024 10:56:45 +0200 Subject: [PATCH 06/19] feature flag implementation --- .../sql/catalyst/util/CollationFactory.java | 7 ++++++ .../expressions/collationExpressions.scala | 10 ++++++++ .../sql/errors/QueryCompilationErrors.scala | 6 +++++ .../apache/spark/sql/internal/SQLConf.scala | 10 ++++++++ .../sql/execution/datasources/rules.scala | 25 ++++++++++++++++--- 5 files changed, 55 insertions(+), 3 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index c2d86a2c50f4..502d5442259c 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -1105,6 +1105,13 @@ public static boolean isCaseSensitiveAndAccentInsensitive(int collationId) { Collation.CollationSpecICU.AccentSensitivity.AI; } + /** + * Returns whether the collation uses trim collation for the given collation id. + */ + public static boolean usesTrimCollation(int collationId) { + return Collation.CollationSpec.getSpaceTrimming(collationId) != Collation.CollationSpec.SpaceTrimming.NONE; + } + public static void assertValidProvider(String provider) throws SparkException { if (!SUPPORTED_PROVIDERS.contains(provider.toLowerCase())) { Map params = Map.of( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala index d45ca533f939..e80e74be388c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala @@ -48,6 +48,11 @@ object CollateExpressionBuilder extends ExpressionBuilder { case Seq(e: Expression, collationExpr: Expression) => (collationExpr.dataType, collationExpr.foldable) match { case (_: StringType, true) => + val collationId = collationExpr.dataType.asInstanceOf[StringType].collationId + if(!SQLConf.get.trimCollationEnabled && + CollationFactory.usesTrimCollation(collationId)) { + throw QueryCompilationErrors.trimCollationNotEnabledError() + } val evalCollation = collationExpr.eval() if (evalCollation == null) { throw QueryCompilationErrors.unexpectedNullError("collation", collationExpr) @@ -86,6 +91,8 @@ case class Collate(child: Expression, collationName: String) override def sql: String = s"$prettyName(${child.sql}, $collationName)" override def toString: String = s"$prettyName($child, $collationName)" + + val usesTrimCollation = CollationFactory.usesTrimCollation(collationId) } // scalastyle:off line.contains.tab @@ -112,4 +119,7 @@ case class Collation(child: Expression) Literal.create(collationName, SQLConf.get.defaultStringType) } override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) + + val usesTrimCollation = CollationFactory.usesTrimCollation( + child.dataType.asInstanceOf[StringType].collationId) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 0b5255e95f07..c7bf632ef027 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -351,6 +351,12 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat ) } + def trimCollationNotEnabledError() : Throwable = { + new AnalysisException( + errorClass = "UNSUPPORTED_FEATURE.TRIM_COLLATION", + messageParameters = Map.empty) + } + def unresolvedUsingColForJoinError( colName: String, suggestion: String, side: String): Throwable = { new AnalysisException( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 2eaafde52228..18223cfd7049 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -759,6 +759,14 @@ object SQLConf { .checkValue(_ > 0, "The initial number of partitions must be positive.") .createOptional + lazy val TRIM_COLLATION_ENABLED = + buildConf("spark.sql.trimCollation.enabled") + .doc("Trim collation feature is under development and its use should be done under this" + + "feature flag.") + .version("4.0.0") + .booleanConf + .createWithDefault(Utils.isTesting) + val DEFAULT_COLLATION = buildConf(SqlApiConfHelper.DEFAULT_COLLATION) .doc("Sets default collation to use for string literals, parameter markers or the string" + @@ -5456,6 +5464,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf { } } + def trimCollationEnabled: Boolean = getConf(TRIM_COLLATION_ENABLED) + override def defaultStringType: StringType = { if (getConf(DEFAULT_COLLATION).toUpperCase(Locale.ROOT) == "UTF8_BINARY") { StringType diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala index 29385904a752..89871c3bd324 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala @@ -18,15 +18,13 @@ package org.apache.spark.sql.execution.datasources import java.util.Locale - import scala.collection.mutable.{HashMap, HashSet} import scala.jdk.CollectionConverters._ - import org.apache.spark.SparkIllegalArgumentException import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.catalog._ -import org.apache.spark.sql.catalyst.expressions.{Expression, InputFileBlockLength, InputFileBlockStart, InputFileName, RowOrdering} +import org.apache.spark.sql.catalyst.expressions.{Collate, Collation, Expression, InputFileBlockLength, InputFileBlockStart, InputFileName, RowOrdering} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes @@ -37,6 +35,7 @@ import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.command.ViewHelper.generateViewProperties import org.apache.spark.sql.execution.datasources.{CreateTable => CreateTableV1} import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2 +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.InsertableRelation import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.util.PartitioningUtils.normalizePartitionSpec @@ -640,6 +639,26 @@ case class QualifyLocationWithWarehouse(catalog: SessionCatalog) extends Rule[Lo } } +object CollationCheck extends (LogicalPlan => Unit) { + def apply(plan: LogicalPlan): Unit = { + plan.foreach { + operator: LogicalPlan => + operator.expressions.foreach(_.foreach( + e => + if (isCollationExpressionAndUsesTrimCollation(e) && !SQLConf.get.trimCollationEnabled) { + throw QueryCompilationErrors.trimCollationNotEnabledError() + } + ) + ) + } + } + + private def isCollationExpressionAndUsesTrimCollation(expression: Expression): Boolean = { + (expression.isInstanceOf[Collation] && expression.asInstanceOf[Collation].usesTrimCollation) || + (expression.isInstanceOf[Collate] && expression.asInstanceOf[Collate].usesTrimCollation) + } +} + /** * This rule checks for references to views WITH SCHEMA [TYPE] EVOLUTION and synchronizes the * catalog if evolution was detected. From a3c5e8cb4f025204984c7bcec0ebfd3a0c99e287 Mon Sep 17 00:00:00 2001 From: Jovan Pavlovic Date: Mon, 23 Sep 2024 14:44:22 +0200 Subject: [PATCH 07/19] Trim collation banned with collate builder --- .../src/main/resources/error/error-conditions.json | 10 +++++----- .../expressions/collationExpressions.scala | 5 ----- .../spark/sql/execution/datasources/rules.scala | 2 +- .../sql/internal/BaseSessionStateBuilder.scala | 1 + .../sql/errors/QueryCompilationErrorsSuite.scala | 14 ++++++++++++++ 5 files changed, 21 insertions(+), 11 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index e83202d9e5ee..9a78c29dc959 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -4886,11 +4886,6 @@ "Catalog does not support ." ] }, - "COLLATION" : { - "message" : [ - "Collation is not yet supported." - ] - }, "COMBINATION_QUERY_RESULT_CLAUSES" : { "message" : [ "Combination of ORDER BY/SORT BY/DISTRIBUTE BY/CLUSTER BY." @@ -5117,6 +5112,11 @@ "message" : [ "TRANSFORM with SERDE is only supported in hive mode." ] + }, + "TRIM_COLLATION" : { + "message" : [ + "TRIM specifier in the collation." + ] } }, "sqlState" : "0A000" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala index e80e74be388c..760bd33944ce 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala @@ -48,11 +48,6 @@ object CollateExpressionBuilder extends ExpressionBuilder { case Seq(e: Expression, collationExpr: Expression) => (collationExpr.dataType, collationExpr.foldable) match { case (_: StringType, true) => - val collationId = collationExpr.dataType.asInstanceOf[StringType].collationId - if(!SQLConf.get.trimCollationEnabled && - CollationFactory.usesTrimCollation(collationId)) { - throw QueryCompilationErrors.trimCollationNotEnabledError() - } val evalCollation = collationExpr.eval() if (evalCollation == null) { throw QueryCompilationErrors.unexpectedNullError("collation", collationExpr) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala index 89871c3bd324..a1962a749550 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala @@ -639,7 +639,7 @@ case class QualifyLocationWithWarehouse(catalog: SessionCatalog) extends Rule[Lo } } -object CollationCheck extends (LogicalPlan => Unit) { +object TrimCollationCheck extends (LogicalPlan => Unit) { def apply(plan: LogicalPlan): Unit = { plan.foreach { operator: LogicalPlan => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala index 0d0258f11efb..1efad47bdb87 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala @@ -225,6 +225,7 @@ abstract class BaseSessionStateBuilder( HiveOnlyCheck +: TableCapabilityCheck +: CommandCheck +: + TrimCollationCheck +: ViewSyncSchemaToMetaStore +: customCheckRules } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala index 832e1873af6a..80053a66b32b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala @@ -868,6 +868,20 @@ class QueryCompilationErrorsSuite "inputTypes" -> "[\"INT\", \"STRING\", \"STRING\"]")) } + test("SPARK-49666: the trim collation feature is off without collate builder call") { + withSQLConf(SQLConf.TRIM_COLLATION_ENABLED.key -> "false") { + Seq( + "SELECT collate('aaa', 'UNICODE_TRIM')", + "SELECT collate('aaa', 'UTF8_BINARY_TRIM')", + "SELECT collate('aaa', 'EN_AI_RTRIM')", + ).foreach { sqlText => + checkError( + exception = intercept[AnalysisException](sql(sqlText)), + condition = "UNSUPPORTED_FEATURE.TRIM_COLLATION") + } + } + } + test("UNSUPPORTED_CALL: call the unsupported method update()") { checkError( exception = intercept[SparkUnsupportedOperationException] { From e65232e056ff8d67009f7d74bf7b0b271dd3b54e Mon Sep 17 00:00:00 2001 From: Jovan Pavlovic Date: Mon, 23 Sep 2024 15:55:34 +0200 Subject: [PATCH 08/19] implement collation trim block without collate builder call. --- .../spark/sql/catalyst/parser/AstBuilder.scala | 7 +++++++ .../sql/errors/QueryCompilationErrorsSuite.scala | 14 ++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 52529bb4b789..0376d36e1c42 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -2557,6 +2557,13 @@ class AstBuilder extends DataTypeAstBuilder } override def visitCollateClause(ctx: CollateClauseContext): String = withOrigin(ctx) { + val collationName = ctx.collationName.getText + if (!SQLConf.get.trimCollationEnabled && + (collationName.toUpperCase().contains("TRIM") || + collationName.toUpperCase().contains("LTRIM") || + collationName.toUpperCase().contains("RTRIM"))) { + throw QueryCompilationErrors.trimCollationNotEnabledError() + } ctx.identifier.getText } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala index 80053a66b32b..c387670b2de5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala @@ -869,6 +869,20 @@ class QueryCompilationErrorsSuite } test("SPARK-49666: the trim collation feature is off without collate builder call") { + withSQLConf(SQLConf.TRIM_COLLATION_ENABLED.key -> "false") { + Seq( + "CREATE TABLE t(col STRING COLLATE EN_TRIM_CI) USING parquet", + "CREATE TABLE t(col STRING COLLATE UTF8_LCASE_TRIM) USING parquet", + "SELECT 'aaa' COLLATE UNICODE_LTRIM_CI", + ).foreach { sqlText => + checkError( + exception = intercept[AnalysisException](sql(sqlText)), + condition = "UNSUPPORTED_FEATURE.TRIM_COLLATION") + } + } + } + + test("SPARK-49666: the trim collation feature is off with collate builder call") { withSQLConf(SQLConf.TRIM_COLLATION_ENABLED.key -> "false") { Seq( "SELECT collate('aaa', 'UNICODE_TRIM')", From 9635ab7a5dcee110d8c22c6c6d94fedeb01ce45b Mon Sep 17 00:00:00 2001 From: Jovan Pavlovic Date: Mon, 23 Sep 2024 17:19:04 +0200 Subject: [PATCH 09/19] add test for set collation. --- .../scala/org/apache/spark/sql/internal/SQLConfSuite.scala | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala index 82795e551b6b..094c65c63bfd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala @@ -517,6 +517,13 @@ class SQLConfSuite extends QueryTest with SharedSparkSession { "confName" -> "spark.sql.session.collation.default", "proposals" -> "UNICODE" )) + + withSQLConf(SQLConf.TRIM_COLLATION_ENABLED.key -> "false") { + checkError( + exception = intercept[AnalysisException](sql(s"SET COLLATION UNICODE_CI_TRIM")), + condition = "UNSUPPORTED_FEATURE.TRIM_COLLATION" + ) + } } test("SPARK-43028: config not found error") { From 87cc3447eca3ab075890653fff75d27432c452b3 Mon Sep 17 00:00:00 2001 From: Jovan Pavlovic Date: Tue, 24 Sep 2024 09:00:43 +0200 Subject: [PATCH 10/19] add tests for recognition of space trimming. --- .../spark/sql/execution/SparkSqlParser.scala | 7 +++ .../org/apache/spark/sql/CollationSuite.scala | 55 ++++++++++++++----- 2 files changed, 49 insertions(+), 13 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index a8261e5d98ba..52382a579146 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -149,6 +149,13 @@ class SparkSqlAstBuilder extends AstBuilder { * }}} */ override def visitSetCollation(ctx: SetCollationContext): LogicalPlan = withOrigin(ctx) { + val collationName = ctx.collationName.getText + if (!SQLConf.get.trimCollationEnabled && + (collationName.toUpperCase().contains("TRIM") || + collationName.toUpperCase().contains("LTRIM") || + collationName.toUpperCase().contains("RTRIM"))) { + throw QueryCompilationErrors.trimCollationNotEnabledError() + } val key = SQLConf.DEFAULT_COLLATION.key SetCommand(Some(key -> Some(ctx.identifier.getText.toUpperCase(Locale.ROOT)))) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala index 73fd897e91f5..a2cbc5b25d70 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala @@ -44,27 +44,54 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { private val allFileBasedDataSources = collationPreservingSources ++ collationNonPreservingSources test("collate returns proper type") { - Seq("utf8_binary", "utf8_lcase", "unicode", "unicode_ci").foreach { collationName => - checkAnswer(sql(s"select 'aaa' collate $collationName"), Row("aaa")) - val collationId = CollationFactory.collationNameToId(collationName) - assert(sql(s"select 'aaa' collate $collationName").schema(0).dataType - == StringType(collationId)) + Seq( + "utf8_binary", + "utf8_lcase", + "unicode", + "unicode_ci", + "unicode_ltrim_ci", + "utf8_lcase_trim", + "utf8_binary_rtrim" + ).foreach { + collationName => + checkAnswer(sql(s"select 'aaa' collate $collationName"), Row("aaa")) + val collationId = CollationFactory.collationNameToId(collationName) + assert(sql(s"select 'aaa' collate $collationName").schema(0).dataType + == StringType(collationId)) } } test("collation name is case insensitive") { - Seq("uTf8_BiNaRy", "utf8_lcase", "uNicOde", "UNICODE_ci").foreach { collationName => - checkAnswer(sql(s"select 'aaa' collate $collationName"), Row("aaa")) - val collationId = CollationFactory.collationNameToId(collationName) - assert(sql(s"select 'aaa' collate $collationName").schema(0).dataType - == StringType(collationId)) + Seq( + "uTf8_BiNaRy", + "utf8_lcase", + "uNicOde", + "UNICODE_ci", + "uNiCoDE_ltRIm_cI", + "UtF8_lCaSE_tRIM", + "utf8_biNAry_RtRiM" + ).foreach { + collationName => + checkAnswer(sql(s"select 'aaa' collate $collationName"), Row("aaa")) + val collationId = CollationFactory.collationNameToId(collationName) + assert(sql(s"select 'aaa' collate $collationName").schema(0).dataType + == StringType(collationId)) } } test("collation expression returns name of collation") { - Seq("utf8_binary", "utf8_lcase", "unicode", "unicode_ci").foreach { collationName => - checkAnswer( - sql(s"select collation('aaa' collate $collationName)"), Row(collationName.toUpperCase())) + Seq( + "utf8_binary", + "utf8_lcase", + "unicode", + "unicode_ci", + "unicode_ltrim_ci", + "utf8_lcase_trim", + "utf8_binary_rtrim" + ).foreach { + collationName => + checkAnswer( + sql(s"select collation('aaa' collate $collationName)"), Row(collationName.toUpperCase())) } } @@ -80,6 +107,8 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { assert(sql(s"select collate('aaa', 'utf8_lcase')").schema(0).dataType == StringType("UTF8_LCASE")) assert(sql(s"select collate('aaa', 'UNICODE')").schema(0).dataType == StringType("UNICODE")) + assert(sql(s"select collate('aaa', 'UNICODE_TRIM')").schema(0).dataType == + StringType("UNICODE_TRIM")) } } From 587cd2b18c37b35f641320eb176d7638b2e52b42 Mon Sep 17 00:00:00 2001 From: Jovan Pavlovic Date: Tue, 24 Sep 2024 09:39:43 +0200 Subject: [PATCH 11/19] minor fix --- .../sql/catalyst/util/CollationFactory.java | 22 +++++++++---------- .../expressions/collationExpressions.scala | 4 ++-- .../org/apache/spark/sql/CollationSuite.scala | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 502d5442259c..477884a757c4 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -516,7 +516,7 @@ private static CollationSpecUTF8 fromCollationId(int collationId) { protected Collation buildCollation() { if (caseSensitivity == CaseSensitivity.UNSPECIFIED) { return new Collation( - UTF8_BINARY_COLLATION_NAME, + normalizedCollationName(), PROVIDER_SPARK, null, UTF8String::binaryCompare, @@ -527,7 +527,7 @@ protected Collation buildCollation() { /* supportsLowercaseEquality = */ false); } else { return new Collation( - UTF8_LCASE_COLLATION_NAME, + normalizedCollationName(), PROVIDER_SPARK, null, CollationAwareUTF8String::compareLowerCase, @@ -577,7 +577,7 @@ protected CollationMeta buildCollationMeta() { @Override protected String normalizedCollationName() { StringBuilder builder = new StringBuilder(); - if(SpecifierUtils.getSpecValue(collationId, CASE_SENSITIVITY_OFFSET, CaseSensitivity.LCASE.ordinal()) == 0){ + if(caseSensitivity == CaseSensitivity.UNSPECIFIED){ builder.append(UTF8_BINARY_COLLATION_NAME); }else{ builder.append(UTF8_LCASE_COLLATION_NAME); @@ -721,16 +721,16 @@ private enum AccentSensitivity { } private static final int UNICODE_COLLATION_ID = new CollationSpecICU( - "UNICODE", - CaseSensitivity.CS, - AccentSensitivity.AS, - SpaceTrimming.NONE).collationId; + "UNICODE", + CaseSensitivity.CS, + AccentSensitivity.AS, + SpaceTrimming.NONE).collationId; private static final int UNICODE_CI_COLLATION_ID = new CollationSpecICU( - "UNICODE", - CaseSensitivity.CI, - AccentSensitivity.AS, - SpaceTrimming.NONE).collationId; + "UNICODE", + CaseSensitivity.CI, + AccentSensitivity.AS, + SpaceTrimming.NONE).collationId; private final CaseSensitivity caseSensitivity; private final AccentSensitivity accentSensitivity; diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala index 760bd33944ce..1125c16d13be 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala @@ -87,7 +87,7 @@ case class Collate(child: Expression, collationName: String) override def toString: String = s"$prettyName($child, $collationName)" - val usesTrimCollation = CollationFactory.usesTrimCollation(collationId) + val usesTrimCollation : Boolean = CollationFactory.usesTrimCollation(collationId) } // scalastyle:off line.contains.tab @@ -115,6 +115,6 @@ case class Collation(child: Expression) } override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) - val usesTrimCollation = CollationFactory.usesTrimCollation( + val usesTrimCollation: Boolean = CollationFactory.usesTrimCollation( child.dataType.asInstanceOf[StringType].collationId) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala index a2cbc5b25d70..1613a21d9f37 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala @@ -85,7 +85,7 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { "utf8_lcase", "unicode", "unicode_ci", - "unicode_ltrim_ci", + "unicode_ci_ltrim", "utf8_lcase_trim", "utf8_binary_rtrim" ).foreach { From 27a30580adfec9a36248096afab45817ac6fda73 Mon Sep 17 00:00:00 2001 From: Jovan Pavlovic Date: Tue, 24 Sep 2024 12:22:32 +0200 Subject: [PATCH 12/19] address comments --- .../apache/spark/sql/catalyst/util/CollationFactory.java | 7 ++++--- .../apache/spark/unsafe/types/CollationFactorySuite.scala | 4 ---- .../org/apache/spark/sql/catalyst/parser/AstBuilder.scala | 5 +---- .../main/scala/org/apache/spark/sql/internal/SQLConf.scala | 2 +- .../org/apache/spark/sql/execution/SparkSqlParser.scala | 5 +---- .../org/apache/spark/sql/execution/datasources/rules.scala | 2 ++ 6 files changed, 9 insertions(+), 16 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 477884a757c4..ba0cb4c0cc2d 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -792,12 +792,13 @@ private static int collationNameToId( String remainingSpecifiers = collationName.substring(lastPos + 1); // Initialize default specifier flags. + // Case sensitive, accent sensitive, no space trimming. boolean isCaseSpecifierSet = false; boolean isAccentSpecifierSet = false; boolean isSpaceTrimmingSpecifierSet = false; - CaseSensitivity caseSensitivity = CaseSensitivity.CS; // Default: Case Sensitive - AccentSensitivity accentSensitivity = AccentSensitivity.AS; // Default: Accent Sensitive - SpaceTrimming spaceTrimming = SpaceTrimming.NONE; // Default: No Trim + CaseSensitivity caseSensitivity = CaseSensitivity.CS; + AccentSensitivity accentSensitivity = AccentSensitivity.AS; + SpaceTrimming spaceTrimming = SpaceTrimming.NONE; String[] specifiers = remainingSpecifiers.split("_"); diff --git a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala index 321d1ccd700f..ca681c1e571f 100644 --- a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala +++ b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala @@ -369,8 +369,6 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig 1 << 15, // UTF8_BINARY mandatory zero bit 15 breach. 1 << 16, // UTF8_BINARY mandatory zero bit 16 breach. 1 << 17, // UTF8_BINARY mandatory zero bit 17 breach. - 1 << 18, // UTF8_BINARY mandatory zero bit 18 breach. - 1 << 19, // UTF8_BINARY mandatory zero bit 19 breach. 1 << 20, // UTF8_BINARY mandatory zero bit 20 breach. 1 << 23, // UTF8_BINARY mandatory zero bit 23 breach. 1 << 24, // UTF8_BINARY mandatory zero bit 24 breach. @@ -382,8 +380,6 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig (1 << 29) | (1 << 13), // ICU mandatory zero bit 13 breach. (1 << 29) | (1 << 14), // ICU mandatory zero bit 14 breach. (1 << 29) | (1 << 15), // ICU mandatory zero bit 15 breach. - (1 << 29) | (1 << 18), // ICU mandatory zero bit 18 breach. - (1 << 29) | (1 << 19), // ICU mandatory zero bit 19 breach. (1 << 29) | (1 << 20), // ICU mandatory zero bit 20 breach. (1 << 29) | (1 << 21), // ICU mandatory zero bit 21 breach. (1 << 29) | (1 << 22), // ICU mandatory zero bit 22 breach. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 0376d36e1c42..2bed13134189 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -2558,10 +2558,7 @@ class AstBuilder extends DataTypeAstBuilder override def visitCollateClause(ctx: CollateClauseContext): String = withOrigin(ctx) { val collationName = ctx.collationName.getText - if (!SQLConf.get.trimCollationEnabled && - (collationName.toUpperCase().contains("TRIM") || - collationName.toUpperCase().contains("LTRIM") || - collationName.toUpperCase().contains("RTRIM"))) { + if (!SQLConf.get.trimCollationEnabled && collationName.toUpperCase().contains("TRIM")) { throw QueryCompilationErrors.trimCollationNotEnabledError() } ctx.identifier.getText diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 18223cfd7049..0f900df44e73 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -760,7 +760,7 @@ object SQLConf { .createOptional lazy val TRIM_COLLATION_ENABLED = - buildConf("spark.sql.trimCollation.enabled") + buildConf("spark.sql.collation.trim.enabled") .doc("Trim collation feature is under development and its use should be done under this" + "feature flag.") .version("4.0.0") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 52382a579146..ee646f718062 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -150,10 +150,7 @@ class SparkSqlAstBuilder extends AstBuilder { */ override def visitSetCollation(ctx: SetCollationContext): LogicalPlan = withOrigin(ctx) { val collationName = ctx.collationName.getText - if (!SQLConf.get.trimCollationEnabled && - (collationName.toUpperCase().contains("TRIM") || - collationName.toUpperCase().contains("LTRIM") || - collationName.toUpperCase().contains("RTRIM"))) { + if (!SQLConf.get.trimCollationEnabled && collationName.toUpperCase().contains("TRIM")) { throw QueryCompilationErrors.trimCollationNotEnabledError() } val key = SQLConf.DEFAULT_COLLATION.key diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala index a1962a749550..f8b8cb43f68f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala @@ -18,8 +18,10 @@ package org.apache.spark.sql.execution.datasources import java.util.Locale + import scala.collection.mutable.{HashMap, HashSet} import scala.jdk.CollectionConverters._ + import org.apache.spark.SparkIllegalArgumentException import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.analysis._ From 58aab954fa487bc0034c249c9d9cdc2a29136320 Mon Sep 17 00:00:00 2001 From: Jovan Pavlovic Date: Tue, 24 Sep 2024 14:12:35 +0200 Subject: [PATCH 13/19] simpfly switch statement. --- .../sql/catalyst/util/CollationFactory.java | 30 ++----------------- 1 file changed, 3 insertions(+), 27 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index ba0cb4c0cc2d..aa0e2f61129b 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -806,52 +806,28 @@ private static int collationNameToId( for (String specifier : specifiers) { switch (specifier) { case "CI": - if (isCaseSpecifierSet) { - throw collationInvalidNameException(originalName); - } - caseSensitivity = CaseSensitivity.CI; - isCaseSpecifierSet = true; - break; case "CS": if (isCaseSpecifierSet) { throw collationInvalidNameException(originalName); } - caseSensitivity = CaseSensitivity.CS; + caseSensitivity = CaseSensitivity.valueOf(specifier); isCaseSpecifierSet = true; break; case "AI": - if (isAccentSpecifierSet) { - throw collationInvalidNameException(originalName); - } - accentSensitivity = AccentSensitivity.AI; - isAccentSpecifierSet = true; - break; case "AS": if (isAccentSpecifierSet) { throw collationInvalidNameException(originalName); } - accentSensitivity = AccentSensitivity.AS; + accentSensitivity = AccentSensitivity.valueOf(specifier); isAccentSpecifierSet = true; break; case "LTRIM": - if (isSpaceTrimmingSpecifierSet) { - throw collationInvalidNameException(originalName); - } - spaceTrimming = SpaceTrimming.LTRIM; - isSpaceTrimmingSpecifierSet = true; - break; case "RTRIM": - if (isSpaceTrimmingSpecifierSet) { - throw collationInvalidNameException(originalName); - } - spaceTrimming = SpaceTrimming.RTRIM; - isSpaceTrimmingSpecifierSet = true; - break; case "TRIM": if (isSpaceTrimmingSpecifierSet) { throw collationInvalidNameException(originalName); } - spaceTrimming = SpaceTrimming.TRIM; + spaceTrimming = SpaceTrimming.valueOf(specifier); isSpaceTrimmingSpecifierSet = true; break; default: From 16c1674adf437ab4bf8016afe02164ca3d5031b4 Mon Sep 17 00:00:00 2001 From: Jovan Pavlovic Date: Tue, 24 Sep 2024 17:39:39 +0200 Subject: [PATCH 14/19] fix test, address rest of the comments. --- .../sql/catalyst/util/CollationFactory.java | 7 ++++++ .../expressions/collationExpressions.scala | 9 ++++---- .../sql/execution/datasources/rules.scala | 23 +------------------ .../internal/BaseSessionStateBuilder.scala | 1 - .../errors/QueryCompilationErrorsSuite.scala | 5 +++- 5 files changed, 16 insertions(+), 29 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index aa0e2f61129b..566d00b2b4b5 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -507,11 +507,18 @@ private static CollationSpecUTF8 fromCollationId(int collationId) { CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK); // Extract space trimming from collation ID. int spaceTrimmingOrdinal = getSpaceTrimming(collationId).ordinal(); + assert(checkCollationId(collationId)); return new CollationSpecUTF8( CaseSensitivity.values()[caseConversionOrdinal], SpaceTrimming.values()[spaceTrimmingOrdinal]); } + private static boolean checkCollationId(int collationId) { + collationId = SpecifierUtils.removeSpec(collationId, SPACE_TRIMMING_OFFSET, SPACE_TRIMMING_MASK); + collationId = SpecifierUtils.removeSpec(collationId, CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK); + return collationId == 0; + } + @Override protected Collation buildCollation() { if (caseSensitivity == CaseSensitivity.UNSPECIFIED) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala index 1125c16d13be..baace414fd9b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collationExpressions.scala @@ -52,6 +52,10 @@ object CollateExpressionBuilder extends ExpressionBuilder { if (evalCollation == null) { throw QueryCompilationErrors.unexpectedNullError("collation", collationExpr) } else { + if (!SQLConf.get.trimCollationEnabled && + evalCollation.toString.toUpperCase().contains("TRIM")) { + throw QueryCompilationErrors.trimCollationNotEnabledError() + } Collate(e, evalCollation.toString) } case (_: StringType, false) => throw QueryCompilationErrors.nonFoldableArgumentError( @@ -86,8 +90,6 @@ case class Collate(child: Expression, collationName: String) override def sql: String = s"$prettyName(${child.sql}, $collationName)" override def toString: String = s"$prettyName($child, $collationName)" - - val usesTrimCollation : Boolean = CollationFactory.usesTrimCollation(collationId) } // scalastyle:off line.contains.tab @@ -114,7 +116,4 @@ case class Collation(child: Expression) Literal.create(collationName, SQLConf.get.defaultStringType) } override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) - - val usesTrimCollation: Boolean = CollationFactory.usesTrimCollation( - child.dataType.asInstanceOf[StringType].collationId) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala index f8b8cb43f68f..29385904a752 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala @@ -26,7 +26,7 @@ import org.apache.spark.SparkIllegalArgumentException import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.catalog._ -import org.apache.spark.sql.catalyst.expressions.{Collate, Collation, Expression, InputFileBlockLength, InputFileBlockStart, InputFileName, RowOrdering} +import org.apache.spark.sql.catalyst.expressions.{Expression, InputFileBlockLength, InputFileBlockStart, InputFileName, RowOrdering} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes @@ -37,7 +37,6 @@ import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.command.ViewHelper.generateViewProperties import org.apache.spark.sql.execution.datasources.{CreateTable => CreateTableV1} import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2 -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.InsertableRelation import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.util.PartitioningUtils.normalizePartitionSpec @@ -641,26 +640,6 @@ case class QualifyLocationWithWarehouse(catalog: SessionCatalog) extends Rule[Lo } } -object TrimCollationCheck extends (LogicalPlan => Unit) { - def apply(plan: LogicalPlan): Unit = { - plan.foreach { - operator: LogicalPlan => - operator.expressions.foreach(_.foreach( - e => - if (isCollationExpressionAndUsesTrimCollation(e) && !SQLConf.get.trimCollationEnabled) { - throw QueryCompilationErrors.trimCollationNotEnabledError() - } - ) - ) - } - } - - private def isCollationExpressionAndUsesTrimCollation(expression: Expression): Boolean = { - (expression.isInstanceOf[Collation] && expression.asInstanceOf[Collation].usesTrimCollation) || - (expression.isInstanceOf[Collate] && expression.asInstanceOf[Collate].usesTrimCollation) - } -} - /** * This rule checks for references to views WITH SCHEMA [TYPE] EVOLUTION and synchronizes the * catalog if evolution was detected. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala index 1efad47bdb87..0d0258f11efb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala @@ -225,7 +225,6 @@ abstract class BaseSessionStateBuilder( HiveOnlyCheck +: TableCapabilityCheck +: CommandCheck +: - TrimCollationCheck +: ViewSyncSchemaToMetaStore +: customCheckRules } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala index c387670b2de5..83dcef910d48 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala @@ -891,7 +891,10 @@ class QueryCompilationErrorsSuite ).foreach { sqlText => checkError( exception = intercept[AnalysisException](sql(sqlText)), - condition = "UNSUPPORTED_FEATURE.TRIM_COLLATION") + condition = "UNSUPPORTED_FEATURE.TRIM_COLLATION", + parameters = Map.empty, + context = ExpectedContext( + fragment = sqlText.substring(7), start = 7, stop = sqlText.length - 1)) } } } From bc99ecbb29ca776d516cbfd8a083ae1a3835a5e7 Mon Sep 17 00:00:00 2001 From: Jovan Pavlovic Date: Wed, 25 Sep 2024 10:29:35 +0200 Subject: [PATCH 15/19] nit fixes. --- .../org/apache/spark/sql/catalyst/util/CollationFactory.java | 4 ++-- .../main/scala/org/apache/spark/sql/internal/SQLConf.scala | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 566d00b2b4b5..8c14f5672849 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -507,13 +507,13 @@ private static CollationSpecUTF8 fromCollationId(int collationId) { CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK); // Extract space trimming from collation ID. int spaceTrimmingOrdinal = getSpaceTrimming(collationId).ordinal(); - assert(checkCollationId(collationId)); + assert(isValidCollationId(collationId)); return new CollationSpecUTF8( CaseSensitivity.values()[caseConversionOrdinal], SpaceTrimming.values()[spaceTrimmingOrdinal]); } - private static boolean checkCollationId(int collationId) { + private static boolean isValidCollationId(int collationId) { collationId = SpecifierUtils.removeSpec(collationId, SPACE_TRIMMING_OFFSET, SPACE_TRIMMING_MASK); collationId = SpecifierUtils.removeSpec(collationId, CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK); return collationId == 0; diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 0f900df44e73..db729b91f73c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -762,7 +762,8 @@ object SQLConf { lazy val TRIM_COLLATION_ENABLED = buildConf("spark.sql.collation.trim.enabled") .doc("Trim collation feature is under development and its use should be done under this" + - "feature flag.") + "feature flag. Trim collation trims leading, trailing or both spaces depending of" + + "specifier (LTRIM, RTRIM, TRIM).") .version("4.0.0") .booleanConf .createWithDefault(Utils.isTesting) From 5328ab7141d5d61ee5802700676879287c87f94f Mon Sep 17 00:00:00 2001 From: Jovan Pavlovic Date: Wed, 25 Sep 2024 11:14:01 +0200 Subject: [PATCH 16/19] fix scala style. --- .../sql/errors/QueryCompilationErrors.scala | 5 +- .../apache/spark/sql/internal/SQLConf.scala | 6 ++- .../org/apache/spark/sql/CollationSuite.scala | 47 +++++++++++-------- .../errors/QueryCompilationErrorsSuite.scala | 12 +++-- 4 files changed, 41 insertions(+), 29 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index c7bf632ef027..0d27f7bedbd3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -351,10 +351,11 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat ) } - def trimCollationNotEnabledError() : Throwable = { + def trimCollationNotEnabledError(): Throwable = { new AnalysisException( errorClass = "UNSUPPORTED_FEATURE.TRIM_COLLATION", - messageParameters = Map.empty) + messageParameters = Map.empty + ) } def unresolvedUsingColForJoinError( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index db729b91f73c..ccf56433530b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -761,9 +761,11 @@ object SQLConf { lazy val TRIM_COLLATION_ENABLED = buildConf("spark.sql.collation.trim.enabled") - .doc("Trim collation feature is under development and its use should be done under this" + + .doc( + "Trim collation feature is under development and its use should be done under this" + "feature flag. Trim collation trims leading, trailing or both spaces depending of" + - "specifier (LTRIM, RTRIM, TRIM).") + "specifier (LTRIM, RTRIM, TRIM)." + ) .version("4.0.0") .booleanConf .createWithDefault(Utils.isTesting) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala index 1613a21d9f37..e15b8cfbf3f0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala @@ -52,12 +52,13 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { "unicode_ltrim_ci", "utf8_lcase_trim", "utf8_binary_rtrim" - ).foreach { - collationName => - checkAnswer(sql(s"select 'aaa' collate $collationName"), Row("aaa")) - val collationId = CollationFactory.collationNameToId(collationName) - assert(sql(s"select 'aaa' collate $collationName").schema(0).dataType - == StringType(collationId)) + ).foreach { collationName => + checkAnswer(sql(s"select 'aaa' collate $collationName"), Row("aaa")) + val collationId = CollationFactory.collationNameToId(collationName) + assert( + sql(s"select 'aaa' collate $collationName").schema(0).dataType + == StringType(collationId) + ) } } @@ -70,12 +71,13 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { "uNiCoDE_ltRIm_cI", "UtF8_lCaSE_tRIM", "utf8_biNAry_RtRiM" - ).foreach { - collationName => - checkAnswer(sql(s"select 'aaa' collate $collationName"), Row("aaa")) - val collationId = CollationFactory.collationNameToId(collationName) - assert(sql(s"select 'aaa' collate $collationName").schema(0).dataType - == StringType(collationId)) + ).foreach { collationName => + checkAnswer(sql(s"select 'aaa' collate $collationName"), Row("aaa")) + val collationId = CollationFactory.collationNameToId(collationName) + assert( + sql(s"select 'aaa' collate $collationName").schema(0).dataType + == StringType(collationId) + ) } } @@ -88,10 +90,11 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { "unicode_ci_ltrim", "utf8_lcase_trim", "utf8_binary_rtrim" - ).foreach { - collationName => - checkAnswer( - sql(s"select collation('aaa' collate $collationName)"), Row(collationName.toUpperCase())) + ).foreach { collationName => + checkAnswer( + sql(s"select collation('aaa' collate $collationName)"), + Row(collationName.toUpperCase()) + ) } } @@ -104,11 +107,15 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { test("collate function syntax with default collation set") { withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UTF8_LCASE") { - assert(sql(s"select collate('aaa', 'utf8_lcase')").schema(0).dataType == - StringType("UTF8_LCASE")) + assert( + sql(s"select collate('aaa', 'utf8_lcase')").schema(0).dataType == + StringType("UTF8_LCASE") + ) assert(sql(s"select collate('aaa', 'UNICODE')").schema(0).dataType == StringType("UNICODE")) - assert(sql(s"select collate('aaa', 'UNICODE_TRIM')").schema(0).dataType == - StringType("UNICODE_TRIM")) + assert( + sql(s"select collate('aaa', 'UNICODE_TRIM')").schema(0).dataType == + StringType("UNICODE_TRIM") + ) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala index 83dcef910d48..5abdca326f2f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala @@ -873,11 +873,12 @@ class QueryCompilationErrorsSuite Seq( "CREATE TABLE t(col STRING COLLATE EN_TRIM_CI) USING parquet", "CREATE TABLE t(col STRING COLLATE UTF8_LCASE_TRIM) USING parquet", - "SELECT 'aaa' COLLATE UNICODE_LTRIM_CI", + "SELECT 'aaa' COLLATE UNICODE_LTRIM_CI" ).foreach { sqlText => checkError( exception = intercept[AnalysisException](sql(sqlText)), - condition = "UNSUPPORTED_FEATURE.TRIM_COLLATION") + condition = "UNSUPPORTED_FEATURE.TRIM_COLLATION" + ) } } } @@ -887,14 +888,15 @@ class QueryCompilationErrorsSuite Seq( "SELECT collate('aaa', 'UNICODE_TRIM')", "SELECT collate('aaa', 'UTF8_BINARY_TRIM')", - "SELECT collate('aaa', 'EN_AI_RTRIM')", + "SELECT collate('aaa', 'EN_AI_RTRIM')" ).foreach { sqlText => checkError( exception = intercept[AnalysisException](sql(sqlText)), condition = "UNSUPPORTED_FEATURE.TRIM_COLLATION", parameters = Map.empty, - context = ExpectedContext( - fragment = sqlText.substring(7), start = 7, stop = sqlText.length - 1)) + context = + ExpectedContext(fragment = sqlText.substring(7), start = 7, stop = sqlText.length - 1) + ) } } } From 156728c4ae94b9e0869011de5bae13ac56fa9594 Mon Sep 17 00:00:00 2001 From: Jovan Pavlovic Date: Wed, 25 Sep 2024 11:42:43 +0200 Subject: [PATCH 17/19] fix java style. --- .../spark/sql/catalyst/util/CollationFactory.java | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 8c14f5672849..bc92b3cbb251 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -514,8 +514,14 @@ private static CollationSpecUTF8 fromCollationId(int collationId) { } private static boolean isValidCollationId(int collationId) { - collationId = SpecifierUtils.removeSpec(collationId, SPACE_TRIMMING_OFFSET, SPACE_TRIMMING_MASK); - collationId = SpecifierUtils.removeSpec(collationId, CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK); + collationId = SpecifierUtils.removeSpec( + collationId, + SPACE_TRIMMING_OFFSET, + SPACE_TRIMMING_MASK); + collationId = SpecifierUtils.removeSpec( + collationId, + CASE_SENSITIVITY_OFFSET, + CASE_SENSITIVITY_MASK); return collationId == 0; } @@ -1093,7 +1099,8 @@ public static boolean isCaseSensitiveAndAccentInsensitive(int collationId) { * Returns whether the collation uses trim collation for the given collation id. */ public static boolean usesTrimCollation(int collationId) { - return Collation.CollationSpec.getSpaceTrimming(collationId) != Collation.CollationSpec.SpaceTrimming.NONE; + return Collation.CollationSpec.getSpaceTrimming(collationId) != + Collation.CollationSpec.SpaceTrimming.NONE; } public static void assertValidProvider(String provider) throws SparkException { From 9a1bc8b87246c1c1d16f4dd47f351d3f9da39982 Mon Sep 17 00:00:00 2001 From: Jovan Pavlovic Date: Wed, 25 Sep 2024 18:05:09 +0200 Subject: [PATCH 18/19] addressing comments. --- .../org/apache/spark/sql/catalyst/util/CollationFactory.java | 5 +++-- .../apache/spark/unsafe/types/CollationFactorySuite.scala | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index bc92b3cbb251..e368e2479a3a 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -271,7 +271,8 @@ protected enum ImplementationProvider { /** * Bits 19-18 having value 00 for no space trimming, 01 for left space trimming - * 10 for right space trimming and 11 for both sides space trimming. + * 10 for right space trimming and 11 for both sides space trimming. Bits 21, 20 + * remained reserved (and fixed to 0) for future use. */ protected enum SpaceTrimming { NONE, LTRIM, RTRIM, TRIM @@ -592,7 +593,7 @@ protected String normalizedCollationName() { StringBuilder builder = new StringBuilder(); if(caseSensitivity == CaseSensitivity.UNSPECIFIED){ builder.append(UTF8_BINARY_COLLATION_NAME); - }else{ + } else{ builder.append(UTF8_LCASE_COLLATION_NAME); } if (spaceTrimming != SpaceTrimming.NONE) { diff --git a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala index ca681c1e571f..054c44f7286b 100644 --- a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala +++ b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala @@ -370,6 +370,7 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig 1 << 16, // UTF8_BINARY mandatory zero bit 16 breach. 1 << 17, // UTF8_BINARY mandatory zero bit 17 breach. 1 << 20, // UTF8_BINARY mandatory zero bit 20 breach. + 1 << 21, // UTF8_BINARY mandatory zero bit 21 breach. 1 << 23, // UTF8_BINARY mandatory zero bit 23 breach. 1 << 24, // UTF8_BINARY mandatory zero bit 24 breach. 1 << 25, // UTF8_BINARY mandatory zero bit 25 breach. From d4fb3633f829f73b1f5e21190a47eb9c2e8cf793 Mon Sep 17 00:00:00 2001 From: Jovan Pavlovic Date: Thu, 26 Sep 2024 09:44:23 +0200 Subject: [PATCH 19/19] mark feature flag with internal. --- .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index ccf56433530b..560c48f58098 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -761,6 +761,7 @@ object SQLConf { lazy val TRIM_COLLATION_ENABLED = buildConf("spark.sql.collation.trim.enabled") + .internal() .doc( "Trim collation feature is under development and its use should be done under this" + "feature flag. Trim collation trims leading, trailing or both spaces depending of" +