Skip to content
Closed
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ public static boolean lowercaseMatchFrom(
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param startPos the start position for searching (in the target string)
* @return length of the target substring that ends with the specified prefix in lowercase
* @return length of the target substring that starts with the specified prefix in lowercase
*/
public static int lowercaseMatchLengthFrom(
private static int lowercaseMatchLengthFrom(
final UTF8String target,
final UTF8String lowercasePattern,
int startPos) {
Expand All @@ -99,7 +99,7 @@ public static int lowercaseMatchLengthFrom(
* @param startPos the start position for searching (in the target string)
* @return the position of the first occurrence of pattern in target
*/
public static int lowercaseFind(
private static int lowercaseFind(
final UTF8String target,
final UTF8String lowercasePattern,
int startPos) {
Expand Down Expand Up @@ -145,7 +145,7 @@ public static boolean lowercaseMatchUntil(
* @param endPos the end position for searching (in the target string)
* @return length of the target substring that ends with the specified suffix in lowercase
*/
public static int lowercaseMatchLengthUntil(
private static int lowercaseMatchLengthUntil(
final UTF8String target,
final UTF8String lowercasePattern,
int endPos) {
Expand All @@ -170,7 +170,7 @@ public static int lowercaseMatchLengthUntil(
* @param endPos the end position for searching (in the target string)
* @return the position of the last occurrence of pattern in target
*/
public static int lowercaseRFind(
private static int lowercaseRFind(
final UTF8String target,
final UTF8String lowercasePattern,
int endPos) {
Expand Down Expand Up @@ -287,32 +287,28 @@ public static String toTitleCase(final String target, final int collationId) {
}

public static int findInSet(final UTF8String match, final UTF8String set, int collationId) {
// If the "word" string contains a comma, FindInSet should return 0.
if (match.contains(UTF8String.fromString(","))) {
return 0;
}

String setString = set.toString();
StringSearch stringSearch = CollationFactory.getStringSearch(setString, match.toString(),
collationId);

int wordStart = 0;
while ((wordStart = stringSearch.next()) != StringSearch.DONE) {
boolean isValidStart = wordStart == 0 || setString.charAt(wordStart - 1) == ',';
boolean isValidEnd = wordStart + stringSearch.getMatchLength() == setString.length()
|| setString.charAt(wordStart + stringSearch.getMatchLength()) == ',';

if (isValidStart && isValidEnd) {
int pos = 0;
for (int i = 0; i < setString.length() && i < wordStart; i++) {
if (setString.charAt(i) == ',') {
pos++;
}
// Otherwise, search for commas in "set" and compare each substring with "word".
int byteIndex = 0, charIndex = 0, wordCount = 1, lastComma = -1;
while (byteIndex < set.numBytes()) {
byte nextByte = set.getByte(byteIndex);
if (nextByte == (byte) ',') {
if (set.substring(lastComma + 1, charIndex).semanticEquals(match, collationId)) {
return wordCount;
}

return pos + 1;
lastComma = charIndex;
++wordCount;
}
byteIndex += UTF8String.numBytesForFirstByte(nextByte);
++charIndex;
}

if (set.substring(lastComma + 1, set.numBytes()).semanticEquals(match, collationId)) {
return wordCount;
}
// If no match is found, return 0.
return 0;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -297,31 +297,24 @@ public static int exec(final UTF8String word, final UTF8String set, final int co
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality) {
return execBinary(word, set);
} else if (collation.supportsLowercaseEquality) {
return execLowercase(word, set);
} else {
return execICU(word, set, collationId);
return execCollationAware(word, set, collationId);
}
}
public static String genCode(final String word, final String set, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.FindInSet.exec";
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s, %s)", word, set);
} else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s, %s)", word, set);
} else {
return String.format(expr + "ICU(%s, %s, %d)", word, set, collationId);
return String.format(expr + "execCollationAware(%s, %s, %d)", word, set, collationId);
}
}
public static int execBinary(final UTF8String word, final UTF8String set) {
return set.findInSet(word);
}
public static int execLowercase(final UTF8String word, final UTF8String set) {
return set.toLowerCase().findInSet(word.toLowerCase());
}
public static int execICU(final UTF8String word, final UTF8String set,
final int collationId) {
public static int execCollationAware(final UTF8String word, final UTF8String set,
final int collationId) {
return CollationAwareUTF8String.findInSet(word, set, collationId);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -639,81 +639,93 @@ public void testStringInstr() throws SparkException {
assertStringInstr("abi̇o12", "İo", "UNICODE_CI", 3);
}

private void assertFindInSet(String word, String set, String collationName,
Integer expected) throws SparkException {
private void assertFindInSet(String word, UTF8String set, String collationName,
Integer expected) throws SparkException {
UTF8String w = UTF8String.fromString(word);
UTF8String s = UTF8String.fromString(set);
int collationId = CollationFactory.collationNameToId(collationName);
assertEquals(expected, CollationSupport.FindInSet.exec(w, s, collationId));
assertEquals(expected, CollationSupport.FindInSet.exec(w, set, collationId));
}

@Test
public void testFindInSet() throws SparkException {
assertFindInSet("AB", "abc,b,ab,c,def", "UTF8_BINARY", 0);
assertFindInSet("abc", "abc,b,ab,c,def", "UTF8_BINARY", 1);
assertFindInSet("def", "abc,b,ab,c,def", "UTF8_BINARY", 5);
assertFindInSet("d,ef", "abc,b,ab,c,def", "UTF8_BINARY", 0);
assertFindInSet("", "abc,b,ab,c,def", "UTF8_BINARY", 0);
assertFindInSet("a", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0);
assertFindInSet("c", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 4);
assertFindInSet("AB", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 3);
assertFindInSet("AbC", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 1);
assertFindInSet("abcd", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0);
assertFindInSet("d,ef", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0);
assertFindInSet("XX", "xx", "UTF8_BINARY_LCASE", 1);
assertFindInSet("", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0);
assertFindInSet("界x", "test,大千,世,界X,大,千,世界", "UTF8_BINARY_LCASE", 4);
assertFindInSet("a", "abc,b,ab,c,def", "UNICODE", 0);
assertFindInSet("ab", "abc,b,ab,c,def", "UNICODE", 3);
assertFindInSet("Ab", "abc,b,ab,c,def", "UNICODE", 0);
assertFindInSet("d,ef", "abc,b,ab,c,def", "UNICODE", 0);
assertFindInSet("xx", "xx", "UNICODE", 1);
assertFindInSet("界x", "test,大千,世,界X,大,千,世界", "UNICODE", 0);
assertFindInSet("大", "test,大千,世,界X,大,千,世界", "UNICODE", 5);
assertFindInSet("a", "abc,b,ab,c,def", "UNICODE_CI", 0);
assertFindInSet("C", "abc,b,ab,c,def", "UNICODE_CI", 4);
assertFindInSet("DeF", "abc,b,ab,c,dEf", "UNICODE_CI", 5);
assertFindInSet("DEFG", "abc,b,ab,c,def", "UNICODE_CI", 0);
assertFindInSet("XX", "xx", "UNICODE_CI", 1);
assertFindInSet("界x", "test,大千,世,界X,大,千,世界", "UNICODE_CI", 4);
assertFindInSet("界x", "test,大千,界Xx,世,界X,大,千,世界", "UNICODE_CI", 5);
assertFindInSet("大", "test,大千,世,界X,大,千,世界", "UNICODE_CI", 5);
assertFindInSet("i̇", "İ", "UNICODE_CI", 1);
assertFindInSet("i", "İ", "UNICODE_CI", 0);
assertFindInSet("i̇", "i̇", "UNICODE_CI", 1);
assertFindInSet("i", "i̇", "UNICODE_CI", 0);
assertFindInSet("i̇", "İ,", "UNICODE_CI", 1);
assertFindInSet("i", "İ,", "UNICODE_CI", 0);
assertFindInSet("i̇", "i̇,", "UNICODE_CI", 1);
assertFindInSet("i", "i̇,", "UNICODE_CI", 0);
assertFindInSet("i̇", "ab,İ", "UNICODE_CI", 2);
assertFindInSet("i", "ab,İ", "UNICODE_CI", 0);
assertFindInSet("i̇", "ab,i̇", "UNICODE_CI", 2);
assertFindInSet("i", "ab,i̇", "UNICODE_CI", 0);
assertFindInSet("i̇", "ab,İ,12", "UNICODE_CI", 2);
assertFindInSet("i", "ab,İ,12", "UNICODE_CI", 0);
assertFindInSet("i̇", "ab,i̇,12", "UNICODE_CI", 2);
assertFindInSet("i", "ab,i̇,12", "UNICODE_CI", 0);
assertFindInSet("i̇o", "ab,İo,12", "UNICODE_CI", 2);
assertFindInSet("İo", "ab,i̇o,12", "UNICODE_CI", 2);
assertFindInSet("i̇", "İ", "UTF8_BINARY_LCASE", 1);
assertFindInSet("i", "İ", "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", "i̇", "UTF8_BINARY_LCASE", 1);
assertFindInSet("i", "i̇", "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", "İ,", "UTF8_BINARY_LCASE", 1);
assertFindInSet("i", "İ,", "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", "i̇,", "UTF8_BINARY_LCASE", 1);
assertFindInSet("i", "i̇,", "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", "ab,İ", "UTF8_BINARY_LCASE", 2);
assertFindInSet("i", "ab,İ", "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", "ab,i̇", "UTF8_BINARY_LCASE", 2);
assertFindInSet("i", "ab,i̇", "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", "ab,İ,12", "UTF8_BINARY_LCASE", 2);
assertFindInSet("i", "ab,İ,12", "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", "ab,i̇,12", "UTF8_BINARY_LCASE", 2);
assertFindInSet("i", "ab,i̇,12", "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇o", "ab,İo,12", "UTF8_BINARY_LCASE", 2);
assertFindInSet("İo", "ab,i̇o,12", "UTF8_BINARY_LCASE", 2);
assertFindInSet("AB", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0);
assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 1);
assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 5);
assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0);
assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0);
assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0);
assertFindInSet("c", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 4);
assertFindInSet("AB", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 3);
assertFindInSet("AbC", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 1);
assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0);
assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0);
assertFindInSet("XX", UTF8String.fromString("xx"), "UTF8_BINARY_LCASE", 1);
assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0);
assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UTF8_BINARY_LCASE", 4);
assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
assertFindInSet("ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 3);
assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
assertFindInSet("xx", UTF8String.fromString("xx"), "UNICODE", 1);
assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE", 0);
assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE", 5);
assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0);
assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 4);
assertFindInSet("DeF", UTF8String.fromString("abc,b,ab,c,dEf"), "UNICODE_CI", 5);
assertFindInSet("DEFG", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0);
assertFindInSet("XX", UTF8String.fromString("xx"), "UNICODE_CI", 1);
assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE_CI", 4);
assertFindInSet("界x", UTF8String.fromString("test,大千,界Xx,世,界X,大,千,世界"), "UNICODE_CI", 5);
assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE_CI", 5);
assertFindInSet("i̇", UTF8String.fromString("İ"), "UNICODE_CI", 1);
assertFindInSet("i", UTF8String.fromString("İ"), "UNICODE_CI", 0);
assertFindInSet("i̇", UTF8String.fromString("i̇"), "UNICODE_CI", 1);
assertFindInSet("i", UTF8String.fromString("i̇"), "UNICODE_CI", 0);
assertFindInSet("i̇", UTF8String.fromString("İ,"), "UNICODE_CI", 1);
assertFindInSet("i", UTF8String.fromString("İ,"), "UNICODE_CI", 0);
assertFindInSet("i̇", UTF8String.fromString("i̇,"), "UNICODE_CI", 1);
assertFindInSet("i", UTF8String.fromString("i̇,"), "UNICODE_CI", 0);
assertFindInSet("i̇", UTF8String.fromString("ab,İ"), "UNICODE_CI", 2);
assertFindInSet("i", UTF8String.fromString("ab,İ"), "UNICODE_CI", 0);
assertFindInSet("i̇", UTF8String.fromString("ab,i̇"), "UNICODE_CI", 2);
assertFindInSet("i", UTF8String.fromString("ab,i̇"), "UNICODE_CI", 0);
assertFindInSet("i̇", UTF8String.fromString("ab,İ,12"), "UNICODE_CI", 2);
assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UNICODE_CI", 0);
assertFindInSet("i̇", UTF8String.fromString("ab,i̇,12"), "UNICODE_CI", 2);
assertFindInSet("i", UTF8String.fromString("ab,i̇,12"), "UNICODE_CI", 0);
assertFindInSet("i̇o", UTF8String.fromString("ab,İo,12"), "UNICODE_CI", 2);
assertFindInSet("İo", UTF8String.fromString("ab,i̇o,12"), "UNICODE_CI", 2);
assertFindInSet("i̇", UTF8String.fromString("İ"), "UTF8_BINARY_LCASE", 1);
assertFindInSet("i", UTF8String.fromString("İ"), "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", UTF8String.fromString("i̇"), "UTF8_BINARY_LCASE", 1);
assertFindInSet("i", UTF8String.fromString("i̇"), "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", UTF8String.fromString("İ,"), "UTF8_BINARY_LCASE", 1);
assertFindInSet("i", UTF8String.fromString("İ,"), "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", UTF8String.fromString("i̇,"), "UTF8_BINARY_LCASE", 1);
assertFindInSet("i", UTF8String.fromString("i̇,"), "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", UTF8String.fromString("ab,İ"), "UTF8_BINARY_LCASE", 2);
assertFindInSet("i", UTF8String.fromString("ab,İ"), "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", UTF8String.fromString("ab,i̇"), "UTF8_BINARY_LCASE", 2);
assertFindInSet("i", UTF8String.fromString("ab,i̇"), "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", UTF8String.fromString("ab,İ,12"), "UTF8_BINARY_LCASE", 2);
assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇", UTF8String.fromString("ab,i̇,12"), "UTF8_BINARY_LCASE", 2);
assertFindInSet("i", UTF8String.fromString("ab,i̇,12"), "UTF8_BINARY_LCASE", 0);
assertFindInSet("i̇o", UTF8String.fromString("ab,İo,12"), "UTF8_BINARY_LCASE", 2);
assertFindInSet("İo", UTF8String.fromString("ab,i̇o,12"), "UTF8_BINARY_LCASE", 2);
// Invalid UTF8 strings
assertFindInSet("C", UTF8String.fromBytes(
new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }),
"UTF8_BINARY", 3);
assertFindInSet("c", UTF8String.fromBytes(
new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }),
"UTF8_BINARY_LCASE", 2);
assertFindInSet("C", UTF8String.fromBytes(
new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }),
"UNICODE", 3);
assertFindInSet("c", UTF8String.fromBytes(
new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }),
"UNICODE_CI", 2);
}

private void assertReplace(String source, String search, String replace, String collationName,
Expand Down Expand Up @@ -758,12 +770,13 @@ public void testReplace() throws SparkException {
assertReplace("İi̇İi̇İi̇", "i", "x", "UNICODE_CI", "İi̇İi̇İi̇");
assertReplace("abİo12i̇o", "i̇o", "xx", "UNICODE_CI", "abxx12xx");
assertReplace("abi̇o12i̇o", "İo", "yy", "UNICODE_CI", "abyy12yy");
assertReplace("abi̇12", "i", "X", "UTF8_BINARY_LCASE", "abẊ12"); // != UNICODE_CI
assertReplace("abi̇12", "i", "X", "UTF8_BINARY_LCASE", "abX\u030712"); // != UNICODE_CI
assertReplace("abi̇12", "\u0307", "X", "UTF8_BINARY_LCASE", "abiX12"); // != UNICODE_CI
assertReplace("abi̇12", "İ", "X", "UTF8_BINARY_LCASE", "abX12");
assertReplace("abİ12", "i", "X", "UTF8_BINARY_LCASE", "abİ12");
assertReplace("İi̇İi̇İi̇", "i̇", "x", "UTF8_BINARY_LCASE", "xxxxxx");
assertReplace("İi̇İi̇İi̇", "i", "x", "UTF8_BINARY_LCASE", "İẋİẋİẋ"); // != UNICODE_CI
assertReplace("İi̇İi̇İi̇", "i", "x", "UTF8_BINARY_LCASE",
"İx\u0307İx\u0307İx\u0307"); // != UNICODE_CI
assertReplace("abİo12i̇o", "i̇o", "xx", "UTF8_BINARY_LCASE", "abxx12xx");
assertReplace("abi̇o12i̇o", "İo", "yy", "UTF8_BINARY_LCASE", "abyy12yy");
}
Expand Down