From 6426fddd77319730a893acf20b7b658d9cabc3c7 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Wed, 19 Jun 2024 10:41:18 +0800 Subject: [PATCH 01/11] [SPARK-48658][SQL] Encode/Decode functions report coding error instead of mojibake --- .../resources/error/error-conditions.json | 6 + .../expressions/stringExpressions.scala | 125 ++++++++++-------- .../sql/errors/QueryExecutionErrors.scala | 8 ++ .../apache/spark/sql/internal/SQLConf.scala | 10 ++ .../ansi/string-functions.sql.out | 106 +++++++++++++-- .../analyzer-results/string-functions.sql.out | 106 +++++++++++++-- .../sql-tests/inputs/string-functions.sql | 12 ++ .../results/ansi/string-functions.sql.out | 112 ++++++++++++++++ .../results/string-functions.sql.out | 112 ++++++++++++++++ 9 files changed, 521 insertions(+), 76 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 35dfa7a6c349..1888b669dcc9 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -3030,6 +3030,12 @@ ], "sqlState" : "22023" }, + "MALFORMED_CHARACTER_CODING" : { + "message" : [ + "Invalid value found when performing with " + ], + "sqlState" : "22000" + }, "MERGE_CARDINALITY_VIOLATION" : { "message" : [ "The ON search condition of the MERGE statement matched a single row from the target table with multiple rows of the source table.", diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index ac23962f41ed..442c61af7009 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql.catalyst.expressions -import java.io.UnsupportedEncodingException +import java.nio.{ByteBuffer, CharBuffer} +import java.nio.charset.{CharacterCodingException, Charset, CodingErrorAction, IllegalCharsetNameException, UnsupportedCharsetException} import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols} import java.util.{Base64 => JBase64} import java.util.{HashMap, Locale, Map => JMap} @@ -25,6 +26,7 @@ import java.util.{HashMap, Locale, Map => JMap} import scala.collection.mutable.ArrayBuffer import org.apache.spark.QueryContext +import org.apache.spark.network.util.JavaUtils import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.{ExpressionBuilder, FunctionRegistry, TypeCheckResult} import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch @@ -2708,11 +2710,14 @@ case class Decode(params: Seq[Expression], replacement: Expression) since = "1.5.0", group = "string_funcs") // scalastyle:on line.size.limit -case class StringDecode(bin: Expression, charset: Expression, legacyCharsets: Boolean) +case class StringDecode( + bin: Expression, + charset: Expression, + legacyCharsets: Boolean, legacyErrorAction: Boolean) extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { def this(bin: Expression, charset: Expression) = - this(bin, charset, SQLConf.get.legacyJavaCharsets) + this(bin, charset, SQLConf.get.legacyJavaCharsets, SQLConf.get.legacyCodingErrorAction) override def left: Expression = bin override def right: Expression = charset @@ -2724,35 +2729,38 @@ case class StringDecode(bin: Expression, charset: Expression, legacyCharsets: Bo protected override def nullSafeEval(input1: Any, input2: Any): Any = { val fromCharset = input2.asInstanceOf[UTF8String].toString - try { - if (legacyCharsets || supportedCharsets.contains(fromCharset.toUpperCase(Locale.ROOT))) { - UTF8String.fromString(new String(input1.asInstanceOf[Array[Byte]], fromCharset)) - } else throw new UnsupportedEncodingException - } catch { - case _: UnsupportedEncodingException => - throw QueryExecutionErrors.invalidCharsetError(prettyName, fromCharset) + if (legacyCharsets || supportedCharsets.contains(fromCharset.toUpperCase(Locale.ROOT))) { + val decoder = try { + val codingErrorAction = if (legacyErrorAction) { + CodingErrorAction.REPLACE + } else { + CodingErrorAction.REPORT + } + Charset.forName(fromCharset) + .newDecoder() + .onMalformedInput(codingErrorAction) + .onUnmappableCharacter(codingErrorAction) + } catch { + case _: IllegalCharsetNameException | + _: UnsupportedCharsetException | + _: IllegalArgumentException => + throw QueryExecutionErrors.invalidCharsetError(prettyName, fromCharset) + } + try { + val cb = decoder.decode(ByteBuffer.wrap(input1.asInstanceOf[Array[Byte]])) + UTF8String.fromString(cb.toString) + } catch { + case _: CharacterCodingException => + throw QueryExecutionErrors.malformedCharacterCoding(prettyName, fromCharset) + } + } else { + throw QueryExecutionErrors.invalidCharsetError(prettyName, fromCharset) } } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - nullSafeCodeGen(ctx, ev, (bytes, charset) => { - val fromCharset = ctx.freshName("fromCharset") - val sc = JavaCode.global( - ctx.addReferenceObj("supportedCharsets", supportedCharsets), - supportedCharsets.getClass) - s""" - String $fromCharset = $charset.toString(); - try { - if ($legacyCharsets || $sc.contains($fromCharset.toUpperCase(java.util.Locale.ROOT))) { - ${ev.value} = UTF8String.fromString(new String($bytes, $fromCharset)); - } else { - throw new java.io.UnsupportedEncodingException(); - } - } catch (java.io.UnsupportedEncodingException e) { - throw QueryExecutionErrors.invalidCharsetError("$prettyName", $fromCharset); - } - """ - }) + val expr = ctx.addReferenceObj("this", this) + defineCodeGen(ctx, ev, (bin, charset) => s"(UTF8String) $expr.nullSafeEval($bin, $charset)") } override protected def withNewChildrenInternal( @@ -2785,11 +2793,15 @@ object StringDecode { since = "1.5.0", group = "string_funcs") // scalastyle:on line.size.limit -case class Encode(str: Expression, charset: Expression, legacyCharsets: Boolean) +case class Encode( + str: Expression, + charset: Expression, + legacyCharsets: Boolean, + legacyErrorAction: Boolean) extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { def this(value: Expression, charset: Expression) = - this(value, charset, SQLConf.get.legacyJavaCharsets) + this(value, charset, SQLConf.get.legacyJavaCharsets, SQLConf.get.legacyCodingErrorAction) override def left: Expression = str override def right: Expression = charset @@ -2800,36 +2812,41 @@ case class Encode(str: Expression, charset: Expression, legacyCharsets: Boolean) private val supportedCharsets = Set( "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16", "UTF-32") + protected override def nullSafeEval(input1: Any, input2: Any): Any = { val toCharset = input2.asInstanceOf[UTF8String].toString - try { - if (legacyCharsets || supportedCharsets.contains(toCharset.toUpperCase(Locale.ROOT))) { - input1.asInstanceOf[UTF8String].toString.getBytes(toCharset) - } else throw new UnsupportedEncodingException - } catch { - case _: UnsupportedEncodingException => - throw QueryExecutionErrors.invalidCharsetError(prettyName, toCharset) + if (legacyCharsets || supportedCharsets.contains(toCharset.toUpperCase(Locale.ROOT))) { + val encoder = try { + val codingErrorAction = if (legacyErrorAction) { + CodingErrorAction.REPLACE + } else { + CodingErrorAction.REPORT + } + Charset.forName(toCharset) + .newEncoder() + .onMalformedInput(codingErrorAction) + .onUnmappableCharacter(codingErrorAction) + } catch { + case _: IllegalCharsetNameException | + _: UnsupportedCharsetException | + _: IllegalArgumentException => + throw QueryExecutionErrors.invalidCharsetError(prettyName, toCharset) + } + try { + val bb = encoder.encode(CharBuffer.wrap(input1.asInstanceOf[UTF8String].toString)) + JavaUtils.bufferToArray(bb) + } catch { + case _: CharacterCodingException => + throw QueryExecutionErrors.malformedCharacterCoding(prettyName, toCharset) + } + } else { + throw QueryExecutionErrors.invalidCharsetError(prettyName, toCharset) } } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - nullSafeCodeGen(ctx, ev, (string, charset) => { - val toCharset = ctx.freshName("toCharset") - val sc = JavaCode.global( - ctx.addReferenceObj("supportedCharsets", supportedCharsets), - supportedCharsets.getClass) - s""" - String $toCharset = $charset.toString(); - try { - if ($legacyCharsets || $sc.contains($toCharset.toUpperCase(java.util.Locale.ROOT))) { - ${ev.value} = $string.toString().getBytes($toCharset); - } else { - throw new java.io.UnsupportedEncodingException(); - } - } catch (java.io.UnsupportedEncodingException e) { - throw QueryExecutionErrors.invalidCharsetError("$prettyName", $toCharset); - }""" - }) + val expr = ctx.addReferenceObj("this", this) + defineCodeGen(ctx, ev, (str, charset) => s"(byte[]) $expr.nullSafeEval($str, $charset)") } override protected def withNewChildrenInternal( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala index 30e53f146982..8af931976b2e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala @@ -2741,6 +2741,14 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE "charset" -> charset)) } + def malformedCharacterCoding(functionName: String, charset: String): RuntimeException = { + new SparkRuntimeException( + errorClass = "MALFORMED_CHARACTER_CODING", + messageParameters = Map( + "function" -> toSQLId(functionName), + "charset" -> charset)) + } + def invalidWriterCommitMessageError(details: String): Throwable = { new SparkRuntimeException( errorClass = "INVALID_WRITER_COMMIT_MESSAGE", diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 25a2441e05fe..d7a8c5c590c5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -5002,6 +5002,14 @@ object SQLConf { .booleanConf .createWithDefault(false) + val LEGACY_CODING_ERROR_ACTION = buildConf("spark.sql.legacy.codingErrorAction") + .internal() + .doc("When set to true, encode/decode functions replace unmappable characters with mojibake " + + "instead of reporting coding errors.") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + val LEGACY_EVAL_CURRENT_TIME = buildConf("spark.sql.legacy.earlyEvalCurrentTime") .internal() .doc("When set to true, evaluation and constant folding will happen for now() and " + @@ -5976,6 +5984,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def legacyJavaCharsets: Boolean = getConf(SQLConf.LEGACY_JAVA_CHARSETS) + def legacyCodingErrorAction: Boolean = getConf(SQLConf.LEGACY_CODING_ERROR_ACTION) + def legacyEvalCurrentTime: Boolean = getConf(SQLConf.LEGACY_EVAL_CURRENT_TIME) /** ********************** SQLConf functionality methods ************ */ diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out index c9b451187356..275a6e7900f6 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out @@ -384,21 +384,21 @@ Project [btrim(xyxtrimyyx, xy) AS btrim(xyxtrimyyx, xy)#x] -- !query SELECT btrim(encode(" xyz ", 'utf-8')) -- !query analysis -Project [btrim(encode( xyz , utf-8, false)) AS btrim(encode( xyz , utf-8))#x] +Project [btrim(encode( xyz , utf-8, false, false)) AS btrim(encode( xyz , utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8')) -- !query analysis -Project [btrim(encode(yxTomxx, utf-8, false), encode(xyz, utf-8, false)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +Project [btrim(encode(yxTomxx, utf-8, false, false), encode(xyz, utf-8, false, false)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8')) -- !query analysis -Project [btrim(encode(xxxbarxxx, utf-8, false), encode(x, utf-8, false)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +Project [btrim(encode(xxxbarxxx, utf-8, false, false), encode(x, utf-8, false, false)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +- OneRowRelation @@ -649,14 +649,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(true)) -- !query select encode('hello', 'WINDOWS-1252') -- !query analysis -Project [encode(hello, WINDOWS-1252, true) AS encode(hello, WINDOWS-1252)#x] +Project [encode(hello, WINDOWS-1252, true, false) AS encode(hello, WINDOWS-1252)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, true) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x, true, false) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -670,14 +670,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(false)) -- !query select encode('hello', 'WINDOWS-1252') -- !query analysis -Project [encode(hello, WINDOWS-1252, false) AS encode(hello, WINDOWS-1252)#x] +Project [encode(hello, WINDOWS-1252, false, false) AS encode(hello, WINDOWS-1252)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x, false, false) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -685,14 +685,56 @@ Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] -- !query select encode('hello', 'Windows-xxx') -- !query analysis -Project [encode(hello, Windows-xxx, false) AS encode(hello, Windows-xxx)#x] +Project [encode(hello, Windows-xxx, false, false) AS encode(hello, Windows-xxx)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x, false, false) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(true)) + + +-- !query +select encode('渭城朝雨浥轻尘', 'US-ASCII') +-- !query analysis +Project [encode(渭城朝雨浥轻尘, US-ASCII, false, true) AS encode(渭城朝雨浥轻尘, US-ASCII)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x, false, true) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(false)) + + +-- !query +select encode('客舍青青柳色新', 'US-ASCII') +-- !query analysis +Project [encode(客舍青青柳色新, US-ASCII, false, false) AS encode(客舍青青柳色新, US-ASCII)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x, false, false) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -746,14 +788,14 @@ org.apache.spark.sql.AnalysisException -- !query select decode(encode('abc', 'utf-8'), 'utf-8') -- !query analysis -Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +Project [decode(encode(abc, utf-8, false, false), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +- OneRowRelation -- !query select decode(encode('大千世界', 'utf-32'), 'utf-32') -- !query analysis -Project [decode(encode(大千世界, utf-32, false), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x] +Project [decode(encode(大千世界, utf-32, false, false), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x] +- OneRowRelation @@ -863,6 +905,48 @@ Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] +- LocalRelation [scol#x, ecol#x] +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(true)) + + +-- !query +select decode('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') +-- !query analysis +Project [decode(E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592, US-ASCII) AS decode(E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592, US-ASCII)#x] ++- OneRowRelation + + +-- !query +select decode(scol, ecol) from values('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(false)) + + +-- !query +select decode('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') +-- !query analysis +Project [decode(E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA, US-ASCII) AS decode(E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA, US-ASCII)#x] ++- OneRowRelation + + +-- !query +select decode(scol, ecol) from values('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + -- !query SELECT CONTAINS(null, 'Spark') -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out index c9b451187356..275a6e7900f6 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out @@ -384,21 +384,21 @@ Project [btrim(xyxtrimyyx, xy) AS btrim(xyxtrimyyx, xy)#x] -- !query SELECT btrim(encode(" xyz ", 'utf-8')) -- !query analysis -Project [btrim(encode( xyz , utf-8, false)) AS btrim(encode( xyz , utf-8))#x] +Project [btrim(encode( xyz , utf-8, false, false)) AS btrim(encode( xyz , utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8')) -- !query analysis -Project [btrim(encode(yxTomxx, utf-8, false), encode(xyz, utf-8, false)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +Project [btrim(encode(yxTomxx, utf-8, false, false), encode(xyz, utf-8, false, false)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8')) -- !query analysis -Project [btrim(encode(xxxbarxxx, utf-8, false), encode(x, utf-8, false)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +Project [btrim(encode(xxxbarxxx, utf-8, false, false), encode(x, utf-8, false, false)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +- OneRowRelation @@ -649,14 +649,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(true)) -- !query select encode('hello', 'WINDOWS-1252') -- !query analysis -Project [encode(hello, WINDOWS-1252, true) AS encode(hello, WINDOWS-1252)#x] +Project [encode(hello, WINDOWS-1252, true, false) AS encode(hello, WINDOWS-1252)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, true) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x, true, false) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -670,14 +670,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(false)) -- !query select encode('hello', 'WINDOWS-1252') -- !query analysis -Project [encode(hello, WINDOWS-1252, false) AS encode(hello, WINDOWS-1252)#x] +Project [encode(hello, WINDOWS-1252, false, false) AS encode(hello, WINDOWS-1252)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x, false, false) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -685,14 +685,56 @@ Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] -- !query select encode('hello', 'Windows-xxx') -- !query analysis -Project [encode(hello, Windows-xxx, false) AS encode(hello, Windows-xxx)#x] +Project [encode(hello, Windows-xxx, false, false) AS encode(hello, Windows-xxx)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x, false, false) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(true)) + + +-- !query +select encode('渭城朝雨浥轻尘', 'US-ASCII') +-- !query analysis +Project [encode(渭城朝雨浥轻尘, US-ASCII, false, true) AS encode(渭城朝雨浥轻尘, US-ASCII)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x, false, true) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(false)) + + +-- !query +select encode('客舍青青柳色新', 'US-ASCII') +-- !query analysis +Project [encode(客舍青青柳色新, US-ASCII, false, false) AS encode(客舍青青柳色新, US-ASCII)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x, false, false) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -746,14 +788,14 @@ org.apache.spark.sql.AnalysisException -- !query select decode(encode('abc', 'utf-8'), 'utf-8') -- !query analysis -Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +Project [decode(encode(abc, utf-8, false, false), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +- OneRowRelation -- !query select decode(encode('大千世界', 'utf-32'), 'utf-32') -- !query analysis -Project [decode(encode(大千世界, utf-32, false), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x] +Project [decode(encode(大千世界, utf-32, false, false), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x] +- OneRowRelation @@ -863,6 +905,48 @@ Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] +- LocalRelation [scol#x, ecol#x] +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(true)) + + +-- !query +select decode('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') +-- !query analysis +Project [decode(E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592, US-ASCII) AS decode(E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592, US-ASCII)#x] ++- OneRowRelation + + +-- !query +select decode(scol, ecol) from values('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query analysis +SetCommand (spark.sql.legacy.codingErrorAction,Some(false)) + + +-- !query +select decode('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') +-- !query analysis +Project [decode(E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA, US-ASCII) AS decode(E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA, US-ASCII)#x] ++- OneRowRelation + + +-- !query +select decode(scol, ecol) from values('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) +-- !query analysis +Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + -- !query SELECT CONTAINS(null, 'Spark') -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 733720a7e21b..4bda487a8863 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -126,6 +126,12 @@ select encode('hello', 'WINDOWS-1252'); select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol); select encode('hello', 'Windows-xxx'); select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol); +set spark.sql.legacy.codingErrorAction=true; +select encode('渭城朝雨浥轻尘', 'US-ASCII'); +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol); +set spark.sql.legacy.codingErrorAction=false; +select encode('客舍青青柳色新', 'US-ASCII'); +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol); -- decode select decode(); @@ -147,6 +153,12 @@ select decode(scol, ecol) from values(X'68656c6c6f', 'WINDOWS-1252') as t(scol, set spark.sql.legacy.javaCharsets=false; select decode(X'68656c6c6f', 'WINDOWS-1252'); select decode(scol, ecol) from values(X'68656c6c6f', 'WINDOWS-1252') as t(scol, ecol); +set spark.sql.legacy.codingErrorAction=true; +select decode('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII'); +select decode(scol, ecol) from values('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol); +set spark.sql.legacy.codingErrorAction=false; +select decode('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII'); +select decode(scol, ecol) from values('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol); -- contains SELECT CONTAINS(null, 'Spark'); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 09d4f8892fa4..24339dcad8e5 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -903,6 +903,70 @@ org.apache.spark.SparkIllegalArgumentException } +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction true + + +-- !query +select encode('渭城朝雨浥轻尘', 'US-ASCII') +-- !query schema +struct +-- !query output +??????? + + +-- !query +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct +-- !query output +??????? + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction false + + +-- !query +select encode('客舍青青柳色新', 'US-ASCII') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`encode`" + } +} + + +-- !query +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`encode`" + } +} + + -- !query select decode() -- !query schema @@ -1125,6 +1189,54 @@ org.apache.spark.SparkIllegalArgumentException } +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction true + + +-- !query +select decode('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') +-- !query schema +struct +-- !query output +E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592 + + +-- !query +select decode(scol, ecol) from values('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct +-- !query output +E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592 + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction false + + +-- !query +select decode('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') +-- !query schema +struct +-- !query output +E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA + + +-- !query +select decode(scol, ecol) from values('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct +-- !query output +E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA + + -- !query SELECT CONTAINS(null, 'Spark') -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 506524840f10..fef8b325d71b 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -835,6 +835,70 @@ org.apache.spark.SparkIllegalArgumentException } +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction true + + +-- !query +select encode('渭城朝雨浥轻尘', 'US-ASCII') +-- !query schema +struct +-- !query output +??????? + + +-- !query +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct +-- !query output +??????? + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction false + + +-- !query +select encode('客舍青青柳色新', 'US-ASCII') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`encode`" + } +} + + +-- !query +select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`encode`" + } +} + + -- !query select decode() -- !query schema @@ -1057,6 +1121,54 @@ org.apache.spark.SparkIllegalArgumentException } +-- !query +set spark.sql.legacy.codingErrorAction=true +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction true + + +-- !query +select decode('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') +-- !query schema +struct +-- !query output +E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592 + + +-- !query +select decode(scol, ecol) from values('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct +-- !query output +E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592 + + +-- !query +set spark.sql.legacy.codingErrorAction=false +-- !query schema +struct +-- !query output +spark.sql.legacy.codingErrorAction false + + +-- !query +select decode('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') +-- !query schema +struct +-- !query output +E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA + + +-- !query +select decode(scol, ecol) from values('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) +-- !query schema +struct +-- !query output +E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA + + -- !query SELECT CONTAINS(null, 'Spark') -- !query schema From aee78a588f1af1bba30cc6840a37eb67fb9d5693 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Wed, 19 Jun 2024 11:10:17 +0800 Subject: [PATCH 02/11] [SPARK-48658][SQL] Encode/Decode functions report coding error instead of mojibake --- .../src/main/resources/error/error-conditions.json | 12 ++++++------ .../analyzer-results/ansi/string-functions.sql.out | 2 +- .../analyzer-results/string-functions.sql.out | 2 +- .../resources/sql-tests/inputs/string-functions.sql | 2 +- .../sql-tests/results/ansi/string-functions.sql.out | 2 +- .../sql-tests/results/string-functions.sql.out | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 1888b669dcc9..f83fde124919 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -2988,6 +2988,12 @@ ], "sqlState" : "42710" }, + "MALFORMED_CHARACTER_CODING" : { + "message" : [ + "Invalid value found when performing with " + ], + "sqlState" : "22000" + }, "MALFORMED_CSV_RECORD" : { "message" : [ "Malformed CSV record: " @@ -3030,12 +3036,6 @@ ], "sqlState" : "22023" }, - "MALFORMED_CHARACTER_CODING" : { - "message" : [ - "Invalid value found when performing with " - ], - "sqlState" : "22000" - }, "MERGE_CARDINALITY_VIOLATION" : { "message" : [ "The ON search condition of the MERGE statement matched a single row from the target table with multiple rows of the source table.", diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out index 275a6e7900f6..1d99de0c0d1f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out @@ -732,7 +732,7 @@ Project [encode(客舍青青柳色新, US-ASCII, false, false) AS encode(客舍 -- !query -select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol) -- !query analysis Project [encode(scol#x, ecol#x, false, false) AS encode(scol, ecol)#x] +- SubqueryAlias t diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out index 275a6e7900f6..1d99de0c0d1f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out @@ -732,7 +732,7 @@ Project [encode(客舍青青柳色新, US-ASCII, false, false) AS encode(客舍 -- !query -select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol) -- !query analysis Project [encode(scol#x, ecol#x, false, false) AS encode(scol, ecol)#x] +- SubqueryAlias t diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 4bda487a8863..523e09d55889 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -131,7 +131,7 @@ select encode('渭城朝雨浥轻尘', 'US-ASCII'); select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol); set spark.sql.legacy.codingErrorAction=false; select encode('客舍青青柳色新', 'US-ASCII'); -select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol); +select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol); -- decode select decode(); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 24339dcad8e5..f48e4dd6280c 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -952,7 +952,7 @@ org.apache.spark.SparkRuntimeException -- !query -select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol) -- !query schema struct<> -- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index fef8b325d71b..05f1cf5af882 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -884,7 +884,7 @@ org.apache.spark.SparkRuntimeException -- !query -select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) +select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol) -- !query schema struct<> -- !query output From afb2d08a794edd56465c602e86ff1bf5f8618ae8 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Wed, 19 Jun 2024 11:19:32 +0800 Subject: [PATCH 03/11] [SPARK-48658][SQL] Encode/Decode functions report coding error instead of mojibake --- .../ansi/string-functions.sql.out | 12 +++--- .../analyzer-results/string-functions.sql.out | 12 +++--- .../sql-tests/inputs/string-functions.sql | 8 ++-- .../results/ansi/string-functions.sql.out | 38 +++++++++++++------ .../results/string-functions.sql.out | 38 +++++++++++++------ 5 files changed, 70 insertions(+), 38 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out index 1d99de0c0d1f..8f77dee6039c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out @@ -912,14 +912,14 @@ SetCommand (spark.sql.legacy.codingErrorAction,Some(true)) -- !query -select decode('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') +select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') -- !query analysis -Project [decode(E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592, US-ASCII) AS decode(E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592, US-ASCII)#x] +Project [decode(0xE58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592, US-ASCII) AS decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', US-ASCII)#x] +- OneRowRelation -- !query -select decode(scol, ecol) from values('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) +select decode(scol, ecol) from values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) -- !query analysis Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] +- SubqueryAlias t @@ -933,14 +933,14 @@ SetCommand (spark.sql.legacy.codingErrorAction,Some(false)) -- !query -select decode('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') +select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') -- !query analysis -Project [decode(E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA, US-ASCII) AS decode(E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA, US-ASCII)#x] +Project [decode(0xE8A5BFE587BAE998B3E585B3E697A0E69585E4BABA, US-ASCII) AS decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', US-ASCII)#x] +- OneRowRelation -- !query -select decode(scol, ecol) from values('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) +select decode(scol, ecol) from values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) -- !query analysis Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] +- SubqueryAlias t diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out index 1d99de0c0d1f..8f77dee6039c 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out @@ -912,14 +912,14 @@ SetCommand (spark.sql.legacy.codingErrorAction,Some(true)) -- !query -select decode('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') +select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') -- !query analysis -Project [decode(E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592, US-ASCII) AS decode(E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592, US-ASCII)#x] +Project [decode(0xE58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592, US-ASCII) AS decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', US-ASCII)#x] +- OneRowRelation -- !query -select decode(scol, ecol) from values('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) +select decode(scol, ecol) from values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) -- !query analysis Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] +- SubqueryAlias t @@ -933,14 +933,14 @@ SetCommand (spark.sql.legacy.codingErrorAction,Some(false)) -- !query -select decode('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') +select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') -- !query analysis -Project [decode(E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA, US-ASCII) AS decode(E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA, US-ASCII)#x] +Project [decode(0xE8A5BFE587BAE998B3E585B3E697A0E69585E4BABA, US-ASCII) AS decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', US-ASCII)#x] +- OneRowRelation -- !query -select decode(scol, ecol) from values('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) +select decode(scol, ecol) from values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) -- !query analysis Project [decode(scol#x, ecol#x) AS decode(scol, ecol)#x] +- SubqueryAlias t diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 523e09d55889..0d9c0f3a6a14 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -154,11 +154,11 @@ set spark.sql.legacy.javaCharsets=false; select decode(X'68656c6c6f', 'WINDOWS-1252'); select decode(scol, ecol) from values(X'68656c6c6f', 'WINDOWS-1252') as t(scol, ecol); set spark.sql.legacy.codingErrorAction=true; -select decode('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII'); -select decode(scol, ecol) from values('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol); +select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII'); +select decode(scol, ecol) from values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol); set spark.sql.legacy.codingErrorAction=false; -select decode('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII'); -select decode(scol, ecol) from values('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol); +select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII'); +select decode(scol, ecol) from values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol); -- contains SELECT CONTAINS(null, 'Spark'); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index f48e4dd6280c..9f72e215ea54 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -1198,19 +1198,19 @@ spark.sql.legacy.codingErrorAction true -- !query -select decode('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') +select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') -- !query schema -struct +struct -- !query output -E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592 +��������������������� -- !query -select decode(scol, ecol) from values('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) +select decode(scol, ecol) from values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) -- !query schema struct -- !query output -E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592 +��������������������� -- !query @@ -1222,19 +1222,35 @@ spark.sql.legacy.codingErrorAction false -- !query -select decode('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') +select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') -- !query schema -struct +struct<> -- !query output -E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`decode`" + } +} -- !query -select decode(scol, ecol) from values('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) +select decode(scol, ecol) from values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) -- !query schema -struct +struct<> -- !query output -E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`decode`" + } +} -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 05f1cf5af882..e6778cb539bd 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -1130,19 +1130,19 @@ spark.sql.legacy.codingErrorAction true -- !query -select decode('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') +select decode(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') -- !query schema -struct +struct -- !query output -E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592 +��������������������� -- !query -select decode(scol, ecol) from values('E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) +select decode(scol, ecol) from values(X'E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592', 'US-ASCII') as t(scol, ecol) -- !query schema struct -- !query output -E58A9DE5909BE69BB4E5B0BDE4B880E69DAFE98592 +��������������������� -- !query @@ -1154,19 +1154,35 @@ spark.sql.legacy.codingErrorAction false -- !query -select decode('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') +select decode(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') -- !query schema -struct +struct<> -- !query output -E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`decode`" + } +} -- !query -select decode(scol, ecol) from values('E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) +select decode(scol, ecol) from values(X'E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA', 'US-ASCII') as t(scol, ecol) -- !query schema -struct +struct<> -- !query output -E8A5BFE587BAE998B3E585B3E697A0E69585E4BABA +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "MALFORMED_CHARACTER_CODING", + "sqlState" : "22000", + "messageParameters" : { + "charset" : "US-ASCII", + "function" : "`decode`" + } +} -- !query From f6dd4fa985085a080da8d70645ea89be92dfa363 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Wed, 19 Jun 2024 13:15:33 +0800 Subject: [PATCH 04/11] fix ExplainSuite --- .../test/scala/org/apache/spark/sql/ExplainSuite.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala index b2aaaceb26ab..0113854ab0c7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala @@ -193,8 +193,8 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite """.stripMargin) checkKeywordsExistsInExplain(df2, "Project [concat(cast(id#xL as string), cast((id#xL + 1) as string), " + - "cast(encode(cast((id#xL + 2) as string), utf-8, false) as string), " + - "cast(encode(cast((id#xL + 3) as string), utf-8, false) as string)) AS col#x]") + "cast(encode(cast((id#xL + 2) as string), utf-8, false, false) as string), " + + "cast(encode(cast((id#xL + 3) as string), utf-8, false, false) as string)) AS col#x]") val df3 = sql( """ @@ -209,8 +209,8 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite """.stripMargin) checkKeywordsExistsInExplain(df3, "Project [concat(cast(id#xL as string), " + - "cast(encode(cast((id#xL + 2) as string), utf-8, false) as string), " + - "cast(encode(cast((id#xL + 3) as string), utf-8, false) as string)) AS col#x]") + "cast(encode(cast((id#xL + 2) as string), utf-8, false, false) as string), " + + "cast(encode(cast((id#xL + 3) as string), utf-8, false, false) as string)) AS col#x]") } } From 851135c9b1ecb5172d5cd7cc3dba5fee76d92f26 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Wed, 19 Jun 2024 13:20:49 +0800 Subject: [PATCH 05/11] fix golden file tests --- .../typeCoercion/native/concat.sql.out | 18 +++++++++--------- .../typeCoercion/native/elt.sql.out | 8 ++++---- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out index b3c5034656e2..2d0d6b4e7c95 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out @@ -11,7 +11,7 @@ FROM ( -- !query analysis Project [concat(concat(cast(col1#xL as string), col2#x), cast(col3#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [id#xL AS col1#xL, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x] + +- Project [id#xL AS col1#xL, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false, false) AS col3#x] +- Range (0, 10, step=1) @@ -29,7 +29,7 @@ FROM ( -- !query analysis Project [concat(concat(concat(col1#x, cast(col2#xL as string)), concat(col3#x, cast(col4#x as string))), cast(col5#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col4#x, cast(id#xL as double) AS col5#x] + +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false, false) AS col4#x, cast(id#xL as double) AS col5#x] +- Range (0, 10, step=1) @@ -46,7 +46,7 @@ FROM ( -- !query analysis Project [concat(concat(col1#x, col2#x), cast(concat(col3#x, col4#x) as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] + +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false, false) AS col4#x] +- Range (0, 10, step=1) @@ -67,7 +67,7 @@ FROM ( -- !query analysis Project [concat(cast(col1#x as string), cast(col2#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8, false, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false, false) AS col2#x] +- Range (0, 10, step=1) @@ -84,7 +84,7 @@ FROM ( -- !query analysis Project [concat(concat(concat(cast(col1#x as string), cast(col2#x as string)), cast(col3#x as string)), cast(col4#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8, false, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false, false) AS col4#x] +- Range (0, 10, step=1) @@ -101,7 +101,7 @@ FROM ( -- !query analysis Project [concat(concat(cast(col1#x as string), cast(col2#x as string)), concat(cast(col3#x as string), cast(col4#x as string))) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8, false, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false, false) AS col4#x] +- Range (0, 10, step=1) @@ -122,7 +122,7 @@ FROM ( -- !query analysis Project [concat(col1#x, col2#x) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8, false, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false, false) AS col2#x] +- Range (0, 10, step=1) @@ -139,7 +139,7 @@ FROM ( -- !query analysis Project [concat(concat(concat(col1#x, col2#x), col3#x), col4#x) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8, false, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false, false) AS col4#x] +- Range (0, 10, step=1) @@ -156,7 +156,7 @@ FROM ( -- !query analysis Project [concat(concat(col1#x, col2#x), concat(col3#x, col4#x)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8, false, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false, false) AS col4#x] +- Range (0, 10, step=1) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out index 60b7fa711791..a45f36b86d24 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out @@ -13,7 +13,7 @@ FROM ( -- !query analysis Project [elt(2, col1#x, cast(col2#xL as string), col3#x, cast(col4#x as string), cast(col5#x as string), false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col4#x, cast(id#xL as double) AS col5#x] + +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false, false) AS col4#x, cast(id#xL as double) AS col5#x] +- Range (0, 10, step=1) @@ -30,7 +30,7 @@ FROM ( -- !query analysis Project [elt(3, col1#x, col2#x, cast(col3#x as string), cast(col4#x as string), false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] + +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false, false) AS col4#x] +- Range (0, 10, step=1) @@ -51,7 +51,7 @@ FROM ( -- !query analysis Project [elt(1, cast(col1#x as string), cast(col2#x as string), false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8, false, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false, false) AS col2#x] +- Range (0, 10, step=1) @@ -72,5 +72,5 @@ FROM ( -- !query analysis Project [elt(2, col1#x, col2#x, false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8, false, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false, false) AS col2#x] +- Range (0, 10, step=1) From d3473a45342d455ac3c4551b8862808e05647d08 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Wed, 19 Jun 2024 13:26:03 +0800 Subject: [PATCH 06/11] fix golden file tests --- .../query-tests/explain-results/function_decode.explain | 2 +- .../query-tests/explain-results/function_encode.explain | 2 +- .../explain-results/function_to_binary_with_format.explain | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain index 165be9b9e12f..af7e2065d1d2 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain @@ -1,2 +1,2 @@ -Project [decode(cast(g#0 as binary), UTF-8, false) AS decode(g, UTF-8)#0] +Project [decode(cast(g#0 as binary), UTF-8, false, false) AS decode(g, UTF-8)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain index 2f6543605923..583169f976e9 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain @@ -1,2 +1,2 @@ -Project [encode(g#0, UTF-8, false) AS encode(g, UTF-8)#0] +Project [encode(g#0, UTF-8, false, false) AS encode(g, UTF-8)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain index b62ccccc0c15..43a3a0f42675 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain @@ -1,2 +1,2 @@ -Project [encode(g#0, UTF-8, false) AS to_binary(g, utf-8)#0] +Project [encode(g#0, UTF-8, false, false) AS to_binary(g, utf-8)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] From 3e90976d93d76fb7be5c9ef262d985810eb05b21 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Thu, 20 Jun 2024 13:59:28 +0800 Subject: [PATCH 07/11] Encode RuntimeReplaceable with StaticInvoke --- .../expressions/stringExpressions.scala | 50 +++++++++++-------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 442c61af7009..cfbe464dc014 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2798,7 +2798,7 @@ case class Encode( charset: Expression, legacyCharsets: Boolean, legacyErrorAction: Boolean) - extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { + extends BinaryExpression with RuntimeReplaceable with ImplicitCastInputTypes with NullIntolerant { def this(value: Expression, charset: Expression) = this(value, charset, SQLConf.get.legacyJavaCharsets, SQLConf.get.legacyCodingErrorAction) @@ -2809,13 +2809,31 @@ case class Encode( override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation, StringTypeAnyCollation) - private val supportedCharsets = Set( - "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16", "UTF-32") + override protected def withNewChildrenInternal( + newLeft: Expression, newRight: Expression): Encode = copy(str = newLeft, charset = newRight) + + override val replacement: Expression = StaticInvoke( + classOf[Encode], + BinaryType, + "encode", + Seq( + str, charset, Literal(legacyCharsets, BooleanType), Literal(legacyErrorAction, BooleanType)), + Seq(StringTypeAnyCollation, StringTypeAnyCollation, BooleanType, BooleanType)) +} +object Encode { + def apply(value: Expression, charset: Expression): Encode = new Encode(value, charset) - protected override def nullSafeEval(input1: Any, input2: Any): Any = { - val toCharset = input2.asInstanceOf[UTF8String].toString - if (legacyCharsets || supportedCharsets.contains(toCharset.toUpperCase(Locale.ROOT))) { + private final lazy val VALID_CHARSETS = Set( + "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16", "UTF-32") + + def encode( + input: UTF8String, + charset: UTF8String, + legacyCharsets: Boolean, + legacyErrorAction: Boolean): Array[Byte] = { + val toCharset = charset.toString + if (legacyCharsets || VALID_CHARSETS.contains(toCharset.toUpperCase(Locale.ROOT))) { val encoder = try { val codingErrorAction = if (legacyErrorAction) { CodingErrorAction.REPLACE @@ -2830,31 +2848,19 @@ case class Encode( case _: IllegalCharsetNameException | _: UnsupportedCharsetException | _: IllegalArgumentException => - throw QueryExecutionErrors.invalidCharsetError(prettyName, toCharset) + throw QueryExecutionErrors.invalidCharsetError("encode", toCharset) } try { - val bb = encoder.encode(CharBuffer.wrap(input1.asInstanceOf[UTF8String].toString)) + val bb = encoder.encode(CharBuffer.wrap(input.toString)) JavaUtils.bufferToArray(bb) } catch { case _: CharacterCodingException => - throw QueryExecutionErrors.malformedCharacterCoding(prettyName, toCharset) + throw QueryExecutionErrors.malformedCharacterCoding("encode", toCharset) } } else { - throw QueryExecutionErrors.invalidCharsetError(prettyName, toCharset) + throw QueryExecutionErrors.invalidCharsetError("encode", toCharset) } } - - override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - val expr = ctx.addReferenceObj("this", this) - defineCodeGen(ctx, ev, (str, charset) => s"(byte[]) $expr.nullSafeEval($str, $charset)") - } - - override protected def withNewChildrenInternal( - newLeft: Expression, newRight: Expression): Encode = copy(str = newLeft, charset = newRight) -} - -object Encode { - def apply(value: Expression, charset: Expression): Encode = new Encode(value, charset) } /** From b0cf6eebf4673879be6e0c655c75f3cf8d4b8214 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Thu, 20 Jun 2024 14:51:13 +0800 Subject: [PATCH 08/11] Decode RuntimeReplaceable with StaticInvoke --- .../explain-results/function_decode.explain | 2 +- .../explain-results/function_encode.explain | 2 +- .../function_to_binary_with_format.explain | 2 +- .../expressions/stringExpressions.scala | 75 ++++++++++--------- .../ansi/string-functions.sql.out | 30 ++++---- .../analyzer-results/string-functions.sql.out | 30 ++++---- .../typeCoercion/native/concat.sql.out | 18 ++--- .../typeCoercion/native/elt.sql.out | 8 +- .../org/apache/spark/sql/ExplainSuite.scala | 8 +- 9 files changed, 89 insertions(+), 86 deletions(-) diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain index af7e2065d1d2..e1a445120c13 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain @@ -1,2 +1,2 @@ -Project [decode(cast(g#0 as binary), UTF-8, false, false) AS decode(g, UTF-8)#0] +Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.StringDecode, StringType, decode, cast(g#0 as binary), UTF-8, false, false, BinaryType, StringTypeAnyCollation, BooleanType, BooleanType, true, true, true) AS decode(g, UTF-8)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain index 583169f976e9..7ce8776d754d 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain @@ -1,2 +1,2 @@ -Project [encode(g#0, UTF-8, false, false) AS encode(g, UTF-8)#0] +Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.Encode, BinaryType, encode, g#0, UTF-8, false, false, StringTypeAnyCollation, StringTypeAnyCollation, BooleanType, BooleanType, true, true, true) AS encode(g, UTF-8)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain index 43a3a0f42675..d999697a4c9e 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain @@ -1,2 +1,2 @@ -Project [encode(g#0, UTF-8, false, false) AS to_binary(g, utf-8)#0] +Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.Encode, BinaryType, encode, g#0, UTF-8, false, false, StringTypeAnyCollation, StringTypeAnyCollation, BooleanType, BooleanType, true, true, true) AS to_binary(g, utf-8)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index cfbe464dc014..660c93de807a 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2713,23 +2713,39 @@ case class Decode(params: Seq[Expression], replacement: Expression) case class StringDecode( bin: Expression, charset: Expression, - legacyCharsets: Boolean, legacyErrorAction: Boolean) - extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { + legacyCharsets: Boolean, + legacyErrorAction: Boolean) + extends RuntimeReplaceable with ImplicitCastInputTypes { def this(bin: Expression, charset: Expression) = this(bin, charset, SQLConf.get.legacyJavaCharsets, SQLConf.get.legacyCodingErrorAction) - override def left: Expression = bin - override def right: Expression = charset override def dataType: DataType = SQLConf.get.defaultStringType override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType, StringTypeAnyCollation) + override def prettyName: String = "decode" + override def toString: String = s"$prettyName($bin, $charset)" + + override def replacement: Expression = StaticInvoke( + classOf[StringDecode], + SQLConf.get.defaultStringType, + "decode", + Seq(bin, charset, Literal(legacyCharsets), Literal(legacyErrorAction)), + Seq(BinaryType, StringTypeAnyCollation, BooleanType, BooleanType)) - private val supportedCharsets = Set( - "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16", "UTF-32") + override def children: Seq[Expression] = Seq(bin, charset) + override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = + copy(bin = newChildren(0), charset = newChildren(1)) +} - protected override def nullSafeEval(input1: Any, input2: Any): Any = { - val fromCharset = input2.asInstanceOf[UTF8String].toString - if (legacyCharsets || supportedCharsets.contains(fromCharset.toUpperCase(Locale.ROOT))) { +object StringDecode { + def apply(bin: Expression, charset: Expression): StringDecode = new StringDecode(bin, charset) + def decode( + input: Array[Byte], + charset: UTF8String, + legacyCharsets: Boolean, + legacyErrorAction: Boolean): UTF8String = { + val fromCharset = charset.toString + if (legacyCharsets || Encode.VALID_CHARSETS.contains(fromCharset.toUpperCase(Locale.ROOT))) { val decoder = try { val codingErrorAction = if (legacyErrorAction) { CodingErrorAction.REPLACE @@ -2744,34 +2760,19 @@ case class StringDecode( case _: IllegalCharsetNameException | _: UnsupportedCharsetException | _: IllegalArgumentException => - throw QueryExecutionErrors.invalidCharsetError(prettyName, fromCharset) + throw QueryExecutionErrors.invalidCharsetError("decode", fromCharset) } try { - val cb = decoder.decode(ByteBuffer.wrap(input1.asInstanceOf[Array[Byte]])) + val cb = decoder.decode(ByteBuffer.wrap(input)) UTF8String.fromString(cb.toString) } catch { case _: CharacterCodingException => - throw QueryExecutionErrors.malformedCharacterCoding(prettyName, fromCharset) + throw QueryExecutionErrors.malformedCharacterCoding("decode", fromCharset) } } else { - throw QueryExecutionErrors.invalidCharsetError(prettyName, fromCharset) + throw QueryExecutionErrors.invalidCharsetError("decode", fromCharset) } } - - override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - val expr = ctx.addReferenceObj("this", this) - defineCodeGen(ctx, ev, (bin, charset) => s"(UTF8String) $expr.nullSafeEval($bin, $charset)") - } - - override protected def withNewChildrenInternal( - newLeft: Expression, newRight: Expression): StringDecode = - copy(bin = newLeft, charset = newRight) - - override def prettyName: String = "decode" -} - -object StringDecode { - def apply(bin: Expression, charset: Expression): StringDecode = new StringDecode(bin, charset) } /** @@ -2798,20 +2799,15 @@ case class Encode( charset: Expression, legacyCharsets: Boolean, legacyErrorAction: Boolean) - extends BinaryExpression with RuntimeReplaceable with ImplicitCastInputTypes with NullIntolerant { + extends RuntimeReplaceable with ImplicitCastInputTypes { def this(value: Expression, charset: Expression) = this(value, charset, SQLConf.get.legacyJavaCharsets, SQLConf.get.legacyCodingErrorAction) - override def left: Expression = str - override def right: Expression = charset override def dataType: DataType = BinaryType override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation, StringTypeAnyCollation) - override protected def withNewChildrenInternal( - newLeft: Expression, newRight: Expression): Encode = copy(str = newLeft, charset = newRight) - override val replacement: Expression = StaticInvoke( classOf[Encode], BinaryType, @@ -2819,13 +2815,20 @@ case class Encode( Seq( str, charset, Literal(legacyCharsets, BooleanType), Literal(legacyErrorAction, BooleanType)), Seq(StringTypeAnyCollation, StringTypeAnyCollation, BooleanType, BooleanType)) + + override def toString: String = s"$prettyName($str, $charset)" + + override def children: Seq[Expression] = Seq(str, charset) + + override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = + copy(str = newChildren.head, charset = newChildren(1)) } object Encode { def apply(value: Expression, charset: Expression): Encode = new Encode(value, charset) - private final lazy val VALID_CHARSETS = Set( - "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16", "UTF-32") + private[expressions] final lazy val VALID_CHARSETS = + Set("US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16", "UTF-32") def encode( input: UTF8String, diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out index 8f77dee6039c..c7675b16384f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out @@ -384,21 +384,21 @@ Project [btrim(xyxtrimyyx, xy) AS btrim(xyxtrimyyx, xy)#x] -- !query SELECT btrim(encode(" xyz ", 'utf-8')) -- !query analysis -Project [btrim(encode( xyz , utf-8, false, false)) AS btrim(encode( xyz , utf-8))#x] +Project [btrim(encode( xyz , utf-8)) AS btrim(encode( xyz , utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8')) -- !query analysis -Project [btrim(encode(yxTomxx, utf-8, false, false), encode(xyz, utf-8, false, false)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +Project [btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8')) -- !query analysis -Project [btrim(encode(xxxbarxxx, utf-8, false, false), encode(x, utf-8, false, false)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +Project [btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +- OneRowRelation @@ -649,14 +649,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(true)) -- !query select encode('hello', 'WINDOWS-1252') -- !query analysis -Project [encode(hello, WINDOWS-1252, true, false) AS encode(hello, WINDOWS-1252)#x] +Project [encode(hello, WINDOWS-1252) AS encode(hello, WINDOWS-1252)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, true, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -670,14 +670,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(false)) -- !query select encode('hello', 'WINDOWS-1252') -- !query analysis -Project [encode(hello, WINDOWS-1252, false, false) AS encode(hello, WINDOWS-1252)#x] +Project [encode(hello, WINDOWS-1252) AS encode(hello, WINDOWS-1252)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -685,14 +685,14 @@ Project [encode(scol#x, ecol#x, false, false) AS encode(scol, ecol)#x] -- !query select encode('hello', 'Windows-xxx') -- !query analysis -Project [encode(hello, Windows-xxx, false, false) AS encode(hello, Windows-xxx)#x] +Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -706,14 +706,14 @@ SetCommand (spark.sql.legacy.codingErrorAction,Some(true)) -- !query select encode('渭城朝雨浥轻尘', 'US-ASCII') -- !query analysis -Project [encode(渭城朝雨浥轻尘, US-ASCII, false, true) AS encode(渭城朝雨浥轻尘, US-ASCII)#x] +Project [encode(渭城朝雨浥轻尘, US-ASCII) AS encode(渭城朝雨浥轻尘, US-ASCII)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false, true) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -727,14 +727,14 @@ SetCommand (spark.sql.legacy.codingErrorAction,Some(false)) -- !query select encode('客舍青青柳色新', 'US-ASCII') -- !query analysis -Project [encode(客舍青青柳色新, US-ASCII, false, false) AS encode(客舍青青柳色新, US-ASCII)#x] +Project [encode(客舍青青柳色新, US-ASCII) AS encode(客舍青青柳色新, US-ASCII)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -788,14 +788,14 @@ org.apache.spark.sql.AnalysisException -- !query select decode(encode('abc', 'utf-8'), 'utf-8') -- !query analysis -Project [decode(encode(abc, utf-8, false, false), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +Project [decode(encode(abc, utf-8), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +- OneRowRelation -- !query select decode(encode('大千世界', 'utf-32'), 'utf-32') -- !query analysis -Project [decode(encode(大千世界, utf-32, false, false), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x] +Project [decode(encode(大千世界, utf-32), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out index 8f77dee6039c..c7675b16384f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out @@ -384,21 +384,21 @@ Project [btrim(xyxtrimyyx, xy) AS btrim(xyxtrimyyx, xy)#x] -- !query SELECT btrim(encode(" xyz ", 'utf-8')) -- !query analysis -Project [btrim(encode( xyz , utf-8, false, false)) AS btrim(encode( xyz , utf-8))#x] +Project [btrim(encode( xyz , utf-8)) AS btrim(encode( xyz , utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8')) -- !query analysis -Project [btrim(encode(yxTomxx, utf-8, false, false), encode(xyz, utf-8, false, false)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +Project [btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8')) -- !query analysis -Project [btrim(encode(xxxbarxxx, utf-8, false, false), encode(x, utf-8, false, false)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +Project [btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +- OneRowRelation @@ -649,14 +649,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(true)) -- !query select encode('hello', 'WINDOWS-1252') -- !query analysis -Project [encode(hello, WINDOWS-1252, true, false) AS encode(hello, WINDOWS-1252)#x] +Project [encode(hello, WINDOWS-1252) AS encode(hello, WINDOWS-1252)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, true, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -670,14 +670,14 @@ SetCommand (spark.sql.legacy.javaCharsets,Some(false)) -- !query select encode('hello', 'WINDOWS-1252') -- !query analysis -Project [encode(hello, WINDOWS-1252, false, false) AS encode(hello, WINDOWS-1252)#x] +Project [encode(hello, WINDOWS-1252) AS encode(hello, WINDOWS-1252)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -685,14 +685,14 @@ Project [encode(scol#x, ecol#x, false, false) AS encode(scol, ecol)#x] -- !query select encode('hello', 'Windows-xxx') -- !query analysis -Project [encode(hello, Windows-xxx, false, false) AS encode(hello, Windows-xxx)#x] +Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -706,14 +706,14 @@ SetCommand (spark.sql.legacy.codingErrorAction,Some(true)) -- !query select encode('渭城朝雨浥轻尘', 'US-ASCII') -- !query analysis -Project [encode(渭城朝雨浥轻尘, US-ASCII, false, true) AS encode(渭城朝雨浥轻尘, US-ASCII)#x] +Project [encode(渭城朝雨浥轻尘, US-ASCII) AS encode(渭城朝雨浥轻尘, US-ASCII)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false, true) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -727,14 +727,14 @@ SetCommand (spark.sql.legacy.codingErrorAction,Some(false)) -- !query select encode('客舍青青柳色新', 'US-ASCII') -- !query analysis -Project [encode(客舍青青柳色新, US-ASCII, false, false) AS encode(客舍青青柳色新, US-ASCII)#x] +Project [encode(客舍青青柳色新, US-ASCII) AS encode(客舍青青柳色新, US-ASCII)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x, false, false) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -788,14 +788,14 @@ org.apache.spark.sql.AnalysisException -- !query select decode(encode('abc', 'utf-8'), 'utf-8') -- !query analysis -Project [decode(encode(abc, utf-8, false, false), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +Project [decode(encode(abc, utf-8), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +- OneRowRelation -- !query select decode(encode('大千世界', 'utf-32'), 'utf-32') -- !query analysis -Project [decode(encode(大千世界, utf-32, false, false), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x] +Project [decode(encode(大千世界, utf-32), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out index 2d0d6b4e7c95..62e3a8747326 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out @@ -11,7 +11,7 @@ FROM ( -- !query analysis Project [concat(concat(cast(col1#xL as string), col2#x), cast(col3#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [id#xL AS col1#xL, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false, false) AS col3#x] + +- Project [id#xL AS col1#xL, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x] +- Range (0, 10, step=1) @@ -29,7 +29,7 @@ FROM ( -- !query analysis Project [concat(concat(concat(col1#x, cast(col2#xL as string)), concat(col3#x, cast(col4#x as string))), cast(col5#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false, false) AS col4#x, cast(id#xL as double) AS col5#x] + +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col4#x, cast(id#xL as double) AS col5#x] +- Range (0, 10, step=1) @@ -46,7 +46,7 @@ FROM ( -- !query analysis Project [concat(concat(col1#x, col2#x), cast(concat(col3#x, col4#x) as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false, false) AS col4#x] + +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] +- Range (0, 10, step=1) @@ -67,7 +67,7 @@ FROM ( -- !query analysis Project [concat(cast(col1#x as string), cast(col2#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false, false) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] +- Range (0, 10, step=1) @@ -84,7 +84,7 @@ FROM ( -- !query analysis Project [concat(concat(concat(cast(col1#x as string), cast(col2#x as string)), cast(col3#x as string)), cast(col4#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false, false) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] +- Range (0, 10, step=1) @@ -101,7 +101,7 @@ FROM ( -- !query analysis Project [concat(concat(cast(col1#x as string), cast(col2#x as string)), concat(cast(col3#x as string), cast(col4#x as string))) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false, false) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] +- Range (0, 10, step=1) @@ -122,7 +122,7 @@ FROM ( -- !query analysis Project [concat(col1#x, col2#x) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false, false) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] +- Range (0, 10, step=1) @@ -139,7 +139,7 @@ FROM ( -- !query analysis Project [concat(concat(concat(col1#x, col2#x), col3#x), col4#x) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false, false) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] +- Range (0, 10, step=1) @@ -156,7 +156,7 @@ FROM ( -- !query analysis Project [concat(concat(col1#x, col2#x), concat(col3#x, col4#x)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false, false) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] +- Range (0, 10, step=1) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out index a45f36b86d24..f4902012f0f9 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out @@ -13,7 +13,7 @@ FROM ( -- !query analysis Project [elt(2, col1#x, cast(col2#xL as string), col3#x, cast(col4#x as string), cast(col5#x as string), false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false, false) AS col4#x, cast(id#xL as double) AS col5#x] + +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col4#x, cast(id#xL as double) AS col5#x] +- Range (0, 10, step=1) @@ -30,7 +30,7 @@ FROM ( -- !query analysis Project [elt(3, col1#x, col2#x, cast(col3#x as string), cast(col4#x as string), false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false, false) AS col4#x] + +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] +- Range (0, 10, step=1) @@ -51,7 +51,7 @@ FROM ( -- !query analysis Project [elt(1, cast(col1#x as string), cast(col2#x as string), false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false, false) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] +- Range (0, 10, step=1) @@ -72,5 +72,5 @@ FROM ( -- !query analysis Project [elt(2, col1#x, col2#x, false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8, false, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false, false) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] +- Range (0, 10, step=1) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala index 0113854ab0c7..cae63c999a8e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala @@ -193,8 +193,8 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite """.stripMargin) checkKeywordsExistsInExplain(df2, "Project [concat(cast(id#xL as string), cast((id#xL + 1) as string), " + - "cast(encode(cast((id#xL + 2) as string), utf-8, false, false) as string), " + - "cast(encode(cast((id#xL + 3) as string), utf-8, false, false) as string)) AS col#x]") + "cast(encode(cast((id#xL + 2) as string), utf-8) as string), " + + "cast(encode(cast((id#xL + 3) as string), utf-8) as string)) AS col#x]") val df3 = sql( """ @@ -209,8 +209,8 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite """.stripMargin) checkKeywordsExistsInExplain(df3, "Project [concat(cast(id#xL as string), " + - "cast(encode(cast((id#xL + 2) as string), utf-8, false, false) as string), " + - "cast(encode(cast((id#xL + 3) as string), utf-8, false, false) as string)) AS col#x]") + "cast(encode(cast((id#xL + 2) as string), utf-8) as string), " + + "cast(encode(cast((id#xL + 3) as string), utf-8) as string)) AS col#x]") } } From 64f3c3936cdcdf7543e2ea4101cd8ccd8c5b7f94 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Thu, 20 Jun 2024 17:47:23 +0800 Subject: [PATCH 09/11] fix tests --- .../sql/catalyst/expressions/CodeGenerationSuite.scala | 2 +- .../sql/catalyst/expressions/ExpressionEvalHelper.scala | 7 ++++++- .../sql/catalyst/expressions/StringExpressionsSuite.scala | 4 ++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala index 4df8d87074fc..ec15b4873bfd 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala @@ -107,7 +107,7 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper { strExpr = StringDecode(Encode(strExpr, "utf-8"), "utf-8") } - val expressions = Seq(If(EqualTo(strExpr, strExpr), strExpr, strExpr)) + val expressions = Seq(If(EqualTo(strExpr, strExpr), strExpr, strExpr)).map(replace) val plan = GenerateMutableProjection.generate(expressions) val actual = plan(null).toSeq(expressions.map(_.dataType)) assert(actual.length == 1) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala index 21e6b8692911..a063e53486ad 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala @@ -71,10 +71,15 @@ trait ExpressionEvalHelper extends ScalaCheckDrivenPropertyChecks with PlanTestB new ArrayBasedMapData(keyArray, valueArray) } + protected def replace(expr: Expression): Expression = expr match { + case r: RuntimeReplaceable => replace(r.replacement) + case _ => expr.mapChildren(replace) + } + private def prepareEvaluation(expression: Expression): Expression = { val serializer = new JavaSerializer(new SparkConf()).newInstance() val resolver = ResolveTimeZone - val expr = resolver.resolveTimeZones(expression) + val expr = resolver.resolveTimeZones(replace(expression)) assert(expr.resolved) serializer.deserialize(serializer.serialize(expr)) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 51de44d8dfd9..ebd724543481 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -505,8 +505,8 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(StringDecode(b, Literal.create(null, StringType)), null, create_row(null)) // Test escaping of charset - GenerateUnsafeProjection.generate(Encode(a, Literal("\"quote")) :: Nil) - GenerateUnsafeProjection.generate(StringDecode(b, Literal("\"quote")) :: Nil) + GenerateUnsafeProjection.generate(Encode(a, Literal("\"quote")).replacement :: Nil) + GenerateUnsafeProjection.generate(StringDecode(b, Literal("\"quote")).replacement :: Nil) } test("initcap unit test") { From 8f5a2360758abc9ec6dbbe2b0416d6590c1699a6 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Fri, 21 Jun 2024 00:15:09 +0800 Subject: [PATCH 10/11] fix --- .../scala/org/apache/spark/sql/ExplainSuite.scala | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala index cae63c999a8e..22fdd96ce6ba 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala @@ -192,9 +192,11 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite |) """.stripMargin) checkKeywordsExistsInExplain(df2, - "Project [concat(cast(id#xL as string), cast((id#xL + 1) as string), " + - "cast(encode(cast((id#xL + 2) as string), utf-8) as string), " + - "cast(encode(cast((id#xL + 3) as string), utf-8) as string)) AS col#x]") + "Project [concat(concat(col1#x, col2#x), cast(concat(col3#x, col4#x) as string)) AS col#x]", + "Project [cast(id#xL as string) AS col1#x, " + + "cast((id#xL + cast(1 as bigint)) as string) AS col2#x, " + + "encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, " + + "encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x]") val df3 = sql( """ @@ -208,9 +210,10 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite |) """.stripMargin) checkKeywordsExistsInExplain(df3, - "Project [concat(cast(id#xL as string), " + - "cast(encode(cast((id#xL + 2) as string), utf-8) as string), " + - "cast(encode(cast((id#xL + 3) as string), utf-8) as string)) AS col#x]") + "Project [concat(col1#x, cast(concat(col3#x, col4#x) as string)) AS col#x]", + "Project [cast(id#xL as string) AS col1#x, " + + "encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, " + + "encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x]") } } From d7a4199f212f42b37e126d7c00c9bcc44a2d7480 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Mon, 24 Jun 2024 17:17:35 +0800 Subject: [PATCH 11/11] address comments --- .../spark/sql/catalyst/expressions/CodeGenerationSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala index ec15b4873bfd..4c045f9fda73 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala @@ -104,10 +104,10 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper { test("SPARK-22543: split large if expressions into blocks due to JVM code size limit") { var strExpr: Expression = Literal("abc") for (_ <- 1 to 150) { - strExpr = StringDecode(Encode(strExpr, "utf-8"), "utf-8") + strExpr = StringTrimRight(StringTrimLeft(strExpr)) } - val expressions = Seq(If(EqualTo(strExpr, strExpr), strExpr, strExpr)).map(replace) + val expressions = Seq(If(EqualTo(strExpr, strExpr), strExpr, strExpr)) val plan = GenerateMutableProjection.generate(expressions) val actual = plan(null).toSeq(expressions.map(_.dataType)) assert(actual.length == 1)