From e886603b2e6a59b31248127510199c22b5805ec3 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Wed, 12 Jun 2024 20:08:57 +0800 Subject: [PATCH] [SPARK-48602][SQL] Make csv generator support different output style with spark.sql.binaryOutputStyle --- .../sql/catalyst/csv/UnivocityGenerator.scala | 8 ++-- .../sql/catalyst/parser/AstBuilder.scala | 2 +- .../sql-tests/analyzer-results/binary.sql.out | 7 ++++ .../analyzer-results/binary_base64.sql.out | 7 ++++ .../analyzer-results/binary_basic.sql.out | 7 ++++ .../analyzer-results/binary_hex.sql.out | 7 ++++ .../binary_hex_discrete.sql.out | 34 ++++++++++++++++ .../resources/sql-tests/inputs/binary.sql | 1 + .../sql-tests/inputs/binary_hex_discrete.sql | 3 ++ .../sql-tests/results/binary.sql.out | 8 ++++ .../sql-tests/results/binary_base64.sql.out | 8 ++++ .../sql-tests/results/binary_basic.sql.out | 8 ++++ .../sql-tests/results/binary_hex.sql.out | 8 ++++ .../results/binary_hex_discrete.sql.out | 39 +++++++++++++++++++ .../ThriftServerQueryTestSuite.scala | 1 + 15 files changed, 144 insertions(+), 4 deletions(-) create mode 100644 sql/core/src/test/resources/sql-tests/analyzer-results/binary_hex_discrete.sql.out create mode 100644 sql/core/src/test/resources/sql-tests/inputs/binary_hex_discrete.sql create mode 100644 sql/core/src/test/resources/sql-tests/results/binary_hex_discrete.sql.out diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala index f10a53bde5dd..e6e964ac90b3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala @@ -22,8 +22,8 @@ import java.io.Writer import com.univocity.parsers.csv.CsvWriter import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.SpecializedGetters -import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, IntervalStringStyles, IntervalUtils, SparkStringUtils, TimestampFormatter} +import org.apache.spark.sql.catalyst.expressions.{SpecializedGetters, ToStringBase} +import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, IntervalStringStyles, IntervalUtils, TimestampFormatter} import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -65,9 +65,11 @@ class UnivocityGenerator( private val nullAsQuotedEmptyString = SQLConf.get.getConf(SQLConf.LEGACY_NULL_VALUE_WRITTEN_AS_QUOTED_EMPTY_STRING_CSV) + private val binaryFormatter = ToStringBase.getBinaryFormatter + private def makeConverter(dataType: DataType): ValueConverter = dataType match { case BinaryType => - (getter, ordinal) => SparkStringUtils.getHexString(getter.getBinary(ordinal)) + (getter, ordinal) => binaryFormatter(getter.getBinary(ordinal)).toString case DateType => (getter, ordinal) => dateFormatter.format(getter.getInt(ordinal)) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 0e0946668197..15c623235ccc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -2790,7 +2790,7 @@ class AstBuilder extends DataTypeAstBuilder with SQLConfHelper with Logging { case BINARY_HEX => val padding = if (value.length % 2 != 0) "0" else "" try { - Literal(Hex.decodeHex(padding + value)) + Literal(Hex.decodeHex(padding + value), BinaryType) } catch { case e: DecoderException => val ex = QueryParsingErrors.cannotParseValueTypeError("X", value, ctx) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/binary.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/binary.sql.out index 4be8fabf2346..fe61e684a7ff 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/binary.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/binary.sql.out @@ -25,3 +25,10 @@ SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' -- !query analysis Project [array(0x, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333, cast(Spark as binary)) AS array(X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST(Spark AS BINARY))#x] +- OneRowRelation + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query analysis +Project [to_csv(named_struct(n, 1, info, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333), Some(America/Los_Angeles)) AS to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'))#x] ++- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/binary_base64.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/binary_base64.sql.out index 4be8fabf2346..fe61e684a7ff 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/binary_base64.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/binary_base64.sql.out @@ -25,3 +25,10 @@ SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' -- !query analysis Project [array(0x, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333, cast(Spark as binary)) AS array(X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST(Spark AS BINARY))#x] +- OneRowRelation + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query analysis +Project [to_csv(named_struct(n, 1, info, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333), Some(America/Los_Angeles)) AS to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'))#x] ++- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/binary_basic.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/binary_basic.sql.out index 4be8fabf2346..fe61e684a7ff 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/binary_basic.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/binary_basic.sql.out @@ -25,3 +25,10 @@ SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' -- !query analysis Project [array(0x, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333, cast(Spark as binary)) AS array(X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST(Spark AS BINARY))#x] +- OneRowRelation + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query analysis +Project [to_csv(named_struct(n, 1, info, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333), Some(America/Los_Angeles)) AS to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'))#x] ++- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/binary_hex.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/binary_hex.sql.out index 4be8fabf2346..fe61e684a7ff 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/binary_hex.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/binary_hex.sql.out @@ -25,3 +25,10 @@ SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' -- !query analysis Project [array(0x, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333, cast(Spark as binary)) AS array(X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST(Spark AS BINARY))#x] +- OneRowRelation + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query analysis +Project [to_csv(named_struct(n, 1, info, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333), Some(America/Los_Angeles)) AS to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'))#x] ++- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/binary_hex_discrete.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/binary_hex_discrete.sql.out new file mode 100644 index 000000000000..fe61e684a7ff --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/binary_hex_discrete.sql.out @@ -0,0 +1,34 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT X'' +-- !query analysis +Project [0x AS X''#x] ++- OneRowRelation + + +-- !query +SELECT X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' +-- !query analysis +Project [0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333 AS X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'#x] ++- OneRowRelation + + +-- !query +SELECT CAST('Spark' as BINARY) +-- !query analysis +Project [cast(Spark as binary) AS CAST(Spark AS BINARY)#x] ++- OneRowRelation + + +-- !query +SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST('Spark' as BINARY)) +-- !query analysis +Project [array(0x, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333, cast(Spark as binary)) AS array(X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST(Spark AS BINARY))#x] ++- OneRowRelation + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query analysis +Project [to_csv(named_struct(n, 1, info, 0x4561736F6E2059616F20323031382D31312D31373A31333A33333A3333), Some(America/Los_Angeles)) AS to_csv(named_struct(n, 1, info, X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'))#x] ++- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/inputs/binary.sql b/sql/core/src/test/resources/sql-tests/inputs/binary.sql index bffd97103409..8e9e90872374 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/binary.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/binary.sql @@ -4,3 +4,4 @@ SELECT X''; SELECT X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333'; SELECT CAST('Spark' as BINARY); SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST('Spark' as BINARY)); +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')); \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/binary_hex_discrete.sql b/sql/core/src/test/resources/sql-tests/inputs/binary_hex_discrete.sql new file mode 100644 index 000000000000..ba7796ca4e2f --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/binary_hex_discrete.sql @@ -0,0 +1,3 @@ +--IMPORT binary.sql + +--SET spark.sql.binaryOutputStyle=HEX_DISCRETE; diff --git a/sql/core/src/test/resources/sql-tests/results/binary.sql.out b/sql/core/src/test/resources/sql-tests/results/binary.sql.out index 3d58e6d7346b..050f05271411 100644 --- a/sql/core/src/test/resources/sql-tests/results/binary.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/binary.sql.out @@ -29,3 +29,11 @@ SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' struct> -- !query output [,Eason Yao 2018-11-17:13:33:33,Spark] + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query schema +struct +-- !query output +1,Eason Yao 2018-11-17:13:33:33 diff --git a/sql/core/src/test/resources/sql-tests/results/binary_base64.sql.out b/sql/core/src/test/resources/sql-tests/results/binary_base64.sql.out index 526642a81cfc..8724e8620b48 100644 --- a/sql/core/src/test/resources/sql-tests/results/binary_base64.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/binary_base64.sql.out @@ -29,3 +29,11 @@ SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' struct> -- !query output [,RWFzb24gWWFvIDIwMTgtMTEtMTc6MTM6MzM6MzM,U3Bhcms] + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query schema +struct +-- !query output +1,RWFzb24gWWFvIDIwMTgtMTEtMTc6MTM6MzM6MzM diff --git a/sql/core/src/test/resources/sql-tests/results/binary_basic.sql.out b/sql/core/src/test/resources/sql-tests/results/binary_basic.sql.out index e8ff324e4d2e..0c543a7b4547 100644 --- a/sql/core/src/test/resources/sql-tests/results/binary_basic.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/binary_basic.sql.out @@ -29,3 +29,11 @@ SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' struct> -- !query output [[],[69, 97, 115, 111, 110, 32, 89, 97, 111, 32, 50, 48, 49, 56, 45, 49, 49, 45, 49, 55, 58, 49, 51, 58, 51, 51, 58, 51, 51],[83, 112, 97, 114, 107]] + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query schema +struct +-- !query output +1,"[69, 97, 115, 111, 110, 32, 89, 97, 111, 32, 50, 48, 49, 56, 45, 49, 49, 45, 49, 55, 58, 49, 51, 58, 51, 51, 58, 51, 51]" diff --git a/sql/core/src/test/resources/sql-tests/results/binary_hex.sql.out b/sql/core/src/test/resources/sql-tests/results/binary_hex.sql.out index e2e997a38135..d977301f98e0 100644 --- a/sql/core/src/test/resources/sql-tests/results/binary_hex.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/binary_hex.sql.out @@ -29,3 +29,11 @@ SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' struct> -- !query output [,4561736F6E2059616F20323031382D31312D31373A31333A33333A3333,537061726B] + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query schema +struct +-- !query output +1,4561736F6E2059616F20323031382D31312D31373A31333A33333A3333 diff --git a/sql/core/src/test/resources/sql-tests/results/binary_hex_discrete.sql.out b/sql/core/src/test/resources/sql-tests/results/binary_hex_discrete.sql.out new file mode 100644 index 000000000000..3fc6c0f53cc5 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/binary_hex_discrete.sql.out @@ -0,0 +1,39 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +SELECT X'' +-- !query schema +struct +-- !query output +[] + + +-- !query +SELECT X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333' +-- !query schema +struct +-- !query output +[45 61 73 6F 6E 20 59 61 6F 20 32 30 31 38 2D 31 31 2D 31 37 3A 31 33 3A 33 33 3A 33 33] + + +-- !query +SELECT CAST('Spark' as BINARY) +-- !query schema +struct +-- !query output +[53 70 61 72 6B] + + +-- !query +SELECT array( X'', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333', CAST('Spark' as BINARY)) +-- !query schema +struct> +-- !query output +[[],[45 61 73 6F 6E 20 59 61 6F 20 32 30 31 38 2D 31 31 2D 31 37 3A 31 33 3A 33 33 3A 33 33],[53 70 61 72 6B]] + + +-- !query +SELECT to_csv(named_struct('n', 1, 'info', X'4561736F6E2059616F20323031382D31312D31373A31333A33333A3333')) +-- !query schema +struct +-- !query output +1,[45 61 73 6F 6E 20 59 61 6F 20 32 30 31 38 2D 31 31 2D 31 37 3A 31 33 3A 33 33 3A 33 33] diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala index d7c85f647ae6..627e5c4950a9 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala @@ -105,6 +105,7 @@ class ThriftServerQueryTestSuite extends SQLQueryTestSuite with SharedThriftServ // SPARK-47264 "collations.sql", "binary_hex.sql", + "binary_hex_discrete.sql", "binary_basic.sql", "binary_base64.sql" )