diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 476b18fac310..70122d7a99a8 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -3027,6 +3027,9 @@ object Encode { legacyCharsets: Boolean, legacyErrorAction: Boolean): Array[Byte] = { val toCharset = charset.toString + if (input.numBytes == 0 || "UTF-8".equalsIgnoreCase(toCharset)) { + return input.getBytes + } if (legacyCharsets || VALID_CHARSETS.contains(toCharset.toUpperCase(Locale.ROOT))) { val encoder = try { val codingErrorAction = if (legacyErrorAction) { diff --git a/sql/core/benchmarks/EncodeBenchmark-jdk21-results.txt b/sql/core/benchmarks/EncodeBenchmark-jdk21-results.txt new file mode 100644 index 000000000000..0a6164bc652e --- /dev/null +++ b/sql/core/benchmarks/EncodeBenchmark-jdk21-results.txt @@ -0,0 +1,8 @@ +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure +AMD EPYC 7763 64-Core Processor +encode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF-32 47469 47482 19 0.2 4746.9 1.0X +UTF-16 57463 57487 35 0.2 5746.3 0.8X +UTF-8 2803 2805 3 3.6 280.3 16.9X + diff --git a/sql/core/benchmarks/EncodeBenchmark-results.txt b/sql/core/benchmarks/EncodeBenchmark-results.txt new file mode 100644 index 000000000000..404138db7d36 --- /dev/null +++ b/sql/core/benchmarks/EncodeBenchmark-results.txt @@ -0,0 +1,8 @@ +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure +AMD EPYC 7763 64-Core Processor +encode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF-32 31107 31205 138 0.3 3110.7 1.0X +UTF-16 47904 47934 43 0.2 4790.4 0.6X +UTF-8 2957 2978 30 3.4 295.7 10.5X + diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out index c4f002d84ea6..98664dedf820 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out @@ -739,6 +739,41 @@ Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- LocalRelation [scol#x, ecol#x] +-- !query +select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + +-- !query +select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + +-- !query +select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + +-- !query +select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + +-- !query +select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + -- !query select decode() -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out index c4f002d84ea6..98664dedf820 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out @@ -739,6 +739,41 @@ Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +- LocalRelation [scol#x, ecol#x] +-- !query +select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + +-- !query +select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + +-- !query +select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + +-- !query +select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + +-- !query +select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query analysis +Project [encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16), UTF-16), UTF-8)#x] ++- OneRowRelation + + -- !query select decode() -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 256b8e0d49fa..c108f7c76f76 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -132,6 +132,11 @@ select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t( set spark.sql.legacy.codingErrorAction=false; select encode('客舍青青柳色新', 'US-ASCII'); select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol); +select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'), 'UTF-8'); +select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'), 'UTF-8'); +select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 'UTF-16'), 'UTF-8'); +select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8'); +select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 'UTF-16'), 'UTF-8'); -- decode select decode(); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 24d4cfa74b5a..da2fa9ca0c18 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -967,6 +967,46 @@ org.apache.spark.SparkRuntimeException } +-- !query +select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +白日依山尽,黄河入海流。欲穷千里目,更上一层楼。 + + +-- !query +select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。 + + +-- !query +select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark + + +-- !query +select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως + + +-- !query +select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。 + + -- !query select decode() -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 53f516dac03c..d42c387c8057 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -899,6 +899,46 @@ org.apache.spark.SparkRuntimeException } +-- !query +select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +白日依山尽,黄河入海流。欲穷千里目,更上一层楼。 + + +-- !query +select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。 + + +-- !query +select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark + + +-- !query +select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως + + +-- !query +select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 'UTF-16'), 'UTF-8') +-- !query schema +struct +-- !query output +Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。 + + -- !query select decode() -- !query schema diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/EncodeBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/EncodeBenchmark.scala new file mode 100644 index 000000000000..76ebd7f41677 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/EncodeBenchmark.scala @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import org.apache.spark.benchmark.Benchmark + +/** + * Benchmark for encode + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class --jars + * 2. build/sbt "sql/Test/runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain " + * Results will be written to "benchmarks/EncodeBenchmark-results.txt". + * }}} + */ +object EncodeBenchmark extends SqlBasedBenchmark { + import spark.implicits._ + private val N = 10L * 1000 * 1000 + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + withTempPath { path => + // scalastyle:off nonascii + val exprs = Seq( + "", + "Spark", + "白日依山尽,黄河入海流。欲穷千里目,更上一层楼。", + "το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως", + "세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark", + "Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。") + // scalastyle:off nonascii + + spark.range(N).map { i => + val idx = (i % 6).toInt + val str = exprs(idx) + (str, str * 3, str * 5, str * 9, "") + }.write.parquet(path.getCanonicalPath) + + val benchmark = new Benchmark("encode", N, output = output) + def addBenchmarkCase(charset: String): Unit = { + benchmark.addCase(charset) { _ => + spark.read.parquet(path.getCanonicalPath).selectExpr( + s"encode(_1, '$charset')", + s"encode(_2, '$charset')", + s"encode(_3, '$charset')", + s"encode(_4, '$charset')", + s"encode(_5, '$charset')").noop() + } + } + addBenchmarkCase("UTF-32") + addBenchmarkCase("UTF-16") + addBenchmarkCase("UTF-8") + benchmark.run() + } + } +}