Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3027,6 +3027,9 @@ object Encode {
legacyCharsets: Boolean,
legacyErrorAction: Boolean): Array[Byte] = {
val toCharset = charset.toString
if (input.numBytes == 0 || "UTF-8".equalsIgnoreCase(toCharset)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is actually a behavior change. If the input bytes are not valid utf 8 encoding, previously the result was not the same as the input bytes, but now it is.

We should either remove this utf 8 shortcut, or check the input bytes to see if it's valid utf8 encoding first.

cc @yaooqinn

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean that we will encode the unmappable characters to mojibakes before this PR, but now we use its identity?

Do you think we can call input.isValid to check here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yea I think so. For the happy path it's still faster than doing the actual encoding, and invalid utf8 bytes should be rare so it's ok to have an extra isValid call.

return input.getBytes
}
if (legacyCharsets || VALID_CHARSETS.contains(toCharset.toUpperCase(Locale.ROOT))) {
val encoder = try {
val codingErrorAction = if (legacyErrorAction) {
Expand Down
8 changes: 8 additions & 0 deletions sql/core/benchmarks/EncodeBenchmark-jdk21-results.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1022-azure
AMD EPYC 7763 64-Core Processor
encode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
UTF-32 47469 47482 19 0.2 4746.9 1.0X
UTF-16 57463 57487 35 0.2 5746.3 0.8X
UTF-8 2803 2805 3 3.6 280.3 16.9X

8 changes: 8 additions & 0 deletions sql/core/benchmarks/EncodeBenchmark-results.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1022-azure
AMD EPYC 7763 64-Core Processor
encode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
UTF-32 31107 31205 138 0.3 3110.7 1.0X
UTF-16 47904 47934 43 0.2 4790.4 0.6X
UTF-8 2957 2978 30 3.4 295.7 10.5X

Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,41 @@ Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
+- LocalRelation [scol#x, ecol#x]


-- !query
select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query analysis
Project [encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16), UTF-8)#x]
+- OneRowRelation


-- !query
select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query analysis
Project [encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16), UTF-8)#x]
+- OneRowRelation


-- !query
select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query analysis
Project [encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16), UTF-16), UTF-8)#x]
+- OneRowRelation


-- !query
select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query analysis
Project [encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8)#x]
+- OneRowRelation


-- !query
select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query analysis
Project [encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16), UTF-16), UTF-8)#x]
+- OneRowRelation


-- !query
select decode()
-- !query analysis
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,41 @@ Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
+- LocalRelation [scol#x, ecol#x]


-- !query
select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query analysis
Project [encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16), UTF-8)#x]
+- OneRowRelation


-- !query
select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query analysis
Project [encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16), UTF-8)#x]
+- OneRowRelation


-- !query
select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query analysis
Project [encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16), UTF-16), UTF-8)#x]
+- OneRowRelation


-- !query
select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query analysis
Project [encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8)#x]
+- OneRowRelation


-- !query
select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query analysis
Project [encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16), UTF-16), UTF-8) AS encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16), UTF-16), UTF-8)#x]
+- OneRowRelation


-- !query
select decode()
-- !query analysis
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,11 @@ select encode(scol, ecol) from values('渭城朝雨浥轻尘', 'US-ASCII') as t(
set spark.sql.legacy.codingErrorAction=false;
select encode('客舍青青柳色新', 'US-ASCII');
select encode(scol, ecol) from values('客舍青青柳色新', 'US-ASCII') as t(scol, ecol);
select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'), 'UTF-8');
select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'), 'UTF-8');
select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 'UTF-16'), 'UTF-8');
select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8');
select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 'UTF-16'), 'UTF-8');

-- decode
select decode();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -967,6 +967,46 @@ org.apache.spark.SparkRuntimeException
}


-- !query
select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query schema
struct<encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16), UTF-8):binary>
-- !query output
白日依山尽,黄河入海流。欲穷千里目,更上一层楼。


-- !query
select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query schema
struct<encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16), UTF-8):binary>
-- !query output
南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。


-- !query
select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query schema
struct<encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16), UTF-16), UTF-8):binary>
-- !query output
세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark


-- !query
select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query schema
struct<encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8):binary>
-- !query output
το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως


-- !query
select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query schema
struct<encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16), UTF-16), UTF-8):binary>
-- !query output
Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。


-- !query
select decode()
-- !query schema
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -899,6 +899,46 @@ org.apache.spark.SparkRuntimeException
}


-- !query
select encode(decode(encode('白日依山尽,黄河入海流。欲穷千里目,更上一层楼。', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query schema
struct<encode(decode(encode(白日依山尽,黄河入海流。欲穷千里目,更上一层楼。, UTF-16), UTF-16), UTF-8):binary>
-- !query output
白日依山尽,黄河入海流。欲穷千里目,更上一层楼。


-- !query
select encode(decode(encode('南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query schema
struct<encode(decode(encode(南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。, UTF-16), UTF-16), UTF-8):binary>
-- !query output
南山經之首曰䧿山。其首曰招搖之山,臨於西海之上。


-- !query
select encode(decode(encode('세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query schema
struct<encode(decode(encode(세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark, UTF-16), UTF-16), UTF-8):binary>
-- !query output
세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark


-- !query
select encode(decode(encode('το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query schema
struct<encode(decode(encode(το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως, UTF-16), UTF-16), UTF-8):binary>
-- !query output
το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως


-- !query
select encode(decode(encode('Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。', 'UTF-16'), 'UTF-16'), 'UTF-8')
-- !query schema
struct<encode(decode(encode(Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。, UTF-16), UTF-16), UTF-8):binary>
-- !query output
Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。


-- !query
select decode()
-- !query schema
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.sql.execution.benchmark

import org.apache.spark.benchmark.Benchmark

/**
* Benchmark for encode
* To run this benchmark:
* {{{
* 1. without sbt:
* bin/spark-submit --class <this class> --jars <spark core test jar> <sql core test jar>
* 2. build/sbt "sql/Test/runMain <this class>"
* 3. generate result:
* SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain <this class>"
* Results will be written to "benchmarks/EncodeBenchmark-results.txt".
* }}}
*/
object EncodeBenchmark extends SqlBasedBenchmark {
import spark.implicits._
private val N = 10L * 1000 * 1000

override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
withTempPath { path =>
// scalastyle:off nonascii
val exprs = Seq(
"",
"Spark",
"白日依山尽,黄河入海流。欲穷千里目,更上一层楼。",
"το Spark είναι το πιο δημοφιλές πλαίσιο επεξεργασίας μεγάλων δεδομένων παγκοσμίως",
"세계에서 가장 인기 있는 빅데이터 처리 프레임워크인 Spark",
"Sparkは世界で最も人気のあるビッグデータ処理フレームワークである。")
// scalastyle:off nonascii

spark.range(N).map { i =>
val idx = (i % 6).toInt
val str = exprs(idx)
(str, str * 3, str * 5, str * 9, "")
}.write.parquet(path.getCanonicalPath)

val benchmark = new Benchmark("encode", N, output = output)
def addBenchmarkCase(charset: String): Unit = {
benchmark.addCase(charset) { _ =>
spark.read.parquet(path.getCanonicalPath).selectExpr(
s"encode(_1, '$charset')",
s"encode(_2, '$charset')",
s"encode(_3, '$charset')",
s"encode(_4, '$charset')",
s"encode(_5, '$charset')").noop()
}
}
addBenchmarkCase("UTF-32")
addBenchmarkCase("UTF-16")
addBenchmarkCase("UTF-8")
benchmark.run()
}
}
}