Skip to content

Commit ddd8d5f

Browse files
MaxGekkcloud-fan
authored andcommitted
[SPARK-31932][SQL][TESTS] Add date/timestamp benchmarks for HiveResult.hiveResultString()
### What changes were proposed in this pull request? Add benchmarks for `HiveResult.hiveResultString()/toHiveString()` to measure throughput of `toHiveString` for the date/timestamp types: - java.sql.Date/Timestamp - java.time.Instant - java.time.LocalDate Benchmark results were generated in the environment: | Item | Description | | ---- | ----| | Region | us-west-2 (Oregon) | | Instance | r3.xlarge | | AMI | ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server-20190722.1 (ami-06f2f779464715dc5) | | Java | OpenJDK 64-Bit Server VM 1.8.0_242 and OpenJDK 64-Bit Server VM 11.0.6+10 | ### Why are the changes needed? To detect perf regressions of `toHiveString` in the future. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? By running `DateTimeBenchmark` and check dataset content. Closes apache#28757 from MaxGekk/benchmark-toHiveString. Authored-by: Max Gekk <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 8305b77 commit ddd8d5f

File tree

3 files changed

+46
-8
lines changed

3 files changed

+46
-8
lines changed

sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,5 +453,9 @@ From java.time.Instant 325 328
453453
Collect longs 1300 1321 25 3.8 260.0 0.3X
454454
Collect java.sql.Timestamp 1450 1557 102 3.4 290.0 0.3X
455455
Collect java.time.Instant 1499 1599 87 3.3 299.9 0.3X
456+
java.sql.Date to Hive string 17536 18367 1059 0.3 3507.2 0.0X
457+
java.time.LocalDate to Hive string 12089 12897 725 0.4 2417.8 0.0X
458+
java.sql.Timestamp to Hive string 48014 48625 752 0.1 9602.9 0.0X
459+
java.time.Instant to Hive string 37346 37445 93 0.1 7469.1 0.0X
456460

457461

sql/core/benchmarks/DateTimeBenchmark-results.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,5 +453,9 @@ From java.time.Instant 236 243
453453
Collect longs 1280 1337 79 3.9 256.1 0.3X
454454
Collect java.sql.Timestamp 1485 1501 15 3.4 297.0 0.3X
455455
Collect java.time.Instant 1441 1465 37 3.5 288.1 0.3X
456+
java.sql.Date to Hive string 18745 20895 1364 0.3 3749.0 0.0X
457+
java.time.LocalDate to Hive string 15296 15450 143 0.3 3059.2 0.0X
458+
java.sql.Timestamp to Hive string 46421 47210 946 0.1 9284.2 0.0X
459+
java.time.Instant to Hive string 34747 35187 382 0.1 6949.4 0.0X
456460

457461

sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@ import java.sql.{Date, Timestamp}
2121
import java.time.{Instant, LocalDate}
2222

2323
import org.apache.spark.benchmark.Benchmark
24+
import org.apache.spark.sql.Dataset
2425
import org.apache.spark.sql.catalyst.util.DateTimeConstants.MILLIS_PER_DAY
2526
import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.{withDefaultTimeZone, LA}
27+
import org.apache.spark.sql.execution.HiveResult
2628
import org.apache.spark.sql.internal.SQLConf
2729

2830
/**
@@ -182,14 +184,19 @@ object DateTimeBenchmark extends SqlBasedBenchmark {
182184
benchmark.addCase("From java.time.LocalDate", numIters) { _ =>
183185
spark.range(rowsNum).map(millis => LocalDate.ofEpochDay(millis / MILLIS_PER_DAY)).noop()
184186
}
187+
def dates = {
188+
spark.range(0, rowsNum, 1, 1).map(millis => new Date(millis))
189+
}
185190
benchmark.addCase("Collect java.sql.Date", numIters) { _ =>
186-
spark.range(0, rowsNum, 1, 1).map(millis => new Date(millis)).collect()
191+
dates.collect()
192+
}
193+
def localDates = {
194+
spark.range(0, rowsNum, 1, 1)
195+
.map(millis => LocalDate.ofEpochDay(millis / MILLIS_PER_DAY))
187196
}
188197
benchmark.addCase("Collect java.time.LocalDate", numIters) { _ =>
189198
withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") {
190-
spark.range(0, rowsNum, 1, 1)
191-
.map(millis => LocalDate.ofEpochDay(millis / MILLIS_PER_DAY))
192-
.collect()
199+
localDates.collect()
193200
}
194201
}
195202
benchmark.addCase("From java.sql.Timestamp", numIters) { _ =>
@@ -202,14 +209,37 @@ object DateTimeBenchmark extends SqlBasedBenchmark {
202209
spark.range(0, rowsNum, 1, 1)
203210
.collect()
204211
}
212+
def timestamps = {
213+
spark.range(0, rowsNum, 1, 1).map(millis => new Timestamp(millis))
214+
}
205215
benchmark.addCase("Collect java.sql.Timestamp", numIters) { _ =>
206-
spark.range(0, rowsNum, 1, 1).map(millis => new Timestamp(millis)).collect()
216+
timestamps.collect()
217+
}
218+
def instants = {
219+
spark.range(0, rowsNum, 1, 1).map(millis => Instant.ofEpochMilli(millis))
207220
}
208221
benchmark.addCase("Collect java.time.Instant", numIters) { _ =>
209222
withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") {
210-
spark.range(0, rowsNum, 1, 1)
211-
.map(millis => Instant.ofEpochMilli(millis))
212-
.collect()
223+
instants.collect()
224+
}
225+
}
226+
def toHiveString(df: Dataset[_]): Unit = {
227+
HiveResult.hiveResultString(df.queryExecution.executedPlan)
228+
}
229+
benchmark.addCase("java.sql.Date to Hive string", numIters) { _ =>
230+
toHiveString(dates)
231+
}
232+
benchmark.addCase("java.time.LocalDate to Hive string", numIters) { _ =>
233+
withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") {
234+
toHiveString(localDates)
235+
}
236+
}
237+
benchmark.addCase("java.sql.Timestamp to Hive string", numIters) { _ =>
238+
toHiveString(timestamps)
239+
}
240+
benchmark.addCase("java.time.Instant to Hive string", numIters) { _ =>
241+
withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") {
242+
toHiveString(instants)
213243
}
214244
}
215245
benchmark.run()

0 commit comments

Comments
 (0)