From e217139ee63c7755c6630354847e8c5b3d447aa7 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 27 Mar 2020 23:51:19 +0300 Subject: [PATCH 01/13] Add benchmarks for rebasing date-time in parquet --- .../benchmark/DateTimeRebaseBenchmark.scala | 172 ++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala new file mode 100644 index 000000000000..0c14cdb9044e --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import java.time.{LocalDateTime, ZoneOffset} + +import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.internal.SQLConf + +/** + * Synthetic benchmark for rebasing of date and timestamp in read/write. + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class --jars + * 2. build/sbt "sql/test:runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/DateTimeRebaseBenchmark-results.txt". + * }}} + */ +object DateTimeRebaseBenchmark extends SqlBasedBenchmark { + import spark.implicits._ + + private def genTs(cardinality: Int, start: LocalDateTime, end: LocalDateTime): DataFrame = { + val startSec = start.toEpochSecond(ZoneOffset.UTC) + val endSec = end.toEpochSecond(ZoneOffset.UTC) + spark.range(0, cardinality, 1, 1) + .select((($"id" % (endSec - startSec)) + startSec).as("seconds")) + .select($"seconds".cast("timestamp").as("ts")) + } + + private def genTsAfter1582(cardinality: Int): DataFrame = { + val start = LocalDateTime.of(1582, 10, 15, 0, 0, 0) + val end = LocalDateTime.of(3000, 1, 1, 0, 0, 0) + genTs(cardinality, start, end) + } + + private def genTsBefore1582(cardinality: Int): DataFrame = { + val start = LocalDateTime.of(10, 1, 1, 0, 0, 0) + val end = LocalDateTime.of(1580, 1, 1, 0, 0, 0) + genTs(cardinality, start, end) + + } + + private def save(df: DataFrame, path: String, format: String = "parquet"): Unit = { + df.write.mode("overwrite").format(format).save(path) + } + + private def load(path: String, format: String = "parquet"): Unit = { + spark.read.format(format).load(path).noop() + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + withTempPath { path => + runBenchmark("Parquet read/write") { + val rowsNum = 100000000 + var numIters = 1 + var benchmark = new Benchmark("Save timestamps to parquet", rowsNum, output = output) + benchmark.addCase("after 1582, noop", numIters) { _ => + genTsAfter1582(rowsNum).noop() + } + val ts_after_1582_off = path.getAbsolutePath + "/ts_after_1582_off" + benchmark.addCase("after 1582, rebase off", numIters) { _ => + withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "false") { + save(genTsAfter1582(rowsNum), ts_after_1582_off) + } + } + val ts_after_1582_on = path.getAbsolutePath + "/ts_after_1582_on" + benchmark.addCase("after 1582, rebase on", numIters) { _ => + withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "true") { + save(genTsAfter1582(rowsNum), ts_after_1582_on) + } + } + benchmark.addCase("before 1582, noop", numIters) { _ => + genTsBefore1582(rowsNum).noop() + } + val ts_before_1582_off = path.getAbsolutePath + "/ts_before_1582_off" + benchmark.addCase("before 1582, rebase off", numIters) { _ => + withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "false") { + save(genTsBefore1582(rowsNum), ts_before_1582_off) + } + } + val ts_before_1582_on = path.getAbsolutePath + "/ts_before_1582_on" + benchmark.addCase("before 1582, rebase on", numIters) { _ => + withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "true") { + save(genTsBefore1582(rowsNum), ts_before_1582_on) + } + } + benchmark.run() + + numIters = 3 + benchmark = new Benchmark("Load timestamps from parquet", rowsNum, output = output) + benchmark.addCase("after 1582, vec off, rebase off", numIters) { _ => + withSQLConf( + SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false", + SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "false") { + load(ts_after_1582_off) + } + } + benchmark.addCase("after 1582, vec off, rebase on", numIters) { _ => + withSQLConf( + SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false", + SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "true") { + load(ts_after_1582_on) + } + } + benchmark.addCase("after 1582, vec on, rebase off", numIters) { _ => + withSQLConf( + SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true", + SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "false") { + load(ts_after_1582_off) + } + } + benchmark.addCase("after 1582, vec on, rebase on", numIters) { _ => + withSQLConf( + SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true", + SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "true") { + load(ts_after_1582_on) + } + } + + benchmark.addCase("before 1582, vec off, rebase off", numIters) { _ => + withSQLConf( + SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false", + SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "false") { + load(ts_before_1582_off) + } + } + benchmark.addCase("before 1582, vec off, rebase on", numIters) { _ => + withSQLConf( + SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false", + SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "true") { + load(ts_before_1582_on) + } + } + benchmark.addCase("before 1582, vec on, rebase off", numIters) { _ => + withSQLConf( + SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true", + SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "false") { + load(ts_before_1582_off) + } + } + benchmark.addCase("after 1582, vec on, rebase on", numIters) { _ => + withSQLConf( + SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true", + SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "true") { + load(ts_before_1582_on) + } + } + + benchmark.run() + } + } + } +} From 24476a3e3ebb6091d25d7e4606d395b93a7742d7 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sat, 28 Mar 2020 00:03:59 +0300 Subject: [PATCH 02/13] Minor change: `after` -> `before` --- .../spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala index 0c14cdb9044e..471daad5dfeb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala @@ -157,7 +157,7 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark { load(ts_before_1582_off) } } - benchmark.addCase("after 1582, vec on, rebase on", numIters) { _ => + benchmark.addCase("before 1582, vec on, rebase on", numIters) { _ => withSQLConf( SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true", SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "true") { From f3c30b7de2abdadcb14bd9d878700f07a4e59566 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sat, 28 Mar 2020 19:26:37 +0300 Subject: [PATCH 03/13] Benchmarks for dates --- .../benchmark/DateTimeRebaseBenchmark.scala | 159 ++++++++---------- 1 file changed, 67 insertions(+), 92 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala index 471daad5dfeb..cf52543cc3b0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.benchmark -import java.time.{LocalDateTime, ZoneOffset} +import java.time.{LocalDate, LocalDateTime, ZoneOffset} import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.DataFrame @@ -59,113 +59,88 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark { } - private def save(df: DataFrame, path: String, format: String = "parquet"): Unit = { - df.write.mode("overwrite").format(format).save(path) + private def genDateAfter1582(cardinality: Int): DataFrame = { + genTsAfter1582(cardinality).select($"ts".cast("date").as("date")) } - private def load(path: String, format: String = "parquet"): Unit = { - spark.read.format(format).load(path).noop() + private def genDateBefore1582(cardinality: Int): DataFrame = { + genTsBefore1582(cardinality).select($"ts".cast("date").as("date")) + } + + private def genDF(cardinality: Int, dateTime: String, after1582: Boolean): DataFrame = { + (dateTime, after1582) match { + case ("date", true) => genDateAfter1582(cardinality) + case ("date", false) => genDateBefore1582(cardinality) + case ("timestamp", true) => genTsAfter1582(cardinality) + case ("timestamp", false) => genTsBefore1582(cardinality) + case _ => throw new IllegalArgumentException( + s"cardinality = $cardinality dateTime = $dateTime after1582 = $after1582") + } } override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { withTempPath { path => runBenchmark("Parquet read/write") { val rowsNum = 100000000 - var numIters = 1 - var benchmark = new Benchmark("Save timestamps to parquet", rowsNum, output = output) - benchmark.addCase("after 1582, noop", numIters) { _ => - genTsAfter1582(rowsNum).noop() - } - val ts_after_1582_off = path.getAbsolutePath + "/ts_after_1582_off" - benchmark.addCase("after 1582, rebase off", numIters) { _ => - withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "false") { - save(genTsAfter1582(rowsNum), ts_after_1582_off) - } - } - val ts_after_1582_on = path.getAbsolutePath + "/ts_after_1582_on" - benchmark.addCase("after 1582, rebase on", numIters) { _ => - withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "true") { - save(genTsAfter1582(rowsNum), ts_after_1582_on) - } - } - benchmark.addCase("before 1582, noop", numIters) { _ => - genTsBefore1582(rowsNum).noop() - } - val ts_before_1582_off = path.getAbsolutePath + "/ts_before_1582_off" - benchmark.addCase("before 1582, rebase off", numIters) { _ => - withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "false") { - save(genTsBefore1582(rowsNum), ts_before_1582_off) - } - } - val ts_before_1582_on = path.getAbsolutePath + "/ts_before_1582_on" - benchmark.addCase("before 1582, rebase on", numIters) { _ => - withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "true") { - save(genTsBefore1582(rowsNum), ts_before_1582_on) - } - } - benchmark.run() - - numIters = 3 - benchmark = new Benchmark("Load timestamps from parquet", rowsNum, output = output) - benchmark.addCase("after 1582, vec off, rebase off", numIters) { _ => - withSQLConf( - SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false", - SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "false") { - load(ts_after_1582_off) - } - } - benchmark.addCase("after 1582, vec off, rebase on", numIters) { _ => - withSQLConf( - SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false", - SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "true") { - load(ts_after_1582_on) - } - } - benchmark.addCase("after 1582, vec on, rebase off", numIters) { _ => - withSQLConf( - SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true", - SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "false") { - load(ts_after_1582_off) + Seq("date", "timestamp").foreach { dateTime => + val benchmark = new Benchmark(s"Save ${dateTime}s to parquet", rowsNum, output = output) + benchmark.addCase("after 1582, noop", 1) { _ => + genDF(rowsNum, dateTime, after1582 = true).noop() } - } - benchmark.addCase("after 1582, vec on, rebase on", numIters) { _ => - withSQLConf( - SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true", - SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "true") { - load(ts_after_1582_on) + benchmark.addCase("before 1582, noop", 1) { _ => + genDF(rowsNum, dateTime, after1582 = false).noop() } - } - benchmark.addCase("before 1582, vec off, rebase off", numIters) { _ => - withSQLConf( - SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false", - SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "false") { - load(ts_before_1582_off) + def save(after1582: Boolean, rebase: Boolean): Unit = { + val period = if (after1582) "after" else "before" + val rebaseFlag = if (rebase) "on" else "off" + val caseName = s"$period 1582, rebase $rebaseFlag" + benchmark.addCase(caseName, 1) { _ => + withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> rebase.toString) { + val df = genDF(rowsNum, dateTime, after1582) + val pathToWrite = path.getAbsolutePath + s"/${dateTime}_${period}_1582_$rebaseFlag" + df.write + .mode("overwrite") + .format("parquet") + .save(pathToWrite) + } + } } - } - benchmark.addCase("before 1582, vec off, rebase on", numIters) { _ => - withSQLConf( - SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false", - SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "true") { - load(ts_before_1582_on) + + Seq(true, false).foreach { after1582 => + Seq(false, true).foreach { rebase => + save(after1582, rebase) + } } - } - benchmark.addCase("before 1582, vec on, rebase off", numIters) { _ => - withSQLConf( - SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true", - SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "false") { - load(ts_before_1582_off) + benchmark.run() + + val benchmark2 = new Benchmark(s"Load $dateTime from parquet", rowsNum, output = output) + + def load(after1582: Boolean, vec: Boolean, rebase: Boolean): Unit = { + val period = if (after1582) "after" else "before" + val rebaseFlag = if (rebase) "on" else "off" + val vecFlag = if (vec) "on" else "off" + val caseName = s"$period 1582, vec $vecFlag, rebase $rebaseFlag" + benchmark2.addCase(caseName, 3) { _ => + withSQLConf( + SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vec.toString, + SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> rebase.toString) { + val pathToRead = path.getAbsolutePath + s"/${dateTime}_${period}_1582_$rebaseFlag" + spark.read.format("parquet").load(pathToRead).noop() + } + } } - } - benchmark.addCase("before 1582, vec on, rebase on", numIters) { _ => - withSQLConf( - SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true", - SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key -> "true") { - load(ts_before_1582_on) + + Seq(true, false).foreach { after1582 => + Seq(false, true).foreach { vec => + Seq(false, true).foreach { rebase => + load(after1582, vec, rebase) + } + } } - } - benchmark.run() + benchmark2.run() + } } } } From 0734e469b35dbf1a20d0b89f3afed4be0e8b78a7 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sat, 28 Mar 2020 19:33:19 +0300 Subject: [PATCH 04/13] Remove LocalDate --- .../spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala index cf52543cc3b0..fb7f6557aeba 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.benchmark -import java.time.{LocalDate, LocalDateTime, ZoneOffset} +import java.time.{LocalDateTime, ZoneOffset} import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.DataFrame From 417001b0e372f86c4ac74574eecb9d585e53ecbb Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sat, 28 Mar 2020 10:15:38 -0700 Subject: [PATCH 05/13] Re-gen results of DateTimeRebaseBenchmark on JDK 8 --- .../DateTimeRebaseBenchmark-results.txt | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt new file mode 100644 index 000000000000..b9a07e24d6ae --- /dev/null +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt @@ -0,0 +1,53 @@ +================================================================================================ +Parquet read/write +================================================================================================ + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.3 +Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +Save dates to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +after 1582, noop 8873 8873 0 11.3 88.7 1.0X +before 1582, noop 8543 8543 0 11.7 85.4 1.0X +after 1582, rebase off 20145 20145 0 5.0 201.4 0.4X +after 1582, rebase on 51934 51934 0 1.9 519.3 0.2X +before 1582, rebase off 20253 20253 0 4.9 202.5 0.4X +before 1582, rebase on 58564 58564 0 1.7 585.6 0.2X + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.3 +Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +Load date from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +after 1582, vec off, rebase off 9260 9308 48 10.8 92.6 1.0X +after 1582, vec off, rebase on 27658 28050 348 3.6 276.6 0.3X +after 1582, vec on, rebase off 2689 2799 115 37.2 26.9 3.4X +after 1582, vec on, rebase on 2558 2564 6 39.1 25.6 3.6X +before 1582, vec off, rebase off 8990 9009 20 11.1 89.9 1.0X +before 1582, vec off, rebase on 29806 30053 268 3.4 298.1 0.3X +before 1582, vec on, rebase off 2550 2567 15 39.2 25.5 3.6X +before 1582, vec on, rebase on 2578 2586 13 38.8 25.8 3.6X + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.3 +Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +Save timestamps to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +after 1582, noop 2099 2099 0 47.6 21.0 1.0X +before 1582, noop 2080 2080 0 48.1 20.8 1.0X +after 1582, rebase off 12626 12626 0 7.9 126.3 0.2X +after 1582, rebase on 76490 76490 0 1.3 764.9 0.0X +before 1582, rebase off 12548 12548 0 8.0 125.5 0.2X +before 1582, rebase on 81903 81903 0 1.2 819.0 0.0X + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.3 +Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +Load timestamp from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +after 1582, vec off, rebase off 12780 12890 126 7.8 127.8 1.0X +after 1582, vec off, rebase on 53386 53418 35 1.9 533.9 0.2X +after 1582, vec on, rebase off 4000 4055 84 25.0 40.0 3.2X +after 1582, vec on, rebase on 45193 45277 125 2.2 451.9 0.3X +before 1582, vec off, rebase off 12806 12913 101 7.8 128.1 1.0X +before 1582, vec off, rebase on 54768 54976 355 1.8 547.7 0.2X +before 1582, vec on, rebase off 4017 4300 294 24.9 40.2 3.2X +before 1582, vec on, rebase on 46892 47118 200 2.1 468.9 0.3X + + From 50247e53e297349c99b04216733451db557425a3 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sat, 28 Mar 2020 11:04:06 -0700 Subject: [PATCH 06/13] Re-gen results of DateTimeRebaseBenchmark on JDK 11 --- .../DateTimeRebaseBenchmark-jdk11-results.txt | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt new file mode 100644 index 000000000000..7c6c844f32c3 --- /dev/null +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt @@ -0,0 +1,53 @@ +================================================================================================ +Parquet read/write +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.5+10 on Mac OS X 10.15.3 +Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +Save dates to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +after 1582, noop 9182 9182 0 10.9 91.8 1.0X +before 1582, noop 8665 8665 0 11.5 86.7 1.1X +after 1582, rebase off 19332 19332 0 5.2 193.3 0.5X +after 1582, rebase on 53228 53228 0 1.9 532.3 0.2X +before 1582, rebase off 19840 19840 0 5.0 198.4 0.5X +before 1582, rebase on 59134 59134 0 1.7 591.3 0.2X + +OpenJDK 64-Bit Server VM 11.0.5+10 on Mac OS X 10.15.3 +Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +Load date from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +after 1582, vec off, rebase off 11272 11324 45 8.9 112.7 1.0X +after 1582, vec off, rebase on 29902 29974 110 3.3 299.0 0.4X +after 1582, vec on, rebase off 2867 2872 5 34.9 28.7 3.9X +after 1582, vec on, rebase on 2671 2687 18 37.4 26.7 4.2X +before 1582, vec off, rebase off 11487 11749 333 8.7 114.9 1.0X +before 1582, vec off, rebase on 31816 32090 250 3.1 318.2 0.4X +before 1582, vec on, rebase off 2664 2667 2 37.5 26.6 4.2X +before 1582, vec on, rebase on 2670 2712 66 37.4 26.7 4.2X + +OpenJDK 64-Bit Server VM 11.0.5+10 on Mac OS X 10.15.3 +Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +Save timestamps to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +after 1582, noop 2171 2171 0 46.1 21.7 1.0X +before 1582, noop 2126 2126 0 47.0 21.3 1.0X +after 1582, rebase off 13745 13745 0 7.3 137.4 0.2X +after 1582, rebase on 65073 65073 0 1.5 650.7 0.0X +before 1582, rebase off 13875 13875 0 7.2 138.7 0.2X +before 1582, rebase on 71456 71456 0 1.4 714.6 0.0X + +OpenJDK 64-Bit Server VM 11.0.5+10 on Mac OS X 10.15.3 +Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +Load timestamp from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +after 1582, vec off, rebase off 14472 14544 65 6.9 144.7 1.0X +after 1582, vec off, rebase on 46600 47025 623 2.1 466.0 0.3X +after 1582, vec on, rebase off 3927 3977 47 25.5 39.3 3.7X +after 1582, vec on, rebase on 35369 35493 108 2.8 353.7 0.4X +before 1582, vec off, rebase off 14604 14664 68 6.8 146.0 1.0X +before 1582, vec off, rebase on 48112 48222 127 2.1 481.1 0.3X +before 1582, vec on, rebase off 3894 3997 91 25.7 38.9 3.7X +before 1582, vec on, rebase on 36938 37024 130 2.7 369.4 0.4X + + From 27a75a6856b9b4a16bec2a0e9d943536bf2b913a Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sat, 28 Mar 2020 21:06:56 +0300 Subject: [PATCH 07/13] Minor change of load bench name --- .../sql/execution/benchmark/DateTimeRebaseBenchmark.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala index fb7f6557aeba..01b49cc3da71 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala @@ -114,7 +114,8 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark { } benchmark.run() - val benchmark2 = new Benchmark(s"Load $dateTime from parquet", rowsNum, output = output) + val benchmark2 = new Benchmark( + s"Load ${dateTime}s from parquet", rowsNum, output = output) def load(after1582: Boolean, vec: Boolean, rebase: Boolean): Unit = { val period = if (after1582) "after" else "before" From 69bb4ed39e55ebe11f230205ff070ab9e4e17410 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sat, 28 Mar 2020 21:47:27 +0300 Subject: [PATCH 08/13] Fix date generation --- .../benchmark/DateTimeRebaseBenchmark.scala | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala index 01b49cc3da71..bf2e4adfcb79 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala @@ -17,10 +17,11 @@ package org.apache.spark.sql.execution.benchmark -import java.time.{LocalDateTime, ZoneOffset} +import java.time.{LocalDate, LocalDateTime, LocalTime, ZoneOffset} import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.catalyst.util.DateTimeConstants.SECONDS_PER_DAY import org.apache.spark.sql.internal.SQLConf /** @@ -56,15 +57,27 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark { val start = LocalDateTime.of(10, 1, 1, 0, 0, 0) val end = LocalDateTime.of(1580, 1, 1, 0, 0, 0) genTs(cardinality, start, end) + } + private def genDate(cardinality: Int, start: LocalDate, end: LocalDate): DataFrame = { + val startSec = LocalDateTime.of(start, LocalTime.MIDNIGHT).toEpochSecond(ZoneOffset.UTC) + val endSec = LocalDateTime.of(end, LocalTime.MIDNIGHT).toEpochSecond(ZoneOffset.UTC) + spark.range(0, cardinality * SECONDS_PER_DAY, SECONDS_PER_DAY, 1) + .select((($"id" % (endSec - startSec)) + startSec).as("seconds")) + .select($"seconds".cast("timestamp").as("ts")) + .select($"ts".cast("date").as("date")) } private def genDateAfter1582(cardinality: Int): DataFrame = { - genTsAfter1582(cardinality).select($"ts".cast("date").as("date")) + val start = LocalDate.of(1582, 10, 15) + val end = LocalDate.of(3000, 1, 1) + genDate(cardinality, start, end) } private def genDateBefore1582(cardinality: Int): DataFrame = { - genTsBefore1582(cardinality).select($"ts".cast("date").as("date")) + val start = LocalDate.of(10, 1, 1) + val end = LocalDate.of(1580, 1, 1) + genDate(cardinality, start, end) } private def genDF(cardinality: Int, dateTime: String, after1582: Boolean): DataFrame = { From 10382e2039b3e110d1385748ea0aeb4126c3764a Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sat, 28 Mar 2020 12:34:14 -0700 Subject: [PATCH 09/13] Re-gen results of DateTimeRebaseBenchmark on JDK 11 --- .../DateTimeRebaseBenchmark-jdk11-results.txt | 60 +++++++++---------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt index 7c6c844f32c3..b239c8c6dabf 100644 --- a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt @@ -6,48 +6,48 @@ OpenJDK 64-Bit Server VM 11.0.5+10 on Mac OS X 10.15.3 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz Save dates to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 9182 9182 0 10.9 91.8 1.0X -before 1582, noop 8665 8665 0 11.5 86.7 1.1X -after 1582, rebase off 19332 19332 0 5.2 193.3 0.5X -after 1582, rebase on 53228 53228 0 1.9 532.3 0.2X -before 1582, rebase off 19840 19840 0 5.0 198.4 0.5X -before 1582, rebase on 59134 59134 0 1.7 591.3 0.2X +after 1582, noop 15986 15986 0 6.3 159.9 1.0X +before 1582, noop 8463 8463 0 11.8 84.6 1.9X +after 1582, rebase off 25404 25404 0 3.9 254.0 0.6X +after 1582, rebase on 54458 54458 0 1.8 544.6 0.3X +before 1582, rebase off 17023 17023 0 5.9 170.2 0.9X +before 1582, rebase on 54942 54942 0 1.8 549.4 0.3X OpenJDK 64-Bit Server VM 11.0.5+10 on Mac OS X 10.15.3 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz -Load date from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +Load dates from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 11272 11324 45 8.9 112.7 1.0X -after 1582, vec off, rebase on 29902 29974 110 3.3 299.0 0.4X -after 1582, vec on, rebase off 2867 2872 5 34.9 28.7 3.9X -after 1582, vec on, rebase on 2671 2687 18 37.4 26.7 4.2X -before 1582, vec off, rebase off 11487 11749 333 8.7 114.9 1.0X -before 1582, vec off, rebase on 31816 32090 250 3.1 318.2 0.4X -before 1582, vec on, rebase off 2664 2667 2 37.5 26.6 4.2X -before 1582, vec on, rebase on 2670 2712 66 37.4 26.7 4.2X +after 1582, vec off, rebase off 10840 11008 151 9.2 108.4 1.0X +after 1582, vec off, rebase on 27635 27820 234 3.6 276.4 0.4X +after 1582, vec on, rebase off 2939 2981 51 34.0 29.4 3.7X +after 1582, vec on, rebase on 20195 20206 9 5.0 202.0 0.5X +before 1582, vec off, rebase off 10571 10641 82 9.5 105.7 1.0X +before 1582, vec off, rebase on 30889 30975 116 3.2 308.9 0.4X +before 1582, vec on, rebase off 2848 2916 83 35.1 28.5 3.8X +before 1582, vec on, rebase on 22721 22770 73 4.4 227.2 0.5X OpenJDK 64-Bit Server VM 11.0.5+10 on Mac OS X 10.15.3 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz Save timestamps to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 2171 2171 0 46.1 21.7 1.0X -before 1582, noop 2126 2126 0 47.0 21.3 1.0X -after 1582, rebase off 13745 13745 0 7.3 137.4 0.2X -after 1582, rebase on 65073 65073 0 1.5 650.7 0.0X -before 1582, rebase off 13875 13875 0 7.2 138.7 0.2X -before 1582, rebase on 71456 71456 0 1.4 714.6 0.0X +after 1582, noop 2341 2341 0 42.7 23.4 1.0X +before 1582, noop 2327 2327 0 43.0 23.3 1.0X +after 1582, rebase off 12859 12859 0 7.8 128.6 0.2X +after 1582, rebase on 64691 64691 0 1.5 646.9 0.0X +before 1582, rebase off 13093 13093 0 7.6 130.9 0.2X +before 1582, rebase on 71913 71913 0 1.4 719.1 0.0X OpenJDK 64-Bit Server VM 11.0.5+10 on Mac OS X 10.15.3 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz -Load timestamp from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +Load timestamps from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 14472 14544 65 6.9 144.7 1.0X -after 1582, vec off, rebase on 46600 47025 623 2.1 466.0 0.3X -after 1582, vec on, rebase off 3927 3977 47 25.5 39.3 3.7X -after 1582, vec on, rebase on 35369 35493 108 2.8 353.7 0.4X -before 1582, vec off, rebase off 14604 14664 68 6.8 146.0 1.0X -before 1582, vec off, rebase on 48112 48222 127 2.1 481.1 0.3X -before 1582, vec on, rebase off 3894 3997 91 25.7 38.9 3.7X -before 1582, vec on, rebase on 36938 37024 130 2.7 369.4 0.4X +after 1582, vec off, rebase off 12307 12547 224 8.1 123.1 1.0X +after 1582, vec off, rebase on 42650 43006 433 2.3 426.5 0.3X +after 1582, vec on, rebase off 3822 3865 65 26.2 38.2 3.2X +after 1582, vec on, rebase on 33704 33749 79 3.0 337.0 0.4X +before 1582, vec off, rebase off 12299 12348 84 8.1 123.0 1.0X +before 1582, vec off, rebase on 44533 44972 662 2.2 445.3 0.3X +before 1582, vec on, rebase off 4245 4519 258 23.6 42.5 2.9X +before 1582, vec on, rebase on 35629 35856 252 2.8 356.3 0.3X From 912dee41526ac5d7ae9dd44a790a961d2f04b54f Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sat, 28 Mar 2020 13:37:25 -0700 Subject: [PATCH 10/13] Re-gen results of DateTimeRebaseBenchmark on JDK 8 --- .../DateTimeRebaseBenchmark-results.txt | 60 +++++++++---------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt index b9a07e24d6ae..24b89339cdee 100644 --- a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt @@ -6,48 +6,48 @@ Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.3 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz Save dates to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 8873 8873 0 11.3 88.7 1.0X -before 1582, noop 8543 8543 0 11.7 85.4 1.0X -after 1582, rebase off 20145 20145 0 5.0 201.4 0.4X -after 1582, rebase on 51934 51934 0 1.9 519.3 0.2X -before 1582, rebase off 20253 20253 0 4.9 202.5 0.4X -before 1582, rebase on 58564 58564 0 1.7 585.6 0.2X +after 1582, noop 19105 19105 0 5.2 191.0 1.0X +before 1582, noop 8289 8289 0 12.1 82.9 2.3X +after 1582, rebase off 29998 29998 0 3.3 300.0 0.6X +after 1582, rebase on 61779 61779 0 1.6 617.8 0.3X +before 1582, rebase off 18777 18777 0 5.3 187.8 1.0X +before 1582, rebase on 58391 58391 0 1.7 583.9 0.3X Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.3 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz -Load date from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +Load dates from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 9260 9308 48 10.8 92.6 1.0X -after 1582, vec off, rebase on 27658 28050 348 3.6 276.6 0.3X -after 1582, vec on, rebase off 2689 2799 115 37.2 26.9 3.4X -after 1582, vec on, rebase on 2558 2564 6 39.1 25.6 3.6X -before 1582, vec off, rebase off 8990 9009 20 11.1 89.9 1.0X -before 1582, vec off, rebase on 29806 30053 268 3.4 298.1 0.3X -before 1582, vec on, rebase off 2550 2567 15 39.2 25.5 3.6X -before 1582, vec on, rebase on 2578 2586 13 38.8 25.8 3.6X +after 1582, vec off, rebase off 11302 11569 328 8.8 113.0 1.0X +after 1582, vec off, rebase on 31014 31140 198 3.2 310.1 0.4X +after 1582, vec on, rebase off 3187 3357 261 31.4 31.9 3.5X +after 1582, vec on, rebase on 22647 22781 115 4.4 226.5 0.5X +before 1582, vec off, rebase off 11099 11209 96 9.0 111.0 1.0X +before 1582, vec off, rebase on 33300 33428 119 3.0 333.0 0.3X +before 1582, vec on, rebase off 3255 3366 131 30.7 32.6 3.5X +before 1582, vec on, rebase on 24548 24993 386 4.1 245.5 0.5X Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.3 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz Save timestamps to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 2099 2099 0 47.6 21.0 1.0X -before 1582, noop 2080 2080 0 48.1 20.8 1.0X -after 1582, rebase off 12626 12626 0 7.9 126.3 0.2X -after 1582, rebase on 76490 76490 0 1.3 764.9 0.0X -before 1582, rebase off 12548 12548 0 8.0 125.5 0.2X -before 1582, rebase on 81903 81903 0 1.2 819.0 0.0X +after 1582, noop 2094 2094 0 47.8 20.9 1.0X +before 1582, noop 2073 2073 0 48.2 20.7 1.0X +after 1582, rebase off 13400 13400 0 7.5 134.0 0.2X +after 1582, rebase on 77886 77886 0 1.3 778.9 0.0X +before 1582, rebase off 13378 13378 0 7.5 133.8 0.2X +before 1582, rebase on 90746 90746 0 1.1 907.5 0.0X Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.3 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz -Load timestamp from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +Load timestamps from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 12780 12890 126 7.8 127.8 1.0X -after 1582, vec off, rebase on 53386 53418 35 1.9 533.9 0.2X -after 1582, vec on, rebase off 4000 4055 84 25.0 40.0 3.2X -after 1582, vec on, rebase on 45193 45277 125 2.2 451.9 0.3X -before 1582, vec off, rebase off 12806 12913 101 7.8 128.1 1.0X -before 1582, vec off, rebase on 54768 54976 355 1.8 547.7 0.2X -before 1582, vec on, rebase off 4017 4300 294 24.9 40.2 3.2X -before 1582, vec on, rebase on 46892 47118 200 2.1 468.9 0.3X +after 1582, vec off, rebase off 13037 13180 186 7.7 130.4 1.0X +after 1582, vec off, rebase on 53351 53398 41 1.9 533.5 0.2X +after 1582, vec on, rebase off 4027 4076 49 24.8 40.3 3.2X +after 1582, vec on, rebase on 46889 47083 326 2.1 468.9 0.3X +before 1582, vec off, rebase off 13720 13913 219 7.3 137.2 1.0X +before 1582, vec off, rebase on 57410 57779 320 1.7 574.1 0.2X +before 1582, vec on, rebase off 4045 4183 146 24.7 40.4 3.2X +before 1582, vec on, rebase on 49008 49458 397 2.0 490.1 0.3X From c2cc3859db4d2e149378dca97258428adbfb4d47 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Sun, 29 Mar 2020 07:12:48 +0000 Subject: [PATCH 11/13] Regen benchmark results on Linux JDK 8 --- .../DateTimeRebaseBenchmark-results.txt | 72 +++++++++---------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt index 24b89339cdee..dd0c32c196a7 100644 --- a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt @@ -2,52 +2,52 @@ Parquet read/write ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.3 -Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1058-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save dates to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 19105 19105 0 5.2 191.0 1.0X -before 1582, noop 8289 8289 0 12.1 82.9 2.3X -after 1582, rebase off 29998 29998 0 3.3 300.0 0.6X -after 1582, rebase on 61779 61779 0 1.6 617.8 0.3X -before 1582, rebase off 18777 18777 0 5.3 187.8 1.0X -before 1582, rebase on 58391 58391 0 1.7 583.9 0.3X +after 1582, noop 9472 9472 0 10.6 94.7 1.0X +before 1582, noop 9226 9226 0 10.8 92.3 1.0X +after 1582, rebase off 21201 21201 0 4.7 212.0 0.4X +after 1582, rebase on 56471 56471 0 1.8 564.7 0.2X +before 1582, rebase off 20179 20179 0 5.0 201.8 0.5X +before 1582, rebase on 65717 65717 0 1.5 657.2 0.1X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.3 -Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1058-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load dates from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 11302 11569 328 8.8 113.0 1.0X -after 1582, vec off, rebase on 31014 31140 198 3.2 310.1 0.4X -after 1582, vec on, rebase off 3187 3357 261 31.4 31.9 3.5X -after 1582, vec on, rebase on 22647 22781 115 4.4 226.5 0.5X -before 1582, vec off, rebase off 11099 11209 96 9.0 111.0 1.0X -before 1582, vec off, rebase on 33300 33428 119 3.0 333.0 0.3X -before 1582, vec on, rebase off 3255 3366 131 30.7 32.6 3.5X -before 1582, vec on, rebase on 24548 24993 386 4.1 245.5 0.5X +after 1582, vec off, rebase off 12294 12434 205 8.1 122.9 1.0X +after 1582, vec off, rebase on 36959 36967 12 2.7 369.6 0.3X +after 1582, vec on, rebase off 3644 3691 49 27.4 36.4 3.4X +after 1582, vec on, rebase on 26764 26852 92 3.7 267.6 0.5X +before 1582, vec off, rebase off 12830 12917 85 7.8 128.3 1.0X +before 1582, vec off, rebase on 38897 39053 229 2.6 389.0 0.3X +before 1582, vec on, rebase off 3638 3693 85 27.5 36.4 3.4X +before 1582, vec on, rebase on 28956 29007 44 3.5 289.6 0.4X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.3 -Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1058-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save timestamps to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 2094 2094 0 47.8 20.9 1.0X -before 1582, noop 2073 2073 0 48.2 20.7 1.0X -after 1582, rebase off 13400 13400 0 7.5 134.0 0.2X -after 1582, rebase on 77886 77886 0 1.3 778.9 0.0X -before 1582, rebase off 13378 13378 0 7.5 133.8 0.2X -before 1582, rebase on 90746 90746 0 1.1 907.5 0.0X +after 1582, noop 2952 2952 0 33.9 29.5 1.0X +before 1582, noop 2880 2880 0 34.7 28.8 1.0X +after 1582, rebase off 15928 15928 0 6.3 159.3 0.2X +after 1582, rebase on 82816 82816 0 1.2 828.2 0.0X +before 1582, rebase off 15988 15988 0 6.3 159.9 0.2X +before 1582, rebase on 92636 92636 0 1.1 926.4 0.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.3 -Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1058-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load timestamps from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 13037 13180 186 7.7 130.4 1.0X -after 1582, vec off, rebase on 53351 53398 41 1.9 533.5 0.2X -after 1582, vec on, rebase off 4027 4076 49 24.8 40.3 3.2X -after 1582, vec on, rebase on 46889 47083 326 2.1 468.9 0.3X -before 1582, vec off, rebase off 13720 13913 219 7.3 137.2 1.0X -before 1582, vec off, rebase on 57410 57779 320 1.7 574.1 0.2X -before 1582, vec on, rebase off 4045 4183 146 24.7 40.4 3.2X -before 1582, vec on, rebase on 49008 49458 397 2.0 490.1 0.3X +after 1582, vec off, rebase off 14863 14917 78 6.7 148.6 1.0X +after 1582, vec off, rebase on 54819 54939 140 1.8 548.2 0.3X +after 1582, vec on, rebase off 4905 4941 32 20.4 49.0 3.0X +after 1582, vec on, rebase on 44914 45008 124 2.2 449.1 0.3X +before 1582, vec off, rebase off 14928 14970 48 6.7 149.3 1.0X +before 1582, vec off, rebase on 59752 59996 245 1.7 597.5 0.2X +before 1582, vec on, rebase off 4892 4916 33 20.4 48.9 3.0X +before 1582, vec on, rebase on 46854 46977 198 2.1 468.5 0.3X From c89f2c9a0dd717e4ed12101a05236a2c3bd7252a Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Sun, 29 Mar 2020 08:23:07 +0000 Subject: [PATCH 12/13] Regen benchmark results on Linux JDK 11 --- .../DateTimeRebaseBenchmark-jdk11-results.txt | 72 +++++++++---------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt index b239c8c6dabf..36ab52606264 100644 --- a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt @@ -2,52 +2,52 @@ Parquet read/write ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.5+10 on Mac OS X 10.15.3 -Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1058-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save dates to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 15986 15986 0 6.3 159.9 1.0X -before 1582, noop 8463 8463 0 11.8 84.6 1.9X -after 1582, rebase off 25404 25404 0 3.9 254.0 0.6X -after 1582, rebase on 54458 54458 0 1.8 544.6 0.3X -before 1582, rebase off 17023 17023 0 5.9 170.2 0.9X -before 1582, rebase on 54942 54942 0 1.8 549.4 0.3X +after 1582, noop 9272 9272 0 10.8 92.7 1.0X +before 1582, noop 9142 9142 0 10.9 91.4 1.0X +after 1582, rebase off 21841 21841 0 4.6 218.4 0.4X +after 1582, rebase on 58245 58245 0 1.7 582.4 0.2X +before 1582, rebase off 19813 19813 0 5.0 198.1 0.5X +before 1582, rebase on 63737 63737 0 1.6 637.4 0.1X -OpenJDK 64-Bit Server VM 11.0.5+10 on Mac OS X 10.15.3 -Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1058-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load dates from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 10840 11008 151 9.2 108.4 1.0X -after 1582, vec off, rebase on 27635 27820 234 3.6 276.4 0.4X -after 1582, vec on, rebase off 2939 2981 51 34.0 29.4 3.7X -after 1582, vec on, rebase on 20195 20206 9 5.0 202.0 0.5X -before 1582, vec off, rebase off 10571 10641 82 9.5 105.7 1.0X -before 1582, vec off, rebase on 30889 30975 116 3.2 308.9 0.4X -before 1582, vec on, rebase off 2848 2916 83 35.1 28.5 3.8X -before 1582, vec on, rebase on 22721 22770 73 4.4 227.2 0.5X +after 1582, vec off, rebase off 13004 13063 67 7.7 130.0 1.0X +after 1582, vec off, rebase on 36224 36253 26 2.8 362.2 0.4X +after 1582, vec on, rebase off 3596 3654 54 27.8 36.0 3.6X +after 1582, vec on, rebase on 26144 26253 112 3.8 261.4 0.5X +before 1582, vec off, rebase off 12872 12914 51 7.8 128.7 1.0X +before 1582, vec off, rebase on 37762 37904 153 2.6 377.6 0.3X +before 1582, vec on, rebase off 3522 3592 94 28.4 35.2 3.7X +before 1582, vec on, rebase on 27580 27615 59 3.6 275.8 0.5X -OpenJDK 64-Bit Server VM 11.0.5+10 on Mac OS X 10.15.3 -Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1058-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save timestamps to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 2341 2341 0 42.7 23.4 1.0X -before 1582, noop 2327 2327 0 43.0 23.3 1.0X -after 1582, rebase off 12859 12859 0 7.8 128.6 0.2X -after 1582, rebase on 64691 64691 0 1.5 646.9 0.0X -before 1582, rebase off 13093 13093 0 7.6 130.9 0.2X -before 1582, rebase on 71913 71913 0 1.4 719.1 0.0X +after 1582, noop 3113 3113 0 32.1 31.1 1.0X +before 1582, noop 3078 3078 0 32.5 30.8 1.0X +after 1582, rebase off 15749 15749 0 6.3 157.5 0.2X +after 1582, rebase on 69106 69106 0 1.4 691.1 0.0X +before 1582, rebase off 15967 15967 0 6.3 159.7 0.2X +before 1582, rebase on 76843 76843 0 1.3 768.4 0.0X -OpenJDK 64-Bit Server VM 11.0.5+10 on Mac OS X 10.15.3 -Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1058-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load timestamps from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 12307 12547 224 8.1 123.1 1.0X -after 1582, vec off, rebase on 42650 43006 433 2.3 426.5 0.3X -after 1582, vec on, rebase off 3822 3865 65 26.2 38.2 3.2X -after 1582, vec on, rebase on 33704 33749 79 3.0 337.0 0.4X -before 1582, vec off, rebase off 12299 12348 84 8.1 123.0 1.0X -before 1582, vec off, rebase on 44533 44972 662 2.2 445.3 0.3X -before 1582, vec on, rebase off 4245 4519 258 23.6 42.5 2.9X -before 1582, vec on, rebase on 35629 35856 252 2.8 356.3 0.3X +after 1582, vec off, rebase off 15070 15172 94 6.6 150.7 1.0X +after 1582, vec off, rebase on 43748 43867 157 2.3 437.5 0.3X +after 1582, vec on, rebase off 4805 4859 60 20.8 48.1 3.1X +after 1582, vec on, rebase on 33960 34027 61 2.9 339.6 0.4X +before 1582, vec off, rebase off 15037 15071 52 6.7 150.4 1.0X +before 1582, vec off, rebase on 44590 44749 156 2.2 445.9 0.3X +before 1582, vec on, rebase off 4831 4852 30 20.7 48.3 3.1X +before 1582, vec on, rebase on 35460 35481 18 2.8 354.6 0.4X From e0aedf5dbf363477ca88b6c6a7fb0038bafe8261 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Mon, 30 Mar 2020 09:08:38 +0300 Subject: [PATCH 13/13] Rename benchmark --- sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt | 2 +- sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt | 2 +- .../spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt index 36ab52606264..52522f8f88c7 100644 --- a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt @@ -1,5 +1,5 @@ ================================================================================================ -Parquet read/write +Rebasing dates/timestamps in Parquet datasource ================================================================================================ OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1058-aws diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt index dd0c32c196a7..c9320cfe660f 100644 --- a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt @@ -1,5 +1,5 @@ ================================================================================================ -Parquet read/write +Rebasing dates/timestamps in Parquet datasource ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1058-aws diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala index bf2e4adfcb79..983d9b4f709f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala @@ -93,7 +93,7 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark { override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { withTempPath { path => - runBenchmark("Parquet read/write") { + runBenchmark("Rebasing dates/timestamps in Parquet datasource") { val rowsNum = 100000000 Seq("date", "timestamp").foreach { dateTime => val benchmark = new Benchmark(s"Save ${dateTime}s to parquet", rowsNum, output = output)