[SPARK-31932][SQL][TESTS] Add date/timestamp benchmarks for HiveResult.hiveResultString()

MaxGekk · cloud-fan · commit ddd8d5f5a0b6 · 2020-06-09T04:59:41.000Z
### What changes were proposed in this pull request? Add benchmarks for `HiveResult.hiveResultString()/toHiveString()` to measure throughput of `toHiveString` for the date/timestamp types: - java.sql.Date/Timestamp - java.time.Instant - java.time.LocalDate Benchmark results were generated in the environment: | Item | Description | | ---- | ----| | Region | us-west-2 (Oregon) | | Instance | r3.xlarge | | AMI | ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server-20190722.1 (ami-06f2f779464715dc5) | | Java | OpenJDK 64-Bit Server VM 1.8.0_242 and OpenJDK 64-Bit Server VM 11.0.6+10 | ### Why are the changes needed? To detect perf regressions of `toHiveString` in the future. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? By running `DateTimeBenchmark` and check dataset content. Closes apache#28757 from MaxGekk/benchmark-toHiveString. Authored-by: Max Gekk <max.gekk@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt b/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt
@@ -453,5 +453,9 @@ From java.time.Instant                              325            328
 Collect longs                                      1300           1321          25          3.8         260.0       0.3X
 Collect java.sql.Timestamp                         1450           1557         102          3.4         290.0       0.3X
 Collect java.time.Instant                          1499           1599          87          3.3         299.9       0.3X
+java.sql.Date to Hive string                      17536          18367        1059          0.3        3507.2       0.0X
+java.time.LocalDate to Hive string                12089          12897         725          0.4        2417.8       0.0X
+java.sql.Timestamp to Hive string                 48014          48625         752          0.1        9602.9       0.0X
+java.time.Instant to Hive string                  37346          37445          93          0.1        7469.1       0.0X
 
 
diff --git a/sql/core/benchmarks/DateTimeBenchmark-results.txt b/sql/core/benchmarks/DateTimeBenchmark-results.txt
@@ -453,5 +453,9 @@ From java.time.Instant                              236            243
 Collect longs                                      1280           1337          79          3.9         256.1       0.3X
 Collect java.sql.Timestamp                         1485           1501          15          3.4         297.0       0.3X
 Collect java.time.Instant                          1441           1465          37          3.5         288.1       0.3X
+java.sql.Date to Hive string                      18745          20895        1364          0.3        3749.0       0.0X
+java.time.LocalDate to Hive string                15296          15450         143          0.3        3059.2       0.0X
+java.sql.Timestamp to Hive string                 46421          47210         946          0.1        9284.2       0.0X
+java.time.Instant to Hive string                  34747          35187         382          0.1        6949.4       0.0X
 
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala
@@ -21,8 +21,10 @@ import java.sql.{Date, Timestamp}
 import java.time.{Instant, LocalDate}
 
 import org.apache.spark.benchmark.Benchmark
+import org.apache.spark.sql.Dataset
 import org.apache.spark.sql.catalyst.util.DateTimeConstants.MILLIS_PER_DAY
 import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.{withDefaultTimeZone, LA}
+import org.apache.spark.sql.execution.HiveResult
 import org.apache.spark.sql.internal.SQLConf
 
 /**
@@ -182,14 +184,19 @@ object DateTimeBenchmark extends SqlBasedBenchmark {
           benchmark.addCase("From java.time.LocalDate", numIters) { _ =>
             spark.range(rowsNum).map(millis => LocalDate.ofEpochDay(millis / MILLIS_PER_DAY)).noop()
           }
+          def dates = {
+            spark.range(0, rowsNum, 1, 1).map(millis => new Date(millis))
+          }
           benchmark.addCase("Collect java.sql.Date", numIters) { _ =>
-            spark.range(0, rowsNum, 1, 1).map(millis => new Date(millis)).collect()
+            dates.collect()
+          }
+          def localDates = {
+            spark.range(0, rowsNum, 1, 1)
+              .map(millis => LocalDate.ofEpochDay(millis / MILLIS_PER_DAY))
           }
           benchmark.addCase("Collect java.time.LocalDate", numIters) { _ =>
             withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") {
-              spark.range(0, rowsNum, 1, 1)
-                .map(millis => LocalDate.ofEpochDay(millis / MILLIS_PER_DAY))
-                .collect()
+              localDates.collect()
             }
           }
           benchmark.addCase("From java.sql.Timestamp", numIters) { _ =>
@@ -202,14 +209,37 @@ object DateTimeBenchmark extends SqlBasedBenchmark {
             spark.range(0, rowsNum, 1, 1)
               .collect()
           }
+          def timestamps = {
+            spark.range(0, rowsNum, 1, 1).map(millis => new Timestamp(millis))
+          }
           benchmark.addCase("Collect java.sql.Timestamp", numIters) { _ =>
-            spark.range(0, rowsNum, 1, 1).map(millis => new Timestamp(millis)).collect()
+            timestamps.collect()
+          }
+          def instants = {
+            spark.range(0, rowsNum, 1, 1).map(millis => Instant.ofEpochMilli(millis))
           }
           benchmark.addCase("Collect java.time.Instant", numIters) { _ =>
             withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") {
-              spark.range(0, rowsNum, 1, 1)
-                .map(millis => Instant.ofEpochMilli(millis))
-                .collect()
+              instants.collect()
+            }
+          }
+          def toHiveString(df: Dataset[_]): Unit = {
+            HiveResult.hiveResultString(df.queryExecution.executedPlan)
+          }
+          benchmark.addCase("java.sql.Date to Hive string", numIters) { _ =>
+            toHiveString(dates)
+          }
+          benchmark.addCase("java.time.LocalDate to Hive string", numIters) { _ =>
+            withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") {
+              toHiveString(localDates)
+            }
+          }
+          benchmark.addCase("java.sql.Timestamp to Hive string", numIters) { _ =>
+            toHiveString(timestamps)
+          }
+          benchmark.addCase("java.time.Instant to Hive string", numIters) { _ =>
+            withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") {
+              toHiveString(instants)
             }
           }
           benchmark.run()