Skip to content
Closed
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
2745330
Add implicit class DatasetToBenchmark
MaxGekk Jan 2, 2020
6d13c37
Use noop() in CSVBenchmark
MaxGekk Jan 2, 2020
fb8c2c4
Use noop() in AggregateBenchmark
MaxGekk Jan 2, 2020
c28743a
Use noop() in AvroReadBenchmark
MaxGekk Jan 2, 2020
e3724b9
Use noop() in BloomFilterBenchmark
MaxGekk Jan 2, 2020
714ecfb
Use noop() in DataSourceReadBenchmark
MaxGekk Jan 2, 2020
3e88a53
Use noop() in DateTimeBenchmark
MaxGekk Jan 2, 2020
052dd0e
Use noop() in ExtractBenchmark
MaxGekk Jan 2, 2020
bd0f1ea
Use noop() in FilterPushdownBenchmark
MaxGekk Jan 2, 2020
0d2babc
Use noop() in InExpressionBenchmark
MaxGekk Jan 2, 2020
3db70cc
Use noop() in IntervalBenchmark
MaxGekk Jan 2, 2020
6f89a87
Use noop() in JoinBenchmark
MaxGekk Jan 2, 2020
de6cc74
Use noop() in JsonBenchmark
MaxGekk Jan 2, 2020
c6f7527
Use noop() in MakeDateTimeBenchmark
MaxGekk Jan 2, 2020
7307ad4
Use noop() in MiscBenchmark
MaxGekk Jan 2, 2020
4cbbff7
Use noop() in NestedSchemaPruningBenchmark
MaxGekk Jan 2, 2020
4fafd43
Use noop() in ObjectHashAggregateExecBenchmark
MaxGekk Jan 2, 2020
4414856
Use noop() in OrcReadBenchmark
MaxGekk Jan 2, 2020
3d22d83
Use noop() in RangeBenchmark
MaxGekk Jan 2, 2020
b9bed56
Use noop() in TPCDSQueryBenchmark
MaxGekk Jan 2, 2020
4858f93
Use noop() in UDFBenchmark
MaxGekk Jan 2, 2020
eee2948
Use noop() in WideSchemaBenchmark
MaxGekk Jan 2, 2020
6615d5a
Use noop() in WideTableBenchmark
MaxGekk Jan 2, 2020
c26164a
Add run-benchmarks.py
MaxGekk Jan 4, 2020
d6e519a
Fix run-benchmarks.py
MaxGekk Jan 5, 2020
1957c20
revert "Fix run-benchmarks.py"
MaxGekk Jan 5, 2020
1413425
Revert "Add run-benchmarks.py"
MaxGekk Jan 5, 2020
a9b2dd4
Regen benchmark results for JDK 8 & 11 on Linux
MaxGekk Jan 5, 2020
8dd23b7
Regen TPCDSQueryBenchmark for JDK 8 & 11
MaxGekk Jan 6, 2020
7a287f6
Re-gen JsonBenchmark result on JDK 11
MaxGekk Jan 6, 2020
18173e4
Regen JsonBenchmark results on JDK 8
MaxGekk Jan 6, 2020
2a141fd
Merge remote-tracking branch 'origin/master' into noop-in-benchmarks
MaxGekk Jan 8, 2020
677d3e1
Regen WideSchemaBenchmark results on JDK 8 & 11
MaxGekk Jan 8, 2020
9becd93
init
dongjoon-hyun Jan 10, 2020
ec8977e
Add filterpushdown
dongjoon-hyun Jan 10, 2020
4f63c3c
All
dongjoon-hyun Jan 10, 2020
2795c24
a
dongjoon-hyun Jan 10, 2020
e1a46e7
a
dongjoon-hyun Jan 10, 2020
0cfe42a
Merge pull request #24 from dongjoon-hyun/PR-27078
MaxGekk Jan 10, 2020
24dd096
jdk8
dongjoon-hyun Jan 11, 2020
39f79b0
jdk11
dongjoon-hyun Jan 11, 2020
eccde05
Merge pull request #25 from dongjoon-hyun/PR-27078-2
MaxGekk Jan 12, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions dev/run-benchmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env python3

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
from sparktestsupport.shellutils import run_cmd

benchmarks = [
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey, this is nice but why dont you make a separate PR for this file alone :).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am running all those benchmarks on a separate EC2 instance. As soon as it is done (running whole night already). I will commit results and revert the script from the PR.

['sql/test', 'org.apache.spark.sql.execution.benchmark.AggregateBenchmark'],
['avro/test', 'org.apache.spark.sql.execution.benchmark.AvroReadBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.benchmark.BloomFilterBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.benchmark.DataSourceReadBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.benchmark.DateTimeBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.benchmark.ExtractBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.benchmark.FilterPushdownBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.benchmark.InExpressionBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.benchmark.IntervalBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.benchmark.JoinBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.benchmark.MakeDateTimeBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.benchmark.MiscBenchmark'],
['hive/test', 'org.apache.spark.sql.execution.benchmark.ObjectHashAggregateExecBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.benchmark.OrcNestedSchemaPruningBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.benchmark.OrcV2NestedSchemaPruningBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.benchmark.ParquetNestedSchemaPruningBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.benchmark.RangeBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.benchmark.TPCDSQueryBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.benchmark.UDFBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.benchmark.WideSchemaBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.benchmark.WideTableBenchmark'],
['hive/test', 'org.apache.spark.sql.hive.orc.OrcReadBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.datasources.csv.CSVBenchmark'],
['sql/test', 'org.apache.spark.sql.execution.datasources.json.JsonBenchmark']
]

print('Set SPARK_GENERATE_BENCHMARK_FILES=1')
os.environ['SPARK_GENERATE_BENCHMARK_FILES'] = '1'

for b in benchmarks:
print("Run benchmark: %s" % b[1])
run_cmd(['build/sbt', '%s:runMain %s' % (b[0], b[1])])
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ object AvroReadBenchmark extends SqlBasedBenchmark {
prepareTable(dir, spark.sql(s"SELECT CAST(value as ${dataType.sql}) id FROM t1"))

benchmark.addCase("Sum") { _ =>
spark.sql("SELECT sum(id) FROM avroTable").collect()
spark.sql("SELECT sum(id) FROM avroTable").noop()
}

benchmark.run()
Expand All @@ -85,7 +85,7 @@ object AvroReadBenchmark extends SqlBasedBenchmark {
spark.sql("SELECT CAST(value AS INT) AS c1, CAST(value as STRING) AS c2 FROM t1"))

benchmark.addCase("Sum of columns") { _ =>
spark.sql("SELECT sum(c1), sum(length(c2)) FROM avroTable").collect()
spark.sql("SELECT sum(c1), sum(length(c2)) FROM avroTable").noop()
}

benchmark.run()
Expand All @@ -104,15 +104,15 @@ object AvroReadBenchmark extends SqlBasedBenchmark {
prepareTable(dir, spark.sql("SELECT value % 2 AS p, value AS id FROM t1"), Some("p"))

benchmark.addCase("Data column") { _ =>
spark.sql("SELECT sum(id) FROM avroTable").collect()
spark.sql("SELECT sum(id) FROM avroTable").noop()
}

benchmark.addCase("Partition column") { _ =>
spark.sql("SELECT sum(p) FROM avroTable").collect()
spark.sql("SELECT sum(p) FROM avroTable").noop()
}

benchmark.addCase("Both columns") { _ =>
spark.sql("SELECT sum(p), sum(id) FROM avroTable").collect()
spark.sql("SELECT sum(p), sum(id) FROM avroTable").noop()
}

benchmark.run()
Expand All @@ -130,7 +130,7 @@ object AvroReadBenchmark extends SqlBasedBenchmark {
prepareTable(dir, spark.sql("SELECT CAST((id % 200) + 10000 as STRING) AS c1 FROM t1"))

benchmark.addCase("Sum of string length") { _ =>
spark.sql("SELECT sum(length(c1)) FROM avroTable").collect()
spark.sql("SELECT sum(length(c1)) FROM avroTable").noop()
}

benchmark.run()
Expand All @@ -155,7 +155,7 @@ object AvroReadBenchmark extends SqlBasedBenchmark {

benchmark.addCase("Sum of string length") { _ =>
spark.sql("SELECT SUM(LENGTH(c2)) FROM avroTable " +
"WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
"WHERE c1 IS NOT NULL AND c2 IS NOT NULL").noop()
}

benchmark.run()
Expand All @@ -178,7 +178,7 @@ object AvroReadBenchmark extends SqlBasedBenchmark {
prepareTable(dir, spark.sql("SELECT * FROM t1"))

benchmark.addCase("Sum of single column") { _ =>
spark.sql(s"SELECT sum(c$middle) FROM avroTable").collect()
spark.sql(s"SELECT sum(c$middle) FROM avroTable").noop()
}

benchmark.run()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,19 @@ object AggregateBenchmark extends SqlBasedBenchmark {
runBenchmark("aggregate without grouping") {
val N = 500L << 22
codegenBenchmark("agg w/o group", N) {
spark.range(N).selectExpr("sum(id)").collect()
spark.range(N).selectExpr("sum(id)").noop()
}
}

runBenchmark("stat functions") {
val N = 100L << 20

codegenBenchmark("stddev", N) {
spark.range(N).groupBy().agg("id" -> "stddev").collect()
spark.range(N).groupBy().agg("id" -> "stddev").noop()
}

codegenBenchmark("kurtosis", N) {
spark.range(N).groupBy().agg("id" -> "kurtosis").collect()
spark.range(N).groupBy().agg("id" -> "kurtosis").noop()
}
}

Expand All @@ -70,7 +70,7 @@ object AggregateBenchmark extends SqlBasedBenchmark {
val benchmark = new Benchmark("Aggregate w keys", N, output = output)

def f(): Unit = {
spark.range(N).selectExpr("(id & 65535) as k").groupBy("k").sum().collect()
spark.range(N).selectExpr("(id & 65535) as k").groupBy("k").sum().noop()
}

benchmark.addCase("codegen = F", numIters = 2) { _ =>
Expand Down Expand Up @@ -107,7 +107,7 @@ object AggregateBenchmark extends SqlBasedBenchmark {
spark.range(N).selectExpr("id", "floor(rand() * 10000) as k")
.createOrReplaceTempView("test")

def f(): Unit = spark.sql("select k, k, sum(id) from test group by k, k").collect()
def f(): Unit = spark.sql("select k, k, sum(id) from test group by k, k").noop()

benchmark.addCase("codegen = F", numIters = 2) { _ =>
withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") {
Expand Down Expand Up @@ -142,7 +142,7 @@ object AggregateBenchmark extends SqlBasedBenchmark {
val benchmark = new Benchmark("Aggregate w string key", N, output = output)

def f(): Unit = spark.range(N).selectExpr("id", "cast(id & 1023 as string) as k")
.groupBy("k").count().collect()
.groupBy("k").count().noop()

benchmark.addCase("codegen = F", numIters = 2) { _ =>
withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") {
Expand Down Expand Up @@ -177,7 +177,7 @@ object AggregateBenchmark extends SqlBasedBenchmark {
val benchmark = new Benchmark("Aggregate w decimal key", N, output = output)

def f(): Unit = spark.range(N).selectExpr("id", "cast(id & 65535 as decimal) as k")
.groupBy("k").count().collect()
.groupBy("k").count().noop()

benchmark.addCase("codegen = F") { _ =>
withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") {
Expand Down Expand Up @@ -222,7 +222,7 @@ object AggregateBenchmark extends SqlBasedBenchmark {
"id > 1023 as k6")
.groupBy("k1", "k2", "k3", "k4", "k5", "k6")
.sum()
.collect()
.noop()

benchmark.addCase("codegen = F") { _ =>
withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") {
Expand Down Expand Up @@ -282,7 +282,7 @@ object AggregateBenchmark extends SqlBasedBenchmark {
"case when id > 1800 and id <= 1900 then 1 else 0 end as v18")
.groupBy("k1", "k2", "k3")
.sum()
.collect()
.noop()

benchmark.addCase("codegen = F") { _ =>
withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") {
Expand Down Expand Up @@ -315,7 +315,7 @@ object AggregateBenchmark extends SqlBasedBenchmark {

codegenBenchmark("cube", N) {
spark.range(N).selectExpr("id", "id % 1000 as k1", "id & 256 as k2")
.cube("k1", "k2").sum("id").collect()
.cube("k1", "k2").sum("id").noop()
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,10 @@ object BloomFilterBenchmark extends SqlBasedBenchmark {
runBenchmark(s"ORC Read") {
val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output)
benchmark.addCase("Without bloom filter") { _ =>
spark.read.orc(path + "/withoutBF").where("value = 0").count
spark.read.orc(path + "/withoutBF").where("value = 0").noop()
}
benchmark.addCase("With bloom filter") { _ =>
spark.read.orc(path + "/withBF").where("value = 0").count
spark.read.orc(path + "/withBF").where("value = 0").noop()
}
benchmark.run()
}
Expand Down
Loading