-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-21973][SQL] Add an new option to filter queries in TPC-DS #19188
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
| package org.apache.spark.sql.execution.benchmark | ||
|
|
||
| import org.apache.spark.SparkConf | ||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.sql.SparkSession | ||
| import org.apache.spark.sql.catalyst.TableIdentifier | ||
| import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation | ||
|
|
@@ -29,9 +30,9 @@ import org.apache.spark.util.Benchmark | |
| /** | ||
| * Benchmark to measure TPCDS query performance. | ||
| * To run this: | ||
| * spark-submit --class <this class> <spark sql test jar> <TPCDS data location> | ||
| * spark-submit --class <this class> <spark sql test jar> --data-location <TPCDS data location> | ||
| */ | ||
| object TPCDSQueryBenchmark { | ||
| object TPCDSQueryBenchmark extends Logging { | ||
| val conf = | ||
| new SparkConf() | ||
| .setMaster("local[1]") | ||
|
|
@@ -90,7 +91,9 @@ object TPCDSQueryBenchmark { | |
| benchmark.addCase(name) { i => | ||
| spark.sql(queryString).collect() | ||
| } | ||
| logInfo(s"\n\n===== TPCDS QUERY BENCHMARK OUTPUT FOR $name =====\n") | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See #19188 (comment) |
||
| benchmark.run() | ||
| logInfo(s"\n\n===== FINISHED $name =====\n") | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -110,6 +113,20 @@ object TPCDSQueryBenchmark { | |
| "q81", "q82", "q83", "q84", "q85", "q86", "q87", "q88", "q89", "q90", | ||
| "q91", "q92", "q93", "q94", "q95", "q96", "q97", "q98", "q99") | ||
|
|
||
| tpcdsAll(benchmarkArgs.dataLocation, queries = tpcdsQueries) | ||
| // If `--query-filter` defined, filters the queries that this option selects | ||
| val queriesToRun = if (benchmarkArgs.queryFilter.nonEmpty) { | ||
| val queries = tpcdsQueries.filter { case queryName => | ||
| benchmarkArgs.queryFilter.contains(queryName) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add case insensitive?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yea, I like the idea. |
||
| } | ||
| if (queries.isEmpty) { | ||
| throw new RuntimeException( | ||
| s"Empty queries to run. Bad query name filter: ${benchmarkArgs.queryFilter}") | ||
| } | ||
| queries | ||
| } else { | ||
| tpcdsQueries | ||
| } | ||
|
|
||
| tpcdsAll(benchmarkArgs.dataLocation, queries = queriesToRun) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,8 +17,12 @@ | |
|
|
||
| package org.apache.spark.sql.execution.benchmark | ||
|
|
||
| import java.util.Locale | ||
|
|
||
|
|
||
| class TPCDSQueryBenchmarkArguments(val args: Array[String]) { | ||
| var dataLocation: String = null | ||
| var queryFilter: Set[String] = Set.empty | ||
|
|
||
| parseArgs(args.toList) | ||
| validateArguments() | ||
|
|
@@ -32,6 +36,10 @@ class TPCDSQueryBenchmarkArguments(val args: Array[String]) { | |
| dataLocation = value | ||
| args = tail | ||
|
|
||
| case ("--query-filter") :: value :: tail => | ||
| queryFilter = value.toLowerCase(Locale.ROOT).split(",").map(_.trim).toSet | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you also make
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated |
||
| args = tail | ||
|
|
||
| case _ => | ||
| // scalastyle:off println | ||
| System.err.println("Unknown/unsupported param " + args) | ||
|
|
@@ -47,6 +55,7 @@ class TPCDSQueryBenchmarkArguments(val args: Array[String]) { | |
| |Usage: spark-submit --class <this class> <spark sql test jar> [Options] | ||
| |Options: | ||
| | --data-location Path to TPCDS data | ||
| | --query-filter Queries to filter, e.g., q3,q5,q13 | ||
| | | ||
| |------------------------------------------------------------------------------------------------------------------ | ||
| |In order to run this benchmark, please follow the instructions at | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks incorrect?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sorry, but I missed your point. what's correct?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
--data-location <TPCDS data location> [--query-filter Queries to filter]?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
aha, thanks. better to add optional parameters here? I like a simple example here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK. I see.