-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-16525][SQL] Enable Row Based HashMap in HashAggregateExec #14176
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
4a5b81f
7194394
122cf18
def94cc
97bb7c1
b9a4268
e67ff5d
b32cb7b
a58314c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -282,13 +282,12 @@ case class HashAggregateExec( | |
|
|
||
| // The name for Fast HashMap | ||
| private var fastHashMapTerm: String = _ | ||
| // whether vectorized hashmap or row based hashmap is enabled | ||
| // we make sure that at most one of the two flags is true | ||
| // i.e., assertFalse(isVectorizedHashMapEnabled && isRowBasedHashMapEnabled) | ||
| private var isFastHashMapEnabled: Boolean = false | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This flag check if one of them is enabled. As some of the generated code is same for both hash maps, the flag could make condition-checking clearer.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, what I meant was that we can even initialize it with |
||
|
|
||
| // whether a vectorized hashmap is used instead | ||
| // we have decided to always use the row-based hashmap, | ||
| // but the vectorized hashmap can still be switched on for testing and benchmarking purposes. | ||
| private var isVectorizedHashMapEnabled: Boolean = false | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If it is only used in testing/benchmarking, is it worthy to put this piece in the production code? |
||
| private var isRowBasedHashMapEnabled: Boolean = false | ||
| // auxiliary flag, true if any of two above is true | ||
| private var isFastHashMapEnabled: Boolean = isVectorizedHashMapEnabled || isRowBasedHashMapEnabled | ||
|
|
||
| // The name for UnsafeRow HashMap | ||
| private var hashMapTerm: String = _ | ||
|
|
@@ -499,63 +498,35 @@ case class HashAggregateExec( | |
| isSupported && isNotByteArrayDecimalType | ||
| } | ||
|
|
||
| /** | ||
| * Requirement check for vectorized hash map. | ||
| */ | ||
| private def enableVectorizedHashMap(ctx: CodegenContext): Boolean = { | ||
| checkIfFastHashMapSupported(ctx) | ||
| } | ||
|
|
||
| /** | ||
| * Requirement check for row-based hash map. | ||
| */ | ||
| private def enableRowBasedHashMap(ctx: CodegenContext): Boolean = { | ||
| checkIfFastHashMapSupported(ctx) | ||
| } | ||
| private def enableTwoLevelHashMap(ctx: CodegenContext) = { | ||
| if (!checkIfFastHashMapSupported(ctx)) { | ||
| if (modes.forall(mode => mode == Partial || mode == PartialMerge) && !Utils.isTesting) { | ||
| logInfo("spark.sql.codegen.aggregate.map.twolevel.enable is set to true, but" | ||
| + " current version of codegened fast hashmap does not support this aggregate.") | ||
| } | ||
| } else { | ||
| isFastHashMapEnabled = true | ||
|
|
||
| private def setFastHashMapImpl(ctx: CodegenContext) = { | ||
| sqlContext.conf.enforceFastAggHashMapImpl match { | ||
| case "rowbased" => | ||
| if (!enableRowBasedHashMap(ctx)) { | ||
| if (modes.forall(mode => mode == Partial || mode == PartialMerge) && !Utils.isTesting) { | ||
| logWarning("spark.sql.codegen.aggregate.map.enforce.impl is set to rowbased, but" | ||
| + " current version of codegened row-based hashmap does not support this aggregate.") | ||
| } | ||
| } else { | ||
| isRowBasedHashMapEnabled = true | ||
| } | ||
| case "vectorized" => | ||
| if (!enableVectorizedHashMap(ctx)) { | ||
| if (modes.forall(mode => mode == Partial || mode == PartialMerge) && !Utils.isTesting) { | ||
| logWarning("spark.sql.codegen.aggregate.map.enforce.impl is set to vectorized, but" | ||
| + " current version of codegened vectorized hashmap does not support this aggregate.") | ||
| } | ||
| } else { | ||
| isVectorizedHashMapEnabled = true | ||
| } | ||
| case "skip" => | ||
| // no need to do anything, default sets all flags to be false | ||
| case _ => | ||
| if (sqlContext.conf.enforceFastAggHashMapImpl != "auto") { | ||
| logWarning("spark.sql.codegen.aggregate.map.enforce.impl should be set to one of the " | ||
| + "following: rowbased, vectorized, skip, auto(default).") | ||
| } | ||
| if (enableRowBasedHashMap(ctx)) { | ||
| isRowBasedHashMapEnabled = true | ||
| } else if (enableVectorizedHashMap(ctx)) { | ||
| // Because enableVectorizedHashMap() and enableRowBasedHashMap() are identical currently, | ||
| // this should never be reached. We vision this codepath to be useful as our support for | ||
| // the two fast hash map extends. | ||
| isVectorizedHashMapEnabled = true | ||
| } | ||
| // This is for testing/benchmarking only. | ||
| // We enforce to first level to be a vectorized hashmap, instead of the default row-based one. | ||
| sqlContext.getConf("spark.sql.codegen.aggregate.map.vectorized.enable", null) match { | ||
| case "true" => isVectorizedHashMapEnabled = true | ||
| case null | "" | "false" => None } | ||
| } | ||
| isFastHashMapEnabled = isVectorizedHashMapEnabled || isRowBasedHashMapEnabled | ||
| } | ||
|
|
||
| private def doProduceWithKeys(ctx: CodegenContext): String = { | ||
| val initAgg = ctx.freshName("initAgg") | ||
| ctx.addMutableState("boolean", initAgg, s"$initAgg = false;") | ||
| setFastHashMapImpl(ctx) | ||
| if (sqlContext.conf.enableTwoLevelAggMap) { | ||
| enableTwoLevelHashMap(ctx) | ||
| } else { | ||
| sqlContext.getConf("spark.sql.codegen.aggregate.map.vectorized.enable", null) match { | ||
| case "true" => logWarning("Two level hashmap is disabled but vectorized hashmap is " + | ||
| "enabled.") | ||
| case null | "" | "false" => None | ||
| } | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe you want to wrap line 521-529 in a function like Current impl of |
||
| fastHashMapTerm = ctx.freshName("fastHashMap") | ||
| val fastHashMapClassName = ctx.freshName("FastHashMap") | ||
| val fastHashMapGenerator = | ||
|
|
||
This file was deleted.
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we use a more descriptive name than "fast", there can always be faster implementation?