-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-20801] Record accurate size of blocks in MapStatus when it's above threshold. #18031
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,8 +19,13 @@ package org.apache.spark.scheduler | |
|
|
||
| import java.io.{Externalizable, ObjectInput, ObjectOutput} | ||
|
|
||
| import scala.collection.mutable | ||
| import scala.collection.mutable.ArrayBuffer | ||
|
|
||
| import org.roaringbitmap.RoaringBitmap | ||
|
|
||
| import org.apache.spark.SparkEnv | ||
| import org.apache.spark.internal.config | ||
| import org.apache.spark.storage.BlockManagerId | ||
| import org.apache.spark.util.Utils | ||
|
|
||
|
|
@@ -121,48 +126,69 @@ private[spark] class CompressedMapStatus( | |
| } | ||
|
|
||
| /** | ||
| * A [[MapStatus]] implementation that only stores the average size of non-empty blocks, | ||
| * plus a bitmap for tracking which blocks are empty. | ||
| * A [[MapStatus]] implementation that stores the accurate size of huge blocks, which are larger | ||
| * than both spark.shuffle.accurateBlockThreshold and | ||
| * spark.shuffle.accurateBlockThresholdByTimesAverage * averageSize. It stores the | ||
| * average size of other non-empty blocks, plus a bitmap for tracking which blocks are empty. | ||
| * | ||
| * @param loc location where the task is being executed | ||
| * @param numNonEmptyBlocks the number of non-empty blocks | ||
| * @param emptyBlocks a bitmap tracking which blocks are empty | ||
| * @param avgSize average size of the non-empty blocks | ||
| * @param hugeBlockSizes sizes of huge blocks by their reduceId. | ||
| */ | ||
| private[spark] class HighlyCompressedMapStatus private ( | ||
| private[this] var loc: BlockManagerId, | ||
| private[this] var numNonEmptyBlocks: Int, | ||
| private[this] var emptyBlocks: RoaringBitmap, | ||
| private[this] var avgSize: Long) | ||
| private[this] var avgSize: Long, | ||
| @transient private var hugeBlockSizes: Map[Int, Byte]) | ||
| extends MapStatus with Externalizable { | ||
|
|
||
| // loc could be null when the default constructor is called during deserialization | ||
| require(loc == null || avgSize > 0 || numNonEmptyBlocks == 0, | ||
| "Average size can only be zero for map stages that produced no output") | ||
|
|
||
| protected def this() = this(null, -1, null, -1) // For deserialization only | ||
| protected def this() = this(null, -1, null, -1, null) // For deserialization only | ||
|
|
||
| override def location: BlockManagerId = loc | ||
|
|
||
| override def getSizeForBlock(reduceId: Int): Long = { | ||
| assert(hugeBlockSizes != null) | ||
| if (emptyBlocks.contains(reduceId)) { | ||
| 0 | ||
| } else { | ||
| avgSize | ||
| hugeBlockSizes.get(reduceId) match { | ||
| case Some(size) => MapStatus.decompressSize(size) | ||
| case None => avgSize | ||
| } | ||
| } | ||
| } | ||
|
|
||
| override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException { | ||
| loc.writeExternal(out) | ||
| emptyBlocks.writeExternal(out) | ||
| out.writeLong(avgSize) | ||
| out.writeInt(hugeBlockSizes.size) | ||
| hugeBlockSizes.foreach { kv => | ||
| out.writeInt(kv._1) | ||
| out.writeByte(kv._2) | ||
| } | ||
| } | ||
|
|
||
| override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException { | ||
| loc = BlockManagerId(in) | ||
| emptyBlocks = new RoaringBitmap() | ||
| emptyBlocks.readExternal(in) | ||
| avgSize = in.readLong() | ||
| val count = in.readInt() | ||
| val hugeBlockSizesArray = mutable.ArrayBuffer[Tuple2[Int, Byte]]() | ||
| (0 until count).foreach { _ => | ||
| val block = in.readInt() | ||
| val size = in.readByte() | ||
| hugeBlockSizesArray += Tuple2(block, size) | ||
| } | ||
| hugeBlockSizes = hugeBlockSizesArray.toMap | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -193,8 +219,27 @@ private[spark] object HighlyCompressedMapStatus { | |
| } else { | ||
| 0 | ||
| } | ||
| val threshold1 = Option(SparkEnv.get) | ||
| .map(_.conf.get(config.SHUFFLE_ACCURATE_BLOCK_THRESHOLD)) | ||
| .getOrElse(config.SHUFFLE_ACCURATE_BLOCK_THRESHOLD.defaultValue.get) | ||
| val threshold2 = avgSize * Option(SparkEnv.get) | ||
| .map(_.conf.get(config.SHUFFLE_ACCURATE_BLOCK_THRESHOLD_BY_TIMES_AVERAGE)) | ||
| .getOrElse(config.SHUFFLE_ACCURATE_BLOCK_THRESHOLD_BY_TIMES_AVERAGE.defaultValue.get) | ||
| val threshold = math.max(threshold1, threshold2) | ||
|
||
| val hugeBlockSizesArray = ArrayBuffer[Tuple2[Int, Byte]]() | ||
| if (numNonEmptyBlocks > 0) { | ||
| i = 0 | ||
| while (i < totalNumBlocks) { | ||
| if (uncompressedSizes(i) > threshold) { | ||
| hugeBlockSizesArray += Tuple2(i, MapStatus.compressSize(uncompressedSizes(i))) | ||
|
|
||
| } | ||
| i += 1 | ||
| } | ||
|
||
| } | ||
| emptyBlocks.trim() | ||
| emptyBlocks.runOptimize() | ||
| new HighlyCompressedMapStatus(loc, numNonEmptyBlocks, emptyBlocks, avgSize) | ||
| new HighlyCompressedMapStatus(loc, numNonEmptyBlocks, emptyBlocks, avgSize, | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems to me that this https://github.com/apache/spark/pull/16989/files#r117174623 is a good comment to have accurate size for the smaller blocks.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @viirya Thanks a lot for taking time looking into this pr :)
Yes, I think this is really good idea to have accurate size for smaller blocks. But I'm proposing two configs(
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In current change, if almost all blocks are huge, that's said it is not a skew case, so we won't mark the blocks as huge ones. Then we will still fetch them into memory?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With the default value (spark.shuffle.accurateBlockThreshold=100M and spark.shuffle.accurateBlockThresholdByTimesAverage=2), Yes.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd tend to have just one flag and simplify the configuration.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 for one flag, let's only keep |
||
| hugeBlockSizesArray.toMap) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I go through part of codes in #16989. It seems to me that If we want is to know which shuffle request should be go to disk instead of memory, do we need to record the mapping of block ids and accurate sizes?
A simpler approach can be adding a bitmap for hugeBlocks. And we can simply fetch those blocks into disk. Another benefit by doing this is to avoid introducing another config
REDUCER_MAX_REQ_SIZE_SHUFFLE_TO_MEMto decide which blocks going to disk.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I think it makes sense to add bitmap for hugeBlocks. But I'm a little bit hesitant. I still prefer to have
hugeBlockSizesmore independent from upper logic. In addition, the accurate size of blocks can also have positive effect on pending requests. (e.g.spark.reducer.maxSizeInFlightcan control the size of pending requests better.)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The control of
spark.reducer.maxSizeInFlightis not a big problem. It seems to me that any blocks considered as huge should breakmaxSizeInFlightand can't be fetching in parallel. We actually don't need to know accurate size of huge blocks, we just need to know it's huge and it should be more thanmaxSizeInFlight.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@viirya We had this discussion before in the earlier PR (which this is split from).
maxSizeInFlight meant to control how much data can be fetched in parallel and tuned based on network throughput and not memory (though currently, they are directly dependent due to implementation detail).
In reality, it is fairly small compared to what can be held in memory (48mb is default iirc) - since the memory and IO subsystems have different characteristics, using same config to control behavior in both will lead to suboptimal behavior (for example, large memory systems where large amounts can be held in memory, but network bandwidth is not propotionally higher).