-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-27734][CORE][SQL] Add memory based thresholds for shuffle spill #24618
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -74,6 +74,11 @@ public final class UnsafeExternalSorter extends MemoryConsumer { | |
| */ | ||
| private final int numElementsForSpillThreshold; | ||
|
|
||
| /** | ||
| * Force this sorter to spill when the size in memory is beyond this threshold. | ||
| */ | ||
| private final long maxRecordsSizeForSpillThreshold; | ||
|
|
||
| /** | ||
| * Memory pages that hold the records being sorted. The pages in this list are freed when | ||
| * spilling, although in principle we could recycle these pages across spills (on the other hand, | ||
|
|
@@ -86,6 +91,7 @@ public final class UnsafeExternalSorter extends MemoryConsumer { | |
|
|
||
| // These variables are reset after spilling: | ||
| @Nullable private volatile UnsafeInMemorySorter inMemSorter; | ||
| private long inMemRecordsSize = 0; | ||
|
|
||
| private MemoryBlock currentPage = null; | ||
| private long pageCursor = -1; | ||
|
|
@@ -104,10 +110,12 @@ public static UnsafeExternalSorter createWithExistingInMemorySorter( | |
| int initialSize, | ||
| long pageSizeBytes, | ||
| int numElementsForSpillThreshold, | ||
| long maxRecordsSizeForSpillThreshold, | ||
| UnsafeInMemorySorter inMemorySorter) throws IOException { | ||
| UnsafeExternalSorter sorter = new UnsafeExternalSorter(taskMemoryManager, blockManager, | ||
| serializerManager, taskContext, recordComparatorSupplier, prefixComparator, initialSize, | ||
| pageSizeBytes, numElementsForSpillThreshold, inMemorySorter, false /* ignored */); | ||
| serializerManager, taskContext, recordComparatorSupplier, prefixComparator, initialSize, | ||
| pageSizeBytes, numElementsForSpillThreshold, maxRecordsSizeForSpillThreshold, | ||
| inMemorySorter, false /* ignored */); | ||
| sorter.spill(Long.MAX_VALUE, sorter); | ||
| // The external sorter will be used to insert records, in-memory sorter is not needed. | ||
| sorter.inMemSorter = null; | ||
|
|
@@ -124,10 +132,11 @@ public static UnsafeExternalSorter create( | |
| int initialSize, | ||
| long pageSizeBytes, | ||
| int numElementsForSpillThreshold, | ||
| long maxRecordsSizeForSpillThreshold, | ||
| boolean canUseRadixSort) { | ||
| return new UnsafeExternalSorter(taskMemoryManager, blockManager, serializerManager, | ||
| taskContext, recordComparatorSupplier, prefixComparator, initialSize, pageSizeBytes, | ||
| numElementsForSpillThreshold, null, canUseRadixSort); | ||
| numElementsForSpillThreshold, maxRecordsSizeForSpillThreshold, null, canUseRadixSort); | ||
| } | ||
|
|
||
| private UnsafeExternalSorter( | ||
|
|
@@ -140,6 +149,7 @@ private UnsafeExternalSorter( | |
| int initialSize, | ||
| long pageSizeBytes, | ||
| int numElementsForSpillThreshold, | ||
| long maxRecordsSizeForSpillThreshold, | ||
| @Nullable UnsafeInMemorySorter existingInMemorySorter, | ||
| boolean canUseRadixSort) { | ||
| super(taskMemoryManager, pageSizeBytes, taskMemoryManager.getTungstenMemoryMode()); | ||
|
|
@@ -170,6 +180,7 @@ private UnsafeExternalSorter( | |
| } | ||
| this.peakMemoryUsedBytes = getMemoryUsage(); | ||
| this.numElementsForSpillThreshold = numElementsForSpillThreshold; | ||
| this.maxRecordsSizeForSpillThreshold = maxRecordsSizeForSpillThreshold; | ||
|
|
||
| // Register a cleanup task with TaskContext to ensure that memory is guaranteed to be freed at | ||
| // the end of the task. This is necessary to avoid memory leaks in when the downstream operator | ||
|
|
@@ -228,7 +239,7 @@ public long spill(long size, MemoryConsumer trigger) throws IOException { | |
| // Reset the in-memory sorter's pointer array only after freeing up the memory pages holding the | ||
| // records. Otherwise, if the task is over allocated memory, then without freeing the memory | ||
| // pages, we might not be able to get memory for the pointer array. | ||
|
|
||
| inMemRecordsSize = 0; | ||
| taskContext.taskMetrics().incMemoryBytesSpilled(spillSize); | ||
| taskContext.taskMetrics().incDiskBytesSpilled(writeMetrics.bytesWritten()); | ||
| totalSpillBytes += spillSize; | ||
|
|
@@ -396,8 +407,11 @@ public void insertRecord( | |
| logger.info("Spilling data because number of spilledRecords crossed the threshold " + | ||
| numElementsForSpillThreshold); | ||
| spill(); | ||
| } else if (inMemRecordsSize >= maxRecordsSizeForSpillThreshold) { | ||
| logger.info("Spilling data because size of spilledRecords crossed the threshold " + | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we also include the number of records and threshold here?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| maxRecordsSizeForSpillThreshold); | ||
| spill(); | ||
| } | ||
|
|
||
| growPointerArrayIfNecessary(); | ||
| int uaoSize = UnsafeAlignedOffset.getUaoSize(); | ||
| // Need 4 or 8 bytes to store the record length. | ||
|
|
@@ -411,6 +425,7 @@ public void insertRecord( | |
| Platform.copyMemory(recordBase, recordOffset, base, pageCursor, length); | ||
| pageCursor += length; | ||
| inMemSorter.insertRecord(recordAddress, prefix, prefixIsNull); | ||
| inMemRecordsSize += length; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto, uaoSize |
||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -998,6 +998,26 @@ package object config { | |
| .intConf | ||
| .createWithDefault(Integer.MAX_VALUE) | ||
|
|
||
| private[spark] val SHUFFLE_SPILL_MAP_MAX_SIZE_FORCE_SPILL_THRESHOLD = | ||
| ConfigBuilder("spark.shuffle.spill.map.maxRecordsSizeForSpillThreshold") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does the "map" mean inside this config name?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is it necessary to have different threshold between map task and reduce task?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have the same question. What's the use case to separate them differently? |
||
| .internal() | ||
| .doc("The maximum size in memory before forcing the map-side shuffle sorter to spill. " + | ||
| "By default it is Long.MAX_VALUE, which means we never force the sorter to spill, " + | ||
| "until we reach some limitations, like the max page size limitation for the pointer " + | ||
| "array in the sorter.") | ||
| .bytesConf(ByteUnit.BYTE) | ||
| .createWithDefault(Long.MaxValue) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @amanomer, you can make this configuration optional via
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just a reminder, we need to attach |
||
|
|
||
| private[spark] val SHUFFLE_SPILL_REDUCE_MAX_SIZE_FORCE_SPILL_THRESHOLD = | ||
| ConfigBuilder("spark.shuffle.spill.reduce.maxRecordsSizeForSpillThreshold") | ||
| .internal() | ||
| .doc("The maximum size in memory before forcing the reduce-side to spill. " + | ||
| "By default it is Long.MAX_VALUE, which means we never force the sorter to spill, " + | ||
| "until we reach some limitations, like the max page size limitation for the pointer " + | ||
| "array in the sorter.") | ||
| .bytesConf(ByteUnit.BYTE) | ||
| .createWithDefault(Long.MaxValue) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto. |
||
|
|
||
| private[spark] val SHUFFLE_MAP_OUTPUT_PARALLEL_AGGREGATION_THRESHOLD = | ||
| ConfigBuilder("spark.shuffle.mapOutput.parallelAggregationThreshold") | ||
| .internal() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -53,6 +53,10 @@ private[spark] abstract class Spillable[C](taskMemoryManager: TaskMemoryManager) | |
| private[this] val initialMemoryThreshold: Long = | ||
| SparkEnv.get.conf.get(SHUFFLE_SPILL_INITIAL_MEM_THRESHOLD) | ||
|
|
||
| // Force this collection to spill when its size is greater than this threshold | ||
| private[this] val maxSizeForceSpillThreshold: Long = | ||
| SparkEnv.get.conf.get(SHUFFLE_SPILL_REDUCE_MAX_SIZE_FORCE_SPILL_THRESHOLD) | ||
|
|
||
| // Force this collection to spill when there are this many elements in memory | ||
| // For testing only | ||
| private[this] val numElementsForceSpillThreshold: Int = | ||
|
|
@@ -81,7 +85,11 @@ private[spark] abstract class Spillable[C](taskMemoryManager: TaskMemoryManager) | |
| */ | ||
| protected def maybeSpill(collection: C, currentMemory: Long): Boolean = { | ||
| var shouldSpill = false | ||
| if (elementsRead % 32 == 0 && currentMemory >= myMemoryThreshold) { | ||
| // Check number of elements or memory usage limits, whichever is hit first | ||
| if (_elementsRead > numElementsForceSpillThreshold | ||
| || currentMemory > maxSizeForceSpillThreshold) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just wonder if we need |
||
| shouldSpill = true | ||
| } else if (elementsRead % 32 == 0 && currentMemory >= myMemoryThreshold) { | ||
| // Claim up to double our current memory from the shuffle memory pool | ||
| val amountToRequest = 2 * currentMemory - myMemoryThreshold | ||
| val granted = acquireMemory(amountToRequest) | ||
|
|
@@ -90,11 +98,10 @@ private[spark] abstract class Spillable[C](taskMemoryManager: TaskMemoryManager) | |
| // or we already had more memory than myMemoryThreshold), spill the current collection | ||
| shouldSpill = currentMemory >= myMemoryThreshold | ||
| } | ||
| shouldSpill = shouldSpill || _elementsRead > numElementsForceSpillThreshold | ||
| // Actually spill | ||
| if (shouldSpill) { | ||
| _spillCount += 1 | ||
| logSpillage(currentMemory) | ||
| logSpillage(currentMemory, elementsRead) | ||
| spill(collection) | ||
| _elementsRead = 0 | ||
| _memoryBytesSpilled += currentMemory | ||
|
|
@@ -141,10 +148,10 @@ private[spark] abstract class Spillable[C](taskMemoryManager: TaskMemoryManager) | |
| * | ||
| * @param size number of bytes spilled | ||
| */ | ||
| @inline private def logSpillage(size: Long): Unit = { | ||
| @inline private def logSpillage(size: Long, elements: Int) { | ||
| val threadId = Thread.currentThread().getId | ||
| logInfo("Thread %d spilling in-memory map of %s to disk (%d time%s so far)" | ||
| .format(threadId, org.apache.spark.util.Utils.bytesToString(size), | ||
| logInfo("Thread %d spilling in-memory map of %s (elements: %d) to disk (%d time%s so far)" | ||
| .format(threadId, org.apache.spark.util.Utils.bytesToString(size), elements, | ||
| _spillCount, if (_spillCount > 1) "s" else "")) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we also include the uaoSize?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1, the
pageCursoris also increased by uaoSize and length