-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-7081] Faster sort-based shuffle path using binary processing cache-aware sort #5868
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
81d52c5
abf7bfe
57a4ea0
e900152
767d3ca
3db12de
4d2f5e1
8e3ec20
253f13e
9c6cf58
e267cee
e2d96ca
d3cc310
87e721b
0748458
026b497
1433b42
240864c
bfc12d3
b8a09fe
c2fca17
f17fa8f
8958584
595923a
5e100b2
2776aca
f156a8f
3490512
3aeaff7
7ee918e
69232fd
57f1ec0
f480fb2
133c8c9
4f70141
aaea17b
b674412
11feeb6
8a6fe52
cfe0ec4
e67f1ea
5e8cf75
1ce1300
b95e642
9883e30
722849b
7cd013b
9b7ebed
e8718dd
1929a74
01afc74
8f5061a
67d25ba
fd4bb9e
9d1ee7c
fcd9a3c
27b18b0
4a01c45
f780fb1
b57c17f
1ef56c7
b3b1924
0d4d199
ec6d626
ae538dc
ea4f85f
1e3ad52
39434f9
e1855e5
7c953f9
8531286
69d5899
d4e6d89
4f0b770
e58a6b4
e995d1a
56781a1
0ad34da
85da63f
fdcac08
2d4e4f4
57312c9
6276168
4a2c785
e3b8855
c2ce78e
d5779c6
5e189c6
df07699
de40b9d
4023fa4
51812a7
52a9981
d494ffe
7610f2f
ef0a86e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,30 +17,41 @@ | |
|
|
||
| package org.apache.spark.shuffle.unsafe; | ||
|
|
||
| import com.google.common.annotations.VisibleForTesting; | ||
| import java.io.File; | ||
| import java.io.IOException; | ||
| import java.util.Iterator; | ||
| import java.util.LinkedList; | ||
|
|
||
| import org.apache.spark.storage.*; | ||
| import scala.Tuple2; | ||
|
|
||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
| import org.apache.spark.SparkConf; | ||
| import org.apache.spark.TaskContext; | ||
| import org.apache.spark.executor.ShuffleWriteMetrics; | ||
| import org.apache.spark.serializer.SerializerInstance; | ||
| import org.apache.spark.shuffle.ShuffleMemoryManager; | ||
| import org.apache.spark.storage.BlockId; | ||
| import org.apache.spark.storage.BlockManager; | ||
| import org.apache.spark.storage.BlockObjectWriter; | ||
| import org.apache.spark.storage.TempLocalBlockId; | ||
| import org.apache.spark.unsafe.PlatformDependent; | ||
| import org.apache.spark.unsafe.memory.MemoryBlock; | ||
| import org.apache.spark.unsafe.memory.TaskMemoryManager; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
| import scala.Tuple2; | ||
|
|
||
| import java.io.File; | ||
| import java.io.IOException; | ||
| import java.util.Iterator; | ||
| import java.util.LinkedList; | ||
|
|
||
| /** | ||
| * External sorter based on {@link UnsafeShuffleSorter}. | ||
| * An external sorter that is specialized for sort-based shuffle. | ||
| * <p> | ||
| * Incoming records are appended to data pages. When all records have been inserted (or when the | ||
| * current thread's shuffle memory limit is reached), the in-memory records are sorted according to | ||
| * their partition ids (using a {@link UnsafeShuffleSorter}). The sorted records are then written | ||
| * to a single output file (or multiple files, if we've spilled). The format of the output files is | ||
| * the same as the format of the final output file written by | ||
| * {@link org.apache.spark.shuffle.sort.SortShuffleWriter}: each output partition's records are | ||
| * written as a single serialized, compressed stream that can be read with a new decompression and | ||
| * deserialization stream. | ||
| * <p> | ||
| * Unlike {@link org.apache.spark.util.collection.ExternalSorter}, this sorter does not merge its | ||
| * spill files. Instead, this merging is performed in {@link UnsafeShuffleWriter}, which uses a | ||
| * specialized merge procedure that avoids extra serialization/deserialization. | ||
| */ | ||
| public final class UnsafeShuffleSpillWriter { | ||
|
||
|
|
||
|
|
@@ -51,23 +62,31 @@ public final class UnsafeShuffleSpillWriter { | |
|
|
||
| private final int initialSize; | ||
| private final int numPartitions; | ||
| private UnsafeShuffleSorter sorter; | ||
|
|
||
| private final TaskMemoryManager memoryManager; | ||
| private final ShuffleMemoryManager shuffleMemoryManager; | ||
| private final BlockManager blockManager; | ||
| private final TaskContext taskContext; | ||
| private final LinkedList<MemoryBlock> allocatedPages = new LinkedList<MemoryBlock>(); | ||
| private final boolean spillingEnabled; | ||
| private final int fileBufferSize; | ||
| private ShuffleWriteMetrics writeMetrics; | ||
|
|
||
| /** The buffer size to use when writing spills using DiskBlockObjectWriter */ | ||
| private final int fileBufferSize; | ||
|
|
||
| private MemoryBlock currentPage = null; | ||
| private long currentPagePosition = -1; | ||
| /** | ||
| * Memory pages that hold the records being sorted. The pages in this list are freed when | ||
| * spilling, although in principle we could recycle these pages across spills (on the other hand, | ||
| * this might not be necessary if we maintained a pool of re-usable pages in the TaskMemoryManager | ||
| * itself). | ||
| */ | ||
| private final LinkedList<MemoryBlock> allocatedPages = new LinkedList<MemoryBlock>(); | ||
|
|
||
| private final LinkedList<SpillInfo> spills = new LinkedList<SpillInfo>(); | ||
|
|
||
| // All three of these variables are reset after spilling: | ||
| private UnsafeShuffleSorter sorter; | ||
| private MemoryBlock currentPage = null; | ||
| private long currentPagePosition = -1; | ||
|
|
||
| public UnsafeShuffleSpillWriter( | ||
| TaskMemoryManager memoryManager, | ||
| ShuffleMemoryManager shuffleMemoryManager, | ||
|
|
@@ -90,6 +109,10 @@ public UnsafeShuffleSpillWriter( | |
|
|
||
| // TODO: metrics tracking + integration with shuffle write metrics | ||
|
|
||
| /** | ||
| * Allocates a new sorter. Called when opening the spill writer for the first time and after | ||
| * each spill. | ||
| */ | ||
| private void openSorter() throws IOException { | ||
| this.writeMetrics = new ShuffleWriteMetrics(); | ||
| // TODO: connect write metrics to task metrics? | ||
|
|
@@ -106,22 +129,41 @@ private void openSorter() throws IOException { | |
| this.sorter = new UnsafeShuffleSorter(initialSize); | ||
| } | ||
|
|
||
| /** | ||
| * Sorts the in-memory records, writes the sorted records to a spill file, and frees the in-memory | ||
|
||
| * data structures associated with this sort. New data structures are not automatically allocated. | ||
| */ | ||
| private SpillInfo writeSpillFile() throws IOException { | ||
| final UnsafeShuffleSorter.UnsafeShuffleSorterIterator sortedRecords = sorter.getSortedIterator(); | ||
| // This call performs the actual sort. | ||
| final UnsafeShuffleSorter.UnsafeShuffleSorterIterator sortedRecords = | ||
| sorter.getSortedIterator(); | ||
|
|
||
| int currentPartition = -1; | ||
| // Currently, we need to open a new DiskBlockObjectWriter for each partition; we can avoid this | ||
| // after SPARK-5581 is fixed. | ||
| BlockObjectWriter writer = null; | ||
|
|
||
| // Small writes to DiskBlockObjectWriter will be fairly inefficient. Since there doesn't seem to | ||
| // be an API to directly transfer bytes from managed memory to the disk writer, we buffer | ||
| // records in a byte array. This array only needs to be big enough to hold a single record. | ||
| final byte[] arr = new byte[SER_BUFFER_SIZE]; | ||
|
|
||
| final Tuple2<TempLocalBlockId, File> spilledFileInfo = | ||
| blockManager.diskBlockManager().createTempLocalBlock(); | ||
| // Because this output will be read during shuffle, its compression codec must be controlled by | ||
| // spark.shuffle.compress instead of spark.shuffle.spill.compress, so we need to use | ||
| // createTempShuffleBlock here; see SPARK-3426 for more details. | ||
| final Tuple2<TempShuffleBlockId, File> spilledFileInfo = | ||
| blockManager.diskBlockManager().createTempShuffleBlock(); | ||
| final File file = spilledFileInfo._2(); | ||
| final BlockId blockId = spilledFileInfo._1(); | ||
| final SpillInfo spillInfo = new SpillInfo(numPartitions, file, blockId); | ||
|
|
||
| // Unfortunately, we need a serializer instance in order to construct a DiskBlockObjectWriter. | ||
| // Our write path doesn't actually use this serializer (since we end up calling the `write()` | ||
| // OutputStream methods), but DiskBlockObjectWriter still calls some methods on it. To work | ||
| // around this, we pass a dummy no-op serializer. | ||
| final SerializerInstance ser = new DummySerializerInstance(); | ||
| writer = blockManager.getDiskWriter(blockId, file, ser, fileBufferSize, writeMetrics); | ||
|
|
||
| int currentPartition = -1; | ||
| while (sortedRecords.hasNext()) { | ||
| sortedRecords.loadNext(); | ||
| final int partition = sortedRecords.packedRecordPointer.getPartitionId(); | ||
|
|
@@ -153,7 +195,9 @@ private SpillInfo writeSpillFile() throws IOException { | |
|
|
||
| if (writer != null) { | ||
| writer.commitAndClose(); | ||
| // TODO: comment and explain why our handling of empty spills, etc. | ||
| // If `writeSpillFile()` was called from `closeAndGetSpills()` and no records were inserted, | ||
| // then the spill file might be empty. Note that it might be better to avoid calling | ||
| // writeSpillFile() in that case. | ||
| if (currentPartition != -1) { | ||
| spillInfo.partitionLengths[currentPartition] = writer.fileSegment().length(); | ||
| spills.add(spillInfo); | ||
|
|
@@ -162,24 +206,30 @@ private SpillInfo writeSpillFile() throws IOException { | |
| return spillInfo; | ||
| } | ||
|
|
||
| @VisibleForTesting | ||
| public void spill() throws IOException { | ||
| final SpillInfo spillInfo = writeSpillFile(); | ||
| /** | ||
| * Sort and spill the current records in response to memory pressure. | ||
| */ | ||
| private void spill() throws IOException { | ||
| final long threadId = Thread.currentThread().getId(); | ||
| logger.info("Thread " + threadId + " spilling sort data of " + | ||
|
||
| org.apache.spark.util.Utils.bytesToString(getMemoryUsage()) + " to disk (" + | ||
| (spills.size() + (spills.size() > 1 ? " times" : " time")) + " so far)"); | ||
|
|
||
| final SpillInfo spillInfo = writeSpillFile(); | ||
| final long sorterMemoryUsage = sorter.getMemoryUsage(); | ||
| sorter = null; | ||
| shuffleMemoryManager.release(sorterMemoryUsage); | ||
| final long spillSize = freeMemory(); | ||
| taskContext.taskMetrics().incMemoryBytesSpilled(spillSize); | ||
| taskContext.taskMetrics().incDiskBytesSpilled(spillInfo.file.length()); | ||
| final long threadId = Thread.currentThread().getId(); | ||
| // TODO: messy; log _before_ spill | ||
| logger.info("Thread " + threadId + " spilling in-memory map of " + | ||
| org.apache.spark.util.Utils.bytesToString(spillSize) + " to disk (" + | ||
| (spills.size() + ((spills.size() > 1) ? " times" : " time")) + " so far)"); | ||
|
|
||
| openSorter(); | ||
| } | ||
|
|
||
| private long getMemoryUsage() { | ||
| return sorter.getMemoryUsage() + (allocatedPages.size() * PAGE_SIZE); | ||
| } | ||
|
|
||
| private long freeMemory() { | ||
| long memoryFreed = 0; | ||
| final Iterator<MemoryBlock> iter = allocatedPages.iterator(); | ||
|
|
@@ -194,7 +244,15 @@ private long freeMemory() { | |
| return memoryFreed; | ||
| } | ||
|
|
||
| private void ensureSpaceInDataPage(int requiredSpace) throws Exception { | ||
| /** | ||
| * Checks whether there is enough space to insert a new record into the sorter. If there is | ||
| * insufficient space, either allocate more memory or spill the current sort data (if spilling | ||
| * is enabled), then insert the record. | ||
| */ | ||
| private void ensureSpaceInDataPage(int requiredSpace) throws IOException { | ||
| // TODO: we should re-order the `if` cases in this function so that the most common case (there | ||
| // is enough space) appears first. | ||
|
|
||
| // TODO: merge these steps to first calculate total memory requirements for this insert, | ||
| // then try to acquire; no point in acquiring sort buffer only to spill due to no space in the | ||
| // data page. | ||
|
|
@@ -219,7 +277,7 @@ private void ensureSpaceInDataPage(int requiredSpace) throws Exception { | |
| } | ||
| if (requiredSpace > PAGE_SIZE) { | ||
| // TODO: throw a more specific exception? | ||
| throw new Exception("Required space " + requiredSpace + " is greater than page size (" + | ||
| throw new IOException("Required space " + requiredSpace + " is greater than page size (" + | ||
| PAGE_SIZE + ")"); | ||
| } else if (requiredSpace > spaceInCurrentPage) { | ||
| if (spillingEnabled) { | ||
|
||
|
|
@@ -230,7 +288,7 @@ private void ensureSpaceInDataPage(int requiredSpace) throws Exception { | |
| final long memoryAcquiredAfterSpill = shuffleMemoryManager.tryToAcquire(PAGE_SIZE); | ||
| if (memoryAcquiredAfterSpill != PAGE_SIZE) { | ||
| shuffleMemoryManager.release(memoryAcquiredAfterSpill); | ||
| throw new Exception("Can't allocate memory!"); | ||
| throw new IOException("Can't allocate memory!"); | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -241,11 +299,14 @@ private void ensureSpaceInDataPage(int requiredSpace) throws Exception { | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Write a record to the shuffle sorter. | ||
| */ | ||
| public void insertRecord( | ||
| Object recordBaseObject, | ||
| long recordBaseOffset, | ||
| int lengthInBytes, | ||
| int prefix) throws Exception { | ||
| int partitionId) throws IOException { | ||
| // Need 4 bytes to store the record length. | ||
| ensureSpaceInDataPage(lengthInBytes + 4); | ||
|
|
||
|
|
@@ -262,12 +323,20 @@ public void insertRecord( | |
| lengthInBytes); | ||
| currentPagePosition += lengthInBytes; | ||
|
|
||
| sorter.insertRecord(recordAddress, prefix); | ||
| sorter.insertRecord(recordAddress, partitionId); | ||
| } | ||
|
|
||
| /** | ||
| * Close the sorter, causing any buffered data to be sorted and written out to disk. | ||
| * | ||
| * @return metadata for the spill files written by this sorter. If no records were ever inserted | ||
| * into this sorter, then this will return an empty array. | ||
| * @throws IOException | ||
| */ | ||
| public SpillInfo[] closeAndGetSpills() throws IOException { | ||
| if (sorter != null) { | ||
| writeSpillFile(); | ||
|
||
| freeMemory(); | ||
| } | ||
| return spills.toArray(new SpillInfo[0]); | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: wrong import order