-
Notifications
You must be signed in to change notification settings - Fork 6
[SPARK-25299] Implement default version of the API for shuffle writes #6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
7160ce3
460f0ea
96d1774
64fb327
996e903
3b9d33c
1f1c159
0737515
1ded83d
3353155
7a79bd9
9e3f05c
9f6230b
14df750
8cf80f7
46a0174
1325903
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,12 +17,10 @@ | |
|
|
||
| package org.apache.spark.shuffle.sort; | ||
|
|
||
| import java.io.File; | ||
| import java.io.FileInputStream; | ||
| import java.io.FileOutputStream; | ||
| import java.io.IOException; | ||
| import java.io.*; | ||
| import javax.annotation.Nullable; | ||
|
|
||
| import org.apache.spark.api.shuffle.ShufflePartitionWriter; | ||
| import scala.None$; | ||
| import scala.Option; | ||
| import scala.Product2; | ||
|
|
@@ -34,6 +32,8 @@ | |
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
| import org.apache.spark.api.shuffle.ShuffleExecutorComponents; | ||
| import org.apache.spark.api.shuffle.ShuffleMapOutputWriter; | ||
| import org.apache.spark.internal.config.package$; | ||
| import org.apache.spark.Partitioner; | ||
| import org.apache.spark.ShuffleDependency; | ||
|
|
@@ -79,9 +79,11 @@ final class BypassMergeSortShuffleWriter<K, V> extends ShuffleWriter<K, V> { | |
| private final BlockManager blockManager; | ||
| private final Partitioner partitioner; | ||
| private final ShuffleWriteMetricsReporter writeMetrics; | ||
| private final String appId; | ||
| private final int shuffleId; | ||
| private final int mapId; | ||
| private final Serializer serializer; | ||
| private final ShuffleExecutorComponents shuffleExecutorComponents; | ||
| private final IndexShuffleBlockResolver shuffleBlockResolver; | ||
|
|
||
| /** Array of file writers, one for each partition */ | ||
|
|
@@ -103,70 +105,76 @@ final class BypassMergeSortShuffleWriter<K, V> extends ShuffleWriter<K, V> { | |
| BypassMergeSortShuffleHandle<K, V> handle, | ||
| int mapId, | ||
| SparkConf conf, | ||
| ShuffleWriteMetricsReporter writeMetrics) { | ||
| ShuffleWriteMetricsReporter writeMetrics, | ||
| ShuffleExecutorComponents shuffleExecutorComponents) { | ||
|
||
| // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided | ||
| this.fileBufferSize = (int) (long) conf.get(package$.MODULE$.SHUFFLE_FILE_BUFFER_SIZE()) * 1024; | ||
| this.transferToEnabled = conf.getBoolean("spark.file.transferTo", true); | ||
| this.blockManager = blockManager; | ||
| final ShuffleDependency<K, V, V> dep = handle.dependency(); | ||
| this.appId = conf.getAppId(); | ||
| this.mapId = mapId; | ||
| this.shuffleId = dep.shuffleId(); | ||
| this.partitioner = dep.partitioner(); | ||
| this.numPartitions = partitioner.numPartitions(); | ||
| this.writeMetrics = writeMetrics; | ||
| this.serializer = dep.serializer(); | ||
| this.shuffleBlockResolver = shuffleBlockResolver; | ||
| this.shuffleExecutorComponents = shuffleExecutorComponents; | ||
| } | ||
|
|
||
| @Override | ||
| public void write(Iterator<Product2<K, V>> records) throws IOException { | ||
| assert (partitionWriters == null); | ||
| if (!records.hasNext()) { | ||
| partitionLengths = new long[numPartitions]; | ||
| shuffleBlockResolver.writeIndexFileAndCommit(shuffleId, mapId, partitionLengths, null); | ||
| mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths); | ||
| return; | ||
| } | ||
| final SerializerInstance serInstance = serializer.newInstance(); | ||
| final long openStartTime = System.nanoTime(); | ||
| partitionWriters = new DiskBlockObjectWriter[numPartitions]; | ||
| partitionWriterSegments = new FileSegment[numPartitions]; | ||
| for (int i = 0; i < numPartitions; i++) { | ||
| final Tuple2<TempShuffleBlockId, File> tempShuffleBlockIdPlusFile = | ||
| blockManager.diskBlockManager().createTempShuffleBlock(); | ||
| final File file = tempShuffleBlockIdPlusFile._2(); | ||
| final BlockId blockId = tempShuffleBlockIdPlusFile._1(); | ||
| partitionWriters[i] = | ||
| blockManager.getDiskWriter(blockId, file, serInstance, fileBufferSize, writeMetrics); | ||
| } | ||
| // Creating the file to write to and creating a disk writer both involve interacting with | ||
| // the disk, and can take a long time in aggregate when we open many files, so should be | ||
| // included in the shuffle write time. | ||
| writeMetrics.incWriteTime(System.nanoTime() - openStartTime); | ||
|
|
||
| while (records.hasNext()) { | ||
| final Product2<K, V> record = records.next(); | ||
| final K key = record._1(); | ||
| partitionWriters[partitioner.getPartition(key)].write(key, record._2()); | ||
| } | ||
| ShuffleMapOutputWriter mapOutputWriter = shuffleExecutorComponents.writes() | ||
| .createMapOutputWriter(appId, shuffleId, mapId, numPartitions); | ||
|
||
| try { | ||
| if (!records.hasNext()) { | ||
| partitionLengths = new long[numPartitions]; | ||
| mapOutputWriter.commitAllPartitions(); | ||
| mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths); | ||
| return; | ||
| } | ||
| final SerializerInstance serInstance = serializer.newInstance(); | ||
| final long openStartTime = System.nanoTime(); | ||
| partitionWriters = new DiskBlockObjectWriter[numPartitions]; | ||
| partitionWriterSegments = new FileSegment[numPartitions]; | ||
| for (int i = 0; i < numPartitions; i++) { | ||
| final Tuple2<TempShuffleBlockId, File> tempShuffleBlockIdPlusFile = | ||
| blockManager.diskBlockManager().createTempShuffleBlock(); | ||
| final File file = tempShuffleBlockIdPlusFile._2(); | ||
| final BlockId blockId = tempShuffleBlockIdPlusFile._1(); | ||
| partitionWriters[i] = | ||
| blockManager.getDiskWriter(blockId, file, serInstance, fileBufferSize, writeMetrics); | ||
| } | ||
| // Creating the file to write to and creating a disk writer both involve interacting with | ||
| // the disk, and can take a long time in aggregate when we open many files, so should be | ||
| // included in the shuffle write time. | ||
| writeMetrics.incWriteTime(System.nanoTime() - openStartTime); | ||
|
|
||
| for (int i = 0; i < numPartitions; i++) { | ||
| try (DiskBlockObjectWriter writer = partitionWriters[i]) { | ||
| partitionWriterSegments[i] = writer.commitAndGet(); | ||
| while (records.hasNext()) { | ||
| final Product2<K, V> record = records.next(); | ||
| final K key = record._1(); | ||
| partitionWriters[partitioner.getPartition(key)].write(key, record._2()); | ||
| } | ||
| } | ||
|
|
||
| File output = shuffleBlockResolver.getDataFile(shuffleId, mapId); | ||
| File tmp = Utils.tempFileWith(output); | ||
| try { | ||
| partitionLengths = writePartitionedFile(tmp); | ||
| shuffleBlockResolver.writeIndexFileAndCommit(shuffleId, mapId, partitionLengths, tmp); | ||
| } finally { | ||
| if (tmp.exists() && !tmp.delete()) { | ||
| logger.error("Error while deleting temp file {}", tmp.getAbsolutePath()); | ||
| for (int i = 0; i < numPartitions; i++) { | ||
| try (DiskBlockObjectWriter writer = partitionWriters[i]) { | ||
| partitionWriterSegments[i] = writer.commitAndGet(); | ||
| } | ||
| } | ||
|
|
||
| partitionLengths = writePartitionedData(mapOutputWriter); | ||
| mapOutputWriter.commitAllPartitions(); | ||
| mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths); | ||
| } catch (Exception e) { | ||
| try { | ||
| mapOutputWriter.abort(e); | ||
| } catch (Exception e2) { | ||
| logger.error("Failed to abort the writer after failing to write map output.", e2); | ||
| } | ||
| throw e; | ||
| } | ||
| mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths); | ||
| } | ||
|
|
||
| @VisibleForTesting | ||
|
|
@@ -179,37 +187,38 @@ long[] getPartitionLengths() { | |
| * | ||
| * @return array of lengths, in bytes, of each partition of the file (used by map output tracker). | ||
| */ | ||
| private long[] writePartitionedFile(File outputFile) throws IOException { | ||
| private long[] writePartitionedData(ShuffleMapOutputWriter mapOutputWriter) throws IOException { | ||
| // Track location of the partition starts in the output file | ||
| final long[] lengths = new long[numPartitions]; | ||
| if (partitionWriters == null) { | ||
| // We were passed an empty iterator | ||
| return lengths; | ||
| } | ||
|
|
||
| final FileOutputStream out = new FileOutputStream(outputFile, true); | ||
| final long writeStartTime = System.nanoTime(); | ||
| boolean threwException = true; | ||
| try { | ||
| for (int i = 0; i < numPartitions; i++) { | ||
| final File file = partitionWriterSegments[i].file(); | ||
| if (file.exists()) { | ||
| final FileInputStream in = new FileInputStream(file); | ||
| boolean copyThrewException = true; | ||
| try { | ||
| lengths[i] = Utils.copyStream(in, out, false, transferToEnabled); | ||
| copyThrewException = false; | ||
| } finally { | ||
| Closeables.close(in, copyThrewException); | ||
| } | ||
| if (!file.delete()) { | ||
| logger.error("Unable to delete file for partition {}", i); | ||
| boolean copyThrewException = true; | ||
| // TODO: Enable transferTo | ||
| ShufflePartitionWriter writer = mapOutputWriter.getNextPartitionWriter(); | ||
| try (OutputStream tempOutputStream = writer.openStream()) { | ||
| if (file.exists()) { | ||
| FileInputStream in = new FileInputStream(file); | ||
| try { | ||
| Utils.copyStream(in, tempOutputStream, false, false); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, on second thought, you probably just want to fork the code paths here into a "transferTo" block and a "outputstreams" block to allow more flexibility in the plugins to return whatever outputstreams they want. |
||
| copyThrewException = false; | ||
| } finally { | ||
| Closeables.close(in, copyThrewException); | ||
| } | ||
| } | ||
| } | ||
| lengths[i] = writer.getLength(); | ||
| if (file.exists() && !file.delete()) { | ||
| logger.error("Unable to delete file for partition {}", i); | ||
| } | ||
| } | ||
| threwException = false; | ||
| } finally { | ||
| Closeables.close(out, threwException); | ||
| writeMetrics.incWriteTime(System.nanoTime() - writeStartTime); | ||
| } | ||
| partitionWriters = null; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,7 +20,5 @@ | |
| import java.io.OutputStream; | ||
|
|
||
| public abstract class DefaultShuffleBlockOutputStream extends OutputStream { | ||
|
||
|
|
||
| public abstract int getCount(); | ||
|
|
||
| public abstract int getCount(); | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,10 +17,10 @@ | |
|
|
||
| package org.apache.spark.shuffle.sort.io; | ||
|
|
||
| import java.io.BufferedOutputStream; | ||
| import java.io.File; | ||
| import java.io.FileOutputStream; | ||
| import java.io.IOException; | ||
| import java.io.*; | ||
| import java.nio.channels.Channels; | ||
| import java.nio.channels.WritableByteChannel; | ||
|
|
||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
|
|
@@ -40,13 +40,12 @@ public class DefaultShuffleMapOutputWriter implements ShuffleMapOutputWriter { | |
|
|
||
| private final int shuffleId; | ||
| private final int mapId; | ||
| private final int numPartitions; | ||
| private final ShuffleWriteMetricsReporter metrics; | ||
| private final IndexShuffleBlockResolver blockResolver; | ||
| private final long[] partitionLengths; | ||
| private ShufflePartitionWriter[] partitionWriters; | ||
| private final int bufferSize; | ||
| private int currPartitionId = 0; | ||
| private boolean successfulWrite = false; | ||
|
|
||
| private final File outputFile; | ||
| private final File outputTempFile; | ||
|
|
@@ -63,45 +62,38 @@ public DefaultShuffleMapOutputWriter( | |
| SparkConf sparkConf) { | ||
| this.shuffleId = shuffleId; | ||
| this.mapId = mapId; | ||
| this.numPartitions = numPartitions; | ||
| this.metrics = metrics; | ||
| this.blockResolver = blockResolver; | ||
| this.bufferSize = | ||
| (int) (long) sparkConf.get( | ||
| package$.MODULE$.SHUFFLE_UNSAFE_FILE_OUTPUT_BUFFER_SIZE()) * 1024; | ||
| this.partitionLengths = new long[numPartitions]; | ||
| this.partitionWriters = new ShufflePartitionWriter[numPartitions]; | ||
| this.outputFile = blockResolver.getDataFile(shuffleId, mapId); | ||
| this.outputTempFile = Utils.tempFileWith(outputFile); | ||
| } | ||
|
|
||
| @Override | ||
| public ShufflePartitionWriter getNextPartitionWriter() throws IOException { | ||
| initStream(); | ||
| ShufflePartitionWriter shufflePartitionWriter = | ||
| new DefaultShufflePartitionWriter( | ||
| new DefaultShuffleBlockOutputStreamImpl()); | ||
| partitionWriters[currPartitionId++] = shufflePartitionWriter; | ||
| return shufflePartitionWriter; | ||
| return new DefaultShufflePartitionWriter(currPartitionId++); | ||
| } | ||
|
|
||
| @Override | ||
| public void commitAllPartitions() throws IOException { | ||
ifilonenko marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| for (int pId = 0; pId < numPartitions; pId ++) { | ||
| partitionLengths[pId] = partitionWriters[pId].getLength(); | ||
| } | ||
| cleanUp(); | ||
| blockResolver.writeIndexFileAndCommit(shuffleId, mapId, partitionLengths, outputTempFile); | ||
| // TODO: Maybe monitor a little more intelligably then checking files? | ||
| if (!outputFile.getParentFile().isDirectory() && !outputFile.getParentFile().mkdirs()) { | ||
| throw new IOException( | ||
| String.format( | ||
| "Failed to create shuffle file directory at %s.", | ||
| outputFile.getParentFile().getAbsolutePath())); | ||
| } | ||
| if (!outputFile.isFile() && !outputFile.createNewFile()) { | ||
| throw new IOException( | ||
| String.format( | ||
| "Failed to create empty shuffle file at %s.", outputFile.getAbsolutePath())); | ||
| if (!successfulWrite) { | ||
|
||
| if (!outputFile.getParentFile().isDirectory() && !outputFile.getParentFile().mkdirs()) { | ||
| throw new IOException( | ||
| String.format( | ||
| "Failed to create shuffle file directory at %s.", | ||
| outputFile.getParentFile().getAbsolutePath())); | ||
| } | ||
| if (!outputFile.isFile() && !outputFile.createNewFile()) { | ||
|
||
| throw new IOException( | ||
| String.format( | ||
| "Failed to create empty shuffle file at %s.", outputFile.getAbsolutePath())); | ||
| } | ||
| } | ||
ifilonenko marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
|
|
@@ -138,10 +130,42 @@ private void initStream() throws IOException { | |
| } | ||
| } | ||
|
|
||
| private class DefaultShuffleBlockOutputStreamImpl extends DefaultShuffleBlockOutputStream { | ||
| private class DefaultShufflePartitionWriter implements ShufflePartitionWriter { | ||
|
|
||
| private boolean isClosed = false; | ||
| private final int partitionId; | ||
| private final DefaultShuffleBlockOutputStream stream; | ||
|
|
||
| private DefaultShufflePartitionWriter(int partitionId) { | ||
| this.partitionId = partitionId; | ||
| this.stream = new DefaultShuffleBlockOutputStreamImpl(); | ||
| } | ||
|
|
||
| @Override | ||
| public OutputStream openStream() throws IOException { | ||
| return stream; | ||
| } | ||
|
|
||
| @Override | ||
| public long getLength() { | ||
| try { | ||
| stream.close(); | ||
| } catch (Exception e) { | ||
| log.error("Error with closing stream for partition", e); | ||
| } | ||
| int length = stream.getCount(); | ||
| partitionLengths[partitionId] = length; | ||
| return length; | ||
| } | ||
|
|
||
| @Override | ||
| public WritableByteChannel openChannel() throws IOException { | ||
| return Channels.newChannel(outputFileStream); | ||
|
||
| } | ||
| } | ||
|
|
||
| private class DefaultShuffleBlockOutputStreamImpl extends DefaultShuffleBlockOutputStream { | ||
| private int count = 0; | ||
| private boolean isClosed = false; | ||
|
|
||
| @Override | ||
| public int getCount() { | ||
|
|
@@ -153,6 +177,7 @@ public void write(int b) throws IOException { | |
| if (isClosed) { | ||
| throw new IllegalStateException("Attempting to write to a closed block byte channel."); | ||
| } | ||
| successfulWrite = true; | ||
| outputBufferedFileStream.write(b); | ||
| count++; | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't use * import