-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-8029][core] first successful shuffle task always wins #9214
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
149ea3e
cf8118e
ea1ae07
9356c67
2b42eb5
32d4b3b
550e198
4ff98bf
4a19702
89063dd
4145651
2089e12
2e9bbaa
4cd423e
dc4b7f6
b7a0981
830a097
5d11eca
3f5af9c
4b7c71a
eabf978
5bbeec3
e141d82
4df7955
dc076b8
cfdfd2c
86f468a
5c8b247
4d66df1
e59df41
c206fc5
c0edff1
da33519
c0b93a5
9d0d9d9
80e037d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ | |
| import java.io.FileInputStream; | ||
| import java.io.FileOutputStream; | ||
| import java.io.IOException; | ||
| import java.util.Arrays; | ||
| import javax.annotation.Nullable; | ||
|
|
||
| import scala.None$; | ||
|
|
@@ -41,12 +42,15 @@ | |
| import org.apache.spark.executor.ShuffleWriteMetrics; | ||
| import org.apache.spark.scheduler.MapStatus; | ||
| import org.apache.spark.scheduler.MapStatus$; | ||
| import org.apache.spark.shuffle.IndexShuffleBlockResolver$; | ||
| import org.apache.spark.serializer.Serializer; | ||
| import org.apache.spark.serializer.SerializerInstance; | ||
| import org.apache.spark.shuffle.IndexShuffleBlockResolver; | ||
| import org.apache.spark.shuffle.ShuffleWriter; | ||
| import org.apache.spark.storage.*; | ||
| import org.apache.spark.util.Utils; | ||
| import scala.collection.JavaConverters; | ||
| import scala.collection.Seq; | ||
|
|
||
| /** | ||
| * This class implements sort-based shuffle's hash-style shuffle fallback path. This write path | ||
|
|
@@ -121,13 +125,19 @@ public BypassMergeSortShuffleWriter( | |
| } | ||
|
|
||
| @Override | ||
| public void write(Iterator<Product2<K, V>> records) throws IOException { | ||
| public Seq<Tuple2<File, File>> write(Iterator<Product2<K, V>> records) throws IOException { | ||
| assert (partitionWriters == null); | ||
| final File indexFile = blockManager.diskBlockManager().getFile(new ShuffleIndexBlockId( | ||
| shuffleId, mapId, IndexShuffleBlockResolver$.MODULE$.NOOP_REDUCE_ID()) | ||
| ); | ||
| final File dataFile = shuffleBlockResolver.getDataFile(shuffleId, mapId); | ||
| if (!records.hasNext()) { | ||
| partitionLengths = new long[numPartitions]; | ||
| shuffleBlockResolver.writeIndexFile(shuffleId, mapId, partitionLengths); | ||
| File tmpIndexFile = shuffleBlockResolver.writeIndexFile(shuffleId, mapId, partitionLengths); | ||
| mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths); | ||
| return; | ||
| return JavaConverters.asScalaBufferConverter(Arrays.asList( | ||
| new Tuple2<>(tmpIndexFile, indexFile) | ||
| )).asScala(); | ||
| } | ||
| final SerializerInstance serInstance = serializer.newInstance(); | ||
| final long openStartTime = System.nanoTime(); | ||
|
|
@@ -155,10 +165,16 @@ public void write(Iterator<Product2<K, V>> records) throws IOException { | |
| writer.commitAndClose(); | ||
| } | ||
|
|
||
| partitionLengths = | ||
| writePartitionedFile(shuffleBlockResolver.getDataFile(shuffleId, mapId)); | ||
| shuffleBlockResolver.writeIndexFile(shuffleId, mapId, partitionLengths); | ||
| File tmpDataFile = blockManager.diskBlockManager().createTempShuffleBlock()._2(); | ||
|
|
||
| partitionLengths = writePartitionedFile(tmpDataFile); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the type of tmp block used here doesn't matter, since
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If compression is employed, then each region of the single "partitioned" file is compressed separately. So the concatenation here should be correct. |
||
| File tmpIndexFile = shuffleBlockResolver.writeIndexFile(shuffleId, mapId, partitionLengths); | ||
| mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths); | ||
| return JavaConverters.asScalaBufferConverter(Arrays.asList( | ||
| new Tuple2<>(tmpIndexFile, indexFile), | ||
| new Tuple2<>(tmpDataFile, dataFile) | ||
| )).asScala(); | ||
|
|
||
| } | ||
|
|
||
| @VisibleForTesting | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,73 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
| package org.apache.spark.shuffle | ||
|
|
||
| import java.io.File | ||
|
|
||
| import org.apache.spark.Logging | ||
|
|
||
| /** | ||
| * Ensures that on each executor, there are no conflicting writes to the same shuffle files. It | ||
| * implements "first write wins", by atomically moving all shuffle files into their final location, | ||
| * only if the files did not already exist. See SPARK-8029 | ||
| */ | ||
| object ShuffleOutputCoordinator extends Logging { | ||
|
|
||
| /** | ||
| * if all of the destination files do not exist, then move all of the temporary files to their | ||
| * destinations. If any destination files exist, then simply delete all temporary files | ||
| * | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: stray empty line. |
||
| * @param tmpToDest pairs of (temporary, destination) file pairs | ||
| * @return | ||
| */ | ||
| def commitOutputs( | ||
| shuffleId: Int, | ||
| partitionId: Int, tmpToDest: Seq[(File, File)]): Boolean = synchronized { | ||
| logInfo(s"renaming: $tmpToDest") | ||
|
|
||
| // HashShuffleWriter might not write any records to some of its files -- that's OK, we only | ||
| // move the files that do exist | ||
| val toMove = tmpToDest.filter{_._1.exists()} | ||
|
|
||
| val destAlreadyExists = toMove.forall(_._2.exists) | ||
| if (!destAlreadyExists) { | ||
| // if any of the renames fail, delete all the dest files. otherwise, future | ||
| // attempts have no hope of succeeding | ||
| val renamesSucceeded = toMove.map { case (tmp, dest) => | ||
| if (dest.exists()) { | ||
| dest.delete() | ||
| } | ||
| val r = tmp.renameTo(dest) | ||
| if (!r) { | ||
| logInfo(s"failed to rename $tmp to $dest. ${tmp.exists()}; ${dest.exists()}") | ||
| } | ||
| r | ||
| }.forall{identity} | ||
| if (!renamesSucceeded) { | ||
| toMove.foreach { case (tmp, dest) => if (dest.exists()) dest.delete() } | ||
| false | ||
| } else { | ||
| true | ||
| } | ||
| } else { | ||
| logInfo(s"shuffle output for shuffle $shuffleId, partition $partitionId already exists, " + | ||
| s"not overwriting. Another task must have created this shuffle output.") | ||
| toMove.foreach{ case (tmp, _) => tmp.delete()} | ||
| false | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you add a javadoc explaining what the return value is? It's particularly cryptic because it's uses tuples; maybe it would be better to create a helper type where the fields have proper names.