fix compression settings of tmp files; minor cleanup

apache · squito · Oct 21, 2015 · Oct 22, 2015 · Oct 22, 2015 · Oct 22, 2015
commit 4df7955db9c46b1549b3c0f4e238f5be7970c337
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
@@ -127,9 +127,7 @@ public BypassMergeSortShuffleWriter(
   @Override
   public Seq<Tuple2<File, File>> write(Iterator<Product2<K, V>> records) throws IOException {
     assert (partitionWriters == null);
-    final File indexFile = blockManager.diskBlockManager().getFile(new ShuffleIndexBlockId(
-        shuffleId, mapId, IndexShuffleBlockResolver$.MODULE$.NOOP_REDUCE_ID())
-    );
+    final File indexFile = shuffleBlockResolver.getIndexFile(shuffleId, mapId);
     final File dataFile = shuffleBlockResolver.getDataFile(shuffleId, mapId);
     if (!records.hasNext()) {
       partitionLengths = new long[numPartitions];

diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
@@ -57,7 +57,6 @@
 import org.apache.spark.shuffle.ShuffleWriter;
 import org.apache.spark.storage.BlockManager;
 import org.apache.spark.storage.ShuffleIndexBlockId;
-import org.apache.spark.storage.ShuffleMapStatusBlockId;
 import org.apache.spark.storage.TimeTrackingOutputStream;
 import org.apache.spark.unsafe.Platform;
 
@@ -234,9 +233,7 @@ Seq<Tuple2<File, File>> closeAndWriteOutput() throws IOException {
     final File tmpIndexFile = shuffleBlockResolver.writeIndexFile(shuffleId, mapId, partitionLengths);
     mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths);
     final File dataFile = shuffleBlockResolver.getDataFile(shuffleId, mapId);
-    final File indexFile = blockManager.diskBlockManager().getFile(
-      new ShuffleIndexBlockId(shuffleId, mapId, IndexShuffleBlockResolver$.MODULE$.NOOP_REDUCE_ID())
-    );
+    final File indexFile = shuffleBlockResolver.getIndexFile(shuffleId, mapId);
 
     return JavaConverters.asScalaBufferConverter(Arrays.asList(
       new Tuple2<>(tmpIndexFile, indexFile),

diff --git a/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/FileShuffleBlockResolver.scala
@@ -85,7 +85,7 @@ private[spark] class FileShuffleBlockResolver(conf: SparkConf)
         Array.tabulate[(DiskBlockObjectWriter, File)](numReducers) { bucketId =>
           val blockId = ShuffleBlockId(shuffleId, mapId, bucketId)
           val blockFile = blockManager.diskBlockManager.getFile(blockId)
-          val (_, tmpBlockFile) = blockManager.diskBlockManager.createTempLocalBlock()
+          val (_, tmpBlockFile) = blockManager.diskBlockManager.createTempShuffleBlock()
           // Because of previous failures, the shuffle file may already exist on this machine.
           // If so, remove it.
           if (blockFile.exists) {

diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
@@ -51,7 +51,7 @@ private[spark] class IndexShuffleBlockResolver(conf: SparkConf) extends ShuffleB
     blockManager.diskBlockManager.getFile(ShuffleDataBlockId(shuffleId, mapId, NOOP_REDUCE_ID))
   }
 
-  private def getIndexFile(shuffleId: Int, mapId: Int): File = {
+  private[shuffle] def getIndexFile(shuffleId: Int, mapId: Int): File = {
     blockManager.diskBlockManager.getFile(ShuffleIndexBlockId(shuffleId, mapId, NOOP_REDUCE_ID))
   }
 

diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala
@@ -67,13 +67,12 @@ private[spark] class SortShuffleWriter[K, V, C](
     // Don't bother including the time to open the merged output file in the shuffle write time,
     // because it just opens a single file, so is typically too fast to measure accurately
     // (see SPARK-3570).
-    val (_, tmpDataFile) = blockManager.diskBlockManager.createTempShuffleBlock()
+    val (_, tmpDataFile) = blockManager.diskBlockManager.createUncompressedTempShuffleBlock()
     val blockId = ShuffleBlockId(dep.shuffleId, mapId, IndexShuffleBlockResolver.NOOP_REDUCE_ID)
     val partitionLengths = sorter.writePartitionedFile(blockId, tmpDataFile)
     val tmpIndexFile = shuffleBlockResolver.writeIndexFile(dep.shuffleId, mapId, partitionLengths)
     val dataFile = shuffleBlockResolver.getDataFile(dep.shuffleId, mapId)
-    val indexFile = blockManager.diskBlockManager.getFile(
-      ShuffleIndexBlockId(handle.shuffleId, mapId, IndexShuffleBlockResolver.NOOP_REDUCE_ID))
+    val indexFile = shuffleBlockResolver.getIndexFile(dep.shuffleId, mapId)
 
     mapStatus = MapStatus(blockManager.shuffleServerId, partitionLengths)
     Seq(

diff --git a/core/src/main/scala/org/apache/spark/storage/BlockId.scala b/core/src/main/scala/org/apache/spark/storage/BlockId.scala
@@ -105,6 +105,16 @@ private[spark] case class TempShuffleBlockId(id: UUID) extends BlockId {
   override def name: String = "temp_shuffle_" + id
 }
 
+/**
+ * Id associated with temporary shuffle data managed as blocks, which is not
+ * compressed, regardless of spark.shuffle.compress and spark.shuffle.spill.compress.  Used
+ * for the temporary location of data files until they are moved into place by the
+ * [[org.apache.spark.shuffle.ShuffleOutputCoordinator]].  Not serializable.
+ */
+private[spark] case class TempUncompressedShuffleBlockId(id: UUID) extends BlockId {
+  override def name: String = "temp_uncompressed_shuffle_" + id
+}
+
 // Intended only for testing purposes
 private[spark] case class TestBlockId(id: String) extends BlockId {
   override def name: String = "test_" + id

diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
@@ -124,6 +124,20 @@ private[spark] class DiskBlockManager(blockManager: BlockManager, conf: SparkCon
     (blockId, getFile(blockId))
   }
 
+  /**
+   * Produces a unique block id and File suitable for storing shuffled data files, which are
+   * uncompressed, before they are moved to their final location by the
+   * [[org.apache.spark.shuffle.ShuffleOutputCoordinator]]
+   */
+  def createUncompressedTempShuffleBlock(): (TempUncompressedShuffleBlockId, File) = {
+    var blockId = new TempUncompressedShuffleBlockId(UUID.randomUUID())
+    while (getFile(blockId).exists()) {
+      blockId = new TempUncompressedShuffleBlockId(UUID.randomUUID())
+    }
+    (blockId, getFile(blockId))
+
+  }
+
   /**
    * Create local directories for storing block data. These directories are
    * located inside configured local directories and won't