SortShuffleReader code improvement

apache · jerryshao · Sep 5, 2014 · Oct 22, 2014 · Oct 23, 2014 · Oct 23, 2014
commit 98c039b21a03fddc3076d35cddd18d9439e99587
diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/MixedShuffleReader.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/MixedShuffleReader.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.sort
+
+import org.apache.spark.{TaskContext, Logging}
+import org.apache.spark.shuffle.{BaseShuffleHandle, ShuffleReader}
+import org.apache.spark.shuffle.hash.HashShuffleReader
+
+private[spark] class MixedShuffleReader[K, C](
+    handle: BaseShuffleHandle[K, _, C],
+    startPartition: Int,
+    endPartition: Int,
+    context: TaskContext)
+  extends ShuffleReader[K, C] with Logging {
+
+  private val shuffleReader = if (handle.dependency.keyOrdering.isDefined) {
+    new SortShuffleReader[K, C](handle, startPartition, endPartition, context)
+  } else {
+    new HashShuffleReader[K, C](handle, startPartition, endPartition, context)
+  }
+
+  override def read(): Iterator[Product2[K, C]] = shuffleReader.read()
+
+  override def stop(): Unit = shuffleReader.stop()
+}
diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
@@ -47,7 +47,7 @@ private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager
       endPartition: Int,
       context: TaskContext): ShuffleReader[K, C] = {
     // We currently use the same block store shuffle fetcher as the hash-based shuffle.
-    new SortShuffleReader(
+    new MixedShuffleReader(
       handle.asInstanceOf[BaseShuffleHandle[K, _, C]], startPartition, endPartition, context)
   }
 

diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleReader.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleReader.scala
@@ -26,10 +26,9 @@ import org.apache.spark.{Logging, InterruptibleIterator, SparkEnv, TaskContext}
 import org.apache.spark.network.buffer.ManagedBuffer
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.{ShuffleReader, BaseShuffleHandle}
-import org.apache.spark.shuffle.hash.BlockStoreShuffleFetcher
 import org.apache.spark.storage._
 import org.apache.spark.util.CompletionIterator
-import org.apache.spark.util.collection.{MergeUtil, TieredDiskMerger}
+import org.apache.spark.util.collection.{TieredDiskMerger, MergeUtil}
 
 /**
  * SortShuffleReader merges and aggregates shuffle data that has already been sorted within each
@@ -50,13 +49,13 @@ private[spark] class SortShuffleReader[K, C](
     context: TaskContext)
   extends ShuffleReader[K, C] with Logging {
 
+  /** Manage the fetched in-memory shuffle block and related buffer*/
+  case class MemoryShuffleBlock(blockId: BlockId, blockData: ManagedBuffer)
+
   require(endPartition == startPartition + 1,
     "Sort shuffle currently only supports fetching one partition")
 
-  private val fileBufferSize = conf.getInt("spark.shuffle.file.buffer.kb", 32) * 1024
-
-  case class MemoryBlock(blockId: BlockId, blockData: ManagedBuffer)
-
+  /** Shuffle block fetcher iterator */
   private var shuffleRawBlockFetcherItr: ShuffleRawBlockFetcherIterator = _
 
   private val dep = handle.dependency
@@ -65,81 +64,94 @@ private[spark] class SortShuffleReader[K, C](
   private val ser = Serializer.getSerializer(dep.serializer)
   private val shuffleMemoryManager = SparkEnv.get.shuffleMemoryManager
 
-  private val memoryBlocks = new ArrayBuffer[MemoryBlock]()
+  private val fileBufferSize = conf.getInt("spark.shuffle.file.buffer.kb", 32) * 1024
 
-  private val tieredMerger = new TieredDiskMerger(conf, dep, context)
+  /** ArrayBuffer to store in-memory shuffle blocks */
+  private val inMemoryBlocks = new ArrayBuffer[MemoryShuffleBlock]()
 
+  /** keyComparator for mergeSort, id keyOrdering is not available,
+    * using hashcode of key to compare */
   private val keyComparator: Comparator[K] = dep.keyOrdering.getOrElse(new Comparator[K] {
     override def compare(a: K, b: K) = {
       val h1 = if (a == null) 0 else a.hashCode()
       val h2 = if (b == null) 0 else b.hashCode()
-      h1 - h2
+      if (h1 < h2) -1 else if (h1 == h2) 0 else 1
     }
   })
 
-  override def read(): Iterator[Product2[K, C]] = {
-    if (!dep.mapSideCombine && dep.aggregator.isDefined) {
-      val iter = BlockStoreShuffleFetcher.fetch(handle.shuffleId, startPartition, context, ser)
-      new InterruptibleIterator(context,
-        dep.aggregator.get.combineValuesByKey(iter, context))
-    } else {
-      sortShuffleRead()
-    }
-  }
+  /** A merge thread to merge on-disk blocks */
+  private val tieredMerger = new TieredDiskMerger(conf, dep, keyComparator, context)
 
-  private def sortShuffleRead(): Iterator[Product2[K, C]] = {
+  override def read(): Iterator[Product2[K, C]] = {
     tieredMerger.start()
 
     for ((blockId, blockData) <- fetchRawBlocks()) {
       if (blockData.isEmpty) {
         throw new IllegalStateException(s"block $blockId is empty for unknown reason")
       }
 
-      memoryBlocks += MemoryBlock(blockId, blockData.get)
+      inMemoryBlocks += MemoryShuffleBlock(blockId, blockData.get)
 
       // Try to fit block in memory. If this fails, merge in-memory blocks to disk.
       val blockSize = blockData.get.size
       val granted = shuffleMemoryManager.tryToAcquire(blockData.get.size)
+      logInfo(s"Granted $granted memory for shuffle block")
+
       if (granted < blockSize) {
-        shuffleMemoryManager.release(granted)
+        logInfo(s"Granted $granted memory is not enough to store shuffle block ($blockSize), " +
+          s"try to consolidate in-memory blocks to release the memory")
 
-        val itrGroup = memoryBlocksToIterators()
-        val partialMergedIter =
-          MergeUtil.mergeSort(itrGroup, keyComparator, dep.keyOrdering, dep.aggregator)
+        shuffleMemoryManager.release(granted)
 
         // Write merged blocks to disk
         val (tmpBlockId, file) = blockManager.diskBlockManager.createTempShuffleBlock()
-        val fos = new BufferedOutputStream(new FileOutputStream(file), fileBufferSize)
-        blockManager.dataSerializeStream(tmpBlockId, fos, partialMergedIter, ser)
+        val fos = new FileOutputStream(file)
+        val bos = new BufferedOutputStream(fos, fileBufferSize)
+
+        if (inMemoryBlocks.size > 1) {
+          val itrGroup = inMemoryBlocksToIterators()
+          val partialMergedItr =
+            MergeUtil.mergeSort(itrGroup, keyComparator, dep.keyOrdering, dep.aggregator)
+          blockManager.dataSerializeStream(tmpBlockId, bos, partialMergedItr, ser)
+        } else {
+          val buffer = inMemoryBlocks.map(_.blockData.nioByteBuffer()).head
+          val channel = fos.getChannel
+          while (buffer.hasRemaining) {
+            channel.write(buffer)
+          }
+          channel.close()
+        }
+
         tieredMerger.registerOnDiskBlock(tmpBlockId, file)
 
-        for (block <- memoryBlocks) {
+        for (block <- inMemoryBlocks) {
           shuffleMemoryManager.release(block.blockData.size)
         }
-        memoryBlocks.clear()
+        inMemoryBlocks.clear()
       }
 
       shuffleRawBlockFetcherItr.currentResult = null
     }
+
     tieredMerger.doneRegisteringOnDiskBlocks()
 
     // Merge on-disk blocks with in-memory blocks to directly feed to the reducer.
-    val finalItrGroup = memoryBlocksToIterators() ++ Seq(tieredMerger.readMerged())
+    val finalItrGroup = inMemoryBlocksToIterators() ++ Seq(tieredMerger.readMerged())
     val mergedItr =
       MergeUtil.mergeSort(finalItrGroup, keyComparator, dep.keyOrdering, dep.aggregator)
 
-    // Release the in-memory block and on-disk file when iteration is completed.
+    // Release the in-memory block when iteration is completed.
     val completionItr = CompletionIterator[Product2[K, C], Iterator[Product2[K, C]]](
       mergedItr, () => {
-        memoryBlocks.foreach(block => shuffleMemoryManager.release(block.blockData.size))
-        memoryBlocks.clear()
+        inMemoryBlocks.foreach(block => shuffleMemoryManager.release(block.blockData.size))
+        inMemoryBlocks.clear()
       })
 
     new InterruptibleIterator(context, completionItr.map(p => (p._1, p._2)))
   }
 
-  def memoryBlocksToIterators(): Seq[Iterator[Product2[K, C]]] = {
-    memoryBlocks.map{ case MemoryBlock(id, buf) =>
+  private def inMemoryBlocksToIterators(): Seq[Iterator[Product2[K, C]]] = {
+    inMemoryBlocks.map{ case MemoryShuffleBlock(id, buf) =>
       blockManager.dataDeserialize(id, buf.nioByteBuffer(), ser)
         .asInstanceOf[Iterator[Product2[K, C]]]
     }
@@ -149,27 +161,25 @@ private[spark] class SortShuffleReader[K, C](
 
   private def fetchRawBlocks(): Iterator[(BlockId, Option[ManagedBuffer])] = {
     val statuses = SparkEnv.get.mapOutputTracker.getServerStatuses(handle.shuffleId, startPartition)
+
     val splitsByAddress = new HashMap[BlockManagerId, ArrayBuffer[(Int, Long)]]()
     for (((address, size), index) <- statuses.zipWithIndex) {
       splitsByAddress.getOrElseUpdate(address, ArrayBuffer()) += ((index, size))
     }
-    val blocksByAddress: Seq[(BlockManagerId, Seq[(BlockId, Long)])] = splitsByAddress.toSeq.map {
-      case (address, splits) =>
-        (address, splits.map(s => (ShuffleBlockId(handle.shuffleId, s._1, startPartition), s._2)))
-    }
-    var numMapBlocks = 0
-    blocksByAddress.foreach { case (_, blocks) =>
-      blocks.foreach { case (_, len) => if (len > 0) numMapBlocks += 1 }
+
+    val blocksByAddress = splitsByAddress.toSeq.map { case (address, splits) =>
+      val blocks = splits.map { s =>
+        (ShuffleBlockId(handle.shuffleId, s._1, startPartition), s._2)
+      }
+      (address, blocks.toSeq)
     }
-    val threadId = Thread.currentThread.getId
-    logInfo(s"Fetching $numMapBlocks blocks for $threadId")
 
     shuffleRawBlockFetcherItr = new ShuffleRawBlockFetcherIterator(
       context,
       SparkEnv.get.blockTransferService,
       blockManager,
       blocksByAddress,
-      SparkEnv.get.conf.getLong("spark.reducer.maxMbInFlight", 48) * 1024 * 1024)
+      conf.getLong("spark.reducer.maxMbInFlight", 48) * 1024 * 1024)
 
     val completionItr = CompletionIterator[
       (BlockId, Option[ManagedBuffer]),