Address comments

apache · andrewor14 · Aug 7, 2015 · Aug 7, 2015 · Aug 7, 2015 · Aug 7, 2015
commit b4d3633b256de6d981ef7fd2e62afa5490323682
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
@@ -90,10 +90,10 @@ case class TungstenAggregate(
         // We're not using the underlying map, so we just can free it here
         aggregationIterator.free()
         if (groupingExpressions.isEmpty) {
-          // This is a grouped aggregate and the input iterator is empty,
-          // so return an empty iterator.
           Iterator.single[UnsafeRow](aggregationIterator.outputForEmptyGroupingKeyWithoutInput())
         } else {
+          // This is a grouped aggregate and the input iterator is empty,
+          // so return an empty iterator.
           Iterator[UnsafeRow]()
         }
       } else {
@@ -104,10 +104,9 @@ case class TungstenAggregate(
 
     // Note: we need to set up the iterator in each partition before computing the
     // parent partition, so we cannot simply use `mapPartitions` here (SPARK-9747).
-    val parentPartition = child.execute()
     val resultRdd = {
       new MapPartitionsWithPreparationRDD[UnsafeRow, InternalRow, TungstenAggregationIterator](
-        parentPartition, preparePartition, executePartition, preservesPartitioning = true)
+        child.execute(), preparePartition, executePartition, preservesPartitioning = true)
     }
     resultRdd.asInstanceOf[RDD[InternalRow]]
   }

diff --git a/...src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala b/...src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
@@ -84,7 +84,7 @@ class TungstenAggregationIterator(
   extends Iterator[UnsafeRow] with Logging {
 
   // The parent partition iterator, to be initialized later in `start`
-  private[this] var inputIter: Iterator[InternalRow] = Iterator[InternalRow]()
+  private[this] var inputIter: Iterator[InternalRow] = null
 
   ///////////////////////////////////////////////////////////////////////////
   // Part 1: Initializing aggregate functions.
@@ -334,7 +334,7 @@ class TungstenAggregationIterator(
   // This is the hash map used for hash-based aggregation. It is backed by an
   // UnsafeFixedWidthAggregationMap and it is used to store
   // all groups and their corresponding aggregation buffers for hash-based aggregation.
-  private[aggregate] val hashMap = new UnsafeFixedWidthAggregationMap(
+  private[this] val hashMap = new UnsafeFixedWidthAggregationMap(
     initialAggregationBuffer,
     StructType.fromAttributes(allAggregateFunctions.flatMap(_.bufferAttributes)),
     StructType.fromAttributes(groupingExpressions.map(_.toAttribute)),
@@ -345,11 +345,15 @@ class TungstenAggregationIterator(
     false // disable tracking of performance metrics
   )
 
+  // Exposed for testing
+  private[aggregate] def getHashMap: UnsafeFixedWidthAggregationMap = hashMap
+
   // The function used to read and process input rows. When processing input rows,
   // it first uses hash-based aggregation by putting groups and their buffers in
   // hashMap. If we could not allocate more memory for the map, we switch to
   // sort-based aggregation (by calling switchToSortBasedAggregation).
   private def processInputs(): Unit = {
+    assert(inputIter != null, "attempted to process input when iterator was null")
     while (!sortBased && inputIter.hasNext) {
       val newInput = inputIter.next()
       val groupingKey = groupProjection.apply(newInput)
@@ -368,6 +372,7 @@ class TungstenAggregationIterator(
   // that it switch to sort-based aggregation after `fallbackStartsAt` input rows have
   // been processed.
   private def processInputsWithControlledFallback(fallbackStartsAt: Int): Unit = {
+    assert(inputIter != null, "attempted to process input when iterator was null")
     var i = 0
     while (!sortBased && inputIter.hasNext) {
       val newInput = inputIter.next()
@@ -407,6 +412,7 @@ class TungstenAggregationIterator(
    * Switch to sort-based aggregation when the hash-based approach is unable to acquire memory.
    */
   private def switchToSortBasedAggregation(firstKey: UnsafeRow, firstInput: InternalRow): Unit = {
+    assert(inputIter != null, "attempted to process input when iterator was null")
     logInfo("falling back to sort based aggregation.")
     // Step 1: Get the ExternalSorter containing sorted entries of the map.
     externalSorter = hashMap.destructAndCreateExternalSorter()
@@ -426,8 +432,9 @@ class TungstenAggregationIterator(
       case _ => false
     }
 
-    // Note: we spill the sorter's contents immediately after creating it. Therefore, we must
-    // insert something into the sorter here to ensure that we acquire at least a page of memory.
+    // Note: Since we spill the sorter's contents immediately after creating it, we must insert
+    // something into the sorter here to ensure that we acquire at least a page of memory.
+    // This is done through `externalSorter.insertKV`, which will trigger the page allocation.
     // Otherwise, children operators may steal the window of opportunity and starve our sorter.
 
     if (needsProcess) {
@@ -684,7 +691,7 @@ class TungstenAggregationIterator(
    */
   def outputForEmptyGroupingKeyWithoutInput(): UnsafeRow = {
     assert(groupingExpressions.isEmpty)
-    assert(!inputIter.hasNext)
+    assert(inputIter == null)
     generateOutput(UnsafeRow.createFromByteArray(0, 0), initialAggregationBuffer)
   }
 

diff --git a/...est/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala b/...est/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
@@ -39,7 +39,7 @@ class TungstenAggregationIteratorSuite extends SparkFunSuite with LocalSparkCont
       }
       iter = new TungstenAggregationIterator(
         Seq.empty, Seq.empty, Seq.empty, 0, Seq.empty, newMutableProjection, Seq.empty, None)
-      val numPages = iter.hashMap.getNumDataPages
+      val numPages = iter.getHashMap.getNumDataPages
       assert(numPages === 1)
     } finally {
       // Clean up