Skip to content
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Support partial aggregation for reduceGroups.
  • Loading branch information
viirya committed Jul 15, 2016
commit 11357737ad58a2a6c1ea2e17026669fc138f556c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder, Ou
import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, CreateStruct}
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.expressions.Aggregator

/**
* :: Experimental ::
Expand Down Expand Up @@ -177,10 +178,33 @@ class KeyValueGroupedDataset[K, V] private[sql](
* @since 1.6.0
*/
def reduceGroups(f: (V, V) => V): Dataset[(K, V)] = {
val func = (key: K, it: Iterator[V]) => Iterator((key, it.reduce(f)))
val encoder = encoderFor[V]
val intEncoder: ExpressionEncoder[Int] = ExpressionEncoder()
val aggregator: TypedColumn[V, V] = new Aggregator[V, (Int, V), V] {
def bufferEncoder: Encoder[(Int, V)] = ExpressionEncoder.tuple(intEncoder, encoder)
def outputEncoder: Encoder[V] = encoder

implicit val resultEncoder = ExpressionEncoder.tuple(kExprEnc, vExprEnc)
flatMapGroups(func)
def zero: (Int, V) = (0, null.asInstanceOf[V])
def reduce(reducedValue: (Int, V), value: V): (Int, V) = {
if (reducedValue._1 == 0) {
(1, value)
} else {
(1, f(reducedValue._2, value))
}
}
def merge(buf1: (Int, V), buf2: (Int, V)): (Int, V) = {
if (buf1._1 == 0) {
buf2
} else if (buf2._2 == 0) {
buf1
} else {
(1, f(buf1._2, buf2._2))
}
}
def finish(result: (Int, V)): V = result._2
}.toColumn

agg(aggregator)
}

/**
Expand Down