MemorySink statistics for joinintg

lw-lin · lw-lin · commit 6dc2c9e18072 · 2016-11-05T21:27:26.000+08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -27,7 +27,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.encoders.encoderFor
 import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.plans.logical.LeafNode
+import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics}
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
@@ -212,4 +212,8 @@ class MemorySink(val schema: StructType, outputMode: OutputMode) extends Sink wi
  */
 case class MemoryPlan(sink: MemorySink, output: Seq[Attribute]) extends LeafNode {
   def this(sink: MemorySink) = this(sink, sink.schema.toAttributes)
+
+  private val sizePerRow = sink.schema.toAttributes.map(_.dataType.defaultSize).sum
+
+  override def statistics: Statistics = Statistics(sizePerRow * sink.allData.size)
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/MemorySinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/MemorySinkSuite.scala
@@ -187,6 +187,31 @@ class MemorySinkSuite extends StreamTest with BeforeAndAfter {
     query.stop()
   }
 
+  test("MemoryPlan statistics for joining") {
+    val input = MemoryStream[Int]
+    val query = input.toDF()
+      .writeStream
+      .format("memory")
+      .queryName("memStream")
+      .start()
+
+    val memStream = spark.table("memStream").as[Int]
+
+    input.addData(1)
+    query.processAllAvailable()
+    checkDatasetUnorderly(
+      memStream.crossJoin(memStream.withColumnRenamed("value", "value2")).as[(Int, Int)],
+      (1, 1))
+
+    input.addData(2)
+    query.processAllAvailable()
+    checkDatasetUnorderly(
+      memStream.crossJoin(memStream.withColumnRenamed("value", "value2")).as[(Int, Int)],
+      (1, 1), (1, 2), (2, 1), (2, 2))
+
+    query.stop()
+  }
+
   ignore("stress test") {
     // Ignore the stress test as it takes several minutes to run
     (0 until 1000).foreach { _ =>