Narrowing the size of the diff by moving some changes out to future w…

…ork.
apache · frreiss · Aug 9, 2016 · Aug 11, 2016 · Aug 11, 2016 · Aug 15, 2016
commit 7c6a30d2da8c31dcd5db8a4337913d5805264306
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLog.scala
@@ -24,7 +24,6 @@ package org.apache.spark.sql.execution.streaming
  *  - Allow the user to query the latest batch id.
  *  - Allow the user to query the metadata object of a specified batch id.
  *  - Allow the user to query metadata objects in a range of batch ids.
- *  - Inform the log that it is safe to garbage-collect metadata from a batch
  */
 trait MetadataLog[T] {
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.types.StructType
  * monotonically increasing notion of progress that can be represented as an [[Offset]]. Spark
  * will regularly query each [[Source]] to see if any more data is available.
  */
-trait Source {
+trait Source  {
 
   /** Returns the schema of the data from this source */
   def schema: StructType
@@ -59,7 +59,7 @@ trait Source {
    * Informs the source that Spark has completed processing all data for offsets less than or
    * equal to `end` and will only request offsets greater than `end` in the future.
    */
-  def commit(end: Offset)
+  def commit(end: Offset) : Unit = {}
 
   /** Stop this source and free any resources it has allocated. */
   def stop(): Unit

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -317,11 +317,6 @@ class StreamExecution(
         case (src, off) => src.commit(off)
       }
 
-      // The log can also discard old metadata. Trim one batch less than we could, just
-      // in case.
-      if (currentBatchId > 2) {
-        offsetLog.purge(currentBatchId - 2)
-      }
     } else {
       awaitBatchLock.lock()
       try {

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -625,12 +625,7 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
 
       /** Create a text file with a single data item */
       def createFile(data: Int): File = {
-        // Use 2 character file names padded with zeros so that alphabetical and
-        // numeric order are the same for the generated file names.
-        val file = stringToFile(new File(src, f"$data%02d.txt"), data.toString)
-
-        // File modification times aren't currently used to decide what goes into
-        // the next batch, but they may be used in the future.
+        val file = stringToFile(new File(src, s"$data.txt"), data.toString)
         if (lastFileModTime.nonEmpty) file.setLastModified(lastFileModTime.get + 1000)
         lastFileModTime = Some(file.lastModified)
         file

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -298,8 +298,6 @@ class FakeDefaultSource extends StreamSourceProvider {
 
       override def schema: StructType = StructType(StructField("a", IntegerType) :: Nil)
 
-      override def lastCommittedOffset: Option[Offset] = None
-
       override def getOffset: Option[Offset] = {
         if (offset >= 10) {
           None
@@ -314,8 +312,6 @@ class FakeDefaultSource extends StreamSourceProvider {
         spark.range(startOffset, end.asInstanceOf[LongOffset].offset + 1).toDF("a")
       }
 
-      override def commit(end: Offset): Unit = {}
-
       override def stop() {}
     }
   }

diff --git a/...core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala b/...core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
@@ -77,8 +77,6 @@ class DefaultSource extends StreamSourceProvider with StreamSinkProvider {
     new Source {
       override def schema: StructType = fakeSchema
 
-      override def lastCommittedOffset: Option[Offset] = None
-
       override def getOffset: Option[Offset] = Some(new LongOffset(0))
 
       override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
@@ -87,8 +85,6 @@ class DefaultSource extends StreamSourceProvider with StreamSinkProvider {
         Seq[Int]().toDS().toDF()
       }
 
-      override def commit(end: Offset): Unit = {}
-
       override def stop() {}
     }
   }