marmbrus · marmbrus · Jan 7, 2016 · Dec 10, 2015 · Dec 11, 2015 · Dec 11, 2015
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Batch.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Batch.scala
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import org.apache.spark.sql.DataFrame
+
+/**
+ * Used to pass a batch of data through a streaming query execution along with an indication
+ * of progress in the stream.
+ */
+class Batch(val end: Offset, val data: DataFrame)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompositeOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CompositeOffset.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+/**
+ * An ordered collection of offsets, used to track the progress of processing data from one or more
+ * [[Source]]s that are present in a streaming query. This is similar to simplified, single-instance
+ * vector clock that must progress linearly forward.
+ */
+case class CompositeOffset(offsets: Seq[Option[Offset]]) extends Offset {
+  /**
+   * Returns a negative integer, zero, or a positive integer as this object is less than, equal to,
+   * or greater than the specified object.
+   */
+  override def compareTo(other: Offset): Int = other match {
+    case otherComposite: CompositeOffset if otherComposite.offsets.size == offsets.size =>
+      val comparisons = offsets.zip(otherComposite.offsets).map {
+        case (Some(a), Some(b)) => a compareTo b
+        case (None, None) => 0
+        case (None, _) => -1
+        case (_, None) => 1
+       }
+      val signs = comparisons.map(sign).distinct
+      if (signs.size != 1) {
+        throw new IllegalArgumentException(
+          s"Invalid comparison between non-linear histories: $this <=> $other")
+      }
+      signs.head
+    case _ =>
+      throw new IllegalArgumentException(s"Cannot compare $this <=> $other")
+  }
+
+  private def sign(num: Int): Int = num match {
+    case i if i < 0 => -1
+    case i if i == 0 => 0
+    case i if i > 0 => 1
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+/**
+ * A simple offset for sources that produce a single linear stream of data.
+ */
+case class LongOffset(offset: Long) extends Offset {
+
+  override def compareTo(other: Offset): Int = other match {
+    case l: LongOffset => offset.compareTo(l.offset)
+    case _ =>
+      throw new IllegalArgumentException(s"Invalid comparison of $getClass with ${other.getClass}")
+  }
+
+  def +(increment: Long): LongOffset = new LongOffset(offset + increment)
+  def -(decrement: Long): LongOffset = new LongOffset(offset - decrement)
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.scala
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+/**
+ * A offset is a monotonically increasing metric used to track progress in the computation of a
+ * stream. An [[Offset]] must be comparable.
+ */
+trait Offset extends Serializable {
+
+  /**
+   * Returns a negative integer, zero, or a positive integer as this object is less than, equal to,
+   * or greater than the specified object.
+   */
+  def compareTo(other: Offset): Int
+
+  def >(other: Offset): Boolean = compareTo(other) > 0
+  def <(other: Offset): Boolean = compareTo(other) < 0
+  def <=(other: Offset): Boolean = compareTo(other) <= 0
+  def >=(other: Offset): Boolean = compareTo(other) >= 0
+  def ==(other: Offset): Boolean = compareTo(other) == 0
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Sink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Sink.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.{DataFrame, SQLContext}
+
+/**
+ * An interface for systems that can collect the results of a streaming query.
+ *
+ * When new data is produced by a query, a [[Sink]] must be able to transactionally collect the
+ * data and update the [[StreamProgress]]. In the case of a failure, the sink will be recreated
+ * and must be able to return the [[StreamProgress]] for all of the data that is made durable.
+ * This contract allows Spark to process data with exactly-once semantics, even in the case
+ * of failures that require the computation to be restarted.
+ */
+trait Sink {
+  /**
+   * Returns the [[Offset]] for all data that is currently present in the sink, if any. This
+   * function will be called by Spark when restarting a stream in order to determine at which point
+   * in streamed input data computation should be resumed from.
+   */
+  def currentProgress: Option[Offset]
+
+  /**
+   * Accepts a new batch of data as well as a [[StreamProgress]] that denotes how far in the input
+   * data computation has progressed to.  When computation restarts after a failure, it is important
+   * that a [[Sink]] returns the same [[Offset]] as the most recent batch of data that
+   * has been persisted durrably.  Note that this does not necessarily have to be the
+   * [[Offset]] for the most recent batch of data that was given to the sink.  For example,
+   * it is valid to buffer data before persisting, as long as the [[Offset]] is stored
+   * transactionally as data is eventually persisted.
+   */
+  def addBatch(batch: Batch): Unit
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.types.StructType
+
+/**
+ * A source of continually arriving data for a streaming query. A [[Source]] must have a
+ * monotonically increasing notion of progress that can be represented as an [[Offset]]. Spark
+ * will regularly query each [[Source]] to see if any more data is available.
+ */
+trait Source  {
+
+  /** Returns the schema of the data from this source */
+  def schema: StructType
+
+  /**
+   * Returns the next batch of data that is available after `start`, if any is available.
+   */
+  def getNextBatch(start: Option[Offset]): Option[Batch]
+}