tdas · tdas · Oct 2, 2014 · Oct 1, 2014 · Oct 1, 2014 · Oct 1, 2014
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/storage/FileSegment.scala b/streaming/src/main/scala/org/apache/spark/streaming/storage/FileSegment.scala
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.storage
+
+private[streaming] case class FileSegment (path: String, offset: Long, length: Int)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/storage/HdfsSequentialReader.scala b/streaming/src/main/scala/org/apache/spark/streaming/storage/HdfsSequentialReader.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.storage
+
+private[streaming] class HdfsSequentialReader(val path: String) {
+
+  val instream = HdfsUtils.getInputStream(path)
+  var closed = false
+
+  def hasNext: Boolean = {
+    assertOpen()
+    synchronized {
+      instream.available() != 0
+    }
+  }
+
+  def readNext(): Array[Byte] = {
+    assertOpen()
+    // TODO: Possible error case where there are not enough bytes in the stream
+    // TODO: How to handle that?
+    synchronized {
+      val length = instream.readInt()
+      HdfsUtils.checkState(length <= instream.available(), "Not enough data found in file!")
+      val buffer = new Array[Byte](length)
+      instream.readFully(buffer)
+      buffer
+    }
+  }
+
+  def close() {
+    closed = true
+    instream.close()
+  }
+
+  def assertOpen() {
+    HdfsUtils.checkState(!closed, "Stream is closed. Create a new Reader to read from the " +
+      "file.")
+  }
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/storage/HdfsUtils.scala b/streaming/src/main/scala/org/apache/spark/streaming/storage/HdfsUtils.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.storage
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileStatus, FSDataInputStream, FSDataOutputStream, Path}
+
+private[streaming] object HdfsUtils {
+
+  def getOutputStream(path: String): FSDataOutputStream = {
+    // HDFS is not thread-safe when getFileSystem is called, so synchronize on that
+
+    val dfsPath = new Path(path)
+    val conf = new Configuration()
+    val dfs =
+      this.synchronized {
+        dfsPath.getFileSystem(new Configuration())
+      }
+    // If the file exists and we have append support, append instead of creating a new file
+    val stream: FSDataOutputStream = {
+      if (conf.getBoolean("hdfs.append.support", false) && dfs.isFile(dfsPath)) {
+        dfs.append(dfsPath)
+      } else {
+        dfs.create(dfsPath)
+      }
+    }
+    stream
+  }
+
+  def getInputStream(path: String): FSDataInputStream = {
+    val dfsPath = new Path(path)
+    val conf = new Configuration()
+    val dfs = this.synchronized {
+      dfsPath.getFileSystem(new Configuration())
+    }
+    val instream = dfs.open(dfsPath)
+    instream
+  }
+
+  def checkState(state: Boolean, errorMsg: => String) {
+    if(!state) {
+      throw new IllegalStateException(errorMsg)
+    }
+  }
+
+}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/storage/HdfsWalRandomReader.scala b/streaming/src/main/scala/org/apache/spark/streaming/storage/HdfsWalRandomReader.scala
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.storage
+
+private[streaming] class HdfsWalRandomReader(val path: String) {
+
+  val instream = HdfsUtils.getInputStream(path)
+  var closed = false
+
+  def read(segment: FileSegment): Array[Byte] = {
+    assertOpen()
+    synchronized {
+      instream.seek(segment.offset)
+      val nextLength = instream.readInt()
+      HdfsUtils.checkState(nextLength == segment.length,
+        "Expected message length to be " + segment.length + ", " + "but was " + nextLength)
+      val buffer = new Array[Byte](nextLength)
+      instream.readFully(buffer)
+      buffer
+    }
+  }
+
+  def close() {
+    closed = true
+    instream.close()
+  }
+
+  def assertOpen() {
+    HdfsUtils.checkState(!closed, "Stream is closed. Create a new Reader to read from the file.")
+  }
+}
+
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/storage/HdfsWalWriter.scala b/streaming/src/main/scala/org/apache/spark/streaming/storage/HdfsWalWriter.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.storage
+
+private[streaming] class HdfsWalWriter(val path: String) {
+  val stream = HdfsUtils.getOutputStream(path)
+  var nextOffset = stream.getPos
+  var closed = false
+
+  // Data is always written as:
+  // - Length - Long
+  // - Data - of length = Length
+  def write(data: Array[Byte]): FileSegment = {
+    assertOpen()
+    synchronized {
+      val segment = new FileSegment(path, nextOffset, data.length)
+      stream.writeInt(data.length)
+      stream.write(data)
+      stream.hflush()
+      nextOffset = stream.getPos
+      segment
+    }
+  }
+
+  def close(): Unit = {
+    closed = true
+    stream.close()
+  }
+
+  def assertOpen() {
+    HdfsUtils.checkState(!closed, "Stream is closed. Create a new Writer to write to file.")
+  }
+}