Merge pull request databricks#1 from cfregly/master

mengxr · mengxr · commit 830f4c080ca5 · 2014-11-13T13:41:04.000-08:00
initial checkin
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+target/
+project/target
diff --git a/README.md b/README.md
@@ -0,0 +1,23 @@
+RedshiftInputFormat
+===
+
+Hadoop input format for Redshift tables unloaded with the ESCAPE option.
+
+Usage in Spark Core:
+```scala
+import com.databricks.examples.redshift.input.RedshiftInputFormat
+
+val records = sc.newAPIHadoopFile(
+  path,
+  classOf[RedshiftInputFormat],
+  classOf[java.lang.Long],
+  classOf[Array[String]])
+```
+
+Usage in Spark SQL:
+```scala
+import com.databricks.examples.redshift.input.RedshiftInputFormat._
+
+// Call redshiftFile() that returns a SchemaRDD with all string columns.
+val records: SchemaRDD = sqlContext.redshiftFile(path, Seq("name", "age"))
+```
diff --git a/build.sbt b/build.sbt
@@ -0,0 +1,17 @@
+net.virtualvoid.sbt.graph.Plugin.graphSettings
+
+organization := "com.databricks.examples.redshift"
+
+name := "redshift-input-format"
+
+version := "0.1"
+
+scalaVersion := "2.10.4"
+
+libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "1.0.4"
+
+libraryDependencies += "org.apache.spark" %% "spark-sql" % "1.1.0"
+
+libraryDependencies += "com.google.guava" % "guava" % "14.0.1" % Test
+
+libraryDependencies += "org.scalatest" %% "scalatest" % "2.1.5" % Test
diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -0,0 +1,3 @@
+addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0")
+
+addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.4")
diff --git a/src/main/scala/com/databricks/examples/redshift/input/RedshiftInputFormat.scala b/src/main/scala/com/databricks/examples/redshift/input/RedshiftInputFormat.scala
@@ -0,0 +1,285 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.databricks.examples.redshift.input
+
+import java.lang.{Long => JavaLong}
+import java.io.{BufferedInputStream, IOException}
+import java.nio.charset.Charset
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{Path, FileSystem}
+import org.apache.hadoop.io.compress.CompressionCodecFactory
+import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
+import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext}
+
+import org.apache.spark.SparkContext._
+import org.apache.spark.sql.{SQLContext, SchemaRDD, Row}
+import org.apache.spark.sql.catalyst.types._
+
+/**
+ * Input format for text records saved with in-record delimiter and newline characters escaped.
+ *
+ * For example, a record containing two fields: `"a\n"` and `"|b\\"` saved with delimiter `|`
+ * should be the following:
+ * {{{
+ * a\\\n|\\|b\\\\\n
+ * }}},
+ * where the in-record `|`, `\r`, `\n`, and `\\` characters are escaped by `\\`.
+ * Users can configure the delimiter via [[RedshiftInputFormat$#KEY_DELIMITER]].
+ * Its default value [[RedshiftInputFormat$#DEFAULT_DELIMITER]] is set to match Redshift's UNLOAD
+ * with the ESCAPE option:
+ * {{{
+ *   UNLOAD ('select_statement')
+ *   TO 's3://object_path_prefix'
+ *   ESCAPE
+ * }}}
+ *
+ * @see org.apache.spark.SparkContext#newAPIHadoopFile
+ */
+class RedshiftInputFormat extends FileInputFormat[JavaLong, Array[String]] {
+
+  override def createRecordReader(
+      split: InputSplit,
+      context: TaskAttemptContext): RecordReader[JavaLong, Array[String]] = {
+    new RedshiftRecordReader
+  }
+}
+
+object RedshiftInputFormat {
+
+  /** configuration key for delimiter */
+  val KEY_DELIMITER = "redshift.delimiter"
+  /** default delimiter */
+  val DEFAULT_DELIMITER = '|'
+
+  /** Gets the delimiter char from conf or the default. */
+  private[input] def getDelimiterOrDefault(conf: Configuration): Char = {
+    val c = conf.get(KEY_DELIMITER, DEFAULT_DELIMITER.toString)
+    if (c.length != 1) {
+      throw new IllegalArgumentException(s"Expect delimiter be a single character but got '$c'.")
+    } else {
+      c.charAt(0)
+    }
+  }
+
+  /**
+   * Wrapper of SQLContext that provide `redshiftFile` method.
+   */
+  class SQLContextWithRedshiftFile(sqlContext: SQLContext) {
+
+    /**
+     * Read a file unloaded from Redshift into a SchemaRDD.
+     * @param path input path
+     * @return a SchemaRDD
+     */
+    def redshiftFile(path: String, columns: Seq[String]): SchemaRDD = {
+      val sc = sqlContext.sparkContext
+      val rdd = sc.newAPIHadoopFile(path, classOf[RedshiftInputFormat],
+        classOf[java.lang.Long], classOf[Array[String]], sc.hadoopConfiguration)
+      val schema = StructType(columns.map(c => StructField(c, StringType, false)))
+      sqlContext.applySchema(rdd.values.map(x => Row(x: _*)), schema)
+    }
+  }
+
+  implicit def fromSQLContext(sqlContext: SQLContext): SQLContextWithRedshiftFile =
+    new SQLContextWithRedshiftFile(sqlContext)
+}
+
+private[input] class RedshiftRecordReader extends RecordReader[JavaLong, Array[String]] {
+
+  private var reader: BufferedInputStream = _
+
+  private var key: JavaLong = _
+  private var value: Array[String] = _
+
+  private var start: Long = _
+  private var end: Long = _
+  private var cur: Long = _
+
+  private var eof: Boolean = false
+
+  private var delimiter: Byte = _
+  @inline private[this] final val escapeChar: Byte = '\\'
+  @inline private[this] final val lineFeed: Byte = '\n'
+  @inline private[this] final val carriageReturn: Byte = '\r'
+
+  @inline private[this] final val defaultBufferSize = 1024 * 1024
+
+  private[this] val chars = ArrayBuffer.empty[Byte]
+
+  override def initialize(inputSplit: InputSplit, context: TaskAttemptContext): Unit = {
+    val split = inputSplit.asInstanceOf[FileSplit]
+    val file = split.getPath
+    val conf = context.getConfiguration
+    delimiter = RedshiftInputFormat.getDelimiterOrDefault(conf).asInstanceOf[Byte]
+    require(delimiter != escapeChar,
+      s"The delimiter and the escape char cannot be the same but found $delimiter.")
+    require(delimiter != lineFeed, "The delimiter cannot be the lineFeed character.")
+    require(delimiter != carriageReturn, "The delimiter cannot be the carriage return.")
+    val compressionCodecs = new CompressionCodecFactory(conf)
+    val codec = compressionCodecs.getCodec(file)
+    if (codec != null) {
+      throw new IOException(s"Do not support compressed files but found $file.")
+    }
+    val fs = file.getFileSystem(conf)
+    val size = fs.getFileStatus(file).getLen
+    start = findNext(fs, file, size, split.getStart)
+    end = findNext(fs, file, size, split.getStart + split.getLength)
+    cur = start
+    val in = fs.open(file)
+    if (cur > 0L) {
+      in.seek(cur - 1L)
+      in.read()
+    }
+    reader = new BufferedInputStream(in, defaultBufferSize)
+  }
+
+  override def getProgress: Float = {
+    if (start >= end) {
+      1.0f
+    } else {
+      math.min((cur - start).toFloat / (end - start), 1.0f)
+    }
+  }
+
+  override def nextKeyValue(): Boolean = {
+    if (cur < end && !eof) {
+      key = cur
+      value = nextValue()
+      true
+    } else {
+      key = null
+      value = null
+      false
+    }
+  }
+
+  override def getCurrentValue: Array[String] = value
+
+  override def getCurrentKey: JavaLong = key
+
+  override def close(): Unit = {
+    if (reader != null) {
+      reader.close()
+    }
+  }
+
+  /**
+   * Finds the start of the next record.
+   * Because we don't know whether the first char is escaped or not, we need to first find a
+   * position that is not escaped.
+   *
+   * @param fs file system
+   * @param file file path
+   * @param size file size
+   * @param offset start offset
+   * @return the start position of the next record
+   */
+  private def findNext(fs: FileSystem, file: Path, size: Long, offset: Long): Long = {
+    if (offset == 0L)
+      return 0L
+    else if (offset >= size)
+      return size
+    val in = fs.open(file)
+    var pos = offset
+    in.seek(pos)
+    val bis = new BufferedInputStream(in, defaultBufferSize)
+    // Find the first unescaped char.
+    var escaped = true
+    var thisEof = false
+    while (escaped && !thisEof) {
+      val v = bis.read()
+      if (v < 0) {
+        thisEof = true
+      } else {
+        pos += 1
+        if (v != escapeChar) {
+          escaped = false
+        }
+      }
+    }
+    // Find the next unescaped line feed.
+    var endOfRecord = false
+    while ((escaped || !endOfRecord) && !thisEof) {
+      val v = bis.read()
+      if (v < 0) {
+        thisEof = true
+      } else {
+        pos += 1
+        if (v == escapeChar) {
+          escaped = true
+        } else {
+          if (!escaped) {
+            endOfRecord = v == lineFeed
+          } else {
+            escaped = false
+          }
+        }
+      }
+    }
+    in.close()
+    pos
+  }
+
+  private def nextValue(): Array[String] = {
+    val fields = ArrayBuffer.empty[String]
+    var escaped = false
+    var endOfRecord = false
+    while (!endOfRecord && !eof) {
+      var endOfField = false
+      chars.clear()
+      while (!endOfField && !endOfRecord && !eof) {
+        val v = reader.read()
+        if (v < 0) {
+          eof = true
+        } else {
+          cur += 1L
+          val c = v.asInstanceOf[Byte]
+          if (escaped) {
+            if (c != escapeChar && c != delimiter && c != lineFeed && c != carriageReturn) {
+              throw new IllegalStateException(
+                s"Found `$c` (ASCII $v) after $escapeChar.")
+            }
+            chars.append(c)
+            escaped = false
+          } else {
+            if (c == escapeChar) {
+              escaped = true
+            } else if (c == delimiter) {
+              endOfField = true
+            } else if (c == lineFeed) {
+              endOfRecord = true
+            } else {
+              // also copy carriage return
+              chars.append(c)
+            }
+          }
+        }
+      }
+      // TODO: charset?
+      fields.append(new String(chars.toArray, Charset.forName("UTF-8")))
+    }
+    if (escaped) {
+      throw new IllegalStateException(s"Found hanging escape char.")
+    }
+    fields.toArray
+  }
+}
+
diff --git a/src/test/scala/com/databricks/examples/redshift/input/RedshiftInputFormatSuite.scala b/src/test/scala/com/databricks/examples/redshift/input/RedshiftInputFormatSuite.scala

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0")`
	`2`	`+`
	`3`	`+addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.4")`