Fixing some problems intruduced during rebase

apache · AndreSchumacher · Mar 26, 2014 · Mar 27, 2014 · Mar 27, 2014 · Apr 1, 2014
commit 6dbc9b73d03566bed365cd1416ca240799df7747
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.catalyst.expressions.{GenericRow, Row, Attribute}
 import org.apache.spark.sql.parquet.CatalystConverter.FieldType
 
-object CatalystConverter {
+private[parquet] object CatalystConverter {
   // The type internally used for fields
   type FieldType = StructField
 
@@ -55,11 +55,14 @@ object CatalystConverter {
       }
       case ctype: NativeType => {
         // note: for some reason matching for StringType fails so use this ugly if instead
-        if (ctype == StringType) new CatalystPrimitiveStringConverter(parent, fieldIndex)
-        else new CatalystPrimitiveConverter(parent, fieldIndex)
+        if (ctype == StringType) {
+          new CatalystPrimitiveStringConverter(parent, fieldIndex)
+        } else {
+          new CatalystPrimitiveConverter(parent, fieldIndex)
+        }
       }
       case _ => throw new RuntimeException(
-        s"unable to convert datatype ${field.dataType.toString} in CatalystGroupConverter")
+        s"unable to convert datatype ${field.dataType.toString} in CatalystConverter")
     }
   }
 }
@@ -142,9 +145,7 @@ class CatalystGroupConverter(
   def getCurrentRecord: Row = {
     assert(isRootConverter, "getCurrentRecord should only be called in root group converter!")
     // TODO: use iterators if possible
-    new GenericRow {
-      override val values: Array[Any] = current.toArray
-    }
+    new GenericRow(current.toArray)
   }
 
   override def getConverter(fieldIndex: Int): Converter = converters(fieldIndex)
@@ -174,13 +175,9 @@ class CatalystGroupConverter(
   override def end(): Unit = {
     if (!isRootConverter) {
       assert(current!=null) // there should be no empty groups
-      buffer.append(new GenericRow {
-        override val values: Array[Any] = current.toArray
-      })
+      buffer.append(new GenericRow(current.toArray))
       // TODO: use iterators if possible, avoid Row wrapping
-      parent.updateField(index, new GenericRow {
-        override val values: Array[Any] = buffer.toArray
-      })
+      parent.updateField(index, new GenericRow(buffer.toArray.asInstanceOf[Array[Any]]))
     }
   }
 }
@@ -283,9 +280,7 @@ class CatalystArrayConverter(
   override def end(): Unit = {
     assert(parent != null)
     // TODO: use iterators if possible, avoid Row wrapping
-    parent.updateField(index, new GenericRow {
-      override val values: Array[Any] = buffer.toArray
-    })
+    parent.updateField(index, new GenericRow(buffer.toArray))
     clearBuffer()
   }
 }
@@ -304,9 +299,7 @@ class CatalystStructConverter(
   override def end(): Unit = {
     assert(!isRootConverter)
     // TODO: use iterators if possible, avoid Row wrapping!
-    parent.updateField(index, new GenericRow {
-      override val values: Array[Any] = current.toArray
-    })
+    parent.updateField(index, new GenericRow(current.toArray))
   }
 }
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.parquet
 
-import java.io.IOException,
+import java.io.IOException
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -148,226 +148,3 @@ private[parquet] object RowWriteSupport {
   val PARQUET_ROW_SCHEMA: String = "org.apache.spark.sql.parquet.row.schema"
 }
 
-private[parquet] object CatalystConverter {
-  type FieldType = StructField
-
-  protected[parquet] def createConverter(field: FieldType, fieldIndex: Int, parent: CatalystConverter): Converter = {
-    val fieldType: DataType = field.dataType
-    fieldType match {
-      case ArrayType(elementType: DataType) => {
-        elementType match {
-          case StructType(fields) =>
-            if (fields.size > 1) new CatalystGroupConverter(fields, fieldIndex, parent)
-            else new CatalystArrayConverter(fields(0).dataType, fieldIndex, parent)
-          case _ => new CatalystArrayConverter(elementType, fieldIndex, parent)
-        }
-      }
-      case StructType(fields: Seq[StructField]) =>
-        new CatalystGroupConverter(fields, fieldIndex, parent)
-      case ctype: NativeType =>
-        // note: for some reason matching for StringType fails so use this ugly if instead
-        if (ctype == StringType) {
-          new CatalystPrimitiveStringConverter(parent, fieldIndex)
-        } else {
-          new CatalystPrimitiveConverter(parent, fieldIndex)
-        }
-      case _ => throw new RuntimeException(
-        s"unable to convert datatype ${field.dataType.toString} in CatalystGroupConverter")
-    }
-  }
-}
-
-trait CatalystConverter {
-
-  // the number of fields this group has
-  protected[parquet] val size: Int
-
-  // the index of this converter in the parent
-  protected[parquet] val index: Int
-
-  // the parent converter
-  protected[parquet] val parent: CatalystConverter
-
-  // for child converters to update upstream values
-  protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit
-
-  // TODO: in the future consider using specific methods to avoid autoboxing
-  protected[parquet] def updateBoolean(fieldIndex: Int, value: Boolean): Unit =
-    updateField(fieldIndex, value)
-
-  protected[parquet] def updateInt(fieldIndex: Int, value: Int): Unit =
-    updateField(fieldIndex, value)
-
-  protected[parquet] def updateLong(fieldIndex: Int, value: Long): Unit =
-    updateField(fieldIndex, value)
-
-  protected[parquet] def updateDouble(fieldIndex: Int, value: Double): Unit =
-    updateField(fieldIndex, value)
-
-  protected[parquet] def updateFloat(fieldIndex: Int, value: Float): Unit =
-    updateField(fieldIndex, value)
-
-  protected[parquet] def updateBinary(fieldIndex: Int, value: Binary): Unit =
-    updateField(fieldIndex, value.getBytes)
-
-  protected[parquet] def updateString(fieldIndex: Int, value: Binary): Unit =
-    updateField(fieldIndex, value.toStringUsingUTF8)
-
-  protected[parquet] def isRootConverter: Boolean = parent == null
-
-  protected[parquet] def clearBuffer(): Unit
-}
-
-/**
- * A `parquet.io.api.GroupConverter` that is able to convert a Parquet record
- * to a [[org.apache.spark.sql.catalyst.expressions.Row]] object.
- *
- * @param schema The corresponding Catalyst schema in the form of a list of attributes.
- */
-class CatalystGroupConverter(
-    private[parquet] val schema: Seq[FieldType],
-    protected[parquet] val index: Int,
-    protected[parquet] val parent: CatalystConverter,
-    protected[parquet] var current: ArrayBuffer[Any],
-    protected[parquet] var buffer: ArrayBuffer[ArrayBuffer[Any]]) extends GroupConverter with CatalystConverter {
-
-  def this(schema: Seq[FieldType], index: Int, parent: CatalystConverter) =
-    this(schema, index, parent, current=null, buffer=new ArrayBuffer[ArrayBuffer[Any]](CatalystArrayConverter.INITIAL_ARRAY_SIZE))
-
-  // This constructor is used for the root converter only
-  def this(attributes: Seq[Attribute]) =
-    this(attributes.map(a => new FieldType(a.name, a.dataType, a.nullable)), 0, null)
-
-  protected [parquet] val converters: Array[Converter] =
-    schema.map(field => CatalystConverter.createConverter(field, schema.indexOf(field), this)).toArray
-
-  override val size = schema.size
-
-  // Should be only called in root group converter!
-  def getCurrentRecord: Row = {
-    assert(isRootConverter, "getCurrentRecord should only be called in root group converter!")
-    new GenericRow {
-      override val values: Array[Any] = current.toArray
-    }
-  }
-
-  override def getConverter(fieldIndex: Int): Converter = converters(fieldIndex)
-
-  // for child converters to update upstream values
-  override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit =
-    current.update(fieldIndex, value)
-
-  override protected[parquet] def clearBuffer(): Unit = {
-    // TODO: reuse buffer?
-    buffer = new ArrayBuffer[ArrayBuffer[Any]](CatalystArrayConverter.INITIAL_ARRAY_SIZE)
-  }
-
-  override def start(): Unit = {
-    // TODO: reuse buffer?
-    // Allocate new array in the root converter (others will be called clearBuffer() on)
-    current = ArrayBuffer.fill(schema.length)(null)
-    converters.foreach {
-      converter => if (!converter.isPrimitive) {
-        converter.asInstanceOf[CatalystConverter].clearBuffer
-      }
-    }
-  }
-
-  // TODO: think about reusing the buffer
-  override def end(): Unit = {
-    if (!isRootConverter) {
-      assert(current!=null) // there should be no empty groups
-      buffer.append(current)
-      parent.updateField(index, buffer)
-    }
-  }
-}
-
-/**
- * A `parquet.io.api.PrimitiveConverter` that converts Parquet types to Catalyst types.
- *
- * @param parent The parent group converter.
- * @param fieldIndex The index inside the record.
- */
-private[parquet] class CatalystPrimitiveConverter(
-    parent: CatalystConverter,
-    fieldIndex: Int) extends PrimitiveConverter {
-  // TODO: consider refactoring these together with ParquetTypesConverter
-  override def addBinary(value: Binary): Unit =
-    parent.updateBinary(fieldIndex, value)
-
-  override def addBoolean(value: Boolean): Unit =
-    parent.updateBoolean(fieldIndex, value)
-
-  override def addDouble(value: Double): Unit =
-    parent.updateDouble(fieldIndex, value)
-
-  override def addFloat(value: Float): Unit =
-    parent.updateFloat(fieldIndex, value)
-
-  override def addInt(value: Int): Unit =
-    parent.updateInt(fieldIndex, value)
-
-  override def addLong(value: Long): Unit =
-    parent.updateLong(fieldIndex, value)
-}
-
-/**
- * A `parquet.io.api.PrimitiveConverter` that converts Parquet strings (fixed-length byte arrays)
- * into Catalyst Strings.
- *
- * @param parent The parent group converter.
- * @param fieldIndex The index inside the record.
- */
-private[parquet] class CatalystPrimitiveStringConverter(
-    parent: CatalystConverter,
-    fieldIndex: Int) extends CatalystPrimitiveConverter(parent, fieldIndex) {
-  override def addBinary(value: Binary): Unit =
-    parent.updateString(fieldIndex, value)
-}
-
-object CatalystArrayConverter {
-  val INITIAL_ARRAY_SIZE = 20
-}
-
-// this is for single-element groups of primitive or complex types
-// Note: AvroParquet only uses arrays for primitive types (?)
-class CatalystArrayConverter(
-    val elementType: DataType,
-    val index: Int,
-    protected[parquet] val parent: CatalystConverter,
-    protected[parquet] var buffer: Buffer[Any])
-  extends GroupConverter with CatalystConverter {
-  // TODO: In the future consider using native arrays instead of buffer for primitive types for
-  // performance reasons (autoboxing)
-
-  def this(elementType: DataType, index: Int, parent: CatalystConverter) =
-    this(elementType, index, parent, new ArrayBuffer[Any](CatalystArrayConverter.INITIAL_ARRAY_SIZE))
-
-  protected[parquet] val converter: Converter = CatalystConverter.createConverter(
-    new CatalystConverter.FieldType("values", elementType, false), fieldIndex=0, parent=this)
-
-  override def getConverter(fieldIndex: Int): Converter = converter
-
-  override val size = 1 // arrays have only one (repeated) field, which is its elements
-
-  override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit = buffer += value
-
-  override protected[parquet] def clearBuffer(): Unit = {
-    // TODO: reuse buffer?
-    buffer = new ArrayBuffer[Any](CatalystArrayConverter.INITIAL_ARRAY_SIZE)
-  }
-
-  override def start(): Unit = {
-    if (!converter.isPrimitive) {
-      converter.asInstanceOf[CatalystConverter].clearBuffer
-    }
-  }
-
-  // TODO: think about reusing the buffer
-  override def end(): Unit = {
-    if (parent != null) parent.updateField(index, buffer)
-  }
-}
-
-// TODO: add MapConverter
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -205,8 +205,26 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     Utils.deleteRecursively(file)
   }
 
-  test("insert (appending) to same table via Scala API") {
-    sql("INSERT INTO testsource SELECT * FROM testsource")
+  test("Insert (overwrite) via Scala API") {
+    val dirname = Utils.createTempDir()
+    val source_rdd = TestSQLContext.sparkContext.parallelize((1 to 100))
+      .map(i => TestRDDEntry(i, s"val_$i"))
+    source_rdd.registerAsTable("source")
+    val dest_rdd = createParquetFile(dirname.toString, ("key", IntegerType), ("value", StringType))
+    dest_rdd.registerAsTable("dest")
+    sql("INSERT OVERWRITE INTO dest SELECT * FROM source").collect()
+    val rdd_copy1 = sql("SELECT * FROM dest").collect()
+    assert(rdd_copy1.size === 100)
+    assert(rdd_copy1(0).apply(0) === 1)
+    assert(rdd_copy1(0).apply(1) === "val_1")
+    sql("INSERT INTO dest SELECT * FROM source").collect()
+    val rdd_copy2 = sql("SELECT * FROM dest").collect()
+    assert(rdd_copy2.size === 200)
+    Utils.deleteRecursively(dirname)
+  }
+
+  test("Insert (appending) to same table via Scala API") {
+    sql("INSERT INTO testsource SELECT * FROM testsource").collect()
     val double_rdd = sql("SELECT * FROM testsource").collect()
     assert(double_rdd != null)
     assert(double_rdd.size === 30)
@@ -372,9 +390,12 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
   test("Importing nested Parquet file (Addressbook)") {
     implicit def anyToRow(value: Any): Row = value.asInstanceOf[Row]
     ParquetTestData.readNestedFile(
-      ParquetTestData.testNestedFile1,
+      ParquetTestData.testNestedDir1,
       ParquetTestData.testNestedSchema1)
-    val result = getRDD(ParquetTestData.testNestedData1).collect()
+    val result = TestSQLContext
+      .parquetFile(ParquetTestData.testNestedDir1.toString)
+      .toSchemaRDD
+      .collect()
     assert(result != null)
     assert(result.size === 2)
     val first_record = result(0)
@@ -397,9 +418,12 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
   test("Importing nested Parquet file (nested numbers)") {
     implicit def anyToRow(value: Any): Row = value.asInstanceOf[Row]
     ParquetTestData.readNestedFile(
-      ParquetTestData.testNestedFile2,
+      ParquetTestData.testNestedDir2,
       ParquetTestData.testNestedSchema2)
-    val result = getRDD(ParquetTestData.testNestedData2).collect()
+    val result = TestSQLContext
+      .parquetFile(ParquetTestData.testNestedDir2.toString)
+      .toSchemaRDD
+      .collect()
     assert(result.size === 1, "number of top-level rows incorrect")
     assert(result(0).size === 5, "number of fields in row incorrect")
     assert(result(0)(0) === 1)
@@ -420,15 +444,19 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
   }
 
   test("Simple query on addressbook") {
-    val data = TestSQLContext.parquetFile(ParquetTestData.testNestedFile1.toString).toSchemaRDD
+    val data = TestSQLContext
+      .parquetFile(ParquetTestData.testNestedDir1.toString)
+      .toSchemaRDD
     val tmp = data.where('owner === "Julien Le Dem").select('owner as 'a, 'contacts as 'c).collect()
     assert(tmp.size === 1)
     assert(tmp(0)(0) === "Julien Le Dem")
   }
 
   test("Simple query on nested int data") {
     implicit def anyToRow(value: Any): Row = value.asInstanceOf[Row]
-    val data = TestSQLContext.parquetFile(ParquetTestData.testNestedFile2.toString).toSchemaRDD
+    val data = TestSQLContext
+      .parquetFile(ParquetTestData.testNestedDir2.toString)
+      .toSchemaRDD
     data.registerAsTable("data")
     val tmp = sql("SELECT booleanNumberPairs.value, booleanNumberPairs.truth FROM data").collect()
     assert(tmp(0)(0) === 2.5)