More fixes to short and byte conversion

apache · AndreSchumacher · Mar 26, 2014 · Mar 27, 2014 · Mar 27, 2014 · Apr 1, 2014
commit b8a8b9a28baf2780de7841d269a17f01db41e10b
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -154,7 +154,8 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case logical.WriteToFile(path, child) =>
         val relation =
           ParquetRelation.create(path, child, sparkContext.hadoopConfiguration)
-        InsertIntoParquetTable(relation, planLater(child), overwrite=true)(sparkContext) :: Nil
+        // Note: overwrite=false because otherwise the metadata we just created will be deleted
+        InsertIntoParquetTable(relation, planLater(child), overwrite=false)(sparkContext) :: Nil
       case logical.InsertIntoTable(table: ParquetRelation, partition, child, overwrite) =>
         InsertIntoParquetTable(table, planLater(child), overwrite)(sparkContext) :: Nil
       case PhysicalOperation(projectList, filters: Seq[Expression], relation: ParquetRelation) =>

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -93,14 +93,30 @@ private[sql] object CatalystConverter {
             fieldIndex,
             parent)
       }
-      case ctype: NativeType => {
-        // note: for some reason matching for StringType fails so use this ugly if instead
-        if (ctype == StringType) {
-          new CatalystPrimitiveStringConverter(parent, fieldIndex)
-        } else {
-          new CatalystPrimitiveConverter(parent, fieldIndex)
+      // Strings, Shorts and Bytes do not have a corresponding type in Parquet
+      // so we need to treat them separately
+      case StringType => {
+        new CatalystPrimitiveConverter(parent, fieldIndex) {
+          override def addBinary(value: Binary): Unit =
+            parent.updateString(fieldIndex, value)
         }
       }
+      case ShortType => {
+        new CatalystPrimitiveConverter(parent, fieldIndex) {
+          override def addInt(value: Int): Unit =
+            parent.updateShort(fieldIndex, value.asInstanceOf[ShortType.JvmType])
+        }
+      }
+      case ByteType => {
+        new CatalystPrimitiveConverter(parent, fieldIndex) {
+          override def addInt(value: Int): Unit =
+            parent.updateByte(fieldIndex, value.asInstanceOf[ByteType.JvmType])
+        }
+      }
+      // All other primitive types use the default converter
+      case ctype: NativeType => { // note: need the type tag here!
+        new CatalystPrimitiveConverter(parent, fieldIndex)
+      }
       case _ => throw new RuntimeException(
         s"unable to convert datatype ${field.dataType.toString} in CatalystConverter")
     }
@@ -153,6 +169,12 @@ private[parquet] trait CatalystConverter {
   protected[parquet] def updateLong(fieldIndex: Int, value: Long): Unit =
     updateField(fieldIndex, value)
 
+  protected[parquet] def updateShort(fieldIndex: Int, value: Short): Unit =
+    updateField(fieldIndex, value)
+
+  protected[parquet] def updateByte(fieldIndex: Int, value: Byte): Unit =
+    updateField(fieldIndex, value)
+
   protected[parquet] def updateDouble(fieldIndex: Int, value: Double): Unit =
     updateField(fieldIndex, value)
 
@@ -309,6 +331,12 @@ private[parquet] class CatalystPrimitiveRowConverter(
   override protected[parquet] def updateLong(fieldIndex: Int, value: Long): Unit =
     current.setLong(fieldIndex, value)
 
+  override protected[parquet] def updateShort(fieldIndex: Int, value: Short): Unit =
+    current.setShort(fieldIndex, value)
+
+  override protected[parquet] def updateByte(fieldIndex: Int, value: Byte): Unit =
+    current.setByte(fieldIndex, value)
+
   override protected[parquet] def updateDouble(fieldIndex: Int, value: Double): Unit =
     current.setDouble(fieldIndex, value)
 
@@ -350,21 +378,6 @@ private[parquet] class CatalystPrimitiveConverter(
     parent.updateLong(fieldIndex, value)
 }
 
-/**
- * A `parquet.io.api.PrimitiveConverter` that converts Parquet strings (fixed-length byte arrays)
- * into Catalyst Strings.
- *
- * @param parent The parent group converter.
- * @param fieldIndex The index inside the record.
- */
-private[parquet] class CatalystPrimitiveStringConverter(
-    parent: CatalystConverter,
-    fieldIndex: Int)
-  extends CatalystPrimitiveConverter(parent, fieldIndex) {
-  override def addBinary(value: Binary): Unit =
-    parent.updateString(fieldIndex, value)
-}
-
 object CatalystArrayConverter {
   val INITIAL_ARRAY_SIZE = 20
 }
@@ -486,6 +499,18 @@ private[parquet] class CatalystNativeArrayConverter(
     elements += 1
   }
 
+  override protected[parquet] def updateShort(fieldIndex: Int, value: Short): Unit = {
+    checkGrowBuffer()
+    buffer(elements) = value.asInstanceOf[NativeType]
+    elements += 1
+  }
+
+  override protected[parquet] def updateByte(fieldIndex: Int, value: Byte): Unit = {
+    checkGrowBuffer()
+    buffer(elements) = value.asInstanceOf[NativeType]
+    elements += 1
+  }
+
   override protected[parquet] def updateLong(fieldIndex: Int, value: Long): Unit = {
     checkGrowBuffer()
     buffer(elements) = value.asInstanceOf[NativeType]

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -55,11 +55,7 @@ private[sql] case class ParquetRelation(val path: String)
       .getSchema
 
   /** Attributes */
-  // TODO: THIS POTENTIALLY LOOSES TYPE INFORMATION!!!!
-  // e.g. short <-> INT32 and byte <-> INT32
-  override val output =
-    ParquetTypesConverter
-      .convertToAttributes(parquetSchema)
+  override val output = ParquetTypesConverter.readSchemaFromFile(new Path(path))
 
   override def newInstance = ParquetRelation(path).asInstanceOf[this.type]
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala
@@ -65,10 +65,13 @@ case class ParquetTableScan(
       NewFileInputFormat.addInputPath(job, path)
     }
 
-    // Store Parquet schema in `Configuration`
+    // Store both requested and original schema in `Configuration`
     conf.set(
       RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA,
       ParquetTypesConverter.convertToString(output))
+    conf.set(
+      RowWriteSupport.SPARK_ROW_SCHEMA,
+      ParquetTypesConverter.convertToString(relation.output))
 
     // Store record filtering predicate in `Configuration`
     // Note 1: the input format ignores all predicates that cannot be expressed

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
@@ -26,20 +26,19 @@ import org.apache.hadoop.mapreduce.Job
 import parquet.hadoop.{ParquetFileReader, Footer, ParquetFileWriter}
 import parquet.hadoop.metadata.{ParquetMetadata, FileMetaData}
 import parquet.hadoop.util.ContextUtil
-import parquet.schema.{Type => ParquetType, PrimitiveType => ParquetPrimitiveType, MessageType, MessageTypeParser}
+import parquet.schema.{Type => ParquetType, PrimitiveType => ParquetPrimitiveType, MessageType}
 import parquet.schema.{GroupType => ParquetGroupType, OriginalType => ParquetOriginalType, ConversionPatterns}
 import parquet.schema.PrimitiveType.{PrimitiveTypeName => ParquetPrimitiveTypeName}
 import parquet.schema.Type.Repetition
 
+import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Attribute}
 import org.apache.spark.sql.catalyst.types._
-import com.google.common.io.BaseEncoding
-import org.apache.spark.sql.execution.SparkSqlSerializer
 
 // Implicits
 import scala.collection.JavaConversions._
 
-private[parquet] object ParquetTypesConverter {
+private[parquet] object ParquetTypesConverter extends Logging {
   def isPrimitiveType(ctype: DataType): Boolean =
     classOf[PrimitiveType] isAssignableFrom ctype.getClass
 
@@ -62,7 +61,7 @@ private[parquet] object ParquetTypesConverter {
    * Converts a given Parquet `Type` into the corresponding
    * [[org.apache.spark.sql.catalyst.types.DataType]].
    *
-   * Note that we apply the following conversion rules:
+   * We apply the following conversion rules:
    * <ul>
    *   <li> Primitive types are converter to the corresponding primitive type.</li>
    *   <li> Group types that have a single field that is itself a group, which has repetition
@@ -97,6 +96,7 @@ private[parquet] object ParquetTypesConverter {
           keyValueGroup.getFields.apply(1).getName == CatalystConverter.MAP_VALUE_SCHEMA_NAME
       }
     }
+
     def correspondsToArray(groupType: ParquetGroupType): Boolean = {
       groupType.getFieldCount == 1 &&
         groupType.getFieldName(0) == CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME &&
@@ -188,7 +188,7 @@ private[parquet] object ParquetTypesConverter {
    *   <li> Primitive types are converted into Parquet's primitive types.</li>
    *   <li> [[org.apache.spark.sql.catalyst.types.StructType]]s are converted
    *        into Parquet's `GroupType` with the corresponding field types.</li>
-   *   <li> [[org.apache.spark.sql.catalyst.types.ArrayType]]s are converterd
+   *   <li> [[org.apache.spark.sql.catalyst.types.ArrayType]]s are converted
    *        into a 2-level nested group, where the outer group has the inner
    *        group as sole field. The inner group has name `values` and
    *        repetition level `REPEATED` and has the element type of
@@ -269,9 +269,6 @@ private[parquet] object ParquetTypesConverter {
     }
   }
 
-  def getSchema(schemaString: String) : MessageType =
-    MessageTypeParser.parseMessageType(schemaString)
-
   def convertToAttributes(parquetSchema: ParquetType): Seq[Attribute] = {
     parquetSchema
       .asGroupType()
@@ -302,7 +299,7 @@ private[parquet] object ParquetTypesConverter {
     StructType.fromAttributes(schema).toString
   }
 
-  def writeMetaData(attributes: Seq[Attribute], origPath: Path, conf: Configuration) {
+  def writeMetaData(attributes: Seq[Attribute], origPath: Path, conf: Configuration): Unit = {
     if (origPath == null) {
       throw new IllegalArgumentException("Unable to write Parquet metadata: path is null")
     }
@@ -385,4 +382,28 @@ private[parquet] object ParquetTypesConverter {
       footers(0).getParquetMetadata
     }
   }
+
+  /**
+   * Reads in Parquet Metadata from the given path and tries to extract the schema
+   * (Catalyst attributes) from the application-specific key-value map. If this
+   * is empty it falls back to converting from the Parquet file schema which
+   * may lead to an upcast of types (e.g., {byte, short} to int).
+   *
+   * @param origPath The path at which we expect one (or more) Parquet files.
+   * @return A list of attributes that make up the schema.
+   */
+  def readSchemaFromFile(origPath: Path): Seq[Attribute] = {
+    val keyValueMetadata: java.util.Map[String, String] =
+      readMetaData(origPath)
+        .getFileMetaData
+        .getKeyValueMetaData
+    if (keyValueMetadata.get(RowReadSupport.SPARK_METADATA_KEY) != null) {
+      convertFromString(keyValueMetadata.get(RowReadSupport.SPARK_METADATA_KEY))
+    } else {
+      val attributes = convertToAttributes(
+        readMetaData(origPath).getFileMetaData.getSchema)
+      log.warn(s"Falling back to schema conversion from Parquet types; result: $attributes")
+      attributes
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -111,10 +111,23 @@ class ParquetQuerySuite extends QueryTest with FunSuite with BeforeAndAfterAll {
   }
 
   test("Read/Write All Types") {
-    val data = AllDataTypes("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true)
     val tempDir = getTempFilePath("parquetTest").getCanonicalPath
-    sparkContext.parallelize(data :: Nil).saveAsParquetFile(tempDir)
-    assert(parquetFile(tempDir).collect().head === data)
+    val range = (0 to 255)
+    TestSQLContext.sparkContext.parallelize(range)
+      .map(x => AllDataTypes(s"$x", x, x.toLong, x.toFloat, x.toDouble, x.toShort, x.toByte, x % 2 == 0))
+      .saveAsParquetFile(tempDir)
+    val result = parquetFile(tempDir).collect()
+    range.foreach {
+      i =>
+        assert(result(i).getString(0) == s"$i", s"row $i String field did not match, got ${result(i).getString(0)}")
+        assert(result(i).getInt(1) === i)
+        assert(result(i).getLong(2) === i.toLong)
+        assert(result(i).getFloat(3) === i.toFloat)
+        assert(result(i).getDouble(4) === i.toDouble)
+        assert(result(i).getShort(5) === i.toShort)
+        assert(result(i).getByte(6) === i.toByte)
+        assert(result(i).getBoolean(7) === (i % 2 == 0))
+    }
   }
 
   test("self-join parquet files") {