Addresses comments

apache · liancheng · Oct 5, 2015 · Oct 6, 2015 · Oct 6, 2015 · Oct 6, 2015
commit 2bc5ebcf8c473817af68b056f772eb77a48acb07
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -292,10 +292,9 @@ private[spark] object SQLConf {
 
   val PARQUET_WRITE_LEGACY_FORMAT = booleanConf(
     key = "spark.sql.parquet.writeLegacyFormat",
-    defaultValue = Some(true),
+    defaultValue = Some(false),
     doc = "Whether to follow Parquet's format specification when converting Parquet schema to " +
-      "Spark SQL schema and vice versa.",
-    isPublic = false)
+      "Spark SQL schema and vice versa.")
 
   val PARQUET_OUTPUT_COMMITTER_CLASS = stringConf(
     key = "spark.sql.parquet.output.committer.class",

diff --git a/...in/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala b/...in/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
@@ -47,7 +47,7 @@ import org.apache.spark.sql.{AnalysisException, SQLConf}
  *        [[StructType]].  Note that Spark SQL [[TimestampType]] is similar to Hive timestamp, which
  *        has optional nanosecond precision, but different from `TIME_MILLS` and `TIMESTAMP_MILLIS`
  *        described in Parquet format spec.  This argument only affects Parquet read path.
- * @param writeLegacyParquetFormat Whether to use legacy Parquet format compatible with Spark 1.5
+ * @param writeLegacyParquetFormat Whether to use legacy Parquet format compatible with Spark 1.4
  *        and prior versions when converting a Catalyst [[StructType]] to a Parquet [[MessageType]].
  *        When set to false, use standard format defined in parquet-format spec.  This argument only
  *        affects Parquet write path.
@@ -356,7 +356,7 @@ private[parquet] class CatalystSchemaConverter(
       // `TIMESTAMP_MICROS` which are both logical types annotating `INT64`.
       //
       // Originally, Spark SQL uses the same nanosecond timestamp type as Impala and Hive.  Starting
-      // from Spark 1.5.0, we resort to a timestamp type with 100 ns precision so that we can store
+      // from Spark 1.4.0, we resort to a timestamp type with 100 ns precision so that we can store
       // a timestamp into a `Long`.  This design decision is subject to change though, for example,
       // we may resort to microsecond precision in the future.
       //
@@ -375,7 +375,7 @@ private[parquet] class CatalystSchemaConverter(
       // Decimals (legacy mode)
       // ======================
 
-      // Spark 1.5.x and prior versions only support decimals with a maximum precision of 18 and
+      // Spark 1.4.x and prior versions only support decimals with a maximum precision of 18 and
       // always store decimals in fixed-length byte arrays.  To keep compatibility with these older
       // versions, here we convert decimals with all precisions to `FIXED_LEN_BYTE_ARRAY` annotated
       // by `DECIMAL`.
@@ -426,7 +426,7 @@ private[parquet] class CatalystSchemaConverter(
       // ArrayType and MapType (legacy mode)
       // ===================================
 
-      // Spark 1.5.x and prior versions convert `ArrayType` with nullable elements into a 3-level
+      // Spark 1.4.x and prior versions convert `ArrayType` with nullable elements into a 3-level
       // `LIST` structure.  This behavior is somewhat a hybrid of parquet-hive and parquet-avro
       // (1.6.0rc3): the 3-level structure is similar to parquet-hive while the 3rd level element
       // field name "array" is borrowed from parquet-avro.
@@ -445,7 +445,7 @@ private[parquet] class CatalystSchemaConverter(
             .addField(convertField(StructField("array", elementType, nullable)))
             .named("bag"))
 
-      // Spark 1.5.x and prior versions convert ArrayType with non-nullable elements into a 2-level
+      // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level
       // LIST structure.  This behavior mimics parquet-avro (1.6.0rc3).  Note that this case is
       // covered by the backwards-compatibility rules implemented in `isElementType()`.
       case ArrayType(elementType, nullable @ false) if writeLegacyParquetFormat =>
@@ -458,7 +458,7 @@ private[parquet] class CatalystSchemaConverter(
           // "array" is the name chosen by parquet-avro (1.7.0 and prior version)
           convertField(StructField("array", elementType, nullable), REPEATED))
 
-      // Spark 1.5.x and prior versions convert MapType into a 3-level group annotated by
+      // Spark 1.4.x and prior versions convert MapType into a 3-level group annotated by
       // MAP_KEY_VALUE.  This is covered by `convertGroupField(field: GroupType): DataType`.
       case MapType(keyType, valueType, valueContainsNull) if writeLegacyParquetFormat =>
         // <map-repetition> group <name> (MAP) {

diff --git a/.../main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystWriteSupport.scala b/.../main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystWriteSupport.scala
@@ -42,7 +42,7 @@ import org.apache.spark.sql.types._
  * messages.  This class can write Parquet data in two modes:
  *
  *  - Standard mode: Parquet data are written in standard format defined in parquet-format spec.
- *  - Legacy mode: Parquet data are written in legacy format compatible with Spark 1.5 and prior.
+ *  - Legacy mode: Parquet data are written in legacy format compatible with Spark 1.4 and prior.
  *
  * This behavior can be controlled by SQL option `spark.sql.parquet.writeLegacyParquetFormat`.  The
  * value of the option is propagated to this class by the `init()` method and its Hadoop
@@ -63,7 +63,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
   // The Parquet `RecordConsumer` to which all `InternalRow`s are written
   private var recordConsumer: RecordConsumer = _
 
-  // Whether to write data in legacy Parquet format compatible with Spark 1.5 and prior versions
+  // Whether to write data in legacy Parquet format compatible with Spark 1.4 and prior versions
   private var writeLegacyParquetFormat: Boolean = _
 
   // Reusable byte array used to write timestamps as Parquet INT96 values

diff --git a/...rc/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/...rc/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -665,7 +665,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     writeLegacyParquetFormat = false)
 
   testCatalystToParquet(
-    "Backwards-compatibility: LIST with nullable element type - 2 - prior to 1.5.x",
+    "Backwards-compatibility: LIST with nullable element type - 2 - prior to 1.4.x",
     StructType(Seq(
       StructField(
         "f1",
@@ -703,7 +703,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     writeLegacyParquetFormat = false)
 
   testCatalystToParquet(
-    "Backwards-compatibility: LIST with non-nullable element type - 2 - prior to 1.5.x",
+    "Backwards-compatibility: LIST with non-nullable element type - 2 - prior to 1.4.x",
     StructType(Seq(
       StructField(
         "f1",
@@ -764,7 +764,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     writeLegacyParquetFormat = true)
 
   testParquetToCatalyst(
-    "Backwards-compatibility: MAP with non-nullable value type - 3 - prior to 1.5.x",
+    "Backwards-compatibility: MAP with non-nullable value type - 3 - prior to 1.4.x",
     StructType(Seq(
       StructField(
         "f1",
@@ -868,7 +868,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     writeLegacyParquetFormat = false)
 
   testCatalystToParquet(
-    "Backwards-compatibility: MAP with non-nullable value type - 2 - prior to 1.5.x",
+    "Backwards-compatibility: MAP with non-nullable value type - 2 - prior to 1.4.x",
     StructType(Seq(
       StructField(
         "f1",
@@ -908,7 +908,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     writeLegacyParquetFormat = false)
 
   testCatalystToParquet(
-    "Backwards-compatibility: MAP with nullable value type - 3 - prior to 1.5.x",
+    "Backwards-compatibility: MAP with nullable value type - 3 - prior to 1.4.x",
     StructType(Seq(
       StructField(
         "f1",
@@ -987,7 +987,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     writeLegacyParquetFormat = false)
 
   testSchema(
-    "DECIMAL(1, 0) - prior to 1.5.x",
+    "DECIMAL(1, 0) - prior to 1.4.x",
     StructType(Seq(StructField("f1", DecimalType(1, 0)))),
     """message root {
       |  optional fixed_len_byte_array(1) f1 (DECIMAL(1, 0));
@@ -998,7 +998,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     writeLegacyParquetFormat = true)
 
   testSchema(
-    "DECIMAL(8, 3) - prior to 1.5.x",
+    "DECIMAL(8, 3) - prior to 1.4.x",
     StructType(Seq(StructField("f1", DecimalType(8, 3)))),
     """message root {
       |  optional fixed_len_byte_array(4) f1 (DECIMAL(8, 3));
@@ -1009,7 +1009,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     writeLegacyParquetFormat = true)
 
   testSchema(
-    "DECIMAL(9, 3) - prior to 1.5.x",
+    "DECIMAL(9, 3) - prior to 1.4.x",
     StructType(Seq(StructField("f1", DecimalType(9, 3)))),
     """message root {
       |  optional fixed_len_byte_array(5) f1 (DECIMAL(9, 3));
@@ -1020,7 +1020,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     writeLegacyParquetFormat = true)
 
   testSchema(
-    "DECIMAL(18, 3) - prior to 1.5.x",
+    "DECIMAL(18, 3) - prior to 1.4.x",
     StructType(Seq(StructField("f1", DecimalType(18, 3)))),
     """message root {
       |  optional fixed_len_byte_array(8) f1 (DECIMAL(18, 3));