Skip to content
Closed
Prev Previous commit
Next Next commit
Addresses comments
  • Loading branch information
liancheng committed Oct 8, 2015
commit 2bc5ebcf8c473817af68b056f772eb77a48acb07
5 changes: 2 additions & 3 deletions sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
Original file line number Diff line number Diff line change
Expand Up @@ -292,10 +292,9 @@ private[spark] object SQLConf {

val PARQUET_WRITE_LEGACY_FORMAT = booleanConf(
key = "spark.sql.parquet.writeLegacyFormat",
defaultValue = Some(true),
defaultValue = Some(false),
doc = "Whether to follow Parquet's format specification when converting Parquet schema to " +
"Spark SQL schema and vice versa.",
isPublic = false)
"Spark SQL schema and vice versa.")

val PARQUET_OUTPUT_COMMITTER_CLASS = stringConf(
key = "spark.sql.parquet.output.committer.class",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ import org.apache.spark.sql.{AnalysisException, SQLConf}
* [[StructType]]. Note that Spark SQL [[TimestampType]] is similar to Hive timestamp, which
* has optional nanosecond precision, but different from `TIME_MILLS` and `TIMESTAMP_MILLIS`
* described in Parquet format spec. This argument only affects Parquet read path.
* @param writeLegacyParquetFormat Whether to use legacy Parquet format compatible with Spark 1.5
* @param writeLegacyParquetFormat Whether to use legacy Parquet format compatible with Spark 1.4
* and prior versions when converting a Catalyst [[StructType]] to a Parquet [[MessageType]].
* When set to false, use standard format defined in parquet-format spec. This argument only
* affects Parquet write path.
Expand Down Expand Up @@ -356,7 +356,7 @@ private[parquet] class CatalystSchemaConverter(
// `TIMESTAMP_MICROS` which are both logical types annotating `INT64`.
//
// Originally, Spark SQL uses the same nanosecond timestamp type as Impala and Hive. Starting
// from Spark 1.5.0, we resort to a timestamp type with 100 ns precision so that we can store
// from Spark 1.4.0, we resort to a timestamp type with 100 ns precision so that we can store
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be 1.5

// a timestamp into a `Long`. This design decision is subject to change though, for example,
// we may resort to microsecond precision in the future.
//
Expand All @@ -375,7 +375,7 @@ private[parquet] class CatalystSchemaConverter(
// Decimals (legacy mode)
// ======================

// Spark 1.5.x and prior versions only support decimals with a maximum precision of 18 and
// Spark 1.4.x and prior versions only support decimals with a maximum precision of 18 and
// always store decimals in fixed-length byte arrays. To keep compatibility with these older
// versions, here we convert decimals with all precisions to `FIXED_LEN_BYTE_ARRAY` annotated
// by `DECIMAL`.
Expand Down Expand Up @@ -426,7 +426,7 @@ private[parquet] class CatalystSchemaConverter(
// ArrayType and MapType (legacy mode)
// ===================================

// Spark 1.5.x and prior versions convert `ArrayType` with nullable elements into a 3-level
// Spark 1.4.x and prior versions convert `ArrayType` with nullable elements into a 3-level
// `LIST` structure. This behavior is somewhat a hybrid of parquet-hive and parquet-avro
// (1.6.0rc3): the 3-level structure is similar to parquet-hive while the 3rd level element
// field name "array" is borrowed from parquet-avro.
Expand All @@ -445,7 +445,7 @@ private[parquet] class CatalystSchemaConverter(
.addField(convertField(StructField("array", elementType, nullable)))
.named("bag"))

// Spark 1.5.x and prior versions convert ArrayType with non-nullable elements into a 2-level
// Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level
// LIST structure. This behavior mimics parquet-avro (1.6.0rc3). Note that this case is
// covered by the backwards-compatibility rules implemented in `isElementType()`.
case ArrayType(elementType, nullable @ false) if writeLegacyParquetFormat =>
Expand All @@ -458,7 +458,7 @@ private[parquet] class CatalystSchemaConverter(
// "array" is the name chosen by parquet-avro (1.7.0 and prior version)
convertField(StructField("array", elementType, nullable), REPEATED))

// Spark 1.5.x and prior versions convert MapType into a 3-level group annotated by
// Spark 1.4.x and prior versions convert MapType into a 3-level group annotated by
// MAP_KEY_VALUE. This is covered by `convertGroupField(field: GroupType): DataType`.
case MapType(keyType, valueType, valueContainsNull) if writeLegacyParquetFormat =>
// <map-repetition> group <name> (MAP) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ import org.apache.spark.sql.types._
* messages. This class can write Parquet data in two modes:
*
* - Standard mode: Parquet data are written in standard format defined in parquet-format spec.
* - Legacy mode: Parquet data are written in legacy format compatible with Spark 1.5 and prior.
* - Legacy mode: Parquet data are written in legacy format compatible with Spark 1.4 and prior.
*
* This behavior can be controlled by SQL option `spark.sql.parquet.writeLegacyParquetFormat`. The
* value of the option is propagated to this class by the `init()` method and its Hadoop
Expand All @@ -63,7 +63,7 @@ private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] wi
// The Parquet `RecordConsumer` to which all `InternalRow`s are written
private var recordConsumer: RecordConsumer = _

// Whether to write data in legacy Parquet format compatible with Spark 1.5 and prior versions
// Whether to write data in legacy Parquet format compatible with Spark 1.4 and prior versions
private var writeLegacyParquetFormat: Boolean = _

// Reusable byte array used to write timestamps as Parquet INT96 values
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -665,7 +665,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
writeLegacyParquetFormat = false)

testCatalystToParquet(
"Backwards-compatibility: LIST with nullable element type - 2 - prior to 1.5.x",
"Backwards-compatibility: LIST with nullable element type - 2 - prior to 1.4.x",
StructType(Seq(
StructField(
"f1",
Expand Down Expand Up @@ -703,7 +703,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
writeLegacyParquetFormat = false)

testCatalystToParquet(
"Backwards-compatibility: LIST with non-nullable element type - 2 - prior to 1.5.x",
"Backwards-compatibility: LIST with non-nullable element type - 2 - prior to 1.4.x",
StructType(Seq(
StructField(
"f1",
Expand Down Expand Up @@ -764,7 +764,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
writeLegacyParquetFormat = true)

testParquetToCatalyst(
"Backwards-compatibility: MAP with non-nullable value type - 3 - prior to 1.5.x",
"Backwards-compatibility: MAP with non-nullable value type - 3 - prior to 1.4.x",
StructType(Seq(
StructField(
"f1",
Expand Down Expand Up @@ -868,7 +868,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
writeLegacyParquetFormat = false)

testCatalystToParquet(
"Backwards-compatibility: MAP with non-nullable value type - 2 - prior to 1.5.x",
"Backwards-compatibility: MAP with non-nullable value type - 2 - prior to 1.4.x",
StructType(Seq(
StructField(
"f1",
Expand Down Expand Up @@ -908,7 +908,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
writeLegacyParquetFormat = false)

testCatalystToParquet(
"Backwards-compatibility: MAP with nullable value type - 3 - prior to 1.5.x",
"Backwards-compatibility: MAP with nullable value type - 3 - prior to 1.4.x",
StructType(Seq(
StructField(
"f1",
Expand Down Expand Up @@ -987,7 +987,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
writeLegacyParquetFormat = false)

testSchema(
"DECIMAL(1, 0) - prior to 1.5.x",
"DECIMAL(1, 0) - prior to 1.4.x",
StructType(Seq(StructField("f1", DecimalType(1, 0)))),
"""message root {
| optional fixed_len_byte_array(1) f1 (DECIMAL(1, 0));
Expand All @@ -998,7 +998,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
writeLegacyParquetFormat = true)

testSchema(
"DECIMAL(8, 3) - prior to 1.5.x",
"DECIMAL(8, 3) - prior to 1.4.x",
StructType(Seq(StructField("f1", DecimalType(8, 3)))),
"""message root {
| optional fixed_len_byte_array(4) f1 (DECIMAL(8, 3));
Expand All @@ -1009,7 +1009,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
writeLegacyParquetFormat = true)

testSchema(
"DECIMAL(9, 3) - prior to 1.5.x",
"DECIMAL(9, 3) - prior to 1.4.x",
StructType(Seq(StructField("f1", DecimalType(9, 3)))),
"""message root {
| optional fixed_len_byte_array(5) f1 (DECIMAL(9, 3));
Expand All @@ -1020,7 +1020,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
writeLegacyParquetFormat = true)

testSchema(
"DECIMAL(18, 3) - prior to 1.5.x",
"DECIMAL(18, 3) - prior to 1.4.x",
StructType(Seq(StructField("f1", DecimalType(18, 3)))),
"""message root {
| optional fixed_len_byte_array(8) f1 (DECIMAL(18, 3));
Expand Down