apache · dcoliversun · Oct 8, 2022 · dcoliversun · Oct 8, 2022 · dcoliversun
diff --git a/docs/sql-data-sources-parquet.md b/docs/sql-data-sources-parquet.md
@@ -454,6 +454,28 @@ Configuration of Parquet can be done using the `setConf` method on `SparkSession
   </td>
   <td>1.3.0</td>
 </tr>
+<tr>
+  <td><code>spark.sql.parquet.int96TimestampConversion</code></td>
 val PARQUET_INT96_TIMESTAMP_CONVERSION = buildConf("spark.sql.parquet.int96TimestampConversion") 
   .doc("This controls whether timestamp adjustments should be applied to INT96 data when " + 
     "converting to timestamps, for data written by Impala.  This is necessary because Impala " + 
     "stores INT96 data with a different timezone offset than Hive & Spark.") 
   .version("2.3.0") 
   .booleanConf 
   .createWithDefault(false) 
 val PARQUET_INT96_TIMESTAMP_CONVERSION = buildConf("spark.sql.parquet.int96TimestampConversion") 
   .doc("This controls whether timestamp adjustments should be applied to INT96 data when " + 
     "converting to timestamps, for data written by Impala.  This is necessary because Impala " + 
     "stores INT96 data with a different timezone offset than Hive & Spark.") 
   .version("2.3.0") 
   .booleanConf 
   .createWithDefault(false) 
+  <td>false</td>
+  <td>
+    This controls whether timestamp adjustments should be applied to INT96 data when 
+    converting to timestamps, for data written by Impala.  This is necessary because Impala 
+    stores INT96 data with a different timezone offset than Hive & Spark.
+  </td>
+  <td>2.3.0</td>
+</tr>
+<tr>
+  <td><code>spark.sql.parquet.outputTimestampType</code></td>
 val PARQUET_OUTPUT_TIMESTAMP_TYPE = buildConf("spark.sql.parquet.outputTimestampType") 
   .doc("Sets which Parquet timestamp type to use when Spark writes data to Parquet files. " + 
     "INT96 is a non-standard but commonly used timestamp type in Parquet. TIMESTAMP_MICROS " + 
     "is a standard timestamp type in Parquet, which stores number of microseconds from the " + 
     "Unix epoch. TIMESTAMP_MILLIS is also standard, but with millisecond precision, which " + 
     "means Spark has to truncate the microsecond portion of its timestamp value.") 
   .version("2.3.0") 
   .stringConf 
   .transform(_.toUpperCase(Locale.ROOT)) 
   .checkValues(ParquetOutputTimestampType.values.map(_.toString)) 
   .createWithDefault(ParquetOutputTimestampType.INT96.toString) 
 val PARQUET_OUTPUT_TIMESTAMP_TYPE = buildConf("spark.sql.parquet.outputTimestampType") 
   .doc("Sets which Parquet timestamp type to use when Spark writes data to Parquet files. " + 
     "INT96 is a non-standard but commonly used timestamp type in Parquet. TIMESTAMP_MICROS " + 
     "is a standard timestamp type in Parquet, which stores number of microseconds from the " + 
     "Unix epoch. TIMESTAMP_MILLIS is also standard, but with millisecond precision, which " + 
     "means Spark has to truncate the microsecond portion of its timestamp value.") 
   .version("2.3.0") 
   .stringConf 
   .transform(_.toUpperCase(Locale.ROOT)) 
   .checkValues(ParquetOutputTimestampType.values.map(_.toString)) 
   .createWithDefault(ParquetOutputTimestampType.INT96.toString) 
+  <td>INT96</td>
+  <td>
+    Sets which Parquet timestamp type to use when Spark writes data to Parquet files. 
+    INT96 is a non-standard but commonly used timestamp type in Parquet. TIMESTAMP_MICROS 
+    is a standard timestamp type in Parquet, which stores number of microseconds from the 
+    Unix epoch. TIMESTAMP_MILLIS is also standard, but with millisecond precision, which 
+    means Spark has to truncate the microsecond portion of its timestamp value.
+  </td>
+  <td>2.3.0</td>
+</tr>
 <tr>
   <td><code>spark.sql.parquet.compression.codec</code></td>
   <td>snappy</td>
@@ -473,6 +495,17 @@ Configuration of Parquet can be done using the `setConf` method on `SparkSession
   <td>Enables Parquet filter push-down optimization when set to true.</td>
   <td>1.2.0</td>
 </tr>
+<tr>
+  <td><code>spark.sql.parquet.aggregatePushdown</code></td>
 val PARQUET_AGGREGATE_PUSHDOWN_ENABLED = buildConf("spark.sql.parquet.aggregatePushdown") 
   .doc("If true, aggregates will be pushed down to Parquet for optimization. Support MIN, MAX " + 
     "and COUNT as aggregate expression. For MIN/MAX, support boolean, integer, float and date " + 
     "type. For COUNT, support all data types. If statistics is missing from any Parquet file " + 
     "footer, exception would be thrown.") 
   .version("3.3.0") 
   .booleanConf 
   .createWithDefault(false) 
 val PARQUET_AGGREGATE_PUSHDOWN_ENABLED = buildConf("spark.sql.parquet.aggregatePushdown") 
   .doc("If true, aggregates will be pushed down to Parquet for optimization. Support MIN, MAX " + 
     "and COUNT as aggregate expression. For MIN/MAX, support boolean, integer, float and date " + 
     "type. For COUNT, support all data types. If statistics is missing from any Parquet file " + 
     "footer, exception would be thrown.") 
   .version("3.3.0") 
   .booleanConf 
   .createWithDefault(false) 
+  <td>false</td>
+  <td>
+    If true, aggregates will be pushed down to Parquet for optimization. Support MIN, MAX 
+    and COUNT as aggregate expression. For MIN/MAX, support boolean, integer, float and date 
+    type. For COUNT, support all data types. If statistics is missing from any Parquet file 
+    footer, exception would be thrown.
+  </td>
+  <td>3.3.0</td>
+</tr>
 <tr>
   <td><code>spark.sql.hive.convertMetastoreParquet</code></td>
   <td>true</td>
@@ -493,6 +526,17 @@ Configuration of Parquet can be done using the `setConf` method on `SparkSession
   </td>
   <td>1.5.0</td>
 </tr>
+<tr>
+  <td><code>spark.sql.parquet.respectSummaryFiles</code></td>
 val PARQUET_SCHEMA_RESPECT_SUMMARIES = buildConf("spark.sql.parquet.respectSummaryFiles") 
   .doc("When true, we make assumption that all part-files of Parquet are consistent with " + 
        "summary files and we will ignore them when merging schema. Otherwise, if this is " + 
        "false, which is the default, we will merge all part-files. This should be considered " + 
        "as expert-only option, and shouldn't be enabled before knowing what it means exactly.") 
   .version("1.5.0") 
   .booleanConf 
   .createWithDefault(false) 
 val PARQUET_SCHEMA_RESPECT_SUMMARIES = buildConf("spark.sql.parquet.respectSummaryFiles") 
   .doc("When true, we make assumption that all part-files of Parquet are consistent with " + 
        "summary files and we will ignore them when merging schema. Otherwise, if this is " + 
        "false, which is the default, we will merge all part-files. This should be considered " + 
        "as expert-only option, and shouldn't be enabled before knowing what it means exactly.") 
   .version("1.5.0") 
   .booleanConf 
   .createWithDefault(false) 
+  <td>false</td>
+  <td>
+    When true, we make assumption that all part-files of Parquet are consistent with 
+    summary files and we will ignore them when merging schema. Otherwise, if this is 
+    false, which is the default, we will merge all part-files. This should be considered 
+    as expert-only option, and shouldn't be enabled before knowing what it means exactly.
+  </td>
+  <td>1.5.0</td>
+</tr>
 <tr>
   <td><code>spark.sql.parquet.writeLegacyFormat</code></td>
   <td>false</td>
@@ -505,6 +549,84 @@ Configuration of Parquet can be done using the `setConf` method on `SparkSession
   </td>
   <td>1.6.0</td>
 </tr>
+<tr>
+  <td><code>spark.sql.parquet.enableVectorizedReader</code></td>
 val PARQUET_VECTORIZED_READER_ENABLED = 
   buildConf("spark.sql.parquet.enableVectorizedReader") 
     .doc("Enables vectorized parquet decoding.") 
     .version("2.0.0") 
     .booleanConf 
     .createWithDefault(true) 
 val PARQUET_VECTORIZED_READER_ENABLED = 
   buildConf("spark.sql.parquet.enableVectorizedReader") 
     .doc("Enables vectorized parquet decoding.") 
     .version("2.0.0") 
     .booleanConf 
     .createWithDefault(true) 
+  <td>true</td>
+  <td>
+    Enables vectorized parquet decoding.
+  </td>
+  <td>2.0.0</td>
+</tr>
+<tr>
+  <td><code>spark.sql.parquet.enableNestedColumnVectorizedReader</code></td>
 val PARQUET_VECTORIZED_READER_NESTED_COLUMN_ENABLED = 
   buildConf("spark.sql.parquet.enableNestedColumnVectorizedReader") 
     .doc("Enables vectorized Parquet decoding for nested columns (e.g., struct, list, map). " + 
         s"Requires ${PARQUET_VECTORIZED_READER_ENABLED.key} to be enabled.") 
     .version("3.3.0") 
     .booleanConf 
     .createWithDefault(true) 
 val PARQUET_VECTORIZED_READER_NESTED_COLUMN_ENABLED = 
   buildConf("spark.sql.parquet.enableNestedColumnVectorizedReader") 
     .doc("Enables vectorized Parquet decoding for nested columns (e.g., struct, list, map). " + 
         s"Requires ${PARQUET_VECTORIZED_READER_ENABLED.key} to be enabled.") 
     .version("3.3.0") 
     .booleanConf 
     .createWithDefault(true) 
+  <td>true</td>
+  <td>
+    Enables vectorized Parquet decoding for nested columns (e.g., struct, list, map). 
+    Requires <code>spark.sql.parquet.enableVectorizedReader</code> to be enabled.
+  </td>
+  <td>3.3.0</td>
+</tr>
+<tr>
+  <td><code>spark.sql.parquet.recordLevelFilter.enabled</code></td>
 val PARQUET_RECORD_FILTER_ENABLED = buildConf("spark.sql.parquet.recordLevelFilter.enabled") 
   .doc("If true, enables Parquet's native record-level filtering using the pushed down " + 
     "filters. " + 
     s"This configuration only has an effect when '${PARQUET_FILTER_PUSHDOWN_ENABLED.key}' " + 
     "is enabled and the vectorized reader is not used. You can ensure the vectorized reader " + 
     s"is not used by setting '${PARQUET_VECTORIZED_READER_ENABLED.key}' to false.") 
   .version("2.3.0") 
   .booleanConf 
   .createWithDefault(false) 
 val PARQUET_RECORD_FILTER_ENABLED = buildConf("spark.sql.parquet.recordLevelFilter.enabled") 
   .doc("If true, enables Parquet's native record-level filtering using the pushed down " + 
     "filters. " + 
     s"This configuration only has an effect when '${PARQUET_FILTER_PUSHDOWN_ENABLED.key}' " + 
     "is enabled and the vectorized reader is not used. You can ensure the vectorized reader " + 
     s"is not used by setting '${PARQUET_VECTORIZED_READER_ENABLED.key}' to false.") 
   .version("2.3.0") 
   .booleanConf 
   .createWithDefault(false) 
+  <td>false</td>
+  <td>
+    If true, enables Parquet's native record-level filtering using the pushed down filters. 
+    This configuration only has an effect when <code>spark.sql.parquet.filterPushdown</code> 
+    is enabled and the vectorized reader is not used. You can ensure the vectorized reader 
+    is not used by setting <code>spark.sql.parquet.enableVectorizedReader</code> to false.
+  </td>
+  <td>2.3.0</td>
+</tr>
+<tr>
+  <td><code>spark.sql.parquet.columnarReaderBatchSize</code></td>
 val PARQUET_VECTORIZED_READER_BATCH_SIZE = buildConf("spark.sql.parquet.columnarReaderBatchSize") 
   .doc("The number of rows to include in a parquet vectorized reader batch. The number should " + 
     "be carefully chosen to minimize overhead and avoid OOMs in reading data.") 
   .version("2.4.0") 
   .intConf 
   .createWithDefault(4096) 
 val PARQUET_VECTORIZED_READER_BATCH_SIZE = buildConf("spark.sql.parquet.columnarReaderBatchSize") 
   .doc("The number of rows to include in a parquet vectorized reader batch. The number should " + 
     "be carefully chosen to minimize overhead and avoid OOMs in reading data.") 
   .version("2.4.0") 
   .intConf 
   .createWithDefault(4096) 
+  <td>4096</td>
+  <td>
+    The number of rows to include in a parquet vectorized reader batch. The number should 
+    be carefully chosen to minimize overhead and avoid OOMs in reading data.
+  </td>
+  <td>2.4.0</td>
+</tr>
+<tr>
+  <td><code>spark.sql.parquet.fieldId.write.enabled</code></td>
 val PARQUET_FIELD_ID_WRITE_ENABLED = 
  buildConf("spark.sql.parquet.fieldId.write.enabled") 
    .doc("Field ID is a native field of the Parquet schema spec. When enabled, " + 
      "Parquet writers will populate the field Id " + 
      "metadata (if present) in the Spark schema to the Parquet schema.") 
    .version("3.3.0") 
    .booleanConf 
    .createWithDefault(true) 
 val PARQUET_FIELD_ID_WRITE_ENABLED = 
  buildConf("spark.sql.parquet.fieldId.write.enabled") 
    .doc("Field ID is a native field of the Parquet schema spec. When enabled, " + 
      "Parquet writers will populate the field Id " + 
      "metadata (if present) in the Spark schema to the Parquet schema.") 
    .version("3.3.0") 
    .booleanConf 
    .createWithDefault(true) 
+  <td>true</td>
+  <td>
+    Field ID is a native field of the Parquet schema spec. When enabled, 
+    Parquet writers will populate the field Id metadata (if present) in the Spark schema to the Parquet schema.
+  </td>
+  <td>3.3.0</td>
+</tr>
+<tr>
+  <td><code>spark.sql.parquet.fieldId.read.enabled</code></td>
 val PARQUET_FIELD_ID_READ_ENABLED = 
   buildConf("spark.sql.parquet.fieldId.read.enabled") 
     .doc("Field ID is a native field of the Parquet schema spec. When enabled, Parquet readers " + 
       "will use field IDs (if present) in the requested Spark schema to look up Parquet " + 
       "fields instead of using column names") 
     .version("3.3.0") 
     .booleanConf 
     .createWithDefault(false) 
 val PARQUET_FIELD_ID_READ_ENABLED = 
   buildConf("spark.sql.parquet.fieldId.read.enabled") 
     .doc("Field ID is a native field of the Parquet schema spec. When enabled, Parquet readers " + 
       "will use field IDs (if present) in the requested Spark schema to look up Parquet " + 
       "fields instead of using column names") 
     .version("3.3.0") 
     .booleanConf 
     .createWithDefault(false) 
+  <td>false</td>
+  <td>
+    Field ID is a native field of the Parquet schema spec. When enabled, Parquet readers 
+    will use field IDs (if present) in the requested Spark schema to look up Parquet 
+    fields instead of using column names.
+  </td>
+  <td>3.3.0</td>
+</tr>
+<tr>
+  <td><code>spark.sql.parquet.fieldId.read.ignoreMissing</code></td>
 val IGNORE_MISSING_PARQUET_FIELD_ID = 
   buildConf("spark.sql.parquet.fieldId.read.ignoreMissing") 
     .doc("When the Parquet file doesn't have any field IDs but the " + 
       "Spark read schema is using field IDs to read, we will silently return nulls " + 
       "when this flag is enabled, or error otherwise.") 
     .version("3.3.0") 
     .booleanConf 
     .createWithDefault(false) 
 val IGNORE_MISSING_PARQUET_FIELD_ID = 
   buildConf("spark.sql.parquet.fieldId.read.ignoreMissing") 
     .doc("When the Parquet file doesn't have any field IDs but the " + 
       "Spark read schema is using field IDs to read, we will silently return nulls " + 
       "when this flag is enabled, or error otherwise.") 
     .version("3.3.0") 
     .booleanConf 
     .createWithDefault(false) 
+  <td>false</td>
+  <td>
+    When the Parquet file doesn't have any field IDs but the 
+    Spark read schema is using field IDs to read, we will silently return nulls 
+    when this flag is enabled, or error otherwise.
+  </td>
+  <td>3.3.0</td>
+</tr>
+<tr>
+  <td><code>spark.sql.parquet.timestampNTZ.enabled</code></td>
 val PARQUET_TIMESTAMP_NTZ_ENABLED = 
   buildConf("spark.sql.parquet.timestampNTZ.enabled") 
     .doc(s"Enables ${TimestampTypes.TIMESTAMP_NTZ} support for Parquet reads and writes. " + 
       s"When enabled, ${TimestampTypes.TIMESTAMP_NTZ} values are written as Parquet timestamp " + 
       "columns with annotation isAdjustedToUTC = false and are inferred in a similar way. " + 
       s"When disabled, such values are read as ${TimestampTypes.TIMESTAMP_LTZ} and have to be " + 
       s"converted to ${TimestampTypes.TIMESTAMP_LTZ} for writes.") 
     .version("3.4.0") 
     .booleanConf 
     .createWithDefault(true) 
 val PARQUET_TIMESTAMP_NTZ_ENABLED = 
   buildConf("spark.sql.parquet.timestampNTZ.enabled") 
     .doc(s"Enables ${TimestampTypes.TIMESTAMP_NTZ} support for Parquet reads and writes. " + 
       s"When enabled, ${TimestampTypes.TIMESTAMP_NTZ} values are written as Parquet timestamp " + 
       "columns with annotation isAdjustedToUTC = false and are inferred in a similar way. " + 
       s"When disabled, such values are read as ${TimestampTypes.TIMESTAMP_LTZ} and have to be " + 
       s"converted to ${TimestampTypes.TIMESTAMP_LTZ} for writes.") 
     .version("3.4.0") 
     .booleanConf 
     .createWithDefault(true) 
+  <td>true</td>
+  <td>
+    Enables <code>TIMESTAMP_NTZ</code> support for Parquet reads and writes. 
+    When enabled, <code>TIMESTAMP_NTZ</code> values are written as Parquet timestamp 
+    columns with annotation isAdjustedToUTC = false and are inferred in a similar way. 
+    When disabled, such values are read as <code>TIMESTAMP_LTZ</code> and have to be 
+    converted to <code>TIMESTAMP_LTZ</code> for writes.
+  </td>
+  <td>3.4.0</td>
+</tr>
 <tr>
 <td>spark.sql.parquet.datetimeRebaseModeInRead</td>
   <td><code>EXCEPTION</code></td>