Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
741bbf6
bump `tonic` to 0.12 and `prost` to 0.13 for `arrow-flight` (#6041)
BugenZhao Jul 16, 2024
8f76248
Remove `impl<T: AsRef<[u8]>> From<T> for Buffer` that easily acciden…
XiangpengHao Jul 16, 2024
bb5f12b
Make display of interval types more pretty (#6006)
Rachelint Jul 16, 2024
756b1fb
Update snafu (#5930)
Jesse-Bakker Jul 16, 2024
fe04e09
Update Parquet thrift generated structures (#6045)
etseidl Jul 16, 2024
2e7f7ef
Revert "Revert "Write Bloom filters between row groups instead of the…
alamb Jul 16, 2024
effccc1
Revert "Update snafu (#5930)" (#6069)
alamb Jul 16, 2024
649d09d
Update pyo3 requirement from 0.21.1 to 0.22.1 (fixed) (#6075)
crepererum Jul 17, 2024
05e681d
remove repeated codes to make the codes more concise. (#6080)
Rachelint Jul 18, 2024
e40b311
Add `unencoded_byte_array_data_bytes` to `ParquetMetaData` (#6068)
etseidl Jul 19, 2024
81c34ac
Update pyo3 requirement from 0.21.1 to 0.22.2 (#6085)
dependabot[bot] Jul 23, 2024
3bc9987
Deprecate read_page_locations() and simplify offset index in `Parquet…
etseidl Jul 23, 2024
20e11ec
no longer write inline column metadata
etseidl Jul 25, 2024
095130f
Merge remote-tracking branch 'apache/master' into 53.0.0-dev
alamb Jul 25, 2024
a6353d1
Update parquet/src/column/writer/mod.rs
alamb Jul 25, 2024
d122b1f
Merge remote-tracking branch 'origin/53.0.0-dev' into no_column_meta
etseidl Jul 25, 2024
957499d
suggestion from review
etseidl Jul 26, 2024
a033e43
add some more documentation
etseidl Jul 26, 2024
571ce65
Merge remote-tracking branch 'origin/master' into no_column_meta
etseidl Jul 26, 2024
07f9a1d
Merge remote-tracking branch 'origin/master' into no_column_meta
etseidl Jul 26, 2024
444b14f
remove write_metadata from PageWriter
etseidl Jul 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion parquet/src/column/writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
num_page_nulls: 0,
},
column_metrics: ColumnMetrics::<E::T>::new(),
column_index_builder: ColumnIndexBuilder::new(),
column_index_builder,
offset_index_builder: OffsetIndexBuilder::new(),
encodings,
data_page_boundary_ascending: true,
Expand Down
33 changes: 28 additions & 5 deletions parquet/src/file/metadata/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,16 @@ use crate::schema::types::{
/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub type ParquetColumnIndex = Vec<Vec<Index>>;

/// [`OffsetIndexMetaData`] for each row group of each column.
/// [`OffsetIndexMetaData`] for each data page of each row group of each column
///
/// This structure is the parsed representation of the [`OffsetIndex`] from the
/// Parquet file footer, as described in the Parquet [PageIndex documentation].
///
/// `offset_index[row_group_number][column_number]` holds
/// the [`OffsetIndexMetaData`] corresponding to column
/// `column_number`of row group `row_group_number`.
///
/// For example `offset_index[2][3]` holds the [`OffsetIndexMetaData`] for
/// the fourth column in the third row group of the parquet file.
/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;

/// Parsed metadata for a single Parquet file
Expand Down Expand Up @@ -1374,8 +1376,29 @@ mod tests {
schema_descr.clone(),
column_orders,
);
let parquet_meta = ParquetMetaData::new(file_metadata.clone(), row_group_meta.clone());
let base_expected_size = 1352;

// Now, add in Exact Statistics
let columns_with_stats = schema_descr
.columns()
.iter()
.map(|column_descr| {
ColumnChunkMetaData::builder(column_descr.clone())
.set_statistics(Statistics::new::<i32>(Some(0), Some(100), None, 0, false))
.build()
})
.collect::<Result<Vec<_>>>()
.unwrap();

let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
.set_num_rows(1000)
.set_column_metadata(columns_with_stats)
.build()
.unwrap();
let row_group_meta_with_stats = vec![row_group_meta_with_stats];

let parquet_meta = ParquetMetaData::new(file_metadata.clone(), row_group_meta_with_stats);
let base_expected_size = 2088;

assert_eq!(parquet_meta.memory_size(), base_expected_size);

let mut column_index = ColumnIndexBuilder::new();
Expand Down