Skip to content
Closed
Changes from 1 commit
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
a7e41c3
regen thrift with size statistics added
etseidl Feb 7, 2024
788eef3
first cut at adding page size statistics
etseidl Feb 9, 2024
6296ada
add new stats to chunk metadata test
etseidl Feb 16, 2024
84f3d7a
Merge branch 'apache:master' into size_stats
etseidl Mar 8, 2024
0da05a8
fix escapes
etseidl Mar 12, 2024
7301aeb
Merge remote-tracking branch 'origin/master' into size_stats
etseidl Mar 12, 2024
6e5fece
format
etseidl Mar 12, 2024
457eb4a
formatting
etseidl Mar 12, 2024
18a5732
add escapes
etseidl Mar 12, 2024
658512e
Merge remote-tracking branch 'origin/master' into size_stats
etseidl Mar 12, 2024
81c2b2e
Merge remote-tracking branch 'origin/master' into size_stats
etseidl Apr 29, 2024
29dde50
Merge branch 'size_stats' of github.com:etseidl/arrow-rs into size_stats
etseidl Jun 27, 2024
84f8512
Merge remote-tracking branch 'origin/master' into size_stats
etseidl Jun 27, 2024
9635e5e
add test of SizeStatistics.unencoded_byte_array_data_bytes
etseidl Jun 27, 2024
c5c07b6
test def histogram as well, rename test
etseidl Jun 27, 2024
6dd160f
add an assert
etseidl Jun 27, 2024
917b412
refactor and add test of def histogram with nulls
etseidl Jun 27, 2024
f8961a3
add test of repetition level histogram
etseidl Jun 28, 2024
73fa099
revert changes to test_roundtrip
etseidl Jun 28, 2024
00ca596
suggestion from review
etseidl Jul 1, 2024
6acc500
add to documentation as suggested in review
etseidl Jul 1, 2024
787e3e8
make histograms optional
etseidl Jul 2, 2024
46851f4
add histograms to PageIndex
etseidl Jul 2, 2024
4f8487b
use Vec::push()
etseidl Jul 2, 2024
903b06b
formatting
etseidl Jul 2, 2024
fa89836
check size stats in read metadata
etseidl Jul 2, 2024
2800cc7
check unencoded_byte_array_data_bytes is not set for int cols
etseidl Jul 2, 2024
95a0535
rewrite test_byte_array_size_statistics() to not use test_roundtrip()
etseidl Jul 2, 2024
fc66a59
add unencoded_byte_array_data_bytes support in page index
etseidl Jul 2, 2024
542570f
Merge remote-tracking branch 'origin/master' into size_stats
etseidl Jul 2, 2024
7be97e5
update expected sizes to account for new stats
etseidl Jul 2, 2024
f5ab47b
only write SizeStatistics in ColumnMetaData if statistics are enabled
etseidl Jul 3, 2024
a008e9e
add a little documentation
etseidl Jul 5, 2024
87ccec2
add ParquetOffsetIndex to avoid double read of OffsetIndex
etseidl Jul 5, 2024
3eead30
cleanup
etseidl Jul 5, 2024
ddf40c3
use less verbose update of variable_length_bytes
etseidl Jul 5, 2024
0ebb72f
add some documentation
etseidl Jul 6, 2024
393aea1
update to latest thrift (as of 11 Jul 2024) from parquet-format
etseidl Jul 11, 2024
1c12fb8
pass None for optional size statistics
etseidl Jul 11, 2024
53cd5fa
escape HTML tags
etseidl Jul 11, 2024
45f25a8
Merge remote-tracking branch 'origin/master' into size_stats
etseidl Jul 11, 2024
98025cc
don't need to escape brackets in arrays
etseidl Jul 11, 2024
7b59246
Merge remote-tracking branch 'github/update_parquet_thrift' into size…
etseidl Jul 11, 2024
65096dd
use consistent naming
etseidl Jul 11, 2024
08065ad
suggested doc changes
etseidl Jul 11, 2024
1cbd4b7
more suggested doc changes
etseidl Jul 11, 2024
dce3513
use more asserts in tests
etseidl Jul 11, 2024
f661839
move histogram logic into PageMetrics and ColumnMetrics
etseidl Jul 12, 2024
818a614
refactor some to reduce code duplication, finish docs
etseidl Jul 12, 2024
c391dec
account for new size statistics in heap size calculations
etseidl Jul 12, 2024
4816a95
add histogram examples to docs
etseidl Jul 12, 2024
e2faf2d
Merge remote-tracking branch 'origin/master' into size_stats
etseidl Jul 12, 2024
d92ae20
add some fixmes
etseidl Jul 14, 2024
69dd652
leave not to self
etseidl Jul 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
refactor and add test of def histogram with nulls
  • Loading branch information
etseidl committed Jun 27, 2024
commit 917b412a704910f79306663de547ab7f973957c9
117 changes: 71 additions & 46 deletions parquet/src/file/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1398,7 +1398,7 @@ mod tests {
file: W,
data: Vec<Vec<i32>>,
compression: Compression,
repetition: Repetition,
def_levels: Option<&[i16]>,
) -> crate::format::FileMetaData
where
W: Write + Send,
Expand All @@ -1409,7 +1409,7 @@ mod tests {
data,
|r| r.get_int(0).unwrap(),
compression,
repetition,
def_levels,
)
}

Expand All @@ -1420,14 +1420,18 @@ mod tests {
data: Vec<Vec<D::T>>,
value: F,
compression: Compression,
repetition: Repetition,
def_levels: Option<&[i16]>,
) -> crate::format::FileMetaData
where
W: Write + Send,
R: ChunkReader + From<W> + 'static,
D: DataType,
F: Fn(Row) -> D::T,
{
let repetition = match def_levels {
Some(_) => Repetition::OPTIONAL,
_ => Repetition::REQUIRED,
};
let schema = Arc::new(
types::Type::group_type_builder("schema")
.with_fields(vec![Arc::new(
Expand All @@ -1451,15 +1455,9 @@ mod tests {
let row_group_file_offset = file_writer.buf.bytes_written();
let mut row_group_writer = file_writer.next_row_group().unwrap();
if let Some(mut writer) = row_group_writer.next_column().unwrap() {
let def_vec = vec![1; subset.len()];
let def_lvls = if repetition != Repetition::REQUIRED {
Some(def_vec.as_slice())
} else {
None
};
rows += writer
.typed::<D>()
.write_batch(&subset[..], def_lvls, None)
.write_batch(&subset[..], def_levels, None)
.unwrap() as i64;
writer.close().unwrap();
}
Expand All @@ -1474,37 +1472,36 @@ mod tests {

let reader = SerializedFileReader::new(R::from(file)).unwrap();
assert_eq!(reader.num_row_groups(), data.len());
assert_eq!(
reader.metadata().file_metadata().num_rows(),
rows,
"row count in metadata not equal to number of rows written"
);
for (i, item) in data.iter().enumerate().take(reader.num_row_groups()) {
let row_group_reader = reader.get_row_group(i).unwrap();
let iter = row_group_reader.get_row_iter(None).unwrap();
let res: Vec<_> = iter.map(|row| row.unwrap()).map(&value).collect();
let row_group_size = row_group_reader.metadata().total_byte_size();
let uncompressed_size: i64 = row_group_reader
.metadata()
.columns()
.iter()
.map(|v| v.uncompressed_size())
.sum();
assert_eq!(row_group_size, uncompressed_size);
assert_eq!(res, *item);
// Row based API does not like nulls, so skip these validation steps if nulls might
// be present.
if repetition == Repetition::REQUIRED {
assert_eq!(
reader.metadata().file_metadata().num_rows(),
rows,
"row count in metadata not equal to number of rows written"
);
for (i, item) in data.iter().enumerate().take(reader.num_row_groups()) {
let row_group_reader = reader.get_row_group(i).unwrap();
let iter = row_group_reader.get_row_iter(None).unwrap();
let res: Vec<_> = iter.map(|row| row.unwrap()).map(&value).collect();
let row_group_size = row_group_reader.metadata().total_byte_size();
let uncompressed_size: i64 = row_group_reader
.metadata()
.columns()
.iter()
.map(|v| v.uncompressed_size())
.sum();
assert_eq!(row_group_size, uncompressed_size);
assert_eq!(res, *item);
}
}
file_metadata
}

/// File write-read roundtrip.
/// `data` consists of arrays of values for each row group.
fn test_file_roundtrip(file: File, data: Vec<Vec<i32>>) -> crate::format::FileMetaData {
test_roundtrip_i32::<File, File>(
file,
data,
Compression::UNCOMPRESSED,
Repetition::REQUIRED,
)
test_roundtrip_i32::<File, File>(file, data, Compression::UNCOMPRESSED, None)
}

#[test]
Expand Down Expand Up @@ -1549,12 +1546,7 @@ mod tests {
}

fn test_bytes_roundtrip(data: Vec<Vec<i32>>, compression: Compression) {
test_roundtrip_i32::<Vec<u8>, Bytes>(
Vec::with_capacity(1024),
data,
compression,
Repetition::REQUIRED,
);
test_roundtrip_i32::<Vec<u8>, Bytes>(Vec::with_capacity(1024), data, compression, None);
}

#[test]
Expand All @@ -1565,7 +1557,7 @@ mod tests {
vec![my_bool_values],
|r| r.get_bool(0).unwrap(),
Compression::UNCOMPRESSED,
Repetition::REQUIRED,
None,
);
}

Expand All @@ -1577,7 +1569,7 @@ mod tests {
vec![my_bool_values],
|r| r.get_bool(0).unwrap(),
Compression::SNAPPY,
Repetition::REQUIRED,
None,
);
}

Expand Down Expand Up @@ -1865,16 +1857,19 @@ mod tests {

#[test]
fn test_size_statistics() {
let num_rows: i64 = 5;
let data = vec![ByteArrayType::gen_vec(32, num_rows as usize)];
let num_rows: usize = 5;
let data = vec![ByteArrayType::gen_vec(32, num_rows)];
let unenc_size: i64 = data[0].iter().map(|x| x.len() as i64).sum();
let file: File = tempfile::tempfile().unwrap();
let def_vec = vec![1; num_rows];
let def_levels = Some(def_vec.as_slice());

let file_metadata = test_roundtrip::<File, File, ByteArrayType, _>(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we please add some test that verifies that the metadata read from the file correctly interprets the new statistics?

It seems to me like test_roundtrip only verifies a few fields of the metadata after it is read back in

let reader = SerializedFileReader::new(R::from(file)).unwrap();
assert_eq!(reader.num_row_groups(), data.len());
assert_eq!(
reader.metadata().file_metadata().num_rows(),
rows,
"row count in metadata not equal to number of rows written"
);
for (i, item) in data.iter().enumerate().take(reader.num_row_groups()) {
let row_group_reader = reader.get_row_group(i).unwrap();
let iter = row_group_reader.get_row_iter(None).unwrap();
let res: Vec<_> = iter.map(|row| row.unwrap()).map(&value).collect();
let row_group_size = row_group_reader.metadata().total_byte_size();
let uncompressed_size: i64 = row_group_reader
.metadata()
.columns()
.iter()
.map(|v| v.uncompressed_size())
.sum();
assert_eq!(row_group_size, uncompressed_size);
assert_eq!(res, *item);
}
file_metadata

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, file metadata returned from test_roundtrip is created by the writer, not the reader. So I'm currently just testing that the thrift objects are properly formed before writing. The issue I have is that the reader doesn't currently save the new structures, and I don't want to add them on this pass since they won't be used at all. I can instead replicate some of the code in index_reader.rs to create thrift objects from the file data and test that those match the objects created by the writer.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤔 it seems like it would be good to have the ability to read the structures as well (otherwise pure rust implementations can't take advantage of these fields )

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree (and in fact I know people using DataFusion that want these stats...but are ok reading them directly from the files as a preprocessing step). I think adding the histograms to Index is easy enough, but currently the OffsetIndex is returned as a vector of PageLocations. Are you ok with me adding an object that mirrors the thrift OffsetIndex?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I think we should add add our own SizeStatistics struct src/file/size_statistics.rs that mirrors the Thrift TSizeStatistics.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The thing we need to be careful about is not introducing any breaking API changes or else we can't merge this PR until August 2024 per https://github.com/apache/arrow-rs?tab=readme-ov-file#release-versioning-and-schedule

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm. Adding histograms to Index won't be a breaking change as they'll be optional. I could add another method to index_reader.rs to return a vector of unencoded byte array sizes. It's less efficient since we'd have to parse the offset index twice.

ColumnChunkMetaData contains the chunk level histograms and byte array sizes already, just not as a separate SizeStatistics object.

file,
data,
|r| r.get_bytes(0).unwrap().clone(),
Compression::UNCOMPRESSED,
Repetition::OPTIONAL,
def_levels,
);

assert_eq!(file_metadata.row_groups.len(), 1);
Expand All @@ -1893,7 +1888,37 @@ mod tests {
if let Some(ref def_hist) = size_stats.definition_level_histogram {
assert_eq!(def_hist.len(), 2);
assert_eq!(def_hist[0], 0);
assert_eq!(def_hist[1], num_rows);
assert_eq!(def_hist[1], num_rows as i64);
}
}
}
}

#[test]
fn test_size_statistics_with_nulls() {
let def_levels = [1, 1, 0, 1, 0];
let data = vec![vec![1, 2, 3, 4, 5]];
let file: File = tempfile::tempfile().unwrap();
let file_metadata = test_roundtrip_i32::<File, File>(
file,
data,
Compression::UNCOMPRESSED,
Some(&def_levels),
);

assert_eq!(file_metadata.row_groups.len(), 1);
assert_eq!(file_metadata.row_groups[0].columns.len(), 1);
assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some());

if let Some(ref meta_data) = file_metadata.row_groups[0].columns[0].meta_data {
assert!(meta_data.size_statistics.is_some());
if let Some(ref size_stats) = meta_data.size_statistics {
assert!(size_stats.repetition_level_histogram.is_none());
assert!(size_stats.definition_level_histogram.is_some());
if let Some(ref def_hist) = size_stats.definition_level_histogram {
assert_eq!(def_hist.len(), 2);
assert_eq!(def_hist[0], 2); // two nulls
assert_eq!(def_hist[1], 3); // three non-null
}
}
}
Expand Down