Skip to content
Closed
Changes from 1 commit
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
a7e41c3
regen thrift with size statistics added
etseidl Feb 7, 2024
788eef3
first cut at adding page size statistics
etseidl Feb 9, 2024
6296ada
add new stats to chunk metadata test
etseidl Feb 16, 2024
84f3d7a
Merge branch 'apache:master' into size_stats
etseidl Mar 8, 2024
0da05a8
fix escapes
etseidl Mar 12, 2024
7301aeb
Merge remote-tracking branch 'origin/master' into size_stats
etseidl Mar 12, 2024
6e5fece
format
etseidl Mar 12, 2024
457eb4a
formatting
etseidl Mar 12, 2024
18a5732
add escapes
etseidl Mar 12, 2024
658512e
Merge remote-tracking branch 'origin/master' into size_stats
etseidl Mar 12, 2024
81c2b2e
Merge remote-tracking branch 'origin/master' into size_stats
etseidl Apr 29, 2024
29dde50
Merge branch 'size_stats' of github.com:etseidl/arrow-rs into size_stats
etseidl Jun 27, 2024
84f8512
Merge remote-tracking branch 'origin/master' into size_stats
etseidl Jun 27, 2024
9635e5e
add test of SizeStatistics.unencoded_byte_array_data_bytes
etseidl Jun 27, 2024
c5c07b6
test def histogram as well, rename test
etseidl Jun 27, 2024
6dd160f
add an assert
etseidl Jun 27, 2024
917b412
refactor and add test of def histogram with nulls
etseidl Jun 27, 2024
f8961a3
add test of repetition level histogram
etseidl Jun 28, 2024
73fa099
revert changes to test_roundtrip
etseidl Jun 28, 2024
00ca596
suggestion from review
etseidl Jul 1, 2024
6acc500
add to documentation as suggested in review
etseidl Jul 1, 2024
787e3e8
make histograms optional
etseidl Jul 2, 2024
46851f4
add histograms to PageIndex
etseidl Jul 2, 2024
4f8487b
use Vec::push()
etseidl Jul 2, 2024
903b06b
formatting
etseidl Jul 2, 2024
fa89836
check size stats in read metadata
etseidl Jul 2, 2024
2800cc7
check unencoded_byte_array_data_bytes is not set for int cols
etseidl Jul 2, 2024
95a0535
rewrite test_byte_array_size_statistics() to not use test_roundtrip()
etseidl Jul 2, 2024
fc66a59
add unencoded_byte_array_data_bytes support in page index
etseidl Jul 2, 2024
542570f
Merge remote-tracking branch 'origin/master' into size_stats
etseidl Jul 2, 2024
7be97e5
update expected sizes to account for new stats
etseidl Jul 2, 2024
f5ab47b
only write SizeStatistics in ColumnMetaData if statistics are enabled
etseidl Jul 3, 2024
a008e9e
add a little documentation
etseidl Jul 5, 2024
87ccec2
add ParquetOffsetIndex to avoid double read of OffsetIndex
etseidl Jul 5, 2024
3eead30
cleanup
etseidl Jul 5, 2024
ddf40c3
use less verbose update of variable_length_bytes
etseidl Jul 5, 2024
0ebb72f
add some documentation
etseidl Jul 6, 2024
393aea1
update to latest thrift (as of 11 Jul 2024) from parquet-format
etseidl Jul 11, 2024
1c12fb8
pass None for optional size statistics
etseidl Jul 11, 2024
53cd5fa
escape HTML tags
etseidl Jul 11, 2024
45f25a8
Merge remote-tracking branch 'origin/master' into size_stats
etseidl Jul 11, 2024
98025cc
don't need to escape brackets in arrays
etseidl Jul 11, 2024
7b59246
Merge remote-tracking branch 'github/update_parquet_thrift' into size…
etseidl Jul 11, 2024
65096dd
use consistent naming
etseidl Jul 11, 2024
08065ad
suggested doc changes
etseidl Jul 11, 2024
1cbd4b7
more suggested doc changes
etseidl Jul 11, 2024
dce3513
use more asserts in tests
etseidl Jul 11, 2024
f661839
move histogram logic into PageMetrics and ColumnMetrics
etseidl Jul 12, 2024
818a614
refactor some to reduce code duplication, finish docs
etseidl Jul 12, 2024
c391dec
account for new size statistics in heap size calculations
etseidl Jul 12, 2024
4816a95
add histogram examples to docs
etseidl Jul 12, 2024
e2faf2d
Merge remote-tracking branch 'origin/master' into size_stats
etseidl Jul 12, 2024
d92ae20
add some fixmes
etseidl Jul 14, 2024
69dd652
leave not to self
etseidl Jul 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
refactor some to reduce code duplication, finish docs
  • Loading branch information
etseidl committed Jul 12, 2024
commit 818a61456d226a8f0c4131bcc5dcc9aa695daf30
92 changes: 48 additions & 44 deletions parquet/src/column/writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ pub struct ColumnCloseResult {
pub offset_index: Option<OffsetIndex>,
}

/// Creates vector to hold level histogram data. Length will be `max_level + 1`.
/// Creates a vector to hold level histogram data. Length will be `max_level + 1`.
/// Because histograms are not necessary when `max_level == 0`, this will return
/// `None` in that case.
fn new_histogram(max_level: i16) -> Option<Vec<i64>> {
Expand All @@ -194,6 +194,17 @@ fn new_histogram(max_level: i16) -> Option<Vec<i64>> {
}
}

/// Sum `page_histogram` into `chunk_histogram`
fn update_histogram(chunk_histogram: &mut Option<Vec<i64>>, page_histogram: &Option<Vec<i64>>) {
if page_histogram.is_some() && chunk_histogram.is_some() {
let chunk_hist = chunk_histogram.as_mut().unwrap();
let page_hist = page_histogram.as_ref().unwrap();
for i in 0..page_hist.len() {
chunk_hist[i] += page_hist[i]
}
}
}

// Metrics per page
struct PageMetrics {
num_buffered_values: u32,
Expand All @@ -204,7 +215,7 @@ struct PageMetrics {
}

impl PageMetrics {
pub fn new() -> Self {
fn new() -> Self {
PageMetrics {
num_buffered_values: 0,
num_buffered_rows: 0,
Expand All @@ -215,46 +226,47 @@ impl PageMetrics {
}

/// Initialize the repetition level histogram
pub fn with_repetition_level_histogram(mut self, max_level: i16) -> Self {
fn with_repetition_level_histogram(mut self, max_level: i16) -> Self {
self.repetition_level_histogram = new_histogram(max_level);
self
}

/// Initialize the definition level histogram
pub fn with_definition_level_histogram(mut self, max_level: i16) -> Self {
fn with_definition_level_histogram(mut self, max_level: i16) -> Self {
self.definition_level_histogram = new_histogram(max_level);
self
}

/// Resets the state of this `PageMetrics` to the initial state
///
/// If histograms have are defined their contents will be reset to zero.
pub fn new_page(&mut self) {
self.num_buffered_values = 0;
self.num_buffered_rows = 0;
self.num_page_nulls = 0;
if let Some(ref mut hist) = self.repetition_level_histogram {
for v in hist {
*v = 0
}
}
if let Some(ref mut hist) = self.definition_level_histogram {
/// Sets all elements of `histogram` to 0
fn reset_histogram(histogram: &mut Option<Vec<i64>>) {
if let Some(ref mut hist) = histogram {
for v in hist {
*v = 0
}
}
}

/// FIXME docs!
pub fn update_repetition_level_histogram(&mut self, levels: &[i16]) {
/// Resets the state of this `PageMetrics` to the initial state.
/// If histograms have been initialized their contents will be reset to zero.
fn new_page(&mut self) {
self.num_buffered_values = 0;
self.num_buffered_rows = 0;
self.num_page_nulls = 0;
PageMetrics::reset_histogram(&mut self.repetition_level_histogram);
PageMetrics::reset_histogram(&mut self.definition_level_histogram);
}

/// Updates histogram values using provided repetition levels
fn update_repetition_level_histogram(&mut self, levels: &[i16]) {
if let Some(ref mut rep_hist) = self.repetition_level_histogram {
for &level in levels {
rep_hist[level as usize] += 1;
}
}
}

pub fn update_definition_level_histogram(&mut self, levels: &[i16]) {
/// Updates histogram values using provided definition levels
fn update_definition_level_histogram(&mut self, levels: &[i16]) {
if let Some(ref mut def_hist) = self.definition_level_histogram {
for &level in levels {
def_hist[level as usize] += 1;
Expand Down Expand Up @@ -282,7 +294,7 @@ struct ColumnMetrics<T> {
}

impl<T> ColumnMetrics<T> {
pub fn new() -> Self {
fn new() -> Self {
ColumnMetrics {
total_bytes_written: 0,
total_rows_written: 0,
Expand All @@ -302,40 +314,32 @@ impl<T> ColumnMetrics<T> {
}

/// Initialize the repetition level histogram
pub fn with_repetition_level_histogram(mut self, max_level: i16) -> Self {
fn with_repetition_level_histogram(mut self, max_level: i16) -> Self {
self.repetition_level_histogram = new_histogram(max_level);
self
}

/// Initialize the definition level histogram
pub fn with_definition_level_histogram(mut self, max_level: i16) -> Self {
fn with_definition_level_histogram(mut self, max_level: i16) -> Self {
self.definition_level_histogram = new_histogram(max_level);
self
}

/// FIXME docs
pub fn update_from_page_metrics(&mut self, page_metrics: &PageMetrics) {
if page_metrics.definition_level_histogram.is_some()
&& self.definition_level_histogram.is_some()
{
let chunk_hist = self.definition_level_histogram.as_mut().unwrap();
let page_hist = page_metrics.definition_level_histogram.as_ref().unwrap();
for i in 0..page_hist.len() {
chunk_hist[i] += page_hist[i]
}
}
if page_metrics.repetition_level_histogram.is_some()
&& self.repetition_level_histogram.is_some()
{
let chunk_hist = self.repetition_level_histogram.as_mut().unwrap();
let page_hist = page_metrics.repetition_level_histogram.as_ref().unwrap();
for i in 0..page_hist.len() {
chunk_hist[i] += page_hist[i]
}
}
/// Sum the provided PageMetrics histograms into the chunk histograms. Does nothing if
/// page histograms are not initialized.
fn update_from_page_metrics(&mut self, page_metrics: &PageMetrics) {
update_histogram(
&mut self.definition_level_histogram,
&page_metrics.definition_level_histogram,
);
update_histogram(
&mut self.repetition_level_histogram,
&page_metrics.repetition_level_histogram,
);
}

pub fn update_variable_length_bytes(&mut self, variable_length_bytes: &Option<i64>) {
/// Sum the provided page variable_length_bytes into the chunk variable_length_bytes
fn update_variable_length_bytes(&mut self, variable_length_bytes: &Option<i64>) {
if let Some(var_bytes) = variable_length_bytes {
*self.variable_length_bytes.get_or_insert(0) += var_bytes;
}
Expand Down