Skip to content

Commit 80ed712

Browse files
authored
Use LevelHistogram in PageIndex (apache#6135)
* use LevelHistogram in PageIndex and ColumnIndexBuilder * revert changes to OffsetIndexBuilder
1 parent 5f5a82c commit 80ed712

File tree

3 files changed

+20
-16
lines changed

3 files changed

+20
-16
lines changed

parquet/src/file/metadata/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -569,7 +569,7 @@ pub struct ColumnChunkMetaData {
569569
/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the
570570
/// number of rows with level 1, and so on.
571571
///
572-
#[derive(Debug, Clone, PartialEq)]
572+
#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
573573
pub struct LevelHistogram {
574574
inner: Vec<i64>,
575575
}

parquet/src/file/page_index/index.rs

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use crate::basic::Type;
2121
use crate::data_type::private::ParquetValueType;
2222
use crate::data_type::{AsBytes, ByteArray, FixedLenByteArray, Int96};
2323
use crate::errors::ParquetError;
24+
use crate::file::metadata::LevelHistogram;
2425
use crate::format::{BoundaryOrder, ColumnIndex};
2526
use crate::util::bit_util::from_le_slice;
2627
use std::fmt::Debug;
@@ -40,13 +41,13 @@ pub struct PageIndex<T> {
4041
///
4142
/// `repetition_level_histogram[i]` is a count of how many values are at repetition level `i`.
4243
/// For example, `repetition_level_histogram[0]` indicates how many rows the page contains.
43-
pub repetition_level_histogram: Option<Vec<i64>>,
44+
pub repetition_level_histogram: Option<LevelHistogram>,
4445
/// Definition level histogram for the page
4546
///
4647
/// `definition_level_histogram[i]` is a count of how many values are at definition level `i`.
4748
/// For example, `definition_level_histogram[max_definition_level]` indicates how many
4849
/// non-null values are present in the page.
49-
pub definition_level_histogram: Option<Vec<i64>>,
50+
pub definition_level_histogram: Option<LevelHistogram>,
5051
}
5152

5253
impl<T> PageIndex<T> {
@@ -59,10 +60,10 @@ impl<T> PageIndex<T> {
5960
pub fn null_count(&self) -> Option<i64> {
6061
self.null_count
6162
}
62-
pub fn repetition_level_histogram(&self) -> Option<&Vec<i64>> {
63+
pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
6364
self.repetition_level_histogram.as_ref()
6465
}
65-
pub fn definition_level_histogram(&self) -> Option<&Vec<i64>> {
66+
pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
6667
self.definition_level_histogram.as_ref()
6768
}
6869
}
@@ -175,17 +176,17 @@ impl<T: ParquetValueType> NativeIndex<T> {
175176
for i in 0..len {
176177
let page_idx = i * num_levels;
177178
let page_hist = hist[page_idx..page_idx + num_levels].to_vec();
178-
res.push(Some(page_hist));
179+
res.push(Some(LevelHistogram::from(page_hist)));
179180
}
180181
res
181182
} else {
182183
vec![None; len]
183184
}
184185
};
185186

186-
let rep_hists: Vec<Option<Vec<i64>>> =
187+
let rep_hists: Vec<Option<LevelHistogram>> =
187188
to_page_histograms(index.repetition_level_histograms);
188-
let def_hists: Vec<Option<Vec<i64>>> =
189+
let def_hists: Vec<Option<LevelHistogram>> =
189190
to_page_histograms(index.definition_level_histograms);
190191

191192
let indexes = index
@@ -236,19 +237,22 @@ mod tests {
236237
min: Some(-123),
237238
max: Some(234),
238239
null_count: Some(0),
239-
repetition_level_histogram: Some(vec![1, 2]),
240-
definition_level_histogram: Some(vec![1, 2, 3]),
240+
repetition_level_histogram: Some(LevelHistogram::from(vec![1, 2])),
241+
definition_level_histogram: Some(LevelHistogram::from(vec![1, 2, 3])),
241242
};
242243

243244
assert_eq!(page_index.min().unwrap(), &-123);
244245
assert_eq!(page_index.max().unwrap(), &234);
245246
assert_eq!(page_index.min_bytes().unwrap(), (-123).as_bytes());
246247
assert_eq!(page_index.max_bytes().unwrap(), 234.as_bytes());
247248
assert_eq!(page_index.null_count().unwrap(), 0);
248-
assert_eq!(page_index.repetition_level_histogram(), Some(&vec![1, 2]));
249249
assert_eq!(
250-
page_index.definition_level_histogram(),
251-
Some(&vec![1, 2, 3])
250+
page_index.repetition_level_histogram().unwrap().values(),
251+
&vec![1, 2]
252+
);
253+
assert_eq!(
254+
page_index.definition_level_histogram().unwrap().values(),
255+
&vec![1, 2, 3]
252256
);
253257
}
254258

parquet/src/file/writer.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1951,7 +1951,7 @@ mod tests {
19511951

19521952
assert!(col_idx.repetition_level_histogram().is_none());
19531953
assert!(col_idx.definition_level_histogram().is_some());
1954-
check_def_hist(col_idx.definition_level_histogram().unwrap());
1954+
check_def_hist(col_idx.definition_level_histogram().unwrap().values());
19551955

19561956
assert!(reader.metadata().offset_index().is_some());
19571957
let offset_index = reader.metadata().offset_index().unwrap();
@@ -2066,8 +2066,8 @@ mod tests {
20662066
unreachable!()
20672067
};
20682068

2069-
check_def_hist(col_idx.definition_level_histogram().unwrap());
2070-
check_rep_hist(col_idx.repetition_level_histogram().unwrap());
2069+
check_def_hist(col_idx.definition_level_histogram().unwrap().values());
2070+
check_rep_hist(col_idx.repetition_level_histogram().unwrap().values());
20712071

20722072
assert!(reader.metadata().offset_index().is_some());
20732073
let offset_index = reader.metadata().offset_index().unwrap();

0 commit comments

Comments
 (0)