-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Upgrade to arrow/parquet 55, and object_store to 0.12.0 and pyo3 to 0.24.0
#15466
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
4626f9b
98d62e0
fc1f7a5
9299b4b
7b8320e
cdc55e5
1952406
53ec353
955d37f
c103a03
586851b
cabfb58
d980c00
2dd1827
dff9490
a8b3c4d
b1bae93
84725ff
9bfc8a3
3d646d7
86aab05
984f106
2a30ca3
1f0711e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -66,7 +66,7 @@ pub fn local_unpartitioned_file(path: impl AsRef<std::path::Path>) -> ObjectMeta | |
| ObjectMeta { | ||
| location, | ||
| last_modified: metadata.modified().map(chrono::DateTime::from).unwrap(), | ||
| size: metadata.len() as usize, | ||
| size: metadata.len(), | ||
| e_tag: None, | ||
| version: None, | ||
| } | ||
|
|
@@ -166,7 +166,7 @@ impl ObjectStore for BlockingObjectStore { | |
| fn list( | ||
| &self, | ||
| prefix: Option<&Path>, | ||
| ) -> BoxStream<'_, object_store::Result<ObjectMeta>> { | ||
| ) -> BoxStream<'static, object_store::Result<ObjectMeta>> { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| self.inner.list(prefix) | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,19 +18,19 @@ | |
| //! [`ParquetFileReaderFactory`] and [`DefaultParquetFileReaderFactory`] for | ||
| //! low level control of parquet file readers | ||
|
|
||
| use crate::ParquetFileMetrics; | ||
| use bytes::Bytes; | ||
| use datafusion_datasource::file_meta::FileMeta; | ||
| use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; | ||
| use futures::future::BoxFuture; | ||
| use object_store::ObjectStore; | ||
| use parquet::arrow::arrow_reader::ArrowReaderOptions; | ||
| use parquet::arrow::async_reader::{AsyncFileReader, ParquetObjectReader}; | ||
| use parquet::file::metadata::ParquetMetaData; | ||
| use std::fmt::Debug; | ||
| use std::ops::Range; | ||
| use std::sync::Arc; | ||
|
|
||
| use crate::ParquetFileMetrics; | ||
|
|
||
| /// Interface for reading parquet files. | ||
| /// | ||
| /// The combined implementations of [`ParquetFileReaderFactory`] and | ||
|
|
@@ -114,10 +114,11 @@ impl AsyncFileReader for ParquetFileReader { | |
| self.inner.get_byte_ranges(ranges) | ||
| } | ||
|
|
||
| fn get_metadata( | ||
| &mut self, | ||
| ) -> BoxFuture<'_, parquet::errors::Result<Arc<ParquetMetaData>>> { | ||
| self.inner.get_metadata() | ||
| fn get_metadata<'a>( | ||
| &'a mut self, | ||
| options: Option<&'a ArrowReaderOptions>, | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| ) -> BoxFuture<'a, parquet::errors::Result<Arc<ParquetMetaData>>> { | ||
| self.inner.get_metadata(options) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -135,7 +136,8 @@ impl ParquetFileReaderFactory for DefaultParquetFileReaderFactory { | |
| metrics, | ||
| ); | ||
| let store = Arc::clone(&self.store); | ||
| let mut inner = ParquetObjectReader::new(store, file_meta.object_meta); | ||
| let mut inner = ParquetObjectReader::new(store, file_meta.object_meta.location) | ||
| .with_file_size(file_meta.object_meta.size as usize); | ||
|
|
||
| if let Some(hint) = metadata_size_hint { | ||
| inner = inner.with_footer_size_hint(hint) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -44,7 +44,7 @@ datafusion-common = { workspace = true, default-features = true } | |
| datafusion-expr = { workspace = true } | ||
| futures = { workspace = true } | ||
| log = { workspace = true } | ||
| object_store = { workspace = true } | ||
| object_store = { workspace = true, features = ["fs"] } | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| parking_lot = { workspace = true } | ||
| rand = { workspace = true } | ||
| tempfile = { workspace = true } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,17 +19,23 @@ use std::sync::Arc; | |
|
|
||
| use arrow::array::{ | ||
| Array, ArrayRef, ArrowPrimitiveType, AsArray, ListArray, NullBufferBuilder, | ||
| PrimitiveArray, | ||
| }; | ||
| use arrow::datatypes::{Field, Int64Type}; | ||
| use arrow::util::bench_util::create_primitive_array; | ||
| use criterion::{black_box, criterion_group, criterion_main, Criterion}; | ||
| use datafusion_expr::Accumulator; | ||
| use datafusion_functions_aggregate::array_agg::ArrayAggAccumulator; | ||
|
|
||
| use arrow::buffer::OffsetBuffer; | ||
| use arrow::util::test_util::seedable_rng; | ||
| use rand::distributions::{Distribution, Standard}; | ||
| use rand::prelude::StdRng; | ||
| use rand::Rng; | ||
| use rand::SeedableRng; | ||
|
|
||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I inlined the small amount of code from bench_util so this benchmark is standalone and easier to understand what is tested |
||
| /// Returns fixed seedable RNG | ||
| pub fn seedable_rng() -> StdRng { | ||
| StdRng::seed_from_u64(42) | ||
| } | ||
|
|
||
| fn merge_batch_bench(c: &mut Criterion, name: &str, values: ArrayRef) { | ||
| let list_item_data_type = values.as_list::<i32>().values().data_type().clone(); | ||
|
|
@@ -46,6 +52,24 @@ fn merge_batch_bench(c: &mut Criterion, name: &str, values: ArrayRef) { | |
| }); | ||
| } | ||
|
|
||
| pub fn create_primitive_array<T>(size: usize, null_density: f32) -> PrimitiveArray<T> | ||
| where | ||
| T: ArrowPrimitiveType, | ||
| Standard: Distribution<T::Native>, | ||
| { | ||
| let mut rng = seedable_rng(); | ||
|
|
||
| (0..size) | ||
| .map(|_| { | ||
| if rng.gen::<f32>() < null_density { | ||
| None | ||
| } else { | ||
| Some(rng.gen()) | ||
| } | ||
| }) | ||
| .collect() | ||
| } | ||
|
|
||
| /// Create List array with the given item data type, null density, null locations and zero length lists density | ||
| /// Creates an random (but fixed-seeded) array of a given size and null density | ||
| pub fn create_list_array<T>( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The changes to usize/u64 are for better wasm support, see
u64range instead ofusize, for better wasm32 support arrow-rs#6961