Add row_id and prefetch to parquet reader (#65)

thinkharderdev · alamb · web-flow · commit 35b8115a0b2d · 2025-05-07T12:35:59.000-04:00
* Revert "bump `tonic` to 0.12 and `prost` to 0.13 for `arrow-flight` (apache#6041)" This reverts commit 741bbf6. # Conflicts: # arrow-flight/Cargo.toml # arrow-flight/gen/Cargo.toml # arrow-flight/src/arrow.flight.protocol.rs # arrow-integration-testing/Cargo.toml * Revert "fix: enable TLS roots for flight CLI client (apache#6640)" This reverts commit 2983dc1. * Add rowid to parquet reader * make sure stream has correct schema * handle specifying row groups * remove println * add prefetching for row filter fetch * remove println * fix bug in prefetch * fix properly * remove println * chrono dep * use row_id intead of rowid * Reapply "fix: enable TLS roots for flight CLI client (apache#6640)" This reverts commit 244d8bd. * Reapply "bump `tonic` to 0.12 and `prost` to 0.13 for `arrow-flight` (apache#6041)" This reverts commit 7750691. --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
diff --git a/Cargo.toml b/Cargo.toml
@@ -93,4 +93,4 @@ arrow-select = { version = "53.4.1", path = "./arrow-select" }
 arrow-string = { version = "53.4.1", path = "./arrow-string" }
 parquet = { version = "53.4.1", path = "./parquet", default-features = false }
 
-chrono = { version = "0.4.40", default-features = false, features = ["clock"] }
+chrono = { version = "0.4.41", default-features = false, features = ["clock"] }
diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs
@@ -17,16 +17,16 @@
 
 //! Contains reader which reads parquet data into arrow [`RecordBatch`]
 
-use std::collections::VecDeque;
-use std::sync::Arc;
-
+use arrow_array::builder::UInt64Builder;
 use arrow_array::cast::AsArray;
-use arrow_array::Array;
+use arrow_array::{Array, ArrayRef};
 use arrow_array::{RecordBatch, RecordBatchReader};
-use arrow_schema::{ArrowError, DataType as ArrowType, Schema, SchemaRef};
+use arrow_schema::{ArrowError, DataType as ArrowType, Field, FieldRef, Schema, SchemaRef};
 use arrow_select::filter::prep_null_mask_filter;
 pub use filter::{ArrowPredicate, ArrowPredicateFn, RowFilter};
 pub use selection::{RowSelection, RowSelector};
+use std::collections::VecDeque;
+use std::sync::Arc;
 
 pub use crate::arrow::array_reader::RowGroups;
 use crate::arrow::array_reader::{build_array_reader, ArrayReader};
@@ -72,6 +72,10 @@ pub struct ArrowReaderBuilder<T> {
     pub(crate) limit: Option<usize>,
 
     pub(crate) offset: Option<usize>,
+
+    pub(crate) row_id: Option<FieldRef>,
+
+    pub(crate) prefetch: Option<ProjectionMask>,
 }
 
 impl<T> ArrowReaderBuilder<T> {
@@ -88,6 +92,8 @@ impl<T> ArrowReaderBuilder<T> {
             selection: None,
             limit: None,
             offset: None,
+            row_id: None,
+            prefetch: None,
         }
     }
 
@@ -114,6 +120,15 @@ impl<T> ArrowReaderBuilder<T> {
         Self { batch_size, ..self }
     }
 
+    /// Project a column into the result with name `field_name` that will contain the row ID
+    /// for each row. The row ID will be the row offset of the row in the underlying file
+    pub fn with_row_id(self, field_name: impl Into<String>) -> Self {
+        Self {
+            row_id: Some(RowId::field_ref(field_name)),
+            ..self
+        }
+    }
+
     /// Only read data from the provided row group indexes
     ///
     /// This is also called row group filtering
@@ -132,6 +147,15 @@ impl<T> ArrowReaderBuilder<T> {
         }
     }
 
+    /// If evaluating a `RowFilter` also prefetch the columns in `mask`
+    /// while fetching row filter columns
+    pub fn with_prefetch(self, mask: Option<ProjectionMask>) -> Self {
+        Self {
+            prefetch: mask,
+            ..self
+        }
+    }
+
     /// Provide a [`RowSelection`] to filter out rows, and avoid fetching their
     /// data into memory.
     ///
@@ -623,6 +647,8 @@ impl<T: ChunkReader + 'static> ParquetRecordBatchReaderBuilder<T> {
             batch_size,
             array_reader,
             apply_range(selection, reader.num_rows(), self.offset, self.limit),
+            // TODO what do we do here?
+            None,
         ))
     }
 }
@@ -684,13 +710,55 @@ impl<T: ChunkReader + 'static> Iterator for ReaderPageIterator<T> {
 
 impl<T: ChunkReader + 'static> PageIterator for ReaderPageIterator<T> {}
 
+pub(crate) struct RowId {
+    offset: u64,
+    field: FieldRef,
+    buffer: UInt64Builder,
+}
+
+impl RowId {
+    pub fn new(offset: u64, field: FieldRef, batch_size: usize) -> Self {
+        Self {
+            offset,
+            field,
+            buffer: UInt64Builder::with_capacity(batch_size),
+        }
+    }
+
+    pub fn field_ref(name: impl Into<String>) -> FieldRef {
+        Arc::new(Field::new(name, ArrowType::UInt64, false))
+    }
+
+    pub fn skip(&mut self, n: usize) {
+        self.offset += n as u64;
+    }
+
+    pub fn field(&self) -> FieldRef {
+        self.field.clone()
+    }
+
+    fn read(&mut self, n: usize) {
+        // SAFETY: We are appending a `Range<u64>` which has a trusted length
+        unsafe {
+            self.buffer
+                .append_trusted_len_iter(self.offset..self.offset + n as u64)
+        }
+        self.offset += n as u64;
+    }
+
+    fn consume(&mut self) -> ArrayRef {
+        Arc::new(self.buffer.finish())
+    }
+}
+
 /// An `Iterator<Item = ArrowResult<RecordBatch>>` that yields [`RecordBatch`]
 /// read from a parquet data source
 pub struct ParquetRecordBatchReader {
     batch_size: usize,
     array_reader: Box<dyn ArrayReader>,
     schema: SchemaRef,
     selection: Option<VecDeque<RowSelector>>,
+    row_id: Option<RowId>,
 }
 
 impl Iterator for ParquetRecordBatchReader {
@@ -708,6 +776,10 @@ impl Iterator for ParquetRecordBatchReader {
                             Err(e) => return Some(Err(e.into())),
                         };
 
+                        if let Some(row_id) = self.row_id.as_mut() {
+                            row_id.skip(skipped);
+                        }
+
                         if skipped != front.row_count {
                             return Some(Err(general_err!(
                                 "failed to skip rows, expected {}, got {}",
@@ -738,16 +810,24 @@ impl Iterator for ParquetRecordBatchReader {
                     };
                     match self.array_reader.read_records(to_read) {
                         Ok(0) => break,
-                        Ok(rec) => read_records += rec,
+                        Ok(rec) => {
+                            if let Some(rowid) = self.row_id.as_mut() {
+                                rowid.read(rec);
+                            }
+                            read_records += rec
+                        }
                         Err(error) => return Some(Err(error.into())),
                     }
                 }
             }
-            None => {
-                if let Err(error) = self.array_reader.read_records(self.batch_size) {
-                    return Some(Err(error.into()));
+            None => match self.array_reader.read_records(self.batch_size) {
+                Ok(n) => {
+                    if let Some(rowid) = self.row_id.as_mut() {
+                        rowid.read(n);
+                    }
                 }
-            }
+                Err(error) => return Some(Err(error.into())),
+            },
         };
 
         match self.array_reader.consume_batch() {
@@ -761,7 +841,23 @@ impl Iterator for ParquetRecordBatchReader {
 
                 match struct_array {
                     Err(err) => Some(Err(err)),
-                    Ok(e) => (e.len() > 0).then(|| Ok(RecordBatch::from(e))),
+                    Ok(e) => {
+                        if e.len() > 0 {
+                            Some(Ok(match self.row_id.as_mut() {
+                                Some(rowid) => {
+                                    let columns = std::iter::once(rowid.consume())
+                                        .chain(e.columns().iter().cloned())
+                                        .collect();
+
+                                    RecordBatch::try_new(self.schema.clone(), columns)
+                                        .expect("invalid schema")
+                                }
+                                None => RecordBatch::from(e),
+                            }))
+                        } else {
+                            None
+                        }
+                    }
                 }
             }
         }
@@ -806,6 +902,7 @@ impl ParquetRecordBatchReader {
             array_reader,
             schema: Arc::new(Schema::new(levels.fields.clone())),
             selection: selection.map(|s| s.trim().into()),
+            row_id: None,
         })
     }
 
@@ -816,17 +913,29 @@ impl ParquetRecordBatchReader {
         batch_size: usize,
         array_reader: Box<dyn ArrayReader>,
         selection: Option<RowSelection>,
+        rowid: Option<RowId>,
     ) -> Self {
-        let schema = match array_reader.get_data_type() {
-            ArrowType::Struct(ref fields) => Schema::new(fields.clone()),
+        let struct_fields = match array_reader.get_data_type() {
+            ArrowType::Struct(ref fields) => fields.clone(),
             _ => unreachable!("Struct array reader's data type is not struct!"),
         };
 
+        let schema = match rowid.as_ref() {
+            Some(rowid) => {
+                let fields: Vec<_> = std::iter::once(rowid.field())
+                    .chain(struct_fields.iter().cloned())
+                    .collect();
+                Schema::new(fields)
+            }
+            None => Schema::new(struct_fields),
+        };
+
         Self {
             batch_size,
             array_reader,
             schema: Arc::new(schema),
             selection: selection.map(|s| s.trim().into()),
+            row_id: rowid,
         }
     }
 }
@@ -887,7 +996,8 @@ pub(crate) fn evaluate_predicate(
     input_selection: Option<RowSelection>,
     predicate: &mut dyn ArrowPredicate,
 ) -> Result<RowSelection> {
-    let reader = ParquetRecordBatchReader::new(batch_size, array_reader, input_selection.clone());
+    let reader =
+        ParquetRecordBatchReader::new(batch_size, array_reader, input_selection.clone(), None);
     let mut filters = vec![];
     for maybe_batch in reader {
         let maybe_batch = maybe_batch?;
@@ -935,7 +1045,8 @@ pub(crate) async fn evaluate_predicate_coop(
 ) -> Result<RowSelection> {
     let mut budget = DECODE_BUDGET;
 
-    let reader = ParquetRecordBatchReader::new(batch_size, array_reader, input_selection.clone());
+    let reader =
+        ParquetRecordBatchReader::new(batch_size, array_reader, input_selection.clone(), None);
     let mut filters = vec![];
     for maybe_batch in reader {
         let maybe_batch = maybe_batch?;
diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs