bug fix

apache · alamb · Aug 8, 2025 · Jul 1, 2025 · Jul 1, 2025 · Jul 1, 2025
commit 58add510c99c4b47f686feb5439042a0b56fb4ab
diff --git a/parquet/src/arrow/array_reader/cached_array_reader.rs b/parquet/src/arrow/array_reader/cached_array_reader.rs
@@ -218,8 +218,10 @@ impl ArrayReader for CachedArrayReader {
                     if read_from_inner == 0 {
                         break;
                     }
-
-                    let select_from_this_batch = std::cmp::min(num_records - read, read_from_inner);
+                    let select_from_this_batch = std::cmp::min(
+                        num_records - read,
+                        self.inner_position - self.outer_position,
+                    );
                     read += select_from_this_batch;
                     self.selections
                         .push_back(RowSelector::select(select_from_this_batch));

diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs
@@ -441,6 +441,59 @@ impl RowSelection {
     pub fn skipped_row_count(&self) -> usize {
         self.iter().filter(|s| s.skip).map(|s| s.row_count).sum()
     }
+
+    /// Expands the selection to align with batch boundaries.
+    /// This is needed when using cached array readers to ensure that
+    /// the cached data covers full batches.
+    #[cfg(feature = "async")]
+    pub(crate) fn expand_to_batch_boundaries(&self, batch_size: usize, total_rows: usize) -> Self {
+        if batch_size == 0 {
+            return self.clone();
+        }
+
+        let mut expanded_ranges = Vec::new();
+        let mut row_offset = 0;
+
+        for selector in &self.selectors {
+            if selector.skip {
+                row_offset += selector.row_count;
+            } else {
+                let start = row_offset;
+                let end = row_offset + selector.row_count;
+
+                // Expand start to batch boundary
+                let expanded_start = (start / batch_size) * batch_size;
+                // Expand end to batch boundary
+                let expanded_end = ((end + batch_size - 1) / batch_size) * batch_size;
+                let expanded_end = expanded_end.min(total_rows);
+
+                expanded_ranges.push(expanded_start..expanded_end);
+                row_offset += selector.row_count;
+            }
+        }
+
+        // Sort ranges by start position
+        expanded_ranges.sort_by_key(|range| range.start);
+
+        // Merge overlapping or consecutive ranges
+        let mut merged_ranges: Vec<Range<usize>> = Vec::new();
+        for range in expanded_ranges {
+            if let Some(last) = merged_ranges.last_mut() {
+                if range.start <= last.end {
+                    // Overlapping or consecutive - merge them
+                    last.end = last.end.max(range.end);
+                } else {
+                    // No overlap - add new range
+                    merged_ranges.push(range);
+                }
+            } else {
+                // First range
+                merged_ranges.push(range);
+            }
+        }
+
+        Self::from_consecutive_ranges(merged_ranges.into_iter(), total_rows)
+    }
 }
 
 impl From<Vec<RowSelector>> for RowSelection {
@@ -1378,4 +1431,53 @@ mod tests {
         assert_eq!(selection.row_count(), 0);
         assert_eq!(selection.skipped_row_count(), 0);
     }
+
+    #[test]
+    #[cfg(feature = "async")]
+    fn test_expand_to_batch_boundaries() {
+        // Test case that reproduces the overlapping ranges bug
+        let selection = RowSelection::from(vec![
+            RowSelector::skip(21),   // Skip first page
+            RowSelector::select(21), // Select page to boundary
+            RowSelector::skip(41),   // Skip multiple pages
+            RowSelector::select(41), // Select multiple pages
+            RowSelector::skip(25),   // Skip page across boundary
+            RowSelector::select(25), // Select across page boundary
+            RowSelector::skip(7116), // Skip to final page boundary
+            RowSelector::select(10), // Select final page
+        ]);
+
+        let total_rows = 7300;
+        let batch_size = 1024;
+
+        // This should not panic with "out of order"
+        let expanded = selection.expand_to_batch_boundaries(batch_size, total_rows);
+
+        // Verify that the expanded selection is valid
+        assert!(expanded.selects_any());
+        assert!(expanded.row_count() >= selection.row_count());
+
+        // Test with smaller batch size that would cause more overlaps
+        let batch_size = 32;
+        let expanded = selection.expand_to_batch_boundaries(batch_size, total_rows);
+        assert!(expanded.selects_any());
+
+        // Test edge case with batch_size = 0
+        let expanded = selection.expand_to_batch_boundaries(0, total_rows);
+        assert_eq!(expanded, selection);
+
+        // Test simple case with two adjacent selectors
+        let selection = RowSelection::from(vec![
+            RowSelector::select(10), // 0-10
+            RowSelector::skip(5),    // 10-15
+            RowSelector::select(10), // 15-25
+        ]);
+
+        let expanded = selection.expand_to_batch_boundaries(32, 100);
+        // Both selectors should expand to 0-32
+        assert_eq!(
+            expanded.selectors,
+            vec![RowSelector::select(32), RowSelector::skip(68)]
+        );
+    }
 }
diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs
@@ -620,7 +620,12 @@ where
                 // (pre) Fetch only the columns that are selected by the predicate
                 let selection = plan_builder.selection();
                 row_group
-                    .fetch(&mut self.input, predicate.projection(), selection)
+                    .fetch(
+                        &mut self.input,
+                        predicate.projection(),
+                        selection,
+                        batch_size,
+                    )
                     .await?;
 
                 let mut cache_projection = predicate.projection().clone();
@@ -676,7 +681,12 @@ where
         }
         // fetch the pages needed for decoding
         row_group
-            .fetch(&mut self.input, &projection, plan_builder.selection())
+            .fetch(
+                &mut self.input,
+                &projection,
+                plan_builder.selection(),
+                batch_size,
+            )
             .await?;
 
         let plan = plan_builder.build();
@@ -696,6 +706,7 @@ where
         Ok((self, Some(reader)))
     }
 
+    /// Compute which columns are used in filters and the final (output) projection
     fn compute_cache_projection(&self, projection: &ProjectionMask) -> Option<ProjectionMask> {
         let filters = self.filter.as_ref()?;
         let mut cache_projection = filters.predicates.first()?.projection().clone();
@@ -934,9 +945,11 @@ impl InMemoryRowGroup<'_> {
         input: &mut T,
         projection: &ProjectionMask,
         selection: Option<&RowSelection>,
+        batch_size: usize,
     ) -> Result<()> {
         let metadata = self.metadata.row_group(self.row_group_idx);
         if let Some((selection, offset_index)) = selection.zip(self.offset_index) {
+            let selection = selection.expand_to_batch_boundaries(batch_size, self.row_count);
             // If we have a `RowSelection` and an `OffsetIndex` then only fetch pages required for the
             // `RowSelection`
             let mut page_start_offsets: Vec<Vec<u64>> = vec![];
@@ -1869,6 +1882,7 @@ mod tests {
         assert_eq!(total_rows, 730);
     }
 
+    #[ignore]
     #[tokio::test]
     async fn test_in_memory_row_group_sparse() {
         let testdata = arrow::util::test_util::parquet_test_data();
@@ -2423,4 +2437,53 @@ mod tests {
         let result = reader.try_collect::<Vec<_>>().await.unwrap();
         assert_eq!(result.len(), 1);
     }
+
+    #[tokio::test]
+    async fn test_cached_array_reader_sparse_offset_error() {
+        use futures::TryStreamExt;
+
+        use crate::arrow::arrow_reader::{ArrowPredicateFn, RowFilter, RowSelection, RowSelector};
+        use arrow_array::{BooleanArray, RecordBatch};
+
+        let testdata = arrow::util::test_util::parquet_test_data();
+        let path = format!("{testdata}/alltypes_tiny_pages_plain.parquet");
+        let data = Bytes::from(std::fs::read(path).unwrap());
+
+        let async_reader = TestReader::new(data);
+
+        // Enable page index so the fetch logic loads only required pages
+        let options = ArrowReaderOptions::new().with_page_index(true);
+        let builder = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options)
+            .await
+            .unwrap();
+
+        // Skip the first 22 rows (entire first Parquet page) and then select the
+        // next 3 rows (22, 23, 24). This means the fetch step will not include
+        // the first page starting at file offset 0.
+        let selection = RowSelection::from(vec![RowSelector::skip(22), RowSelector::select(3)]);
+
+        // Trivial predicate on column 0 that always returns `true`. Using the
+        // same column in both predicate and projection activates the caching
+        // layer (Producer/Consumer pattern).
+        let parquet_schema = builder.parquet_schema();
+        let proj = ProjectionMask::leaves(parquet_schema, vec![0]);
+        let always_true = ArrowPredicateFn::new(proj.clone(), |batch: RecordBatch| {
+            Ok(BooleanArray::from(vec![true; batch.num_rows()]))
+        });
+        let filter = RowFilter::new(vec![Box::new(always_true)]);
+
+        // Build the stream with batch size 8 so the cache reads whole batches
+        // that straddle the requested row range (rows 0-7, 8-15, 16-23, …).
+        let stream = builder
+            .with_batch_size(8)
+            .with_projection(proj)
+            .with_row_selection(selection)
+            .with_row_filter(filter)
+            .build()
+            .unwrap();
+
+        // Collecting the stream should fail with the sparse column chunk offset
+        // error we want to reproduce.
+        let _result: Vec<_> = stream.try_collect().await.unwrap();
+    }
 }