Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ tokio = { version = "1.28", optional = true, features = [
# cli
anyhow = { version = "1.0", optional = true }
clap = { version = "4.5.4", features = ["derive"], optional = true }
serde = { version = "1.0", features = ["derive"], optional = true }
serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true }

# opendal
opendal = { version = "0.53", optional = true, default-features = false }
Expand All @@ -74,13 +76,12 @@ criterion = { version = "0.5", default-features = false, features = ["async_toki
opendal = { version = "0.53", default-features = false, features = ["services-memory"] }
pretty_assertions = "1.3.0"
proptest = "1.0.0"
serde_json = { version = "1.0", default-features = false, features = ["std"] }

[features]
default = ["async"]

async = ["async-trait", "futures", "futures-util", "tokio"]
cli = ["anyhow", "clap"]
cli = ["anyhow", "clap", "serde", "serde_json"]
# Enable opendal support.
opendal = ["dep:opendal"]

Expand All @@ -105,3 +106,23 @@ required-features = ["cli"]
[[bin]]
name = "orc-stats"
required-features = ["cli"]

[[bin]]
name = "orc-read"
required-features = ["cli"]

[[bin]]
name = "orc-schema"
required-features = ["cli"]

[[bin]]
name = "orc-rowcount"
required-features = ["cli"]

[[bin]]
name = "orc-index"
required-features = ["cli"]

[[bin]]
name = "orc-layout"
required-features = ["cli"]
166 changes: 166 additions & 0 deletions src/bin/orc-index.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! Inspect row indexes for a specific ORC column.
//!
//! Row indexes carry per-row-group statistics and positions; this tool surfaces
//! them for debugging predicate pushdown and verifying writer-produced indexes.

use std::{fs::File, path::PathBuf};

use anyhow::{anyhow, Context, Result};
use clap::Parser;
use orc_rust::reader::metadata::read_metadata;
use orc_rust::schema::{DataType, RootDataType};
use orc_rust::statistics::{ColumnStatistics, TypeStatistics};
use orc_rust::stripe::Stripe;

#[derive(Debug, Parser)]
#[command(
author,
version,
about = "Print row group index information for an ORC column"
)]
struct Args {
/// Path to the ORC file
file: PathBuf,
/// Column name to inspect (top-level columns only)
column: String,
}

fn find_column<'a>(root: &'a RootDataType, name: &str) -> Option<(usize, &'a DataType, &'a str)> {
root.children()
.iter()
.find(|c| c.name() == name)
.map(|col| (col.data_type().column_index(), col.data_type(), col.name()))
}

fn fmt_stats(stats: &ColumnStatistics) -> String {
let mut parts = vec![format!("values={}", stats.number_of_values())];
if stats.has_null() {
parts.push("has_nulls=true".to_string());
}
if let Some(ts) = stats.type_statistics() {
match ts {
TypeStatistics::Integer { min, max, .. } => {
parts.push(format!("min={min}"));
parts.push(format!("max={max}"));
}
TypeStatistics::Double { min, max, .. } => {
parts.push(format!("min={min}"));
parts.push(format!("max={max}"));
}
TypeStatistics::String { min, max, .. } => {
parts.push(format!("min={min}"));
parts.push(format!("max={max}"));
}
TypeStatistics::Bucket { true_count } => {
parts.push(format!("true_count={true_count}"));
}
TypeStatistics::Decimal { min, max, .. } => {
parts.push(format!("min={min}"));
parts.push(format!("max={max}"));
}
TypeStatistics::Date { min, max } => {
parts.push(format!("min={min}"));
parts.push(format!("max={max}"));
}
TypeStatistics::Binary { sum } => {
parts.push(format!("total_bytes={sum}"));
}
TypeStatistics::Timestamp { min, max, .. } => {
parts.push(format!("min={min}"));
parts.push(format!("max={max}"));
}
TypeStatistics::Collection {
min_children,
max_children,
total_children,
} => {
parts.push(format!("min_children={min_children}"));
parts.push(format!("max_children={max_children}"));
parts.push(format!("total_children={total_children}"));
}
}
}
parts.join(", ")
}

fn main() -> Result<()> {
let args = Args::parse();
let mut file = File::open(&args.file)
.with_context(|| format!("failed to open {:?}", args.file.display()))?;
let metadata = read_metadata(&mut file)?;

let Some((column_index, data_type, name)) =
find_column(metadata.root_data_type(), &args.column)
else {
let available = metadata
.root_data_type()
.children()
.iter()
.map(|c| c.name().to_string())
.collect::<Vec<_>>()
.join(", ");
return Err(anyhow!(
"column '{}' not found. Available columns: {available}",
args.column
));
};

println!(
"File: {} | Column: {} (index {})",
args.file.display(),
name,
column_index
);
println!("Type: {data_type}");
println!("Stripes: {}", metadata.stripe_metadatas().len());

for (stripe_idx, stripe_meta) in metadata.stripe_metadatas().iter().enumerate() {
let stripe = Stripe::new(&mut file, &metadata, metadata.root_data_type(), stripe_meta)?;
let row_index = stripe.read_row_indexes(&metadata)?;

let Some(col_index) = row_index.column(column_index) else {
println!("Stripe {stripe_idx}: no row index for column");
continue;
};

if col_index.num_row_groups() == 0 {
println!("Stripe {stripe_idx}: no row groups recorded");
continue;
}

println!(
"Stripe {stripe_idx}: rows_per_group={} total_rows={}",
col_index.rows_per_group(),
row_index.total_rows()
);
for (row_group_idx, entry) in col_index.entries().enumerate() {
let start = row_group_idx * col_index.rows_per_group();
let end = (start + col_index.rows_per_group()).min(row_index.total_rows());
print!(" Row group {row_group_idx} rows [{start},{end})");
if let Some(stats) = &entry.statistics {
println!(" -> {}", fmt_stats(stats));
} else {
println!(" -> no statistics");
}
}
}

Ok(())
}
Loading