diff --git a/Cargo.toml b/Cargo.toml index 3c2683a..23d4477 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -63,6 +63,8 @@ tokio = { version = "1.28", optional = true, features = [ # cli anyhow = { version = "1.0", optional = true } clap = { version = "4.5.4", features = ["derive"], optional = true } +serde = { version = "1.0", features = ["derive"], optional = true } +serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } # opendal opendal = { version = "0.53", optional = true, default-features = false } @@ -74,13 +76,12 @@ criterion = { version = "0.5", default-features = false, features = ["async_toki opendal = { version = "0.53", default-features = false, features = ["services-memory"] } pretty_assertions = "1.3.0" proptest = "1.0.0" -serde_json = { version = "1.0", default-features = false, features = ["std"] } [features] default = ["async"] async = ["async-trait", "futures", "futures-util", "tokio"] -cli = ["anyhow", "clap"] +cli = ["anyhow", "clap", "serde", "serde_json"] # Enable opendal support. opendal = ["dep:opendal"] @@ -105,3 +106,23 @@ required-features = ["cli"] [[bin]] name = "orc-stats" required-features = ["cli"] + +[[bin]] +name = "orc-read" +required-features = ["cli"] + +[[bin]] +name = "orc-schema" +required-features = ["cli"] + +[[bin]] +name = "orc-rowcount" +required-features = ["cli"] + +[[bin]] +name = "orc-index" +required-features = ["cli"] + +[[bin]] +name = "orc-layout" +required-features = ["cli"] diff --git a/src/bin/orc-index.rs b/src/bin/orc-index.rs new file mode 100644 index 0000000..3823687 --- /dev/null +++ b/src/bin/orc-index.rs @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Inspect row indexes for a specific ORC column. +//! +//! Row indexes carry per-row-group statistics and positions; this tool surfaces +//! them for debugging predicate pushdown and verifying writer-produced indexes. + +use std::{fs::File, path::PathBuf}; + +use anyhow::{anyhow, Context, Result}; +use clap::Parser; +use orc_rust::reader::metadata::read_metadata; +use orc_rust::schema::{DataType, RootDataType}; +use orc_rust::statistics::{ColumnStatistics, TypeStatistics}; +use orc_rust::stripe::Stripe; + +#[derive(Debug, Parser)] +#[command( + author, + version, + about = "Print row group index information for an ORC column" +)] +struct Args { + /// Path to the ORC file + file: PathBuf, + /// Column name to inspect (top-level columns only) + column: String, +} + +fn find_column<'a>(root: &'a RootDataType, name: &str) -> Option<(usize, &'a DataType, &'a str)> { + root.children() + .iter() + .find(|c| c.name() == name) + .map(|col| (col.data_type().column_index(), col.data_type(), col.name())) +} + +fn fmt_stats(stats: &ColumnStatistics) -> String { + let mut parts = vec![format!("values={}", stats.number_of_values())]; + if stats.has_null() { + parts.push("has_nulls=true".to_string()); + } + if let Some(ts) = stats.type_statistics() { + match ts { + TypeStatistics::Integer { min, max, .. } => { + parts.push(format!("min={min}")); + parts.push(format!("max={max}")); + } + TypeStatistics::Double { min, max, .. } => { + parts.push(format!("min={min}")); + parts.push(format!("max={max}")); + } + TypeStatistics::String { min, max, .. } => { + parts.push(format!("min={min}")); + parts.push(format!("max={max}")); + } + TypeStatistics::Bucket { true_count } => { + parts.push(format!("true_count={true_count}")); + } + TypeStatistics::Decimal { min, max, .. } => { + parts.push(format!("min={min}")); + parts.push(format!("max={max}")); + } + TypeStatistics::Date { min, max } => { + parts.push(format!("min={min}")); + parts.push(format!("max={max}")); + } + TypeStatistics::Binary { sum } => { + parts.push(format!("total_bytes={sum}")); + } + TypeStatistics::Timestamp { min, max, .. } => { + parts.push(format!("min={min}")); + parts.push(format!("max={max}")); + } + TypeStatistics::Collection { + min_children, + max_children, + total_children, + } => { + parts.push(format!("min_children={min_children}")); + parts.push(format!("max_children={max_children}")); + parts.push(format!("total_children={total_children}")); + } + } + } + parts.join(", ") +} + +fn main() -> Result<()> { + let args = Args::parse(); + let mut file = File::open(&args.file) + .with_context(|| format!("failed to open {:?}", args.file.display()))?; + let metadata = read_metadata(&mut file)?; + + let Some((column_index, data_type, name)) = + find_column(metadata.root_data_type(), &args.column) + else { + let available = metadata + .root_data_type() + .children() + .iter() + .map(|c| c.name().to_string()) + .collect::>() + .join(", "); + return Err(anyhow!( + "column '{}' not found. Available columns: {available}", + args.column + )); + }; + + println!( + "File: {} | Column: {} (index {})", + args.file.display(), + name, + column_index + ); + println!("Type: {data_type}"); + println!("Stripes: {}", metadata.stripe_metadatas().len()); + + for (stripe_idx, stripe_meta) in metadata.stripe_metadatas().iter().enumerate() { + let stripe = Stripe::new(&mut file, &metadata, metadata.root_data_type(), stripe_meta)?; + let row_index = stripe.read_row_indexes(&metadata)?; + + let Some(col_index) = row_index.column(column_index) else { + println!("Stripe {stripe_idx}: no row index for column"); + continue; + }; + + if col_index.num_row_groups() == 0 { + println!("Stripe {stripe_idx}: no row groups recorded"); + continue; + } + + println!( + "Stripe {stripe_idx}: rows_per_group={} total_rows={}", + col_index.rows_per_group(), + row_index.total_rows() + ); + for (row_group_idx, entry) in col_index.entries().enumerate() { + let start = row_group_idx * col_index.rows_per_group(); + let end = (start + col_index.rows_per_group()).min(row_index.total_rows()); + print!(" Row group {row_group_idx} rows [{start},{end})"); + if let Some(stats) = &entry.statistics { + println!(" -> {}", fmt_stats(stats)); + } else { + println!(" -> no statistics"); + } + } + } + + Ok(()) +} diff --git a/src/bin/orc-layout.rs b/src/bin/orc-layout.rs new file mode 100644 index 0000000..d67dbc3 --- /dev/null +++ b/src/bin/orc-layout.rs @@ -0,0 +1,192 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Emit a JSON description of the physical layout of an ORC file. +//! +//! Useful for inspecting stripe offsets, stream kinds/sizes, and column encodings +//! to debug writer output or validate round trips. + +use std::fs::File; +use std::io::Read; +use std::path::PathBuf; + +use anyhow::{Context, Result}; +use clap::Parser; +use orc_rust::compression::{Compression, Decompressor}; +use orc_rust::proto::{column_encoding, stream::Kind, StripeFooter}; +use orc_rust::reader::metadata::{read_metadata, FileMetadata}; +use orc_rust::reader::ChunkReader; +use orc_rust::stripe::StripeMetadata; +use prost::Message; +use serde::Serialize; + +#[derive(Debug, Parser)] +#[command(author, version, about = "Print ORC stripe and stream layout as JSON")] +struct Args { + /// Path to the ORC file + file: PathBuf, +} + +#[derive(Serialize)] +struct Layout { + file: String, + format_version: String, + compression: Option, + rows: u64, + stripes: Vec, +} + +#[derive(Serialize)] +struct StripeLayout { + index: usize, + offset: u64, + index_length: u64, + data_length: u64, + footer_length: u64, + rows: u64, + streams: Vec, + encodings: Vec, +} + +#[derive(Serialize)] +struct StreamLayout { + column: u32, + kind: String, + length: u64, + offset: u64, +} + +#[derive(Serialize)] +struct ColumnEncodingLayout { + column: usize, + kind: String, + dictionary_size: Option, +} + +fn read_stripe_footer( + reader: &R, + stripe: &StripeMetadata, + compression: Option, +) -> Result { + let footer_bytes = reader + .get_bytes(stripe.footer_offset(), stripe.footer_length()) + .context("reading stripe footer")?; + let mut buffer = Vec::new(); + Decompressor::new(footer_bytes, compression, vec![]) + .read_to_end(&mut buffer) + .context("decompressing stripe footer")?; + StripeFooter::decode(buffer.as_slice()).context("decoding stripe footer") +} + +fn kind_to_str(kind: Kind) -> &'static str { + match kind { + Kind::Present => "PRESENT", + Kind::Data => "DATA", + Kind::Length => "LENGTH", + Kind::DictionaryData => "DICTIONARY_DATA", + Kind::Secondary => "SECONDARY", + Kind::RowIndex => "ROW_INDEX", + Kind::BloomFilter => "BLOOM_FILTER", + Kind::BloomFilterUtf8 => "BLOOM_FILTER_UTF8", + Kind::DictionaryCount => "DICTIONARY_COUNT", + Kind::EncryptedIndex => "ENCRYPTED_INDEX", + Kind::EncryptedData => "ENCRYPTED_DATA", + Kind::StripeStatistics => "STRIPE_STATISTICS", + Kind::FileStatistics => "FILE_STATISTICS", + } +} + +fn encoding_to_str(kind: column_encoding::Kind) -> &'static str { + match kind { + column_encoding::Kind::Direct => "DIRECT", + column_encoding::Kind::Dictionary => "DICTIONARY", + column_encoding::Kind::DirectV2 => "DIRECT_V2", + column_encoding::Kind::DictionaryV2 => "DICTIONARY_V2", + } +} + +fn build_stripe_layout( + reader: &R, + metadata: &FileMetadata, + stripe_idx: usize, + stripe: &StripeMetadata, +) -> Result { + let footer = read_stripe_footer(reader, stripe, metadata.compression())?; + + let mut offset = stripe.offset(); + let streams = footer + .streams + .iter() + .map(|s| { + let stream = StreamLayout { + column: s.column(), + kind: kind_to_str(s.kind()).to_string(), + length: s.length(), + offset, + }; + offset += s.length(); + stream + }) + .collect(); + + let encodings = footer + .columns + .iter() + .enumerate() + .map(|(idx, enc)| ColumnEncodingLayout { + column: idx, + kind: encoding_to_str(enc.kind()).to_string(), + dictionary_size: enc.dictionary_size, + }) + .collect(); + + Ok(StripeLayout { + index: stripe_idx, + offset: stripe.offset(), + index_length: stripe.index_length(), + data_length: stripe.data_length(), + footer_length: stripe.footer_length(), + rows: stripe.number_of_rows(), + streams, + encodings, + }) +} + +fn main() -> Result<()> { + let args = Args::parse(); + let mut file = File::open(&args.file) + .with_context(|| format!("failed to open {:?}", args.file.display()))?; + let metadata = read_metadata(&mut file)?; + + let stripes = metadata + .stripe_metadatas() + .iter() + .enumerate() + .map(|(idx, stripe)| build_stripe_layout(&file, &metadata, idx, stripe)) + .collect::>>()?; + + let layout = Layout { + file: args.file.display().to_string(), + format_version: metadata.file_format_version().to_string(), + compression: metadata.compression().map(|c| c.to_string()), + rows: metadata.number_of_rows(), + stripes, + }; + + serde_json::to_writer_pretty(std::io::stdout(), &layout).context("writing layout")?; + Ok(()) +} diff --git a/src/bin/orc-read.rs b/src/bin/orc-read.rs new file mode 100644 index 0000000..890aed5 --- /dev/null +++ b/src/bin/orc-read.rs @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Stream an ORC file to stdout as CSV or JSON lines. +//! +//! This is a thin wrapper around `ArrowReaderBuilder` so that CLI behavior mirrors +//! library reads (projection/predicate defaults, batch sizing, etc). + +use std::fs::File; +use std::io::{self, Read}; + +use anyhow::{Context, Result}; +use arrow::{csv, error::ArrowError, json, record_batch::RecordBatch}; +use bytes::Bytes; +use clap::Parser; +use orc_rust::reader::ChunkReader; +use orc_rust::ArrowReaderBuilder; + +#[derive(Debug, Parser)] +#[command(author, version, about = "Read ORC data and print to stdout")] +struct Args { + /// Path to an ORC file or "-" to read from stdin + file: String, + /// Number of records to read (0 = all) + #[arg(short, long, default_value_t = 0)] + num_records: usize, + /// Output as JSON lines instead of CSV + #[arg(short, long)] + json: bool, + /// Batch size to use when reading + #[arg(long, default_value_t = 8192)] + batch_size: usize, +} + +#[allow(clippy::large_enum_variant)] +enum OutputWriter { + Csv(csv::Writer), + Json(json::Writer), +} + +impl OutputWriter +where + W: io::Write, + F: json::writer::JsonFormat, +{ + fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { + match self { + OutputWriter::Csv(w) => w.write(batch), + OutputWriter::Json(w) => w.write(batch), + } + } + + fn finish(&mut self) -> Result<(), ArrowError> { + match self { + OutputWriter::Csv(_) => Ok(()), + OutputWriter::Json(w) => w.finish(), + } + } +} + +fn run_reader( + source: R, + args: &Args, + mut writer: OutputWriter, +) -> Result<()> { + let reader = ArrowReaderBuilder::try_new(source)? + .with_batch_size(args.batch_size) + .build(); + + let mut remaining = if args.num_records == 0 { + usize::MAX + } else { + args.num_records + }; + + for batch in reader { + if remaining == 0 { + break; + } + let mut batch = batch?; + if remaining < batch.num_rows() { + batch = batch.slice(0, remaining); + } + writer.write(&batch)?; + + remaining = remaining.saturating_sub(batch.num_rows()); + } + + writer.finish().context("closing writer")?; + Ok(()) +} + +fn main() -> Result<()> { + let args = Args::parse(); + + let stdout = io::stdout(); + let handle = stdout.lock(); + + if args.file == "-" { + let mut buf = Vec::new(); + io::stdin().read_to_end(&mut buf).context("reading stdin")?; + let bytes = Bytes::from(buf); + let writer: OutputWriter<_, json::writer::LineDelimited> = if args.json { + OutputWriter::Json( + json::WriterBuilder::new().build::<_, json::writer::LineDelimited>(handle), + ) + } else { + OutputWriter::Csv(csv::WriterBuilder::new().with_header(true).build(handle)) + }; + run_reader(bytes, &args, writer) + } else { + let file = File::open(&args.file).with_context(|| format!("opening {}", args.file))?; + let writer: OutputWriter<_, json::writer::LineDelimited> = if args.json { + OutputWriter::Json( + json::WriterBuilder::new().build::<_, json::writer::LineDelimited>(handle), + ) + } else { + OutputWriter::Csv(csv::WriterBuilder::new().with_header(true).build(handle)) + }; + run_reader(file, &args, writer) + } +} diff --git a/src/bin/orc-rowcount.rs b/src/bin/orc-rowcount.rs new file mode 100644 index 0000000..bfcea82 --- /dev/null +++ b/src/bin/orc-rowcount.rs @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Return the number of rows in one or more ORC files. +//! +//! Uses metadata only (no row decoding), so it is fast even on large files. + +use std::{fs::File, path::PathBuf}; + +use anyhow::{Context, Result}; +use clap::Parser; +use orc_rust::reader::metadata::read_metadata; + +#[derive(Debug, Parser)] +#[command(author, version, about = "Return the number of rows in ORC files")] +struct Args { + /// List of ORC files to inspect + #[arg(required = true)] + files: Vec, +} + +fn main() -> Result<()> { + let args = Args::parse(); + + for path in args.files { + let mut file = + File::open(&path).with_context(|| format!("failed to open {:?}", path.display()))?; + let metadata = read_metadata(&mut file)?; + println!("{}: {}", path.display(), metadata.number_of_rows()); + } + + Ok(()) +} diff --git a/src/bin/orc-schema.rs b/src/bin/orc-schema.rs new file mode 100644 index 0000000..05e6f85 --- /dev/null +++ b/src/bin/orc-schema.rs @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Print the schema and metadata of an ORC file. + +use std::{fs::File, path::PathBuf}; + +use anyhow::{Context, Result}; +use clap::Parser; +use orc_rust::reader::metadata::read_metadata; + +#[derive(Debug, Parser)] +#[command( + author, + version, + about = "Print the schema and metadata of an ORC file" +)] +struct Args { + /// Path to the ORC file + file: PathBuf, + /// Include stripe offsets and row counts + #[arg(short, long)] + verbose: bool, +} + +fn main() -> Result<()> { + let args = Args::parse(); + let mut file = File::open(&args.file) + .with_context(|| format!("failed to open {:?}", args.file.display()))?; + let metadata = read_metadata(&mut file)?; + + println!("File: {}", args.file.display()); + println!("Format version: {}", metadata.file_format_version()); + println!( + "Compression: {}", + metadata + .compression() + .map(|c| c.to_string()) + .unwrap_or_else(|| "None".to_string()) + ); + if let Some(stride) = metadata.row_index_stride() { + println!("Row index stride: {stride}"); + } else { + println!("Row index stride: None"); + } + println!("Rows: {}", metadata.number_of_rows()); + println!("Stripes: {}", metadata.stripe_metadatas().len()); + println!(); + println!("Schema:\n{}", metadata.root_data_type()); + + if args.verbose { + println!("\nStripe layout:"); + for (idx, stripe) in metadata.stripe_metadatas().iter().enumerate() { + println!("Stripe {idx}:"); + println!(" offset: {}", stripe.offset()); + println!(" index length: {}", stripe.index_length()); + println!(" data length: {}", stripe.data_length()); + println!(" footer length: {}", stripe.footer_length()); + println!(" rows: {}", stripe.number_of_rows()); + } + } + + Ok(()) +} diff --git a/src/lib.rs b/src/lib.rs index f7477a3..84110e8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -60,7 +60,7 @@ mod memory; pub mod predicate; pub mod projection; #[allow(dead_code)] -mod proto; +pub mod proto; pub mod reader; pub mod row_group_filter; pub mod row_index; diff --git a/tests/bin/main.rs b/tests/bin/main.rs new file mode 100644 index 0000000..3c759de --- /dev/null +++ b/tests/bin/main.rs @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Smoke tests for CLI binaries. + +#![cfg(feature = "cli")] + +use std::fs::File; +use std::path::PathBuf; +use std::process::Command; + +use orc_rust::reader::metadata::read_metadata; +use serde_json::Value; + +fn data_path(name: &str) -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("integration") + .join("data") + .join(name) +} + +fn run_cmd(bin_env: &str, args: &[&str]) -> (bool, String) { + let output = Command::new(bin_env).args(args).output().unwrap(); + let stdout = String::from_utf8_lossy(&output.stdout).to_string(); + (output.status.success(), stdout) +} + +#[test] +fn orc_rowcount_matches_metadata() { + let file = data_path("TestOrcFile.test1.orc"); + let mut fh = File::open(&file).unwrap(); + let expected = read_metadata(&mut fh).unwrap().number_of_rows().to_string(); + + let (ok, stdout) = run_cmd( + env!("CARGO_BIN_EXE_orc-rowcount"), + &[file.to_str().unwrap()], + ); + assert!(ok, "orc-rowcount failed"); + assert!( + stdout.contains(&expected), + "expected rowcount {expected}, got {stdout}" + ); +} + +#[test] +fn orc_schema_prints_schema() { + let file = data_path("TestOrcFile.test1.orc"); + let (ok, stdout) = run_cmd(env!("CARGO_BIN_EXE_orc-schema"), &[file.to_str().unwrap()]); + assert!(ok); + assert!( + stdout.contains("Schema:"), + "schema output missing Schema header" + ); +} + +#[test] +fn orc_read_limits_records() { + let file = data_path("TestOrcFile.test1.orc"); + let (ok, stdout) = run_cmd( + env!("CARGO_BIN_EXE_orc-read"), + &["--json", "--num-records", "2", file.to_str().unwrap()], + ); + assert!(ok); + let lines: Vec<_> = stdout.lines().collect(); + assert_eq!(2, lines.len(), "expected exactly 2 JSON lines"); + for line in lines { + serde_json::from_str::(line).expect("valid JSON line"); + } +} + +#[test] +fn orc_layout_json_matches_stripe_count() { + let file = data_path("TestOrcFile.test1.orc"); + let mut fh = File::open(&file).unwrap(); + let metadata = read_metadata(&mut fh).unwrap(); + + let (ok, stdout) = run_cmd(env!("CARGO_BIN_EXE_orc-layout"), &[file.to_str().unwrap()]); + assert!(ok); + let v: Value = serde_json::from_str(&stdout).unwrap(); + let stripes = v["stripes"].as_array().expect("stripes is array").len(); + assert_eq!(metadata.stripe_metadatas().len(), stripes); +} + +#[test] +fn orc_index_completes() { + let file = data_path("TestOrcFile.testPredicatePushdown.orc"); + let (ok, stdout) = run_cmd( + env!("CARGO_BIN_EXE_orc-index"), + &[file.to_str().unwrap(), "int1"], + ); + assert!(ok); + assert!( + stdout.contains("Stripe"), + "expected stripe output from orc-index" + ); +}