diff --git a/Cargo.toml b/Cargo.toml index e0144faa7a92..7798ea0b239c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,7 +36,7 @@ members = [ "arrow-row", "arrow-schema", "arrow-select", - "arrow-string", + "arrow-string", "foo", "parquet", "parquet_derive", "parquet_derive_test", @@ -57,6 +57,8 @@ exclude = [ # significantly changing how it is compiled within the workspace, causing the whole workspace to be compiled from # scratch this way, this is a stand-alone package that compiles independently of the others. "arrow-pyarrow-integration-testing", + # parquet inregration testing likewise contains different flags + "parquet-integration-testing", # object_store is excluded because it follows a separate release cycle from the other arrow crates "object_store" ] diff --git a/parquet-integration-testing/.gitignore b/parquet-integration-testing/.gitignore new file mode 100644 index 000000000000..1fcb1529f8e5 --- /dev/null +++ b/parquet-integration-testing/.gitignore @@ -0,0 +1 @@ +out diff --git a/parquet-integration-testing/Cargo.toml b/parquet-integration-testing/Cargo.toml new file mode 100644 index 000000000000..c85ba8be39bc --- /dev/null +++ b/parquet-integration-testing/Cargo.toml @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "python-integration-testing" +description = "Binaries used for testing parquet-rs compatibility (NOT published to crates.io)" +publish = false +edition = "2021" + + +[dependencies] +arrow = { path = "../arrow", features = ["prettyprint"] } +parquet = { path = "../parquet", features = ["arrow"]} +serde = "1.0.203" +serde_json = { version = "1.0", default-features = false, features = ["std"] } +pretty_assertions = "1.4.0" \ No newline at end of file diff --git a/parquet-integration-testing/README.md b/parquet-integration-testing/README.md new file mode 100644 index 000000000000..eca0867c8cc1 --- /dev/null +++ b/parquet-integration-testing/README.md @@ -0,0 +1,32 @@ + + +# Apache Parquet Rust Integration Testing + +The binary in this repo: + +1. Reads files from the parquet-testing repo +2. Creates a JSON file with appropriately formatted contents +3. Compare these JSON files with "known good" golden master files + +## Running + +```shell +cargo run +``` diff --git a/parquet-integration-testing/data/alltypes_plain.parquet.data.json b/parquet-integration-testing/data/alltypes_plain.parquet.data.json new file mode 100644 index 000000000000..2c24a72290c8 --- /dev/null +++ b/parquet-integration-testing/data/alltypes_plain.parquet.data.json @@ -0,0 +1,285 @@ +{ + "filename": "alltypes_plain.parquet", + "rows": [ + [ + { + "id": "4" + }, + { + "bool_col": "true" + }, + { + "tinyint_col": "0" + }, + { + "smallint_col": "0" + }, + { + "int_col": "0" + }, + { + "bigint_col": "0" + }, + { + "float_col": "0.0" + }, + { + "double_col": "0.0" + }, + { + "date_string_col": "30332f30312f3039" + }, + { + "string_col": "30" + }, + { + "timestamp_col": "2009-03-01T00:00:00" + } + ], + [ + { + "id": "5" + }, + { + "bool_col": "false" + }, + { + "tinyint_col": "1" + }, + { + "smallint_col": "1" + }, + { + "int_col": "1" + }, + { + "bigint_col": "10" + }, + { + "float_col": "1.1" + }, + { + "double_col": "10.1" + }, + { + "date_string_col": "30332f30312f3039" + }, + { + "string_col": "31" + }, + { + "timestamp_col": "2009-03-01T00:01:00" + } + ], + [ + { + "id": "6" + }, + { + "bool_col": "true" + }, + { + "tinyint_col": "0" + }, + { + "smallint_col": "0" + }, + { + "int_col": "0" + }, + { + "bigint_col": "0" + }, + { + "float_col": "0.0" + }, + { + "double_col": "0.0" + }, + { + "date_string_col": "30342f30312f3039" + }, + { + "string_col": "30" + }, + { + "timestamp_col": "2009-04-01T00:00:00" + } + ], + [ + { + "id": "7" + }, + { + "bool_col": "false" + }, + { + "tinyint_col": "1" + }, + { + "smallint_col": "1" + }, + { + "int_col": "1" + }, + { + "bigint_col": "10" + }, + { + "float_col": "1.1" + }, + { + "double_col": "10.1" + }, + { + "date_string_col": "30342f30312f3039" + }, + { + "string_col": "31" + }, + { + "timestamp_col": "2009-04-01T00:01:00" + } + ], + [ + { + "id": "2" + }, + { + "bool_col": "true" + }, + { + "tinyint_col": "0" + }, + { + "smallint_col": "0" + }, + { + "int_col": "0" + }, + { + "bigint_col": "0" + }, + { + "float_col": "0.0" + }, + { + "double_col": "0.0" + }, + { + "date_string_col": "30322f30312f3039" + }, + { + "string_col": "30" + }, + { + "timestamp_col": "2009-02-01T00:00:00" + } + ], + [ + { + "id": "3" + }, + { + "bool_col": "false" + }, + { + "tinyint_col": "1" + }, + { + "smallint_col": "1" + }, + { + "int_col": "1" + }, + { + "bigint_col": "10" + }, + { + "float_col": "1.1" + }, + { + "double_col": "10.1" + }, + { + "date_string_col": "30322f30312f3039" + }, + { + "string_col": "31" + }, + { + "timestamp_col": "2009-02-01T00:01:00" + } + ], + [ + { + "id": "0" + }, + { + "bool_col": "true" + }, + { + "tinyint_col": "0" + }, + { + "smallint_col": "0" + }, + { + "int_col": "0" + }, + { + "bigint_col": "0" + }, + { + "float_col": "0.0" + }, + { + "double_col": "0.0" + }, + { + "date_string_col": "30312f30312f3039" + }, + { + "string_col": "30" + }, + { + "timestamp_col": "2009-01-01T00:00:00" + } + ], + [ + { + "id": "1" + }, + { + "bool_col": "false" + }, + { + "tinyint_col": "1" + }, + { + "smallint_col": "1" + }, + { + "int_col": "1" + }, + { + "bigint_col": "10" + }, + { + "float_col": "1.1" + }, + { + "double_col": "10.1" + }, + { + "date_string_col": "30312f30312f3039" + }, + { + "string_col": "31" + }, + { + "timestamp_col": "2009-01-01T00:01:00" + } + ] + ] +} \ No newline at end of file diff --git a/parquet-integration-testing/data/alltypes_plain.parquet.metadata.json b/parquet-integration-testing/data/alltypes_plain.parquet.metadata.json new file mode 100644 index 000000000000..2c66d3cf2ac3 --- /dev/null +++ b/parquet-integration-testing/data/alltypes_plain.parquet.metadata.json @@ -0,0 +1,57 @@ +{ + "filename33": "alltypes_plain.parquet", + "row_goups": [ + { + "columns": [ + { + "file_offset": 77, + "file_path": null + }, + { + "file_offset": 133, + "file_path": null + }, + { + "file_offset": 215, + "file_path": null + }, + { + "file_offset": 303, + "file_path": null + }, + { + "file_offset": 392, + "file_path": null + }, + { + "file_offset": 484, + "file_path": null + }, + { + "file_offset": 571, + "file_path": null + }, + { + "file_offset": 665, + "file_path": null + }, + { + "file_offset": 793, + "file_path": null + }, + { + "file_offset": 889, + "file_path": null + }, + { + "file_offset": 1068, + "file_path": null + } + ], + "file_offset": null, + "num_rows": 8, + "ordinal": null, + "total_byte_size": 671 + } + ] +} \ No newline at end of file diff --git a/parquet-integration-testing/src/bin/parquet-integration-testing.rs b/parquet-integration-testing/src/bin/parquet-integration-testing.rs new file mode 100644 index 000000000000..8317149b1563 --- /dev/null +++ b/parquet-integration-testing/src/bin/parquet-integration-testing.rs @@ -0,0 +1,234 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::util::display::array_value_to_string; +use arrow::util::pretty::pretty_format_columns; +use parquet::arrow::arrow_reader::ArrowReaderBuilder; +use pretty_assertions::assert_eq; +use serde::Serialize; +use serde_json::{json, Value}; +/// Test driver for parquet-integration testing +use std::fs::{canonicalize, File}; +use std::path::{Path, PathBuf}; +use parquet::file::metadata::ColumnChunkMetaData; +use parquet::format::ColumnMetaData; + +fn main() { + let integration_test = IntegrationTest::new(); + + let filenames = vec![ + "alltypes_plain.parquet", + ]; + + for filename in filenames { + integration_test.data_test(filename); + integration_test.metadata_test(filename) + } +} + +// prototype demonstration of checking type support for parquet-rs encoding +// check read support by reading a file with the specified encoding correctly +#[derive(Debug)] +struct IntegrationTest { + parquet_data_path: PathBuf, + expected_data_path: PathBuf, + output_data_path: PathBuf, +} + +impl IntegrationTest { + pub fn new() -> Self { + // TODO error handling + + // paths are relative to arrow-rs/parquet-integration-testing + let parquet_data_path = PathBuf::from("../parquet-testing/data") + .canonicalize() + .unwrap(); + let expected_data_path = PathBuf::from("data").canonicalize().unwrap(); + let output_data_path = PathBuf::from("out").canonicalize().unwrap(); + + std::fs::create_dir_all(&output_data_path).unwrap(); + + Self { + parquet_data_path, + expected_data_path, + output_data_path, + } + } + + /// Read a parquet file, create a JSON representation, and compare to the + /// known good value in data + /// + /// The output JSON looks like this: + /// + /// ```text + /// { + /// filename: "filename.parquet", + /// rows: [ + /// { + /// "column1": "value1", + /// "column2": 123, + /// "column3": null + /// }, + /// .. + /// { + /// "column1": "value2", + /// "column2": 456, + /// "column3": "value3" + /// } + /// ] + /// } + /// ``` + fn data_test(&self, filename: &str) { + let parquet_file_path = self.parquet_data_path.join(filename); + let expected_file_path = self.expected_data_path.join(format!("{filename}.data.json")); + + // For ease of development, write the actual parsed value to a file (to + // permit easy updates, for example) + let output_file_path = self.output_data_path.join(format!("{filename}.data.json")); + + println!("Begin data test: {filename}"); + println!(" Input parquet file: {parquet_file_path:?}"); + println!(" Expected JSON file: {expected_file_path:?}"); + println!(" Output JSON file: {output_file_path:?}"); + + let parquet_json = read_parquet_data(&parquet_file_path); + let output_file = File::create(&output_file_path).unwrap(); + serde_json::to_writer_pretty(output_file, &parquet_json).unwrap(); + + // read expected file if present, default to {} if not + let expected_json = if let Ok(expected_file) = File::open(expected_file_path) { + serde_json::from_reader(expected_file).unwrap() + } else { + json!({}) + }; + assert_eq!(parquet_json, expected_json) + } + + /// Read a parquet file, create a JSON representation of its metadata, and compares to the + /// known good value in data + /// + /// The output JSON looks like this: + /// + /// ```text + /// { + /// filename: "filename.parquet", + /// .. + /// .. + /// } + /// ``` + fn metadata_test(&self, filename: &str) { + let parquet_file_path = self.parquet_data_path.join(filename); + let expected_file_path = self.expected_data_path.join(format!("{filename}.metadata.json")); + + // For ease of development, write the actual parsed value to a file (to + // permit easy updates, for example) + let output_file_path = self.output_data_path.join(format!("{filename}.metadata.json")); + + println!("Begin metadata test: {filename}"); + println!(" Input parquet file: {parquet_file_path:?}"); + println!(" Expected JSON file: {expected_file_path:?}"); + println!(" Output JSON file: {output_file_path:?}"); + + let parquet_json = read_parquet_metadata(&parquet_file_path); + let output_file = File::create(&output_file_path).unwrap(); + serde_json::to_writer_pretty(output_file, &parquet_json).unwrap(); + + // read expected file if present, default to {} if not + let expected_json = if let Ok(expected_file) = File::open(expected_file_path) { + serde_json::from_reader(expected_file).unwrap() + } else { + json!({}) + }; + assert_eq!(parquet_json, expected_json) + } +} + + + +/// The function reads a parquet file and returns a JSON representation of the +/// data within +fn read_parquet_data(parquet_data_path: &Path) -> Value { + let file = File::open(parquet_data_path).unwrap(); + let mut reader = ArrowReaderBuilder::try_new(file).unwrap().build().unwrap(); + + let mut rows = vec![]; + while let Some(batch) = reader.next() { + let batch = batch.unwrap(); + let columns = batch.columns(); + let schema = batch.schema(); + for i in 0..batch.num_rows() { + let mut row = vec![]; + for (field, column) in schema.fields.iter().zip(columns.iter()) { + let name = field.name(); + let value = array_value_to_string(column.as_ref(), i).unwrap(); + row.push(json!({name: value})); + } + rows.push(json!(row)); + } + } + + let filename = parquet_data_path.file_name().unwrap().to_string_lossy(); + + json!({ + "filename": filename, + "rows": rows + }) +} + + +/// The function reads a parquet file and writes a JSON representation of the +/// metatadata (thrift encoded) within +fn read_parquet_metadata(parquet_data_path: &Path) -> Value { + let file = File::open(parquet_data_path).unwrap(); + let metadata = ArrowReaderBuilder::try_new(file).unwrap().metadata().clone(); + + // todo print out schema + let row_groups : Vec<_> = metadata.row_groups().iter() + .map(|rg| { + let columns: Vec<_> = rg.columns().iter() + .map(column_metadata_to_json) + .collect(); + json!({ + "num_rows": rg.num_rows(), + "total_byte_size": rg.total_byte_size(), + "file_offset": rg.file_offset(), + "ordinal": rg.ordinal(), + "columns": columns, + }) + }).collect();; + + let filename = parquet_data_path.file_name().unwrap().to_string_lossy(); + + json!({ + "filename33": filename, + "row_goups": row_groups, + }) +} + +fn column_metadata_to_json(column_metadata: &ColumnChunkMetaData) -> Value { + + json!({ + "file_path": column_metadata.file_path(), + "file_offset": column_metadata.file_offset(), + // todo: column metadata + // "num_values": column_metadata.num_values(), + // todo column-type/ column-path/descr + //"file_path": column_metadata.file_path(), + + }) + +} \ No newline at end of file