diff --git a/parquet-variant-compute/Cargo.toml b/parquet-variant-compute/Cargo.toml index 828ad77bd61e..cadfa1cbb63a 100644 --- a/parquet-variant-compute/Cargo.toml +++ b/parquet-variant-compute/Cargo.toml @@ -38,6 +38,7 @@ indexmap = "2.10.0" parquet-variant = { workspace = true } parquet-variant-json = { workspace = true } chrono = { workspace = true } +uuid = { version = "1.18.0", features = ["v4"]} [lib] name = "parquet_variant_compute" diff --git a/parquet-variant-compute/src/arrow_to_variant.rs b/parquet-variant-compute/src/arrow_to_variant.rs index d31bf8a212dc..40d253914afd 100644 --- a/parquet-variant-compute/src/arrow_to_variant.rs +++ b/parquet-variant-compute/src/arrow_to_variant.rs @@ -28,7 +28,7 @@ use arrow::datatypes::{ Time64NanosecondType, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; -use arrow::temporal_conversions::{as_date, as_datetime, as_time}; +use arrow::temporal_conversions::as_datetime; use arrow_schema::{ArrowError, DataType, TimeUnit}; use chrono::{DateTime, TimeZone, Utc}; use parquet_variant::{ @@ -38,6 +38,98 @@ use parquet_variant::{ use std::collections::HashMap; use std::ops::Range; +// ============================================================================ +// Shared traits and helpers for Arrow-to-Variant conversion +// ============================================================================ + +/// Zero-cost trait for converting Arrow array values to Variant +pub(crate) trait ArrowToVariant: Array { + fn append_to_variant_builder( + &self, + builder: &mut impl VariantBuilderExt, + index: usize, + ) -> Result<(), ArrowError>; +} + +/// Macro to define ArrowToVariant implementations with optional value transformation +macro_rules! define_arrow_to_variant { + ($array_type:ty $(, |$value:ident| $(-> Result<$result_ty:ty>)? $transform:expr)?) => { + impl ArrowToVariant for $array_type { + #[inline] + fn append_to_variant_builder( + &self, + builder: &mut impl VariantBuilderExt, + index: usize, + ) -> Result<(), ArrowError> { + let value = self.value(index); + $( + let $value = value; + let value = $transform; + $( + let value: $result_ty = value?; + )? + )? + builder.append_value(value); + Ok(()) + } + } + }; +} + +// Primitive type implementations using macro +define_arrow_to_variant!(PrimitiveArray); +define_arrow_to_variant!(PrimitiveArray); +define_arrow_to_variant!(PrimitiveArray); +define_arrow_to_variant!(PrimitiveArray); +define_arrow_to_variant!(PrimitiveArray); +define_arrow_to_variant!(PrimitiveArray); +define_arrow_to_variant!(PrimitiveArray); +define_arrow_to_variant!(PrimitiveArray); +define_arrow_to_variant!(PrimitiveArray); +define_arrow_to_variant!(PrimitiveArray); +define_arrow_to_variant!(PrimitiveArray); + +// Simple type implementations using macro +define_arrow_to_variant!(arrow::array::BooleanArray); +define_arrow_to_variant!(arrow::array::StringArray); +define_arrow_to_variant!(arrow::array::BinaryViewArray); + +// Transformation implementations using macro +define_arrow_to_variant!( + PrimitiveArray, + |days| Date32Type::to_naive_date(days) +); + +define_arrow_to_variant!( + PrimitiveArray, + |micros| -> Result<_> { + arrow::temporal_conversions::time64us_to_time(micros).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!("Invalid Time64 microsecond value: {micros}")) + }) + } +); + +// Note: FixedSizeBinaryArray is NOT implemented here because: +// - Unshred semantics: Only FixedSizeBinary(16) → Variant::Uuid is valid, all other sizes are rejected +// - Cast semantics: All FixedSizeBinary sizes should be accepted as Variant::Binary +// These conflicting requirements mean we should not share this implementation + +/// Shared timestamp conversion using Arrow's robust temporal functions +/// Returns Option for compatibility with define_row_builder macro patterns +pub(crate) fn shared_timestamp_to_variant( + value: i64, + has_timezone: bool, +) -> Option> { + as_datetime::(value).map(|naive_datetime| { + if has_timezone { + let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); + Variant::from(utc_dt) + } else { + Variant::from(naive_datetime) + } + }) +} + // ============================================================================ // Row-oriented builders for efficient Arrow-to-Variant conversion // ============================================================================ @@ -447,19 +539,7 @@ define_row_builder!( has_time_zone: bool, }, |array| -> PrimitiveArray { array.as_primitive() }, - |value| -> Option<_> { - // Convert using Arrow's temporal conversion functions - as_datetime::(value).map(|naive_datetime| { - if *has_time_zone { - // Has timezone -> DateTime -> TimestampMicros/TimestampNanos - let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); - Variant::from(utc_dt) // Uses From> for Variant - } else { - // No timezone -> NaiveDateTime -> TimestampNtzMicros/TimestampNtzNanos - Variant::from(naive_datetime) // Uses From for Variant - } - }) - } + |value| -> Option<_> { shared_timestamp_to_variant::(value, *has_time_zone) } ); define_row_builder!( @@ -472,7 +552,7 @@ define_row_builder!( |array| -> PrimitiveArray { array.as_primitive() }, |value| -> Option<_> { let date_value = i64::from(value); - as_date::(date_value) + arrow::temporal_conversions::as_date::(date_value) } ); @@ -486,7 +566,7 @@ define_row_builder!( |array| -> PrimitiveArray { array.as_primitive() }, |value| -> Option<_> { let time_value = i64::from(value); - as_time::(time_value) + arrow::temporal_conversions::as_time::(time_value) } ); diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index b3876ef6ab3e..5575571589e3 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -41,6 +41,7 @@ mod from_json; mod shred_variant; mod to_json; mod type_conversion; +mod unshred_variant; mod variant_array; mod variant_array_builder; pub mod variant_get; @@ -54,3 +55,4 @@ pub use from_json::json_to_variant; pub use shred_variant::shred_variant; pub use to_json::variant_to_json; pub use type_conversion::CastOptions; +pub use unshred_variant::unshred_variant; diff --git a/parquet-variant-compute/src/unshred_variant.rs b/parquet-variant-compute/src/unshred_variant.rs new file mode 100644 index 000000000000..d0bf832cf5b1 --- /dev/null +++ b/parquet-variant-compute/src/unshred_variant.rs @@ -0,0 +1,472 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Module for unshredding VariantArray by folding typed_value columns back into the value column. + +use crate::arrow_to_variant::{shared_timestamp_to_variant, ArrowToVariant}; +use crate::{BorrowedShreddingState, VariantArray, VariantValueArrayBuilder}; +use arrow::array::{ + Array, AsArray as _, BinaryViewArray, BooleanArray, FixedSizeBinaryArray, PrimitiveArray, + StringArray, StructArray, +}; +use arrow::buffer::NullBuffer; +use arrow::datatypes::{ + ArrowTimestampType, DataType, Date32Type, Float32Type, Float64Type, Int16Type, Int32Type, + Int64Type, Int8Type, Time64MicrosecondType, TimeUnit, TimestampMicrosecondType, + TimestampNanosecondType, +}; +use arrow::error::{ArrowError, Result}; +use indexmap::IndexMap; +use parquet_variant::{ObjectFieldBuilder, Variant, VariantBuilderExt, VariantMetadata}; +use uuid::Uuid; + +/// Removes all (nested) typed_value columns from a VariantArray by converting them back to binary +/// variant and merging the resulting values back into the value column. +/// +/// This function efficiently converts a shredded VariantArray back to an unshredded form where all +/// data resides in the value column. +/// +/// # Arguments +/// * `array` - The VariantArray to unshred +/// +/// # Returns +/// A new VariantArray with all data in the value column and no typed_value column +/// +/// # Errors +/// - If the shredded data contains spec violations (e.g., field name conflicts) +/// - If unsupported data types are encountered in typed_value columns +pub fn unshred_variant(array: &VariantArray) -> Result { + // Check if already unshredded (optimization for common case) + if array.typed_value_field().is_none() && array.value_field().is_some() { + return Ok(array.clone()); + } + + // NOTE: None/None at top-level is technically invalid, but the shredding spec requires us to + // emit `Variant::Null` when a required value is missing. + let nulls = array.nulls(); + let mut row_builder = UnshredVariantRowBuilder::try_new_opt(array.shredding_state().borrow())? + .unwrap_or_else(|| UnshredVariantRowBuilder::null(nulls)); + + let metadata = array.metadata_field(); + let mut value_builder = VariantValueArrayBuilder::new(array.len()); + for i in 0..array.len() { + if array.is_null(i) { + value_builder.append_null(); + } else { + let metadata = VariantMetadata::new(metadata.value(i)); + let mut value_builder = value_builder.builder_ext(&metadata); + row_builder.append_row(&mut value_builder, &metadata, i)?; + } + } + + let value = value_builder.build()?; + Ok(VariantArray::from_parts( + metadata.clone(), + Some(value), + None, + nulls.cloned(), + )) +} + +/// Row builder for converting shredded VariantArray rows back to unshredded form +enum UnshredVariantRowBuilder<'a> { + PrimitiveInt8(UnshredPrimitiveRowBuilder<'a, PrimitiveArray>), + PrimitiveInt16(UnshredPrimitiveRowBuilder<'a, PrimitiveArray>), + PrimitiveInt32(UnshredPrimitiveRowBuilder<'a, PrimitiveArray>), + PrimitiveInt64(UnshredPrimitiveRowBuilder<'a, PrimitiveArray>), + PrimitiveFloat32(UnshredPrimitiveRowBuilder<'a, PrimitiveArray>), + PrimitiveFloat64(UnshredPrimitiveRowBuilder<'a, PrimitiveArray>), + PrimitiveDate32(UnshredPrimitiveRowBuilder<'a, PrimitiveArray>), + PrimitiveTime64(UnshredPrimitiveRowBuilder<'a, PrimitiveArray>), + TimestampMicrosecond(TimestampUnshredRowBuilder<'a, TimestampMicrosecondType>), + TimestampNanosecond(TimestampUnshredRowBuilder<'a, TimestampNanosecondType>), + PrimitiveBoolean(UnshredPrimitiveRowBuilder<'a, BooleanArray>), + PrimitiveString(UnshredPrimitiveRowBuilder<'a, StringArray>), + PrimitiveBinaryView(UnshredPrimitiveRowBuilder<'a, BinaryViewArray>), + PrimitiveUuid(UnshredUuidRowBuilder<'a>), + Struct(StructUnshredVariantBuilder<'a>), + ValueOnly(ValueOnlyUnshredVariantBuilder<'a>), + Null(NullUnshredVariantBuilder<'a>), +} + +impl<'a> UnshredVariantRowBuilder<'a> { + /// Creates an all-null row builder. + fn null(nulls: Option<&'a NullBuffer>) -> Self { + Self::Null(NullUnshredVariantBuilder::new(nulls)) + } + + /// Appends a single row at the given value index to the supplied builder. + fn append_row( + &mut self, + builder: &mut impl VariantBuilderExt, + metadata: &VariantMetadata, + index: usize, + ) -> Result<()> { + match self { + Self::PrimitiveInt8(b) => b.append_row(builder, metadata, index), + Self::PrimitiveInt16(b) => b.append_row(builder, metadata, index), + Self::PrimitiveInt32(b) => b.append_row(builder, metadata, index), + Self::PrimitiveInt64(b) => b.append_row(builder, metadata, index), + Self::PrimitiveFloat32(b) => b.append_row(builder, metadata, index), + Self::PrimitiveFloat64(b) => b.append_row(builder, metadata, index), + Self::PrimitiveDate32(b) => b.append_row(builder, metadata, index), + Self::PrimitiveTime64(b) => b.append_row(builder, metadata, index), + Self::TimestampMicrosecond(b) => b.append_row(builder, metadata, index), + Self::TimestampNanosecond(b) => b.append_row(builder, metadata, index), + Self::PrimitiveBoolean(b) => b.append_row(builder, metadata, index), + Self::PrimitiveString(b) => b.append_row(builder, metadata, index), + Self::PrimitiveBinaryView(b) => b.append_row(builder, metadata, index), + Self::PrimitiveUuid(b) => b.append_row(builder, metadata, index), + Self::Struct(b) => b.append_row(builder, metadata, index), + Self::ValueOnly(b) => b.append_row(builder, metadata, index), + Self::Null(b) => b.append_row(builder, metadata, index), + } + } + + /// Creates a new UnshredVariantRowBuilder from shredding state + /// Returns None for None/None case - caller decides how to handle based on context + fn try_new_opt(shredding_state: BorrowedShreddingState<'a>) -> Result> { + let value = shredding_state.value_field(); + let typed_value = shredding_state.typed_value_field(); + let Some(typed_value) = typed_value else { + // Copy the value across directly, if present. Else caller decides what to do. + return Ok(value.map(|v| Self::ValueOnly(ValueOnlyUnshredVariantBuilder::new(v)))); + }; + + // Has typed_value -> determine type and create appropriate builder + macro_rules! primitive_builder { + ($enum_variant:ident, $cast_fn:ident) => { + Self::$enum_variant(UnshredPrimitiveRowBuilder::new( + value, + typed_value.$cast_fn(), + )) + }; + } + + let builder = match typed_value.data_type() { + DataType::Int8 => primitive_builder!(PrimitiveInt8, as_primitive), + DataType::Int16 => primitive_builder!(PrimitiveInt16, as_primitive), + DataType::Int32 => primitive_builder!(PrimitiveInt32, as_primitive), + DataType::Int64 => primitive_builder!(PrimitiveInt64, as_primitive), + DataType::Float32 => primitive_builder!(PrimitiveFloat32, as_primitive), + DataType::Float64 => primitive_builder!(PrimitiveFloat64, as_primitive), + DataType::Date32 => primitive_builder!(PrimitiveDate32, as_primitive), + DataType::Time64(TimeUnit::Microsecond) => { + primitive_builder!(PrimitiveTime64, as_primitive) + } + DataType::Time64(time_unit) => { + return Err(ArrowError::InvalidArgumentError(format!( + "Time64({time_unit}) is not a valid variant shredding type", + ))); + } + DataType::Timestamp(TimeUnit::Microsecond, timezone) => { + Self::TimestampMicrosecond(TimestampUnshredRowBuilder::new( + value, + typed_value.as_primitive(), + timezone.is_some(), + )) + } + DataType::Timestamp(TimeUnit::Nanosecond, timezone) => { + Self::TimestampNanosecond(TimestampUnshredRowBuilder::new( + value, + typed_value.as_primitive(), + timezone.is_some(), + )) + } + DataType::Timestamp(time_unit, _) => { + return Err(ArrowError::InvalidArgumentError(format!( + "Timestamp({time_unit}) is not a valid variant shredding type", + ))); + } + DataType::Boolean => primitive_builder!(PrimitiveBoolean, as_boolean), + DataType::Utf8 => primitive_builder!(PrimitiveString, as_string), + DataType::BinaryView => primitive_builder!(PrimitiveBinaryView, as_binary_view), + DataType::FixedSizeBinary(16) => Self::PrimitiveUuid(UnshredUuidRowBuilder::new( + value, + typed_value.as_fixed_size_binary(), + )), + DataType::FixedSizeBinary(size) => { + return Err(ArrowError::InvalidArgumentError(format!( + "FixedSizeBinary({size}) is not a valid variant shredding type", + ))); + } + DataType::Struct(_) => Self::Struct(StructUnshredVariantBuilder::try_new( + value, + typed_value.as_struct(), + )?), + _ => { + return Err(ArrowError::NotYetImplemented(format!( + "Unshredding not yet supported for type: {}", + typed_value.data_type() + ))); + } + }; + Ok(Some(builder)) + } +} + +/// Builder for arrays with neither typed_value nor value (all NULL/Variant::Null) +struct NullUnshredVariantBuilder<'a> { + nulls: Option<&'a NullBuffer>, +} + +impl<'a> NullUnshredVariantBuilder<'a> { + fn new(nulls: Option<&'a NullBuffer>) -> Self { + Self { nulls } + } + + fn append_row( + &mut self, + builder: &mut impl VariantBuilderExt, + _metadata: &VariantMetadata, + index: usize, + ) -> Result<()> { + if self.nulls.is_some_and(|nulls| nulls.is_null(index)) { + builder.append_null(); + } else { + builder.append_value(Variant::Null); + } + Ok(()) + } +} + +/// Builder for arrays that only have value column (already unshredded) +struct ValueOnlyUnshredVariantBuilder<'a> { + value: &'a arrow::array::BinaryViewArray, +} + +impl<'a> ValueOnlyUnshredVariantBuilder<'a> { + fn new(value: &'a BinaryViewArray) -> Self { + Self { value } + } + + fn append_row( + &mut self, + builder: &mut impl VariantBuilderExt, + metadata: &VariantMetadata, + index: usize, + ) -> Result<()> { + if self.value.is_null(index) { + builder.append_null(); + } else { + let variant = Variant::new_with_metadata(metadata.clone(), self.value.value(index)); + builder.append_value(variant); + } + Ok(()) + } +} + +/// Macro that handles the common unshredded case and returns early if handled. +/// If not handled (shredded case), validates and returns the extracted value. +macro_rules! handle_unshredded_case { + ($self:expr, $builder:expr, $metadata:expr, $index:expr, $partial_shredding:expr) => {{ + let value = $self.value.as_ref().filter(|v| v.is_valid($index)); + let value = value.map(|v| Variant::new_with_metadata($metadata.clone(), v.value($index))); + + // If typed_value is null, handle unshredded case and return early + if $self.typed_value.is_null($index) { + match value { + Some(value) => $builder.append_value(value), + None => $builder.append_null(), + } + return Ok(()); + } + + // Only partial shredding allows value and typed_value to both be non-NULL + if !$partial_shredding && value.is_some() { + return Err(ArrowError::InvalidArgumentError( + "Invalid shredded variant: both value and typed_value are non-null".to_string(), + )); + } + + // Return the extracted value for the partial shredded case + value + }}; +} + +/// Generic unshred builder that works with any typed array implementing VariantUnshredExt +struct UnshredPrimitiveRowBuilder<'a, T> { + value: Option<&'a BinaryViewArray>, + typed_value: &'a T, +} + +impl<'a, T: ArrowToVariant> UnshredPrimitiveRowBuilder<'a, T> { + fn new(value: Option<&'a BinaryViewArray>, typed_value: &'a T) -> Self { + Self { value, typed_value } + } + + fn append_row( + &mut self, + builder: &mut impl VariantBuilderExt, + metadata: &VariantMetadata, + index: usize, + ) -> Result<()> { + handle_unshredded_case!(self, builder, metadata, index, false); + + // If we get here, typed_value is valid and value is NULL + self.typed_value.append_to_variant_builder(builder, index) + } +} + +/// Specialized unshred builder for FixedSizeBinaryArray with UUID semantics +/// +/// Most array types use shared `ArrowToVariant` trait from arrow_to_variant.rs, but +/// `FixedSizeBinaryArray` does not implement the trait because of conflicting semantics: +/// - Unshred: `FixedSizeBinary(16) => Variant::Uuid` (reject other sizes) +/// - Cast: `FixedSizeBinary(_) => Variant::Binary` +struct UnshredUuidRowBuilder<'a> { + value: Option<&'a BinaryViewArray>, + typed_value: &'a FixedSizeBinaryArray, +} + +impl<'a> UnshredUuidRowBuilder<'a> { + fn new(value: Option<&'a BinaryViewArray>, typed_value: &'a FixedSizeBinaryArray) -> Self { + Self { value, typed_value } + } + + fn append_row( + &mut self, + builder: &mut impl VariantBuilderExt, + metadata: &VariantMetadata, + index: usize, + ) -> Result<()> { + handle_unshredded_case!(self, builder, metadata, index, false); + + // If we get here, typed_value is valid and value is NULL + let bytes = self.typed_value.value(index); + // Unshred semantics: FixedSizeBinaryArray is always 16 bytes (UUID) + // The size constraint is validated during UnshredVariantRowBuilder creation + let uuid = Uuid::from_slice(bytes).unwrap(); + builder.append_value(uuid); + Ok(()) + } +} + +/// Generic builder for timestamp types that handles timezone-aware conversion +struct TimestampUnshredRowBuilder<'a, T: ArrowTimestampType> { + value: Option<&'a BinaryViewArray>, + typed_value: &'a PrimitiveArray, + has_timezone: bool, +} + +impl<'a, T: ArrowTimestampType> TimestampUnshredRowBuilder<'a, T> { + fn new( + value: Option<&'a BinaryViewArray>, + typed_value: &'a PrimitiveArray, + has_timezone: bool, + ) -> Self { + Self { + value, + typed_value, + has_timezone, + } + } + + fn append_row( + &mut self, + builder: &mut impl VariantBuilderExt, + metadata: &VariantMetadata, + index: usize, + ) -> Result<()> { + handle_unshredded_case!(self, builder, metadata, index, false); + + // If we get here, typed_value is valid and value is NULL + let timestamp = self.typed_value.value(index); + let Some(variant) = shared_timestamp_to_variant::(timestamp, self.has_timezone) else { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid timestamp value: {timestamp}" + ))); + }; + builder.append_value(variant); + Ok(()) + } +} + +/// Builder for unshredding struct/object types with nested fields +struct StructUnshredVariantBuilder<'a> { + value: Option<&'a arrow::array::BinaryViewArray>, + typed_value: &'a arrow::array::StructArray, + field_unshredders: IndexMap<&'a str, Option>>, +} + +impl<'a> StructUnshredVariantBuilder<'a> { + fn try_new(value: Option<&'a BinaryViewArray>, typed_value: &'a StructArray) -> Result { + // Create unshredders for each field in constructor + let mut field_unshredders = IndexMap::new(); + for (field, field_array) in typed_value.fields().iter().zip(typed_value.columns()) { + // Factory returns None for None/None case -- these are missing fields we should skip + let Some(field_array) = field_array.as_struct_opt() else { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid shredded variant object field: expected Struct, got {}", + field_array.data_type() + ))); + }; + let field_unshredder = UnshredVariantRowBuilder::try_new_opt(field_array.try_into()?)?; + field_unshredders.insert(field.name().as_ref(), field_unshredder); + } + + Ok(Self { + value, + typed_value, + field_unshredders, + }) + } + + fn append_row( + &mut self, + builder: &mut impl VariantBuilderExt, + metadata: &VariantMetadata, + index: usize, + ) -> Result<()> { + let value = handle_unshredded_case!(self, builder, metadata, index, true); + + // If we get here, typed_value is valid and value may or may not be valid + let mut object_builder = builder.try_new_object()?; + + // Process typed fields (skip empty builders that indicate missing fields) + for (field_name, field_unshredder_opt) in &mut self.field_unshredders { + if let Some(field_unshredder) = field_unshredder_opt { + let mut field_builder = ObjectFieldBuilder::new(field_name, &mut object_builder); + field_unshredder.append_row(&mut field_builder, metadata, index)?; + } + } + + // Process any unshredded fields (partial shredding) + if let Some(value) = value { + let Variant::Object(object) = value else { + return Err(ArrowError::InvalidArgumentError( + "Expected object in value field for partially shredded struct".to_string(), + )); + }; + + for (field_name, field_value) in object.iter() { + if self.field_unshredders.contains_key(field_name) { + return Err(ArrowError::InvalidArgumentError(format!( + "Field '{field_name}' appears in both typed_value and value", + ))); + } + object_builder.insert_bytes(field_name, field_value); + } + } + + object_builder.finish(); + Ok(()) + } +} + +// TODO: This code is covered by tests in `parquet/tests/variant_integration.rs`. Does that suffice? +// Or do we also need targeted stand-alone unit tests for full coverage? diff --git a/parquet/tests/variant_integration.rs b/parquet/tests/variant_integration.rs index a933a3faa1d4..2e44bddb4aed 100644 --- a/parquet/tests/variant_integration.rs +++ b/parquet/tests/variant_integration.rs @@ -26,7 +26,7 @@ use arrow::util::test_util::parquet_test_data; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use parquet_variant::{Variant, VariantMetadata}; -use parquet_variant_compute::VariantArray; +use parquet_variant_compute::{unshred_variant, VariantArray}; use serde::Deserialize; use std::path::Path; use std::sync::LazyLock; @@ -65,8 +65,8 @@ macro_rules! variant_test_case { // - cases 40, 42, 87, 127 and 128 are expected to fail always (they include invalid variants) // - the remaining cases are expected to (eventually) pass -variant_test_case!(1, "Unsupported typed_value type: List("); -variant_test_case!(2, "Unsupported typed_value type: List("); +variant_test_case!(1, "Unshredding not yet supported for type: List("); +variant_test_case!(2, "Unshredding not yet supported for type: List("); // case 3 is empty in cases.json 🤷 // ```json // { @@ -74,7 +74,6 @@ variant_test_case!(2, "Unsupported typed_value type: List("); // }, // ``` variant_test_case!(3, "parquet_file must be set"); -// https://github.com/apache/arrow-rs/issues/8329 variant_test_case!(4); variant_test_case!(5); variant_test_case!(6); @@ -96,35 +95,52 @@ variant_test_case!(21); variant_test_case!(22); variant_test_case!(23); // https://github.com/apache/arrow-rs/issues/8332 -variant_test_case!(24, "Unsupported typed_value type: Decimal128(9, 4)"); -variant_test_case!(25, "Unsupported typed_value type: Decimal128(9, 4)"); -variant_test_case!(26, "Unsupported typed_value type: Decimal128(18, 9)"); -variant_test_case!(27, "Unsupported typed_value type: Decimal128(18, 9)"); -variant_test_case!(28, "Unsupported typed_value type: Decimal128(38, 9)"); -variant_test_case!(29, "Unsupported typed_value type: Decimal128(38, 9)"); +variant_test_case!( + 24, + "Unshredding not yet supported for type: Decimal128(9, 4)" +); +variant_test_case!( + 25, + "Unshredding not yet supported for type: Decimal128(9, 4)" +); +variant_test_case!( + 26, + "Unshredding not yet supported for type: Decimal128(18, 9)" +); +variant_test_case!( + 27, + "Unshredding not yet supported for type: Decimal128(18, 9)" +); +variant_test_case!( + 28, + "Unshredding not yet supported for type: Decimal128(38, 9)" +); +variant_test_case!( + 29, + "Unshredding not yet supported for type: Decimal128(38, 9)" +); variant_test_case!(30); variant_test_case!(31); -// https://github.com/apache/arrow-rs/issues/8334 -variant_test_case!(32, "Unsupported typed_value type: Time64(µs)"); +variant_test_case!(32); variant_test_case!(33); variant_test_case!(34); variant_test_case!(35); variant_test_case!(36); variant_test_case!(37); -// https://github.com/apache/arrow-rs/issues/8336 -variant_test_case!(38, "Unsupported typed_value type: Struct("); +variant_test_case!(38); variant_test_case!(39); // Is an error case (should be failing as the expected error message indicates) -variant_test_case!(40, "Unsupported typed_value type: List("); -variant_test_case!(41, "Unsupported typed_value type: List("); +// TODO: Once we support lists: "both value and typed_value are non-null" +variant_test_case!(40, "Unshredding not yet supported for type: List("); +variant_test_case!(41, "Unshredding not yet supported for type: List("); // Is an error case (should be failing as the expected error message indicates) -variant_test_case!(42, "Invalid variant, conflicting value and typed_value"); -// https://github.com/apache/arrow-rs/issues/8336 -variant_test_case!(43, "Unsupported typed_value type: Struct("); -variant_test_case!(44, "Unsupported typed_value type: Struct("); +variant_test_case!(42, "both value and typed_value are non-null"); +// Is an error case (should be failing as the expected error message indicates) +variant_test_case!(43, "Field 'b' appears in both typed_value and value"); +variant_test_case!(44); // https://github.com/apache/arrow-rs/issues/8337 -variant_test_case!(45, "Unsupported typed_value type: List("); -variant_test_case!(46, "Unsupported typed_value type: Struct("); +variant_test_case!(45, "Unshredding not yet supported for type: List("); +variant_test_case!(46); variant_test_case!(47); variant_test_case!(48); variant_test_case!(49); @@ -161,16 +177,15 @@ variant_test_case!(79); variant_test_case!(80); variant_test_case!(81); variant_test_case!(82); -// https://github.com/apache/arrow-rs/issues/8336 -variant_test_case!(83, "Unsupported typed_value type: Struct("); -variant_test_case!(84, "Unsupported typed_value type: Struct("); +variant_test_case!(83); +// Invalid case, implementations can choose to read the shredded value or error out +variant_test_case!(84); // https://github.com/apache/arrow-rs/issues/8337 -variant_test_case!(85, "Unsupported typed_value type: List("); -variant_test_case!(86, "Unsupported typed_value type: List("); +variant_test_case!(85, "Unshredding not yet supported for type: List("); +variant_test_case!(86, "Unshredding not yet supported for type: List("); // Is an error case (should be failing as the expected error message indicates) -// TODO: Once structs are supported, expect "Invalid variant, non-object value with shredded fields" -variant_test_case!(87, "Unsupported typed_value type: Struct("); -variant_test_case!(88, "Unsupported typed_value type: List("); +variant_test_case!(87, "Expected object in value field"); +variant_test_case!(88, "Unshredding not yet supported for type: List("); variant_test_case!(89); variant_test_case!(90); variant_test_case!(91); @@ -207,23 +222,24 @@ variant_test_case!(121); variant_test_case!(122); variant_test_case!(123); variant_test_case!(124); -variant_test_case!(125, "Unsupported typed_value type: Struct"); -variant_test_case!(126, "Unsupported typed_value type: List("); +// Is an error case (should be failing as the expected error message indicates) +variant_test_case!(125, "Field 'b' appears in both typed_value and value"); +variant_test_case!(126, "Unshredding not yet supported for type: List("); // Is an error case (should be failing as the expected error message indicates) variant_test_case!(127, "Illegal shredded value type: UInt32"); // Is an error case (should be failing as the expected error message indicates) -// TODO: Once structs are supported, expect "Invalid variant, non-object value with shredded fields" -variant_test_case!(128, "Unsupported typed_value type: Struct("); +variant_test_case!(128, "Expected object in value field"); variant_test_case!(129); -variant_test_case!(130, "Unsupported typed_value type: Struct("); +variant_test_case!(130); variant_test_case!(131); -variant_test_case!(132, "Unsupported typed_value type: Struct("); -variant_test_case!(133, "Unsupported typed_value type: Struct("); -variant_test_case!(134, "Unsupported typed_value type: Struct("); -variant_test_case!(135); -variant_test_case!(136, "Unsupported typed_value type: List("); +variant_test_case!(132); +variant_test_case!(133); +variant_test_case!(134); +variant_test_case!(135, "Unshredding not yet supported for type: List("); +variant_test_case!(136, "Unshredding not yet supported for type: List("); +// Is an error case (should be failing as the expected error message indicates) variant_test_case!(137, "Illegal shredded value type: FixedSizeBinary(4)"); -variant_test_case!(138, "Unsupported typed_value type: Struct("); +variant_test_case!(138); /// Test case definition structure matching the format from /// `parquet-testing/parquet_shredded/cases.json` @@ -275,12 +291,13 @@ impl VariantTestCase { let variant_data = self.load_variants(); let variant_array = self.load_parquet(); + // `load_parquet` returns shredded variant values, but the test expectations are provided as + // unshredded variant values. Unshred (failing for invalid input) so we can compare them. + let variant_array = unshred_variant(&variant_array).unwrap(); + // if this is an error case, the expected error message should be set if let Some(expected_error) = &self.error_message { - // just accessing the variant_array should trigger the error - for i in 0..variant_array.len() { - let _ = variant_array.value(i); - } + // Unshredding variant array should have already triggered the error panic!("Expected an error '{expected_error}`, but got no error"); }