Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Rename ShreddingState to conform better to spec
  • Loading branch information
alamb committed Aug 7, 2025
commit f8b2df44c2d77a1047de52766f3684672aeef5cf
4 changes: 2 additions & 2 deletions parquet-variant-compute/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
//!
//! ## Main APIs
//! - [`VariantArray`] : Represents an array of `Variant` values.
//! - [`VariantArrayBuilder`]: For building [`VariantArray`]
//! - [`batch_json_string_to_variant`]: Function to convert a batch of JSON strings to a `VariantArray`.
//! - [`batch_variant_to_json_string`]: Function to convert a `VariantArray` to a batch of JSON strings.
//! - [`cast_to_variant`]: Module to cast other Arrow arrays to `VariantArray`.
Expand All @@ -34,15 +35,14 @@
//! [`VariantPath`]: parquet_variant::VariantPath
//! [Variant issue]: https://github.com/apache/arrow-rs/issues/6736


pub mod cast_to_variant;
mod from_json;
mod to_json;
mod variant_array;
mod variant_array_builder;
pub mod variant_get;

pub use variant_array::VariantArray;
pub use variant_array::{ShreddingState, VariantArray};
pub use variant_array_builder::{VariantArrayBuilder, VariantArrayVariantBuilder};

pub use from_json::batch_json_string_to_variant;
Expand Down
50 changes: 35 additions & 15 deletions parquet-variant-compute/src/variant_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,6 @@ impl VariantArray {
/// int8.
///
/// Currently, only [`BinaryViewArray`] are supported.
///
/// [`BinaryViewArray`]: arrow::array::BinaryViewArray
pub fn try_new(inner: ArrayRef) -> Result<Self, ArrowError> {
let Some(inner) = inner.as_struct_opt() else {
return Err(ArrowError::InvalidArgumentError(
Expand Down Expand Up @@ -171,7 +169,7 @@ impl VariantArray {
ShreddingState::Unshredded { metadata, value } => {
Variant::new(metadata.value(index), value.value(index))
}
ShreddingState::FullyShredded { typed_value, .. } => {
ShreddingState::Typed { typed_value, .. } => {
if typed_value.is_null(index) {
Variant::Null
} else {
Expand Down Expand Up @@ -208,23 +206,45 @@ impl VariantArray {
}
}

/// Variant arrays can be shredded in one of three states, encoded here
/// Represents the shredding state of a [`VariantArray`]
///
/// [`VariantArray`]s can be shredded according to the [Parquet Variant
/// Shredding Spec]. Shredding means that the actual value is stored in a typed
/// `typed_field` instead of the generic `value` field.
///
/// Both value and typed_value are optional fields used together to encode a
/// single value. Values in the two fields must be interpreted according to the
/// following table (see [Parquet Variant Shredding Spec] for more details):
///
/// | value | typed_value | Meaning |
/// |----------|--------------|---------|
/// | null | null | The value is missing; only valid for shredded object fields |
/// | non-null | null | The value is present and may be any type, including `null` |
/// | null | non-null | The value is present and is the shredded type |
/// | non-null | non-null | The value is present and is a partially shredded object |
///
/// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding
#[derive(Debug)]
pub enum ShreddingState {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For a future FullyShreddedAllNull variant (neither value nor typed_value present), would we still need to store the metadata even tho it's never actually used? 🤔

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure

I filed a ticket to track adding AllNull:

// TODO: add missing state where there is neither value nor typed_value
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// TODO: add missing state where there is neither value nor typed_value
// TODO: add missing state where there is neither value nor typed_value
// https://github.com/apache/arrow-rs/issues/8088

// Missing { metadata: BinaryViewArray },
/// This variant has no typed_value field
Unshredded {
metadata: BinaryViewArray,
value: BinaryViewArray,
},
/// This variant has a typed_value field and no value field
/// meaning it is fully shredded (aka the value is stored in typed_value)
FullyShredded {
/// meaning it is the shredded type
Typed {
metadata: BinaryViewArray,
typed_value: ArrayRef,
},
/// This variant has both a value field and a typed_value field
/// meaning it is partially shredded: first the typed_value is used, and
/// if that is null, the value field is used.
/// Partially shredded:
/// * value is an object
/// * typed_value is a shredded object.
///
/// Note the spec says "Writers must not produce data where both value and
/// typed_value are non-null, unless the Variant value is an object."
PartiallyShredded {
metadata: BinaryViewArray,
value: BinaryViewArray,
Expand All @@ -246,7 +266,7 @@ impl ShreddingState {
typed_value,
}),
(metadata, Some(value), None) => Ok(Self::Unshredded { metadata, value }),
(metadata, None, Some(typed_value)) => Ok(Self::FullyShredded {
(metadata, None, Some(typed_value)) => Ok(Self::Typed {
metadata,
typed_value,
}),
Expand All @@ -260,7 +280,7 @@ impl ShreddingState {
pub fn metadata_field(&self) -> &BinaryViewArray {
match self {
ShreddingState::Unshredded { metadata, .. } => metadata,
ShreddingState::FullyShredded { metadata, .. } => metadata,
ShreddingState::Typed { metadata, .. } => metadata,
ShreddingState::PartiallyShredded { metadata, .. } => metadata,
}
}
Expand All @@ -269,7 +289,7 @@ impl ShreddingState {
pub fn value_field(&self) -> Option<&BinaryViewArray> {
match self {
ShreddingState::Unshredded { value, .. } => Some(value),
ShreddingState::FullyShredded { .. } => None,
ShreddingState::Typed { .. } => None,
ShreddingState::PartiallyShredded { value, .. } => Some(value),
}
}
Expand All @@ -278,7 +298,7 @@ impl ShreddingState {
pub fn typed_value_field(&self) -> Option<&ArrayRef> {
match self {
ShreddingState::Unshredded { .. } => None,
ShreddingState::FullyShredded { typed_value, .. } => Some(typed_value),
ShreddingState::Typed { typed_value, .. } => Some(typed_value),
ShreddingState::PartiallyShredded { typed_value, .. } => Some(typed_value),
}
}
Expand All @@ -290,10 +310,10 @@ impl ShreddingState {
metadata: metadata.slice(offset, length),
value: value.slice(offset, length),
},
ShreddingState::FullyShredded {
ShreddingState::Typed {
metadata,
typed_value,
} => ShreddingState::FullyShredded {
} => ShreddingState::Typed {
metadata: metadata.slice(offset, length),
typed_value: typed_value.slice(offset, length),
},
Expand Down
2 changes: 1 addition & 1 deletion parquet-variant-compute/src/variant_get/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result<ArrayRef> {
value,
typed_value,
} => output_builder.partially_shredded(variant_array, metadata, value, typed_value),
ShreddingState::FullyShredded {
ShreddingState::Typed {
metadata,
typed_value,
} => output_builder.fully_shredded(variant_array, metadata, typed_value),
Expand Down