Skip to content

Commit 92005ba

Browse files
scovichalamb
andauthored
[Variant] Simpler shredding state (apache#8444)
# Which issue does this PR close? - Related to apache#8336 # Rationale for this change While working on apache#8438, I noticed that the enum variants of `ShreddingState` actually made the code (a lot) more complex than if I just referenced the (optional) value and typed_value columns directly. That made me wonder if `ShreddingState` would be better as a simple two-field struct. # What changes are included in this PR? Change `ShreddingState` to a two-field struct and update the few call sites that noticed. While we're at it, improve the docs about how shredding works. # Are these changes tested? Existing tests cover what is mostly an internal change # Are there any user-facing changes? `ShreddingState` is pub and changed from enum to struct. --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 6739996 commit 92005ba

File tree

1 file changed

+47
-70
lines changed

1 file changed

+47
-70
lines changed

parquet-variant-compute/src/variant_array.rs

Lines changed: 47 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -629,47 +629,38 @@ impl From<ShreddedVariantFieldArray> for StructArray {
629629
///
630630
/// | value | typed_value | Meaning |
631631
/// |----------|--------------|---------|
632-
/// | null | null | The value is missing; only valid for shredded object fields |
633-
/// | non-null | null | The value is present and may be any type, including `null` |
634-
/// | null | non-null | The value is present and is the shredded type |
635-
/// | non-null | non-null | The value is present and is a partially shredded object |
632+
/// | NULL | NULL | The value is missing; only valid for shredded object fields |
633+
/// | non-NULL | NULL | The value is present and may be any type, including [`Variant::Null`] |
634+
/// | NULL | non-NULL | The value is present and is the shredded type |
635+
/// | non-NULL | non-NULL | The value is present and is a partially shredded object |
636+
///
637+
///
638+
/// Applying the above rules to entire columns, we obtain the following:
639+
///
640+
/// | value | typed_value | Meaning |
641+
/// |--------|-------------|---------|
642+
/// | -- | -- | **Missing**: The value is always missing; only valid for shredded object fields |
643+
/// | exists | -- | **Unshredded**: If present, the value may be any type, including [`Variant::Null`]
644+
/// | -- | exists | **Perfectly shredded**: If present, the value is always the shredded type |
645+
/// | exists | exists | **Imperfectly shredded**: The value might (not) be present and might (not) be the shredded type |
646+
///
647+
/// NOTE: Partial shredding is a row-wise situation that can arise under imperfect shredding (a
648+
/// column-wise situation): When both columns exist (imperfect shredding) and the typed_value column
649+
/// is a struct, then both columns can be non-NULL for the same row if value is a variant object
650+
/// (partial shredding).
636651
///
637652
/// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding
638653
#[derive(Clone, Debug)]
639-
pub enum ShreddingState {
640-
/// This variant has no typed_value field
641-
Unshredded { value: BinaryViewArray },
642-
/// This variant has a typed_value field and no value field
643-
/// meaning it is the shredded type
644-
Typed { typed_value: ArrayRef },
645-
/// Imperfectly shredded: Shredded values reside in `typed_value` while those that failed to
646-
/// shred reside in `value`. Missing field values are NULL in both columns, while NULL primitive
647-
/// values have NULL `typed_value` and `Variant::Null` in `value`.
648-
///
649-
/// NOTE: A partially shredded struct is a special kind of imperfect shredding, where
650-
/// `typed_value` and `value` are both non-NULL. The `typed_value` is a struct containing the
651-
/// subset of fields for which shredding was attempted (each field will then have its own value
652-
/// and/or typed_value sub-fields that indicate how shredding actually turned out). Meanwhile,
653-
/// the `value` is a variant object containing the subset of fields for which shredding was
654-
/// not even attempted.
655-
PartiallyShredded {
656-
value: BinaryViewArray,
657-
typed_value: ArrayRef,
658-
},
659-
/// All values are null, only metadata is present.
660-
///
661-
/// This state occurs when neither `value` nor `typed_value` fields exist in the schema.
662-
/// Note: By strict spec interpretation, this should only be valid for shredded object fields,
663-
/// not top-level variants. However, we allow it and treat as Variant::Null for pragmatic
664-
/// handling of missing data.
665-
AllNull,
654+
pub struct ShreddingState {
655+
value: Option<BinaryViewArray>,
656+
typed_value: Option<ArrayRef>,
666657
}
667658

668659
impl ShreddingState {
669-
/// try to create a new `ShreddingState` from the given `value` and `typed_value` fields
660+
/// Create a new `ShreddingState` from the given `value` and `typed_value` fields
670661
///
671662
/// Note you can create a `ShreddingState` from a &[`StructArray`] using
672-
/// `ShreddingState::try_from(&struct_array)`, for example:
663+
/// `ShreddingState::from(&struct_array)`, for example:
673664
///
674665
/// ```no_run
675666
/// # use arrow::array::StructArray;
@@ -678,53 +669,27 @@ impl ShreddingState {
678669
/// # unimplemented!()
679670
/// # }
680671
/// let struct_array: StructArray = get_struct_array();
681-
/// let shredding_state = ShreddingState::try_from(&struct_array).unwrap();
672+
/// let shredding_state = ShreddingState::from(&struct_array);
682673
/// ```
683674
pub fn new(value: Option<BinaryViewArray>, typed_value: Option<ArrayRef>) -> Self {
684-
match (value, typed_value) {
685-
(Some(value), Some(typed_value)) => Self::PartiallyShredded { value, typed_value },
686-
(Some(value), None) => Self::Unshredded { value },
687-
(None, Some(typed_value)) => Self::Typed { typed_value },
688-
(None, None) => Self::AllNull,
689-
}
675+
Self { value, typed_value }
690676
}
691677

692678
/// Return a reference to the value field, if present
693679
pub fn value_field(&self) -> Option<&BinaryViewArray> {
694-
match self {
695-
ShreddingState::Unshredded { value, .. } => Some(value),
696-
ShreddingState::Typed { .. } => None,
697-
ShreddingState::PartiallyShredded { value, .. } => Some(value),
698-
ShreddingState::AllNull => None,
699-
}
680+
self.value.as_ref()
700681
}
701682

702683
/// Return a reference to the typed_value field, if present
703684
pub fn typed_value_field(&self) -> Option<&ArrayRef> {
704-
match self {
705-
ShreddingState::Unshredded { .. } => None,
706-
ShreddingState::Typed { typed_value, .. } => Some(typed_value),
707-
ShreddingState::PartiallyShredded { typed_value, .. } => Some(typed_value),
708-
ShreddingState::AllNull => None,
709-
}
685+
self.typed_value.as_ref()
710686
}
711687

712688
/// Slice all the underlying arrays
713689
pub fn slice(&self, offset: usize, length: usize) -> Self {
714-
match self {
715-
ShreddingState::Unshredded { value } => ShreddingState::Unshredded {
716-
value: value.slice(offset, length),
717-
},
718-
ShreddingState::Typed { typed_value } => ShreddingState::Typed {
719-
typed_value: typed_value.slice(offset, length),
720-
},
721-
ShreddingState::PartiallyShredded { value, typed_value } => {
722-
ShreddingState::PartiallyShredded {
723-
value: value.slice(offset, length),
724-
typed_value: typed_value.slice(offset, length),
725-
}
726-
}
727-
ShreddingState::AllNull => ShreddingState::AllNull,
690+
Self {
691+
value: self.value.as_ref().map(|v| v.slice(offset, length)),
692+
typed_value: self.typed_value.as_ref().map(|tv| tv.slice(offset, length)),
728693
}
729694
}
730695
}
@@ -1060,7 +1025,10 @@ mod test {
10601025
// Verify the shredding state is AllNull
10611026
assert!(matches!(
10621027
variant_array.shredding_state(),
1063-
ShreddingState::AllNull
1028+
ShreddingState {
1029+
value: None,
1030+
typed_value: None
1031+
}
10641032
));
10651033

10661034
// Verify that value() returns Variant::Null (compensating for spec violation)
@@ -1120,7 +1088,10 @@ mod test {
11201088
// Verify the shredding state is AllNull
11211089
assert!(matches!(
11221090
ShreddingState::new(None, None),
1123-
ShreddingState::AllNull
1091+
ShreddingState {
1092+
value: None,
1093+
typed_value: None
1094+
}
11241095
));
11251096
}
11261097

@@ -1137,7 +1108,10 @@ mod test {
11371108
// Verify the shredding state is AllNull
11381109
assert!(matches!(
11391110
variant_array.shredding_state(),
1140-
ShreddingState::AllNull
1111+
ShreddingState {
1112+
value: None,
1113+
typed_value: None
1114+
}
11411115
));
11421116

11431117
// Verify all values are null
@@ -1187,7 +1161,10 @@ mod test {
11871161
// This should be Unshredded, not AllNull, because value field exists in schema
11881162
assert!(matches!(
11891163
variant_array.shredding_state(),
1190-
ShreddingState::Unshredded { .. }
1164+
ShreddingState {
1165+
value: Some(_),
1166+
typed_value: None
1167+
}
11911168
));
11921169
}
11931170
}

0 commit comments

Comments
 (0)