Skip to content
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
try extend.
  • Loading branch information
Rachelint committed May 22, 2025
commit cf053cb1bdaa85e3caa528e4b33f5bc0a7cbc1f0
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ pub struct GroupValuesPrimitive<T: ArrowPrimitiveType> {
values: Vec<T::Native>,
/// The random state used to generate hashes
random_state: RandomState,

append_row_indices: Vec<u32>,
}

impl<T: ArrowPrimitiveType> GroupValuesPrimitive<T> {
Expand All @@ -109,6 +111,7 @@ impl<T: ArrowPrimitiveType> GroupValuesPrimitive<T> {
values: Vec::with_capacity(128),
null_group: None,
random_state: Default::default(),
append_row_indices: Vec::new(),
}
}
}
Expand All @@ -119,13 +122,18 @@ where
{
fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec<usize>) -> Result<()> {
assert_eq!(cols.len(), 1);
let col = cols[0].as_primitive::<T>();

groups.clear();
self.append_row_indices.clear();

for v in cols[0].as_primitive::<T>() {
let mut num_total_groups = self.values.len();
for (row_index, v) in col.iter().enumerate() {
let group_id = match v {
None => *self.null_group.get_or_insert_with(|| {
let group_id = self.values.len();
self.values.push(Default::default());
let group_id = num_total_groups;
self.append_row_indices.push(row_index as u32);
num_total_groups += 1;
group_id
}),
Some(key) => {
Expand All @@ -140,16 +148,28 @@ where
match insert {
hashbrown::hash_table::Entry::Occupied(o) => o.get().0,
hashbrown::hash_table::Entry::Vacant(v) => {
let g = self.values.len();
let g = num_total_groups;
v.insert((g, key));
self.values.push(key);
self.append_row_indices.push(row_index as u32);
num_total_groups += 1;
g
}
}
}
};
groups.push(group_id)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this can extend to groups

}

// If all are new groups, we just extend it
if self.append_row_indices.len() == col.len() {
self.values.extend_from_slice(col.values());
} else {
let col_values = col.values();
for &row_index in self.append_row_indices.iter() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can be written as
self.values.extend(self.append_row_indices.iter().map(...)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually... I found no obvious improvement when switching to extend...
The bottleneck still the hashtable, I think it is better to just keep the original push logic because it may be simpler, and actually efficient enough.

self.values.push(col_values[row_index as usize]);
}
}

Ok(())
}

Expand Down