Skip to content
Merged
Changes from 1 commit
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
987e33b
Pin to pre-release version of arrow 52.2.0
alamb Jul 16, 2024
2c808fb
Update for deprecated method
alamb Jul 16, 2024
8d8732c
Add a config to force using string view in benchmark (#11514)
XiangpengHao Jul 19, 2024
8e0ca1a
Add String view helper functions (#11517)
XiangpengHao Jul 19, 2024
db65772
Add ArrowBytesViewMap and ArrowBytesViewSet (#11515)
XiangpengHao Jul 19, 2024
efcf5c6
Enable `GroupValueBytesView` for aggregation with StringView types (#…
XiangpengHao Jul 20, 2024
34d42bc
Initial support for regex_replace on `StringViewArray` (#11556)
XiangpengHao Jul 22, 2024
bb780b3
Add support for Utf8View for date/temporal codepaths (#11518)
a10y Jul 22, 2024
2b58fd5
GC `StringViewArray` in `CoalesceBatchesStream` (#11587)
XiangpengHao Jul 25, 2024
2b2b8ab
Merge remote-tracking branch 'apache/main' into string-view2
alamb Jul 26, 2024
ea11a9d
Merge remote-tracking branch 'apache/main' into string-view2
alamb Jul 26, 2024
f13bb82
[Bug] fix bug in return type inference of `utf8_to_int_type` (#11662)
XiangpengHao Jul 26, 2024
fb79638
Merge remote-tracking branch 'apache/main' into string-view2
alamb Jul 26, 2024
281fbed
Fix clippy
alamb Jul 26, 2024
5690712
Increase ByteViewMap block size to 2MB (#11674)
XiangpengHao Jul 27, 2024
322c3d2
Change `--string-view` to only apply to parquet formats (#11663)
XiangpengHao Jul 27, 2024
ab8005d
Implement native support StringView for character length (#11676)
XiangpengHao Jul 27, 2024
561aee8
Merge remote-tracking branch 'apache/main' into string-view2
alamb Jul 29, 2024
2e9c8a0
Remove uneeded patches
alamb Jul 29, 2024
f1f22fa
cargo fmt
alamb Jul 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Implement native support StringView for character length (#11676)
* native support for character length

* Update datafusion/functions/src/unicode/character_length.rs

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
XiangpengHao and alamb authored Jul 27, 2024
commit ab8005d8d52499dabdccf41b9bf8313f232c6233
131 changes: 68 additions & 63 deletions datafusion/functions/src/unicode/character_length.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,10 @@

use crate::utils::{make_scalar_function, utf8_to_int_type};
use arrow::array::{
ArrayRef, ArrowPrimitiveType, GenericStringArray, OffsetSizeTrait, PrimitiveArray,
Array, ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray,
OffsetSizeTrait, PrimitiveArray,
};
use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
use datafusion_common::cast::as_generic_string_array;
use datafusion_common::exec_err;
use datafusion_common::Result;
use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
use std::any::Any;
Expand Down Expand Up @@ -71,17 +70,7 @@ impl ScalarUDFImpl for CharacterLengthFunc {
}

fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
DataType::Utf8 => {
make_scalar_function(character_length::<Int32Type>, vec![])(args)
}
DataType::LargeUtf8 => {
make_scalar_function(character_length::<Int64Type>, vec![])(args)
}
other => {
exec_err!("Unsupported data type {other:?} for function character_length")
}
}
make_scalar_function(character_length, vec![])(args)
}

fn aliases(&self) -> &[String] {
Expand All @@ -92,15 +81,32 @@ impl ScalarUDFImpl for CharacterLengthFunc {
/// Returns number of characters in the string.
/// character_length('josé') = 4
/// The implementation counts UTF-8 code points to count the number of characters
fn character_length<T: ArrowPrimitiveType>(args: &[ArrayRef]) -> Result<ArrayRef>
fn character_length(args: &[ArrayRef]) -> Result<ArrayRef> {
match args[0].data_type() {
DataType::Utf8 => {
let string_array = args[0].as_string::<i32>();
character_length_general::<Int32Type, _>(string_array)
}
DataType::LargeUtf8 => {
let string_array = args[0].as_string::<i64>();
character_length_general::<Int64Type, _>(string_array)
}
DataType::Utf8View => {
let string_array = args[0].as_string_view();
character_length_general::<Int32Type, _>(string_array)
}
_ => unreachable!(),
}
}

fn character_length_general<'a, T: ArrowPrimitiveType, V: ArrayAccessor<Item = &'a str>>(
array: V,
) -> Result<ArrayRef>
where
T::Native: OffsetSizeTrait,
{
let string_array: &GenericStringArray<T::Native> =
as_generic_string_array::<T::Native>(&args[0])?;

let result = string_array
.iter()
let iter = ArrayIter::new(array);
let result = iter
.map(|string| {
string.map(|string: &str| {
T::Native::from_usize(string.chars().count())
Expand All @@ -116,55 +122,54 @@ where
mod tests {
use crate::unicode::character_length::CharacterLengthFunc;
use crate::utils::test::test_function;
use arrow::array::{Array, Int32Array};
use arrow::datatypes::DataType::Int32;
use arrow::array::{Array, Int32Array, Int64Array};
use arrow::datatypes::DataType::{Int32, Int64};
use datafusion_common::{Result, ScalarValue};
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};

macro_rules! test_character_length {
($INPUT:expr, $EXPECTED:expr) => {
test_function!(
CharacterLengthFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8($INPUT))],
$EXPECTED,
i32,
Int32,
Int32Array
);

test_function!(
CharacterLengthFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT))],
$EXPECTED,
i64,
Int64,
Int64Array
);

test_function!(
CharacterLengthFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT))],
$EXPECTED,
i32,
Int32,
Int32Array
);
};
}

#[test]
fn test_functions() -> Result<()> {
#[cfg(feature = "unicode_expressions")]
test_function!(
CharacterLengthFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8(Some(
String::from("chars")
)))],
Ok(Some(5)),
i32,
Int32,
Int32Array
);
#[cfg(feature = "unicode_expressions")]
test_function!(
CharacterLengthFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8(Some(
String::from("josé")
)))],
Ok(Some(4)),
i32,
Int32,
Int32Array
);
#[cfg(feature = "unicode_expressions")]
test_function!(
CharacterLengthFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8(Some(
String::from("")
)))],
Ok(Some(0)),
i32,
Int32,
Int32Array
);
#[cfg(feature = "unicode_expressions")]
test_function!(
CharacterLengthFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8(None))],
Ok(None),
i32,
Int32,
Int32Array
);
{
test_character_length!(Some(String::from("chars")), Ok(Some(5)));
test_character_length!(Some(String::from("josé")), Ok(Some(4)));
// test long strings (more than 12 bytes for StringView)
test_character_length!(Some(String::from("joséjoséjoséjosé")), Ok(Some(16)));
test_character_length!(Some(String::from("")), Ok(Some(0)));
test_character_length!(None, Ok(None));
}

#[cfg(not(feature = "unicode_expressions"))]
test_function!(
CharacterLengthFunc::new(),
Expand Down