datafusion_functions/unicode/
character_length.rsuse crate::utils::{make_scalar_function, utf8_to_int_type};
use arrow::array::{
Array, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, PrimitiveBuilder,
StringArrayType,
};
use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
use datafusion_common::Result;
use datafusion_expr::{
ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
};
use datafusion_macros::user_doc;
use std::any::Any;
use std::sync::Arc;
#[user_doc(
doc_section(label = "String Functions"),
description = "Returns the number of characters in a string.",
syntax_example = "character_length(str)",
sql_example = r#"```sql
> select character_length('Ångström');
+------------------------------------+
| character_length(Utf8("Ångström")) |
+------------------------------------+
| 8 |
+------------------------------------+
```"#,
standard_argument(name = "str", prefix = "String"),
related_udf(name = "bit_length"),
related_udf(name = "octet_length")
)]
#[derive(Debug)]
pub struct CharacterLengthFunc {
signature: Signature,
aliases: Vec<String>,
}
impl Default for CharacterLengthFunc {
fn default() -> Self {
Self::new()
}
}
impl CharacterLengthFunc {
pub fn new() -> Self {
use DataType::*;
Self {
signature: Signature::uniform(
1,
vec![Utf8, LargeUtf8, Utf8View],
Volatility::Immutable,
),
aliases: vec![String::from("length"), String::from("char_length")],
}
}
}
impl ScalarUDFImpl for CharacterLengthFunc {
fn as_any(&self) -> &dyn Any {
self
}
fn name(&self) -> &str {
"character_length"
}
fn signature(&self) -> &Signature {
&self.signature
}
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
utf8_to_int_type(&arg_types[0], "character_length")
}
fn invoke_batch(
&self,
args: &[ColumnarValue],
_number_rows: usize,
) -> Result<ColumnarValue> {
make_scalar_function(character_length, vec![])(args)
}
fn aliases(&self) -> &[String] {
&self.aliases
}
fn documentation(&self) -> Option<&Documentation> {
self.doc()
}
}
fn character_length(args: &[ArrayRef]) -> Result<ArrayRef> {
match args[0].data_type() {
DataType::Utf8 => {
let string_array = args[0].as_string::<i32>();
character_length_general::<Int32Type, _>(string_array)
}
DataType::LargeUtf8 => {
let string_array = args[0].as_string::<i64>();
character_length_general::<Int64Type, _>(string_array)
}
DataType::Utf8View => {
let string_array = args[0].as_string_view();
character_length_general::<Int32Type, _>(string_array)
}
_ => unreachable!("CharacterLengthFunc"),
}
}
fn character_length_general<'a, T, V>(array: V) -> Result<ArrayRef>
where
T: ArrowPrimitiveType,
T::Native: OffsetSizeTrait,
V: StringArrayType<'a>,
{
let mut builder = PrimitiveBuilder::<T>::with_capacity(array.len());
let is_array_ascii_only = array.is_ascii();
if array.null_count() == 0 {
if is_array_ascii_only {
for i in 0..array.len() {
let value = array.value(i);
builder.append_value(T::Native::usize_as(value.len()));
}
} else {
for i in 0..array.len() {
let value = array.value(i);
builder.append_value(T::Native::usize_as(value.chars().count()));
}
}
} else if is_array_ascii_only {
for i in 0..array.len() {
if array.is_null(i) {
builder.append_null();
} else {
let value = array.value(i);
builder.append_value(T::Native::usize_as(value.len()));
}
}
} else {
for i in 0..array.len() {
if array.is_null(i) {
builder.append_null();
} else {
let value = array.value(i);
builder.append_value(T::Native::usize_as(value.chars().count()));
}
}
}
Ok(Arc::new(builder.finish()) as ArrayRef)
}
#[cfg(test)]
mod tests {
use crate::unicode::character_length::CharacterLengthFunc;
use crate::utils::test::test_function;
use arrow::array::{Array, Int32Array, Int64Array};
use arrow::datatypes::DataType::{Int32, Int64};
use datafusion_common::{Result, ScalarValue};
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
macro_rules! test_character_length {
($INPUT:expr, $EXPECTED:expr) => {
test_function!(
CharacterLengthFunc::new(),
vec![ColumnarValue::Scalar(ScalarValue::Utf8($INPUT))],
$EXPECTED,
i32,
Int32,
Int32Array
);
test_function!(
CharacterLengthFunc::new(),
vec![ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT))],
$EXPECTED,
i64,
Int64,
Int64Array
);
test_function!(
CharacterLengthFunc::new(),
vec![ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT))],
$EXPECTED,
i32,
Int32,
Int32Array
);
};
}
#[test]
fn test_functions() -> Result<()> {
#[cfg(feature = "unicode_expressions")]
{
test_character_length!(Some(String::from("chars")), Ok(Some(5)));
test_character_length!(Some(String::from("josé")), Ok(Some(4)));
test_character_length!(Some(String::from("joséjoséjoséjosé")), Ok(Some(16)));
test_character_length!(Some(String::from("")), Ok(Some(0)));
test_character_length!(None, Ok(None));
}
#[cfg(not(feature = "unicode_expressions"))]
test_function!(
CharacterLengthFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("josé"))))],
internal_err!(
"function character_length requires compilation with feature flag: unicode_expressions."
),
i32,
Int32,
Int32Array
);
Ok(())
}
}