datafusion_functions/unicode/
character_length.rs1use crate::utils::{make_scalar_function, utf8_to_int_type};
19use arrow::array::{
20 Array, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, PrimitiveBuilder,
21 StringArrayType,
22};
23use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
24use datafusion_common::Result;
25use datafusion_expr::{
26 ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
27};
28use datafusion_macros::user_doc;
29use std::any::Any;
30use std::sync::Arc;
31
32#[user_doc(
33 doc_section(label = "String Functions"),
34 description = "Returns the number of characters in a string.",
35 syntax_example = "character_length(str)",
36 sql_example = r#"```sql
37> select character_length('Ångström');
38+------------------------------------+
39| character_length(Utf8("Ångström")) |
40+------------------------------------+
41| 8 |
42+------------------------------------+
43```"#,
44 standard_argument(name = "str", prefix = "String"),
45 related_udf(name = "bit_length"),
46 related_udf(name = "octet_length")
47)]
48#[derive(Debug)]
49pub struct CharacterLengthFunc {
50 signature: Signature,
51 aliases: Vec<String>,
52}
53
54impl Default for CharacterLengthFunc {
55 fn default() -> Self {
56 Self::new()
57 }
58}
59
60impl CharacterLengthFunc {
61 pub fn new() -> Self {
62 use DataType::*;
63 Self {
64 signature: Signature::uniform(
65 1,
66 vec![Utf8, LargeUtf8, Utf8View],
67 Volatility::Immutable,
68 ),
69 aliases: vec![String::from("length"), String::from("char_length")],
70 }
71 }
72}
73
74impl ScalarUDFImpl for CharacterLengthFunc {
75 fn as_any(&self) -> &dyn Any {
76 self
77 }
78
79 fn name(&self) -> &str {
80 "character_length"
81 }
82
83 fn signature(&self) -> &Signature {
84 &self.signature
85 }
86
87 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
88 utf8_to_int_type(&arg_types[0], "character_length")
89 }
90
91 fn invoke_with_args(
92 &self,
93 args: datafusion_expr::ScalarFunctionArgs,
94 ) -> Result<ColumnarValue> {
95 make_scalar_function(character_length, vec![])(&args.args)
96 }
97
98 fn aliases(&self) -> &[String] {
99 &self.aliases
100 }
101
102 fn documentation(&self) -> Option<&Documentation> {
103 self.doc()
104 }
105}
106
107fn character_length(args: &[ArrayRef]) -> Result<ArrayRef> {
111 match args[0].data_type() {
112 DataType::Utf8 => {
113 let string_array = args[0].as_string::<i32>();
114 character_length_general::<Int32Type, _>(string_array)
115 }
116 DataType::LargeUtf8 => {
117 let string_array = args[0].as_string::<i64>();
118 character_length_general::<Int64Type, _>(string_array)
119 }
120 DataType::Utf8View => {
121 let string_array = args[0].as_string_view();
122 character_length_general::<Int32Type, _>(string_array)
123 }
124 _ => unreachable!("CharacterLengthFunc"),
125 }
126}
127
128fn character_length_general<'a, T, V>(array: V) -> Result<ArrayRef>
129where
130 T: ArrowPrimitiveType,
131 T::Native: OffsetSizeTrait,
132 V: StringArrayType<'a>,
133{
134 let mut builder = PrimitiveBuilder::<T>::with_capacity(array.len());
135
136 let is_array_ascii_only = array.is_ascii();
141 if array.null_count() == 0 {
142 if is_array_ascii_only {
143 for i in 0..array.len() {
144 let value = array.value(i);
145 builder.append_value(T::Native::usize_as(value.len()));
146 }
147 } else {
148 for i in 0..array.len() {
149 let value = array.value(i);
150 builder.append_value(T::Native::usize_as(value.chars().count()));
151 }
152 }
153 } else if is_array_ascii_only {
154 for i in 0..array.len() {
155 if array.is_null(i) {
156 builder.append_null();
157 } else {
158 let value = array.value(i);
159 builder.append_value(T::Native::usize_as(value.len()));
160 }
161 }
162 } else {
163 for i in 0..array.len() {
164 if array.is_null(i) {
165 builder.append_null();
166 } else {
167 let value = array.value(i);
168 builder.append_value(T::Native::usize_as(value.chars().count()));
169 }
170 }
171 }
172
173 Ok(Arc::new(builder.finish()) as ArrayRef)
174}
175
176#[cfg(test)]
177mod tests {
178 use crate::unicode::character_length::CharacterLengthFunc;
179 use crate::utils::test::test_function;
180 use arrow::array::{Array, Int32Array, Int64Array};
181 use arrow::datatypes::DataType::{Int32, Int64};
182 use datafusion_common::{Result, ScalarValue};
183 use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
184
185 macro_rules! test_character_length {
186 ($INPUT:expr, $EXPECTED:expr) => {
187 test_function!(
188 CharacterLengthFunc::new(),
189 vec![ColumnarValue::Scalar(ScalarValue::Utf8($INPUT))],
190 $EXPECTED,
191 i32,
192 Int32,
193 Int32Array
194 );
195
196 test_function!(
197 CharacterLengthFunc::new(),
198 vec![ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT))],
199 $EXPECTED,
200 i64,
201 Int64,
202 Int64Array
203 );
204
205 test_function!(
206 CharacterLengthFunc::new(),
207 vec![ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT))],
208 $EXPECTED,
209 i32,
210 Int32,
211 Int32Array
212 );
213 };
214 }
215
216 #[test]
217 fn test_functions() -> Result<()> {
218 #[cfg(feature = "unicode_expressions")]
219 {
220 test_character_length!(Some(String::from("chars")), Ok(Some(5)));
221 test_character_length!(Some(String::from("josé")), Ok(Some(4)));
222 test_character_length!(Some(String::from("joséjoséjoséjosé")), Ok(Some(16)));
224 test_character_length!(Some(String::from("")), Ok(Some(0)));
225 test_character_length!(None, Ok(None));
226 }
227
228 #[cfg(not(feature = "unicode_expressions"))]
229 test_function!(
230 CharacterLengthFunc::new(),
231 &[ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("josé"))))],
232 internal_err!(
233 "function character_length requires compilation with feature flag: unicode_expressions."
234 ),
235 i32,
236 Int32,
237 Int32Array
238 );
239
240 Ok(())
241 }
242}