datafusion_functions/regex/
regexplike.rs1use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray};
21use arrow::compute::kernels::regexp;
22use arrow::datatypes::DataType;
23use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
24use datafusion_common::types::logical_string;
25use datafusion_common::{
26 arrow_datafusion_err, exec_err, internal_err, plan_err, DataFusionError, Result,
27 ScalarValue,
28};
29use datafusion_expr::{
30 Coercion, ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature,
31 TypeSignatureClass, Volatility,
32};
33use datafusion_macros::user_doc;
34
35use std::any::Any;
36use std::sync::Arc;
37
38#[user_doc(
39 doc_section(label = "Regular Expression Functions"),
40 description = "Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.",
41 syntax_example = "regexp_like(str, regexp[, flags])",
42 sql_example = r#"```sql
43select regexp_like('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
44+--------------------------------------------------------+
45| regexp_like(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
46+--------------------------------------------------------+
47| true |
48+--------------------------------------------------------+
49SELECT regexp_like('aBc', '(b|d)', 'i');
50+--------------------------------------------------+
51| regexp_like(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
52+--------------------------------------------------+
53| true |
54+--------------------------------------------------+
55```
56Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
57"#,
58 standard_argument(name = "str", prefix = "String"),
59 standard_argument(name = "regexp", prefix = "Regular"),
60 argument(
61 name = "flags",
62 description = r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
63 - **i**: case-insensitive: letters match both upper and lower case
64 - **m**: multi-line mode: ^ and $ match begin/end of line
65 - **s**: allow . to match \n
66 - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
67 - **U**: swap the meaning of x* and x*?"#
68 )
69)]
70#[derive(Debug)]
71pub struct RegexpLikeFunc {
72 signature: Signature,
73}
74
75impl Default for RegexpLikeFunc {
76 fn default() -> Self {
77 Self::new()
78 }
79}
80
81impl RegexpLikeFunc {
82 pub fn new() -> Self {
83 Self {
84 signature: Signature::one_of(
85 vec![
86 TypeSignature::Coercible(vec![
87 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
88 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
89 ]),
90 TypeSignature::Coercible(vec![
91 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
92 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
93 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
94 ]),
95 ],
96 Volatility::Immutable,
97 ),
98 }
99 }
100}
101
102impl ScalarUDFImpl for RegexpLikeFunc {
103 fn as_any(&self) -> &dyn Any {
104 self
105 }
106
107 fn name(&self) -> &str {
108 "regexp_like"
109 }
110
111 fn signature(&self) -> &Signature {
112 &self.signature
113 }
114
115 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
116 use DataType::*;
117
118 Ok(match &arg_types[0] {
119 Null => Null,
120 _ => Boolean,
123 })
124 }
125
126 fn invoke_with_args(
127 &self,
128 args: datafusion_expr::ScalarFunctionArgs,
129 ) -> Result<ColumnarValue> {
130 let args = &args.args;
131
132 let len = args
133 .iter()
134 .fold(Option::<usize>::None, |acc, arg| match arg {
135 ColumnarValue::Scalar(_) => acc,
136 ColumnarValue::Array(a) => Some(a.len()),
137 });
138
139 let is_scalar = len.is_none();
140 let inferred_length = len.unwrap_or(1);
141 let args = args
142 .iter()
143 .map(|arg| arg.to_array(inferred_length))
144 .collect::<Result<Vec<_>>>()?;
145
146 let result = regexp_like(&args);
147 if is_scalar {
148 let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0));
150 result.map(ColumnarValue::Scalar)
151 } else {
152 result.map(ColumnarValue::Array)
153 }
154 }
155
156 fn documentation(&self) -> Option<&Documentation> {
157 self.doc()
158 }
159}
160
161pub fn regexp_like(args: &[ArrayRef]) -> Result<ArrayRef> {
204 match args.len() {
205 2 => handle_regexp_like(&args[0], &args[1], None),
206 3 => {
207 let flags = match args[2].data_type() {
208 Utf8 => args[2].as_string::<i32>(),
209 LargeUtf8 => {
210 let large_string_array = args[2].as_string::<i64>();
211 let string_vec: Vec<Option<&str>> = (0..large_string_array.len()).map(|i| {
212 if large_string_array.is_null(i) {
213 None
214 } else {
215 Some(large_string_array.value(i))
216 }
217 })
218 .collect();
219
220 &GenericStringArray::<i32>::from(string_vec)
221 },
222 _ => {
223 let string_view_array = args[2].as_string_view();
224 let string_vec: Vec<Option<String>> = (0..string_view_array.len()).map(|i| {
225 if string_view_array.is_null(i) {
226 None
227 } else {
228 Some(string_view_array.value(i).to_string())
229 }
230 })
231 .collect();
232 &GenericStringArray::<i32>::from(string_vec)
233 },
234 };
235
236 if flags.iter().any(|s| s == Some("g")) {
237 return plan_err!("regexp_like() does not support the \"global\" option");
238 }
239
240 handle_regexp_like(&args[0], &args[1], Some(flags))
241 },
242 other => exec_err!(
243 "`regexp_like` was called with {other} arguments. It requires at least 2 and at most 3."
244 ),
245 }
246}
247
248fn handle_regexp_like(
249 values: &ArrayRef,
250 patterns: &ArrayRef,
251 flags: Option<&GenericStringArray<i32>>,
252) -> Result<ArrayRef> {
253 let array = match (values.data_type(), patterns.data_type()) {
254 (Utf8View, Utf8) => {
255 let value = values.as_string_view();
256 let pattern = patterns.as_string::<i32>();
257
258 regexp::regexp_is_match(value, pattern, flags)
259 .map_err(|e| arrow_datafusion_err!(e))?
260 }
261 (Utf8View, Utf8View) => {
262 let value = values.as_string_view();
263 let pattern = patterns.as_string_view();
264
265 regexp::regexp_is_match(value, pattern, flags)
266 .map_err(|e| arrow_datafusion_err!(e))?
267 }
268 (Utf8View, LargeUtf8) => {
269 let value = values.as_string_view();
270 let pattern = patterns.as_string::<i64>();
271
272 regexp::regexp_is_match(value, pattern, flags)
273 .map_err(|e| arrow_datafusion_err!(e))?
274 }
275 (Utf8, Utf8) => {
276 let value = values.as_string::<i32>();
277 let pattern = patterns.as_string::<i32>();
278
279 regexp::regexp_is_match(value, pattern, flags)
280 .map_err(|e| arrow_datafusion_err!(e))?
281 }
282 (Utf8, Utf8View) => {
283 let value = values.as_string::<i32>();
284 let pattern = patterns.as_string_view();
285
286 regexp::regexp_is_match(value, pattern, flags)
287 .map_err(|e| arrow_datafusion_err!(e))?
288 }
289 (Utf8, LargeUtf8) => {
290 let value = values.as_string_view();
291 let pattern = patterns.as_string::<i64>();
292
293 regexp::regexp_is_match(value, pattern, flags)
294 .map_err(|e| arrow_datafusion_err!(e))?
295 }
296 (LargeUtf8, Utf8) => {
297 let value = values.as_string::<i64>();
298 let pattern = patterns.as_string::<i32>();
299
300 regexp::regexp_is_match(value, pattern, flags)
301 .map_err(|e| arrow_datafusion_err!(e))?
302 }
303 (LargeUtf8, Utf8View) => {
304 let value = values.as_string::<i64>();
305 let pattern = patterns.as_string_view();
306
307 regexp::regexp_is_match(value, pattern, flags)
308 .map_err(|e| arrow_datafusion_err!(e))?
309 }
310 (LargeUtf8, LargeUtf8) => {
311 let value = values.as_string::<i64>();
312 let pattern = patterns.as_string::<i64>();
313
314 regexp::regexp_is_match(value, pattern, flags)
315 .map_err(|e| arrow_datafusion_err!(e))?
316 }
317 other => {
318 return internal_err!(
319 "Unsupported data type {other:?} for function `regexp_like`"
320 )
321 }
322 };
323
324 Ok(Arc::new(array) as ArrayRef)
325}
326
327#[cfg(test)]
328mod tests {
329 use std::sync::Arc;
330
331 use arrow::array::StringArray;
332 use arrow::array::{BooleanBuilder, StringViewArray};
333
334 use crate::regex::regexplike::regexp_like;
335
336 #[test]
337 fn test_case_sensitive_regexp_like_utf8() {
338 let values = StringArray::from(vec!["abc"; 5]);
339
340 let patterns =
341 StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
342
343 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
344 expected_builder.append_value(true);
345 expected_builder.append_value(false);
346 expected_builder.append_value(true);
347 expected_builder.append_value(false);
348 expected_builder.append_value(false);
349 let expected = expected_builder.finish();
350
351 let re = regexp_like(&[Arc::new(values), Arc::new(patterns)]).unwrap();
352
353 assert_eq!(re.as_ref(), &expected);
354 }
355
356 #[test]
357 fn test_case_sensitive_regexp_like_utf8view() {
358 let values = StringViewArray::from(vec!["abc"; 5]);
359
360 let patterns =
361 StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
362
363 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
364 expected_builder.append_value(true);
365 expected_builder.append_value(false);
366 expected_builder.append_value(true);
367 expected_builder.append_value(false);
368 expected_builder.append_value(false);
369 let expected = expected_builder.finish();
370
371 let re = regexp_like(&[Arc::new(values), Arc::new(patterns)]).unwrap();
372
373 assert_eq!(re.as_ref(), &expected);
374 }
375
376 #[test]
377 fn test_case_insensitive_regexp_like_utf8() {
378 let values = StringArray::from(vec!["abc"; 5]);
379 let patterns =
380 StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
381 let flags = StringArray::from(vec!["i"; 5]);
382
383 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
384 expected_builder.append_value(true);
385 expected_builder.append_value(true);
386 expected_builder.append_value(true);
387 expected_builder.append_value(true);
388 expected_builder.append_value(false);
389 let expected = expected_builder.finish();
390
391 let re = regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
392 .unwrap();
393
394 assert_eq!(re.as_ref(), &expected);
395 }
396
397 #[test]
398 fn test_case_insensitive_regexp_like_utf8view() {
399 let values = StringViewArray::from(vec!["abc"; 5]);
400 let patterns =
401 StringViewArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
402 let flags = StringArray::from(vec!["i"; 5]);
403
404 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
405 expected_builder.append_value(true);
406 expected_builder.append_value(true);
407 expected_builder.append_value(true);
408 expected_builder.append_value(true);
409 expected_builder.append_value(false);
410 let expected = expected_builder.finish();
411
412 let re = regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
413 .unwrap();
414
415 assert_eq!(re.as_ref(), &expected);
416 }
417
418 #[test]
419 fn test_unsupported_global_flag_regexp_like() {
420 let values = StringArray::from(vec!["abc"]);
421 let patterns = StringArray::from(vec!["^(a)"]);
422 let flags = StringArray::from(vec!["g"]);
423
424 let re_err =
425 regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
426 .expect_err("unsupported flag should have failed");
427
428 assert_eq!(
429 re_err.strip_backtrace(),
430 "Error during planning: regexp_like() does not support the \"global\" option"
431 );
432 }
433}