datafusion_functions/regex/
regexplike.rs1use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray};
21use arrow::compute::kernels::regexp;
22use arrow::datatypes::DataType;
23use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
24use datafusion_common::exec_err;
25use datafusion_common::ScalarValue;
26use datafusion_common::{arrow_datafusion_err, plan_err};
27use datafusion_common::{internal_err, DataFusionError, Result};
28use datafusion_expr::{ColumnarValue, Documentation, TypeSignature};
29use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
30use datafusion_macros::user_doc;
31
32use std::any::Any;
33use std::sync::Arc;
34
35#[user_doc(
36 doc_section(label = "Regular Expression Functions"),
37 description = "Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.",
38 syntax_example = "regexp_like(str, regexp[, flags])",
39 sql_example = r#"```sql
40select regexp_like('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
41+--------------------------------------------------------+
42| regexp_like(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
43+--------------------------------------------------------+
44| true |
45+--------------------------------------------------------+
46SELECT regexp_like('aBc', '(b|d)', 'i');
47+--------------------------------------------------+
48| regexp_like(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
49+--------------------------------------------------+
50| true |
51+--------------------------------------------------+
52```
53Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
54"#,
55 standard_argument(name = "str", prefix = "String"),
56 standard_argument(name = "regexp", prefix = "Regular"),
57 argument(
58 name = "flags",
59 description = r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
60 - **i**: case-insensitive: letters match both upper and lower case
61 - **m**: multi-line mode: ^ and $ match begin/end of line
62 - **s**: allow . to match \n
63 - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
64 - **U**: swap the meaning of x* and x*?"#
65 )
66)]
67#[derive(Debug)]
68pub struct RegexpLikeFunc {
69 signature: Signature,
70}
71
72impl Default for RegexpLikeFunc {
73 fn default() -> Self {
74 Self::new()
75 }
76}
77
78impl RegexpLikeFunc {
79 pub fn new() -> Self {
80 Self {
81 signature: Signature::one_of(
82 vec![TypeSignature::String(2), TypeSignature::String(3)],
83 Volatility::Immutable,
84 ),
85 }
86 }
87}
88
89impl ScalarUDFImpl for RegexpLikeFunc {
90 fn as_any(&self) -> &dyn Any {
91 self
92 }
93
94 fn name(&self) -> &str {
95 "regexp_like"
96 }
97
98 fn signature(&self) -> &Signature {
99 &self.signature
100 }
101
102 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
103 use DataType::*;
104
105 Ok(match &arg_types[0] {
106 Null => Null,
107 _ => Boolean,
110 })
111 }
112
113 fn invoke_with_args(
114 &self,
115 args: datafusion_expr::ScalarFunctionArgs,
116 ) -> Result<ColumnarValue> {
117 let args = &args.args;
118
119 let len = args
120 .iter()
121 .fold(Option::<usize>::None, |acc, arg| match arg {
122 ColumnarValue::Scalar(_) => acc,
123 ColumnarValue::Array(a) => Some(a.len()),
124 });
125
126 let is_scalar = len.is_none();
127 let inferred_length = len.unwrap_or(1);
128 let args = args
129 .iter()
130 .map(|arg| arg.to_array(inferred_length))
131 .collect::<Result<Vec<_>>>()?;
132
133 let result = regexp_like(&args);
134 if is_scalar {
135 let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0));
137 result.map(ColumnarValue::Scalar)
138 } else {
139 result.map(ColumnarValue::Array)
140 }
141 }
142
143 fn documentation(&self) -> Option<&Documentation> {
144 self.doc()
145 }
146}
147
148pub fn regexp_like(args: &[ArrayRef]) -> Result<ArrayRef> {
191 match args.len() {
192 2 => handle_regexp_like(&args[0], &args[1], None),
193 3 => {
194 let flags = match args[2].data_type() {
195 Utf8 => args[2].as_string::<i32>(),
196 LargeUtf8 => {
197 let large_string_array = args[2].as_string::<i64>();
198 let string_vec: Vec<Option<&str>> = (0..large_string_array.len()).map(|i| {
199 if large_string_array.is_null(i) {
200 None
201 } else {
202 Some(large_string_array.value(i))
203 }
204 })
205 .collect();
206
207 &GenericStringArray::<i32>::from(string_vec)
208 },
209 _ => {
210 let string_view_array = args[2].as_string_view();
211 let string_vec: Vec<Option<String>> = (0..string_view_array.len()).map(|i| {
212 if string_view_array.is_null(i) {
213 None
214 } else {
215 Some(string_view_array.value(i).to_string())
216 }
217 })
218 .collect();
219 &GenericStringArray::<i32>::from(string_vec)
220 },
221 };
222
223 if flags.iter().any(|s| s == Some("g")) {
224 return plan_err!("regexp_like() does not support the \"global\" option");
225 }
226
227 handle_regexp_like(&args[0], &args[1], Some(flags))
228 },
229 other => exec_err!(
230 "`regexp_like` was called with {other} arguments. It requires at least 2 and at most 3."
231 ),
232 }
233}
234
235fn handle_regexp_like(
236 values: &ArrayRef,
237 patterns: &ArrayRef,
238 flags: Option<&GenericStringArray<i32>>,
239) -> Result<ArrayRef> {
240 let array = match (values.data_type(), patterns.data_type()) {
241 (Utf8View, Utf8) => {
242 let value = values.as_string_view();
243 let pattern = patterns.as_string::<i32>();
244
245 regexp::regexp_is_match(value, pattern, flags)
246 .map_err(|e| arrow_datafusion_err!(e))?
247 }
248 (Utf8View, Utf8View) => {
249 let value = values.as_string_view();
250 let pattern = patterns.as_string_view();
251
252 regexp::regexp_is_match(value, pattern, flags)
253 .map_err(|e| arrow_datafusion_err!(e))?
254 }
255 (Utf8View, LargeUtf8) => {
256 let value = values.as_string_view();
257 let pattern = patterns.as_string::<i64>();
258
259 regexp::regexp_is_match(value, pattern, flags)
260 .map_err(|e| arrow_datafusion_err!(e))?
261 }
262 (Utf8, Utf8) => {
263 let value = values.as_string::<i32>();
264 let pattern = patterns.as_string::<i32>();
265
266 regexp::regexp_is_match(value, pattern, flags)
267 .map_err(|e| arrow_datafusion_err!(e))?
268 }
269 (Utf8, Utf8View) => {
270 let value = values.as_string::<i32>();
271 let pattern = patterns.as_string_view();
272
273 regexp::regexp_is_match(value, pattern, flags)
274 .map_err(|e| arrow_datafusion_err!(e))?
275 }
276 (Utf8, LargeUtf8) => {
277 let value = values.as_string_view();
278 let pattern = patterns.as_string::<i64>();
279
280 regexp::regexp_is_match(value, pattern, flags)
281 .map_err(|e| arrow_datafusion_err!(e))?
282 }
283 (LargeUtf8, Utf8) => {
284 let value = values.as_string::<i64>();
285 let pattern = patterns.as_string::<i32>();
286
287 regexp::regexp_is_match(value, pattern, flags)
288 .map_err(|e| arrow_datafusion_err!(e))?
289 }
290 (LargeUtf8, Utf8View) => {
291 let value = values.as_string::<i64>();
292 let pattern = patterns.as_string_view();
293
294 regexp::regexp_is_match(value, pattern, flags)
295 .map_err(|e| arrow_datafusion_err!(e))?
296 }
297 (LargeUtf8, LargeUtf8) => {
298 let value = values.as_string::<i64>();
299 let pattern = patterns.as_string::<i64>();
300
301 regexp::regexp_is_match(value, pattern, flags)
302 .map_err(|e| arrow_datafusion_err!(e))?
303 }
304 other => {
305 return internal_err!(
306 "Unsupported data type {other:?} for function `regexp_like`"
307 )
308 }
309 };
310
311 Ok(Arc::new(array) as ArrayRef)
312}
313
314#[cfg(test)]
315mod tests {
316 use std::sync::Arc;
317
318 use arrow::array::StringArray;
319 use arrow::array::{BooleanBuilder, StringViewArray};
320
321 use crate::regex::regexplike::regexp_like;
322
323 #[test]
324 fn test_case_sensitive_regexp_like_utf8() {
325 let values = StringArray::from(vec!["abc"; 5]);
326
327 let patterns =
328 StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
329
330 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
331 expected_builder.append_value(true);
332 expected_builder.append_value(false);
333 expected_builder.append_value(true);
334 expected_builder.append_value(false);
335 expected_builder.append_value(false);
336 let expected = expected_builder.finish();
337
338 let re = regexp_like(&[Arc::new(values), Arc::new(patterns)]).unwrap();
339
340 assert_eq!(re.as_ref(), &expected);
341 }
342
343 #[test]
344 fn test_case_sensitive_regexp_like_utf8view() {
345 let values = StringViewArray::from(vec!["abc"; 5]);
346
347 let patterns =
348 StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
349
350 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
351 expected_builder.append_value(true);
352 expected_builder.append_value(false);
353 expected_builder.append_value(true);
354 expected_builder.append_value(false);
355 expected_builder.append_value(false);
356 let expected = expected_builder.finish();
357
358 let re = regexp_like(&[Arc::new(values), Arc::new(patterns)]).unwrap();
359
360 assert_eq!(re.as_ref(), &expected);
361 }
362
363 #[test]
364 fn test_case_insensitive_regexp_like_utf8() {
365 let values = StringArray::from(vec!["abc"; 5]);
366 let patterns =
367 StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
368 let flags = StringArray::from(vec!["i"; 5]);
369
370 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
371 expected_builder.append_value(true);
372 expected_builder.append_value(true);
373 expected_builder.append_value(true);
374 expected_builder.append_value(true);
375 expected_builder.append_value(false);
376 let expected = expected_builder.finish();
377
378 let re = regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
379 .unwrap();
380
381 assert_eq!(re.as_ref(), &expected);
382 }
383
384 #[test]
385 fn test_case_insensitive_regexp_like_utf8view() {
386 let values = StringViewArray::from(vec!["abc"; 5]);
387 let patterns =
388 StringViewArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
389 let flags = StringArray::from(vec!["i"; 5]);
390
391 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
392 expected_builder.append_value(true);
393 expected_builder.append_value(true);
394 expected_builder.append_value(true);
395 expected_builder.append_value(true);
396 expected_builder.append_value(false);
397 let expected = expected_builder.finish();
398
399 let re = regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
400 .unwrap();
401
402 assert_eq!(re.as_ref(), &expected);
403 }
404
405 #[test]
406 fn test_unsupported_global_flag_regexp_like() {
407 let values = StringArray::from(vec!["abc"]);
408 let patterns = StringArray::from(vec!["^(a)"]);
409 let flags = StringArray::from(vec!["g"]);
410
411 let re_err =
412 regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
413 .expect_err("unsupported flag should have failed");
414
415 assert_eq!(
416 re_err.strip_backtrace(),
417 "Error during planning: regexp_like() does not support the \"global\" option"
418 );
419 }
420}