datafusion_functions/regex/
regexplike.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Regex expressions
19
20use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray};
21use arrow::compute::kernels::regexp;
22use arrow::datatypes::DataType;
23use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
24use datafusion_common::exec_err;
25use datafusion_common::ScalarValue;
26use datafusion_common::{arrow_datafusion_err, plan_err};
27use datafusion_common::{internal_err, DataFusionError, Result};
28use datafusion_expr::{ColumnarValue, Documentation, TypeSignature};
29use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
30use datafusion_macros::user_doc;
31
32use std::any::Any;
33use std::sync::Arc;
34
35#[user_doc(
36    doc_section(label = "Regular Expression Functions"),
37    description = "Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.",
38    syntax_example = "regexp_like(str, regexp[, flags])",
39    sql_example = r#"```sql
40select regexp_like('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
41+--------------------------------------------------------+
42| regexp_like(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
43+--------------------------------------------------------+
44| true                                                   |
45+--------------------------------------------------------+
46SELECT regexp_like('aBc', '(b|d)', 'i');
47+--------------------------------------------------+
48| regexp_like(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
49+--------------------------------------------------+
50| true                                             |
51+--------------------------------------------------+
52```
53Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
54"#,
55    standard_argument(name = "str", prefix = "String"),
56    standard_argument(name = "regexp", prefix = "Regular"),
57    argument(
58        name = "flags",
59        description = r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
60  - **i**: case-insensitive: letters match both upper and lower case
61  - **m**: multi-line mode: ^ and $ match begin/end of line
62  - **s**: allow . to match \n
63  - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
64  - **U**: swap the meaning of x* and x*?"#
65    )
66)]
67#[derive(Debug)]
68pub struct RegexpLikeFunc {
69    signature: Signature,
70}
71
72impl Default for RegexpLikeFunc {
73    fn default() -> Self {
74        Self::new()
75    }
76}
77
78impl RegexpLikeFunc {
79    pub fn new() -> Self {
80        Self {
81            signature: Signature::one_of(
82                vec![TypeSignature::String(2), TypeSignature::String(3)],
83                Volatility::Immutable,
84            ),
85        }
86    }
87}
88
89impl ScalarUDFImpl for RegexpLikeFunc {
90    fn as_any(&self) -> &dyn Any {
91        self
92    }
93
94    fn name(&self) -> &str {
95        "regexp_like"
96    }
97
98    fn signature(&self) -> &Signature {
99        &self.signature
100    }
101
102    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
103        use DataType::*;
104
105        Ok(match &arg_types[0] {
106            Null => Null,
107            // Type coercion is done by DataFusion based on signature, so if we
108            // get here, the first argument is always a string
109            _ => Boolean,
110        })
111    }
112
113    fn invoke_with_args(
114        &self,
115        args: datafusion_expr::ScalarFunctionArgs,
116    ) -> Result<ColumnarValue> {
117        let args = &args.args;
118
119        let len = args
120            .iter()
121            .fold(Option::<usize>::None, |acc, arg| match arg {
122                ColumnarValue::Scalar(_) => acc,
123                ColumnarValue::Array(a) => Some(a.len()),
124            });
125
126        let is_scalar = len.is_none();
127        let inferred_length = len.unwrap_or(1);
128        let args = args
129            .iter()
130            .map(|arg| arg.to_array(inferred_length))
131            .collect::<Result<Vec<_>>>()?;
132
133        let result = regexp_like(&args);
134        if is_scalar {
135            // If all inputs are scalar, keeps output as scalar
136            let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0));
137            result.map(ColumnarValue::Scalar)
138        } else {
139            result.map(ColumnarValue::Array)
140        }
141    }
142
143    fn documentation(&self) -> Option<&Documentation> {
144        self.doc()
145    }
146}
147
148/// Tests a string using a regular expression returning true if at
149/// least one match, false otherwise.
150///
151/// The full list of supported features and syntax can be found at
152/// <https://docs.rs/regex/latest/regex/#syntax>
153///
154/// Supported flags can be found at
155/// <https://docs.rs/regex/latest/regex/#grouping-and-flags>
156///
157/// # Examples
158///
159/// ```ignore
160/// # use datafusion::prelude::*;
161/// # use datafusion::error::Result;
162/// # #[tokio::main]
163/// # async fn main() -> Result<()> {
164/// let ctx = SessionContext::new();
165/// let df = ctx.read_csv("tests/data/regex.csv", CsvReadOptions::new()).await?;
166///
167/// // use the regexp_like function to test col 'values',
168/// // against patterns in col 'patterns' without flags
169/// let df = df.with_column(
170///     "a",
171///     regexp_like(vec![col("values"), col("patterns")])
172/// )?;
173/// // use the regexp_like function to test col 'values',
174/// // against patterns in col 'patterns' with flags
175/// let df = df.with_column(
176///     "b",
177///     regexp_like(vec![col("values"), col("patterns"), col("flags")])
178/// )?;
179/// // literals can be used as well with dataframe calls
180/// let df = df.with_column(
181///     "c",
182///     regexp_like(vec![lit("foobarbequebaz"), lit("(bar)(beque)")])
183/// )?;
184///
185/// df.show().await?;
186///
187/// # Ok(())
188/// # }
189/// ```
190pub fn regexp_like(args: &[ArrayRef]) -> Result<ArrayRef> {
191    match args.len() {
192        2 => handle_regexp_like(&args[0], &args[1], None),
193        3 => {
194            let flags = match args[2].data_type() {
195                Utf8 => args[2].as_string::<i32>(),
196                LargeUtf8 => {
197                    let large_string_array = args[2].as_string::<i64>();
198                    let string_vec: Vec<Option<&str>> = (0..large_string_array.len()).map(|i| {
199                        if large_string_array.is_null(i) {
200                            None
201                        } else {
202                            Some(large_string_array.value(i))
203                        }
204                    })
205                    .collect();
206
207                    &GenericStringArray::<i32>::from(string_vec)
208                },
209                _ => {
210                    let string_view_array = args[2].as_string_view();
211                    let string_vec: Vec<Option<String>> = (0..string_view_array.len()).map(|i| {
212                        if string_view_array.is_null(i) {
213                            None
214                        } else {
215                            Some(string_view_array.value(i).to_string())
216                        }
217                    })
218                    .collect();
219                    &GenericStringArray::<i32>::from(string_vec)
220                },
221            };
222
223            if flags.iter().any(|s| s == Some("g")) {
224                return plan_err!("regexp_like() does not support the \"global\" option");
225            }
226
227            handle_regexp_like(&args[0], &args[1], Some(flags))
228        },
229        other => exec_err!(
230            "`regexp_like` was called with {other} arguments. It requires at least 2 and at most 3."
231        ),
232    }
233}
234
235fn handle_regexp_like(
236    values: &ArrayRef,
237    patterns: &ArrayRef,
238    flags: Option<&GenericStringArray<i32>>,
239) -> Result<ArrayRef> {
240    let array = match (values.data_type(), patterns.data_type()) {
241        (Utf8View, Utf8) => {
242            let value = values.as_string_view();
243            let pattern = patterns.as_string::<i32>();
244
245            regexp::regexp_is_match(value, pattern, flags)
246                .map_err(|e| arrow_datafusion_err!(e))?
247        }
248        (Utf8View, Utf8View) => {
249            let value = values.as_string_view();
250            let pattern = patterns.as_string_view();
251
252            regexp::regexp_is_match(value, pattern, flags)
253                .map_err(|e| arrow_datafusion_err!(e))?
254        }
255        (Utf8View, LargeUtf8) => {
256            let value = values.as_string_view();
257            let pattern = patterns.as_string::<i64>();
258
259            regexp::regexp_is_match(value, pattern, flags)
260                .map_err(|e| arrow_datafusion_err!(e))?
261        }
262        (Utf8, Utf8) => {
263            let value = values.as_string::<i32>();
264            let pattern = patterns.as_string::<i32>();
265
266            regexp::regexp_is_match(value, pattern, flags)
267                .map_err(|e| arrow_datafusion_err!(e))?
268        }
269        (Utf8, Utf8View) => {
270            let value = values.as_string::<i32>();
271            let pattern = patterns.as_string_view();
272
273            regexp::regexp_is_match(value, pattern, flags)
274                .map_err(|e| arrow_datafusion_err!(e))?
275        }
276        (Utf8, LargeUtf8) => {
277            let value = values.as_string_view();
278            let pattern = patterns.as_string::<i64>();
279
280            regexp::regexp_is_match(value, pattern, flags)
281                .map_err(|e| arrow_datafusion_err!(e))?
282        }
283        (LargeUtf8, Utf8) => {
284            let value = values.as_string::<i64>();
285            let pattern = patterns.as_string::<i32>();
286
287            regexp::regexp_is_match(value, pattern, flags)
288                .map_err(|e| arrow_datafusion_err!(e))?
289        }
290        (LargeUtf8, Utf8View) => {
291            let value = values.as_string::<i64>();
292            let pattern = patterns.as_string_view();
293
294            regexp::regexp_is_match(value, pattern, flags)
295                .map_err(|e| arrow_datafusion_err!(e))?
296        }
297        (LargeUtf8, LargeUtf8) => {
298            let value = values.as_string::<i64>();
299            let pattern = patterns.as_string::<i64>();
300
301            regexp::regexp_is_match(value, pattern, flags)
302                .map_err(|e| arrow_datafusion_err!(e))?
303        }
304        other => {
305            return internal_err!(
306                "Unsupported data type {other:?} for function `regexp_like`"
307            )
308        }
309    };
310
311    Ok(Arc::new(array) as ArrayRef)
312}
313
314#[cfg(test)]
315mod tests {
316    use std::sync::Arc;
317
318    use arrow::array::StringArray;
319    use arrow::array::{BooleanBuilder, StringViewArray};
320
321    use crate::regex::regexplike::regexp_like;
322
323    #[test]
324    fn test_case_sensitive_regexp_like_utf8() {
325        let values = StringArray::from(vec!["abc"; 5]);
326
327        let patterns =
328            StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
329
330        let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
331        expected_builder.append_value(true);
332        expected_builder.append_value(false);
333        expected_builder.append_value(true);
334        expected_builder.append_value(false);
335        expected_builder.append_value(false);
336        let expected = expected_builder.finish();
337
338        let re = regexp_like(&[Arc::new(values), Arc::new(patterns)]).unwrap();
339
340        assert_eq!(re.as_ref(), &expected);
341    }
342
343    #[test]
344    fn test_case_sensitive_regexp_like_utf8view() {
345        let values = StringViewArray::from(vec!["abc"; 5]);
346
347        let patterns =
348            StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
349
350        let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
351        expected_builder.append_value(true);
352        expected_builder.append_value(false);
353        expected_builder.append_value(true);
354        expected_builder.append_value(false);
355        expected_builder.append_value(false);
356        let expected = expected_builder.finish();
357
358        let re = regexp_like(&[Arc::new(values), Arc::new(patterns)]).unwrap();
359
360        assert_eq!(re.as_ref(), &expected);
361    }
362
363    #[test]
364    fn test_case_insensitive_regexp_like_utf8() {
365        let values = StringArray::from(vec!["abc"; 5]);
366        let patterns =
367            StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
368        let flags = StringArray::from(vec!["i"; 5]);
369
370        let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
371        expected_builder.append_value(true);
372        expected_builder.append_value(true);
373        expected_builder.append_value(true);
374        expected_builder.append_value(true);
375        expected_builder.append_value(false);
376        let expected = expected_builder.finish();
377
378        let re = regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
379            .unwrap();
380
381        assert_eq!(re.as_ref(), &expected);
382    }
383
384    #[test]
385    fn test_case_insensitive_regexp_like_utf8view() {
386        let values = StringViewArray::from(vec!["abc"; 5]);
387        let patterns =
388            StringViewArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
389        let flags = StringArray::from(vec!["i"; 5]);
390
391        let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
392        expected_builder.append_value(true);
393        expected_builder.append_value(true);
394        expected_builder.append_value(true);
395        expected_builder.append_value(true);
396        expected_builder.append_value(false);
397        let expected = expected_builder.finish();
398
399        let re = regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
400            .unwrap();
401
402        assert_eq!(re.as_ref(), &expected);
403    }
404
405    #[test]
406    fn test_unsupported_global_flag_regexp_like() {
407        let values = StringArray::from(vec!["abc"]);
408        let patterns = StringArray::from(vec!["^(a)"]);
409        let flags = StringArray::from(vec!["g"]);
410
411        let re_err =
412            regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
413                .expect_err("unsupported flag should have failed");
414
415        assert_eq!(
416            re_err.strip_backtrace(),
417            "Error during planning: regexp_like() does not support the \"global\" option"
418        );
419    }
420}