arrow_string/
regexp.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Defines kernel to extract substrings based on a regular
19//! expression of a \[Large\]StringArray
20
21use crate::like::StringArrayType;
22
23use arrow_array::builder::{
24    BooleanBufferBuilder, GenericStringBuilder, ListBuilder, StringViewBuilder,
25};
26use arrow_array::cast::AsArray;
27use arrow_array::*;
28use arrow_buffer::NullBuffer;
29use arrow_data::{ArrayData, ArrayDataBuilder};
30use arrow_schema::{ArrowError, DataType, Field};
31use regex::Regex;
32
33use std::collections::HashMap;
34use std::sync::Arc;
35
36/// Perform SQL `array ~ regex_array` operation on [`StringArray`] / [`LargeStringArray`].
37/// If `regex_array` element has an empty value, the corresponding result value is always true.
38///
39/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] flag, which allow
40/// special search modes, such as case insensitive and multi-line mode.
41/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags)
42/// for more information.
43#[deprecated(since = "54.0.0", note = "please use `regex_is_match` instead")]
44pub fn regexp_is_match_utf8<OffsetSize: OffsetSizeTrait>(
45    array: &GenericStringArray<OffsetSize>,
46    regex_array: &GenericStringArray<OffsetSize>,
47    flags_array: Option<&GenericStringArray<OffsetSize>>,
48) -> Result<BooleanArray, ArrowError> {
49    regexp_is_match(array, regex_array, flags_array)
50}
51
52/// Return BooleanArray indicating which strings in an array match an array of
53/// regular expressions.
54///
55/// This is equivalent to the SQL `array ~ regex_array`, supporting
56/// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`].
57///
58/// If `regex_array` element has an empty value, the corresponding result value is always true.
59///
60/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`] flag,
61/// which allow special search modes, such as case-insensitive and multi-line mode.
62/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags)
63/// for more information.
64///
65/// # See Also
66/// * [`regexp_is_match_scalar`] for matching a single regular expression against an array of strings
67/// * [`regexp_match`] for extracting groups from a string array based on a regular expression
68///
69/// # Example
70/// ```
71/// # use arrow_array::{StringArray, BooleanArray};
72/// # use arrow_string::regexp::regexp_is_match;
73/// // First array is the array of strings to match
74/// let array = StringArray::from(vec!["Foo", "Bar", "FooBar", "Baz"]);
75/// // Second array is the array of regular expressions to match against
76/// let regex_array = StringArray::from(vec!["^Foo", "^Foo", "Bar$", "Baz"]);
77/// // Third array is the array of flags to use for each regular expression, if desired
78/// // (the type must be provided to satisfy type inference for the third parameter)
79/// let flags_array: Option<&StringArray> = None;
80/// // The result is a BooleanArray indicating when each string in `array`
81/// // matches the corresponding regular expression in `regex_array`
82/// let result = regexp_is_match(&array, &regex_array, flags_array).unwrap();
83/// assert_eq!(result, BooleanArray::from(vec![true, false, true, true]));
84/// ```
85pub fn regexp_is_match<'a, S1, S2, S3>(
86    array: &'a S1,
87    regex_array: &'a S2,
88    flags_array: Option<&'a S3>,
89) -> Result<BooleanArray, ArrowError>
90where
91    &'a S1: StringArrayType<'a>,
92    &'a S2: StringArrayType<'a>,
93    &'a S3: StringArrayType<'a>,
94{
95    if array.len() != regex_array.len() {
96        return Err(ArrowError::ComputeError(
97            "Cannot perform comparison operation on arrays of different length".to_string(),
98        ));
99    }
100
101    let nulls = NullBuffer::union(array.nulls(), regex_array.nulls());
102
103    let mut patterns: HashMap<String, Regex> = HashMap::new();
104    let mut result = BooleanBufferBuilder::new(array.len());
105
106    let complete_pattern = match flags_array {
107        Some(flags) => Box::new(
108            regex_array
109                .iter()
110                .zip(flags.iter())
111                .map(|(pattern, flags)| {
112                    pattern.map(|pattern| match flags {
113                        Some(flag) => format!("(?{flag}){pattern}"),
114                        None => pattern.to_string(),
115                    })
116                }),
117        ) as Box<dyn Iterator<Item = Option<String>>>,
118        None => Box::new(
119            regex_array
120                .iter()
121                .map(|pattern| pattern.map(|pattern| pattern.to_string())),
122        ),
123    };
124
125    array
126        .iter()
127        .zip(complete_pattern)
128        .map(|(value, pattern)| {
129            match (value, pattern) {
130                // Required for Postgres compatibility:
131                // SELECT 'foobarbequebaz' ~ ''); = true
132                (Some(_), Some(pattern)) if pattern == *"" => {
133                    result.append(true);
134                }
135                (Some(value), Some(pattern)) => {
136                    let existing_pattern = patterns.get(&pattern);
137                    let re = match existing_pattern {
138                        Some(re) => re,
139                        None => {
140                            let re = Regex::new(pattern.as_str()).map_err(|e| {
141                                ArrowError::ComputeError(format!(
142                                    "Regular expression did not compile: {e:?}"
143                                ))
144                            })?;
145                            patterns.entry(pattern).or_insert(re)
146                        }
147                    };
148                    result.append(re.is_match(value));
149                }
150                _ => result.append(false),
151            }
152            Ok(())
153        })
154        .collect::<Result<Vec<()>, ArrowError>>()?;
155
156    let data = unsafe {
157        ArrayDataBuilder::new(DataType::Boolean)
158            .len(array.len())
159            .buffers(vec![result.into()])
160            .nulls(nulls)
161            .build_unchecked()
162    };
163
164    Ok(BooleanArray::from(data))
165}
166
167/// Perform SQL `array ~ regex_array` operation on [`StringArray`] /
168/// [`LargeStringArray`] and a scalar.
169///
170/// See the documentation on [`regexp_is_match_utf8`] for more details.
171#[deprecated(since = "54.0.0", note = "please use `regex_is_match_scalar` instead")]
172pub fn regexp_is_match_utf8_scalar<OffsetSize: OffsetSizeTrait>(
173    array: &GenericStringArray<OffsetSize>,
174    regex: &str,
175    flag: Option<&str>,
176) -> Result<BooleanArray, ArrowError> {
177    regexp_is_match_scalar(array, regex, flag)
178}
179
180/// Return BooleanArray indicating which strings in an array match a single regular expression.
181///
182/// This is equivalent to the SQL `array ~ regex_array`, supporting
183/// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`] and a scalar.
184///
185/// See the documentation on [`regexp_is_match`] for more details on arguments
186///
187/// # See Also
188/// * [`regexp_is_match`] for matching an array of regular expression against an array of strings
189/// * [`regexp_match`] for extracting groups from a string array based on a regular expression
190///
191/// # Example
192/// ```
193/// # use arrow_array::{StringArray, BooleanArray};
194/// # use arrow_string::regexp::regexp_is_match_scalar;
195/// // array of strings to match
196/// let array = StringArray::from(vec!["Foo", "Bar", "FooBar", "Baz"]);
197/// let regexp = "^Foo"; // regular expression to match against
198/// let flags: Option<&str> = None;  // flags can control the matching behavior
199/// // The result is a BooleanArray indicating when each string in `array`
200/// // matches the regular expression `regexp`
201/// let result = regexp_is_match_scalar(&array, regexp, None).unwrap();
202/// assert_eq!(result, BooleanArray::from(vec![true, false, true, false]));
203/// ```
204pub fn regexp_is_match_scalar<'a, S>(
205    array: &'a S,
206    regex: &str,
207    flag: Option<&str>,
208) -> Result<BooleanArray, ArrowError>
209where
210    &'a S: StringArrayType<'a>,
211{
212    let null_bit_buffer = array.nulls().map(|x| x.inner().sliced());
213    let mut result = BooleanBufferBuilder::new(array.len());
214
215    let pattern = match flag {
216        Some(flag) => format!("(?{flag}){regex}"),
217        None => regex.to_string(),
218    };
219
220    if pattern.is_empty() {
221        result.append_n(array.len(), true);
222    } else {
223        let re = Regex::new(pattern.as_str()).map_err(|e| {
224            ArrowError::ComputeError(format!("Regular expression did not compile: {e:?}"))
225        })?;
226        for i in 0..array.len() {
227            let value = array.value(i);
228            result.append(re.is_match(value));
229        }
230    }
231
232    let buffer = result.into();
233    let data = unsafe {
234        ArrayData::new_unchecked(
235            DataType::Boolean,
236            array.len(),
237            None,
238            null_bit_buffer,
239            0,
240            vec![buffer],
241            vec![],
242        )
243    };
244
245    Ok(BooleanArray::from(data))
246}
247
248macro_rules! process_regexp_array_match {
249    ($array:expr, $regex_array:expr, $flags_array:expr, $list_builder:expr) => {
250        let mut patterns: HashMap<String, Regex> = HashMap::new();
251
252        let complete_pattern = match $flags_array {
253            Some(flags) => Box::new($regex_array.iter().zip(flags.iter()).map(
254                |(pattern, flags)| {
255                    pattern.map(|pattern| match flags {
256                        Some(value) => format!("(?{value}){pattern}"),
257                        None => pattern.to_string(),
258                    })
259                },
260            )) as Box<dyn Iterator<Item = Option<String>>>,
261            None => Box::new(
262                $regex_array
263                    .iter()
264                    .map(|pattern| pattern.map(|pattern| pattern.to_string())),
265            ),
266        };
267
268        $array
269            .iter()
270            .zip(complete_pattern)
271            .map(|(value, pattern)| {
272                match (value, pattern) {
273                    // Required for Postgres compatibility:
274                    // SELECT regexp_match('foobarbequebaz', ''); = {""}
275                    (Some(_), Some(pattern)) if pattern == *"" => {
276                        $list_builder.values().append_value("");
277                        $list_builder.append(true);
278                    }
279                    (Some(value), Some(pattern)) => {
280                        let existing_pattern = patterns.get(&pattern);
281                        let re = match existing_pattern {
282                            Some(re) => re,
283                            None => {
284                                let re = Regex::new(pattern.as_str()).map_err(|e| {
285                                    ArrowError::ComputeError(format!(
286                                        "Regular expression did not compile: {e:?}"
287                                    ))
288                                })?;
289                                patterns.entry(pattern).or_insert(re)
290                            }
291                        };
292                        match re.captures(value) {
293                            Some(caps) => {
294                                let mut iter = caps.iter();
295                                if caps.len() > 1 {
296                                    iter.next();
297                                }
298                                for m in iter.flatten() {
299                                    $list_builder.values().append_value(m.as_str());
300                                }
301
302                                $list_builder.append(true);
303                            }
304                            None => $list_builder.append(false),
305                        }
306                    }
307                    _ => $list_builder.append(false),
308                }
309                Ok(())
310            })
311            .collect::<Result<Vec<()>, ArrowError>>()?;
312    };
313}
314
315fn regexp_array_match<OffsetSize: OffsetSizeTrait>(
316    array: &GenericStringArray<OffsetSize>,
317    regex_array: &GenericStringArray<OffsetSize>,
318    flags_array: Option<&GenericStringArray<OffsetSize>>,
319) -> Result<ArrayRef, ArrowError> {
320    let builder: GenericStringBuilder<OffsetSize> = GenericStringBuilder::with_capacity(0, 0);
321    let mut list_builder = ListBuilder::new(builder);
322
323    process_regexp_array_match!(array, regex_array, flags_array, list_builder);
324
325    Ok(Arc::new(list_builder.finish()))
326}
327
328fn regexp_array_match_utf8view(
329    array: &StringViewArray,
330    regex_array: &StringViewArray,
331    flags_array: Option<&StringViewArray>,
332) -> Result<ArrayRef, ArrowError> {
333    let builder = StringViewBuilder::with_capacity(0);
334    let mut list_builder = ListBuilder::new(builder);
335
336    process_regexp_array_match!(array, regex_array, flags_array, list_builder);
337
338    Ok(Arc::new(list_builder.finish()))
339}
340
341fn get_scalar_pattern_flag<'a, OffsetSize: OffsetSizeTrait>(
342    regex_array: &'a dyn Array,
343    flag_array: Option<&'a dyn Array>,
344) -> (Option<&'a str>, Option<&'a str>) {
345    let regex = regex_array.as_string::<OffsetSize>();
346    let regex = regex.is_valid(0).then(|| regex.value(0));
347
348    if let Some(flag_array) = flag_array {
349        let flag = flag_array.as_string::<OffsetSize>();
350        (regex, flag.is_valid(0).then(|| flag.value(0)))
351    } else {
352        (regex, None)
353    }
354}
355
356fn get_scalar_pattern_flag_utf8view<'a>(
357    regex_array: &'a dyn Array,
358    flag_array: Option<&'a dyn Array>,
359) -> (Option<&'a str>, Option<&'a str>) {
360    let regex = regex_array.as_string_view();
361    let regex = regex.is_valid(0).then(|| regex.value(0));
362
363    if let Some(flag_array) = flag_array {
364        let flag = flag_array.as_string_view();
365        (regex, flag.is_valid(0).then(|| flag.value(0)))
366    } else {
367        (regex, None)
368    }
369}
370
371macro_rules! process_regexp_match {
372    ($array:expr, $regex:expr, $list_builder:expr) => {
373        $array
374            .iter()
375            .map(|value| {
376                match value {
377                    // Required for Postgres compatibility:
378                    // SELECT regexp_match('foobarbequebaz', ''); = {""}
379                    Some(_) if $regex.as_str().is_empty() => {
380                        $list_builder.values().append_value("");
381                        $list_builder.append(true);
382                    }
383                    Some(value) => match $regex.captures(value) {
384                        Some(caps) => {
385                            let mut iter = caps.iter();
386                            if caps.len() > 1 {
387                                iter.next();
388                            }
389                            for m in iter.flatten() {
390                                $list_builder.values().append_value(m.as_str());
391                            }
392                            $list_builder.append(true);
393                        }
394                        None => $list_builder.append(false),
395                    },
396                    None => $list_builder.append(false),
397                }
398                Ok(())
399            })
400            .collect::<Result<Vec<()>, ArrowError>>()?
401    };
402}
403
404fn regexp_scalar_match<OffsetSize: OffsetSizeTrait>(
405    array: &GenericStringArray<OffsetSize>,
406    regex: &Regex,
407) -> Result<ArrayRef, ArrowError> {
408    let builder: GenericStringBuilder<OffsetSize> = GenericStringBuilder::with_capacity(0, 0);
409    let mut list_builder = ListBuilder::new(builder);
410
411    process_regexp_match!(array, regex, list_builder);
412
413    Ok(Arc::new(list_builder.finish()))
414}
415
416fn regexp_scalar_match_utf8view(
417    array: &StringViewArray,
418    regex: &Regex,
419) -> Result<ArrayRef, ArrowError> {
420    let builder = StringViewBuilder::with_capacity(0);
421    let mut list_builder = ListBuilder::new(builder);
422
423    process_regexp_match!(array, regex, list_builder);
424
425    Ok(Arc::new(list_builder.finish()))
426}
427
428/// Extract all groups matched by a regular expression for a given String array.
429///
430/// Modelled after the Postgres [regexp_match].
431///
432/// Returns a ListArray of [`GenericStringArray`] with each element containing the leftmost-first
433/// match of the corresponding index in `regex_array` to string in `array`
434///
435/// If there is no match, the list element is NULL.
436///
437/// If a match is found, and the pattern contains no capturing parenthesized subexpressions,
438/// then the list element is a single-element [`GenericStringArray`] containing the substring
439/// matching the whole pattern.
440///
441/// If a match is found, and the pattern contains capturing parenthesized subexpressions, then the
442/// list element is a [`GenericStringArray`] whose n'th element is the substring matching
443/// the n'th capturing parenthesized subexpression of the pattern.
444///
445/// The flags parameter is an optional text string containing zero or more single-letter flags
446/// that change the function's behavior.
447///
448/// # See Also
449/// * [`regexp_is_match`] for matching (rather than extracting) a regular expression against an array of strings
450///
451/// [regexp_match]: https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-POSIX-REGEXP
452pub fn regexp_match(
453    array: &dyn Array,
454    regex_array: &dyn Datum,
455    flags_array: Option<&dyn Datum>,
456) -> Result<ArrayRef, ArrowError> {
457    let (rhs, is_rhs_scalar) = regex_array.get();
458
459    if array.data_type() != rhs.data_type() {
460        return Err(ArrowError::ComputeError(
461            "regexp_match() requires both array and pattern to be either Utf8, Utf8View or LargeUtf8"
462                .to_string(),
463        ));
464    }
465
466    let (flags, is_flags_scalar) = match flags_array {
467        Some(flags) => {
468            let (flags, is_flags_scalar) = flags.get();
469            (Some(flags), Some(is_flags_scalar))
470        }
471        None => (None, None),
472    };
473
474    if is_flags_scalar.is_some() && is_rhs_scalar != is_flags_scalar.unwrap() {
475        return Err(ArrowError::ComputeError(
476            "regexp_match() requires both pattern and flags to be either scalar or array"
477                .to_string(),
478        ));
479    }
480
481    if flags_array.is_some() && rhs.data_type() != flags.unwrap().data_type() {
482        return Err(ArrowError::ComputeError(
483            "regexp_match() requires both pattern and flags to be either Utf8, Utf8View or LargeUtf8"
484                .to_string(),
485        ));
486    }
487
488    if is_rhs_scalar {
489        // Regex and flag is scalars
490        let (regex, flag) = match rhs.data_type() {
491            DataType::Utf8View => get_scalar_pattern_flag_utf8view(rhs, flags),
492            DataType::Utf8 => get_scalar_pattern_flag::<i32>(rhs, flags),
493            DataType::LargeUtf8 => get_scalar_pattern_flag::<i64>(rhs, flags),
494            _ => {
495                return Err(ArrowError::ComputeError(
496                    "regexp_match() requires pattern to be either Utf8, Utf8View or LargeUtf8"
497                        .to_string(),
498                ));
499            }
500        };
501
502        if regex.is_none() {
503            return Ok(new_null_array(
504                &DataType::List(Arc::new(Field::new_list_field(
505                    array.data_type().clone(),
506                    true,
507                ))),
508                array.len(),
509            ));
510        }
511
512        let regex = regex.unwrap();
513
514        let pattern = if let Some(flag) = flag {
515            format!("(?{flag}){regex}")
516        } else {
517            regex.to_string()
518        };
519
520        let re = Regex::new(pattern.as_str()).map_err(|e| {
521            ArrowError::ComputeError(format!("Regular expression did not compile: {e:?}"))
522        })?;
523
524        match array.data_type() {
525            DataType::Utf8View => regexp_scalar_match_utf8view(array.as_string_view(), &re),
526            DataType::Utf8 => regexp_scalar_match(array.as_string::<i32>(), &re),
527            DataType::LargeUtf8 => regexp_scalar_match(array.as_string::<i64>(), &re),
528            _ => Err(ArrowError::ComputeError(
529                "regexp_match() requires array to be either Utf8, Utf8View or LargeUtf8"
530                    .to_string(),
531            )),
532        }
533    } else {
534        match array.data_type() {
535            DataType::Utf8View => {
536                let regex_array = rhs.as_string_view();
537                let flags_array = flags.map(|flags| flags.as_string_view());
538                regexp_array_match_utf8view(array.as_string_view(), regex_array, flags_array)
539            }
540            DataType::Utf8 => {
541                let regex_array = rhs.as_string();
542                let flags_array = flags.map(|flags| flags.as_string());
543                regexp_array_match(array.as_string::<i32>(), regex_array, flags_array)
544            }
545            DataType::LargeUtf8 => {
546                let regex_array = rhs.as_string();
547                let flags_array = flags.map(|flags| flags.as_string());
548                regexp_array_match(array.as_string::<i64>(), regex_array, flags_array)
549            }
550            _ => Err(ArrowError::ComputeError(
551                "regexp_match() requires array to be either Utf8, Utf8View or LargeUtf8"
552                    .to_string(),
553            )),
554        }
555    }
556}
557
558#[cfg(test)]
559mod tests {
560    use super::*;
561
562    macro_rules! test_match_single_group {
563        ($test_name:ident, $values:expr, $patterns:expr, $arr_type:ty, $builder_type:ty, $expected:expr) => {
564            #[test]
565            fn $test_name() {
566                let array: $arr_type = <$arr_type>::from($values);
567                let pattern: $arr_type = <$arr_type>::from($patterns);
568
569                let actual = regexp_match(&array, &pattern, None).unwrap();
570
571                let elem_builder: $builder_type = <$builder_type>::new();
572                let mut expected_builder = ListBuilder::new(elem_builder);
573
574                for val in $expected {
575                    match val {
576                        Some(v) => {
577                            expected_builder.values().append_value(v);
578                            expected_builder.append(true);
579                        }
580                        None => expected_builder.append(false),
581                    }
582                }
583
584                let expected = expected_builder.finish();
585                let result = actual.as_any().downcast_ref::<ListArray>().unwrap();
586                assert_eq!(&expected, result);
587            }
588        };
589    }
590
591    test_match_single_group!(
592        match_single_group_string,
593        vec![
594            Some("abc-005-def"),
595            Some("X-7-5"),
596            Some("X545"),
597            None,
598            Some("foobarbequebaz"),
599            Some("foobarbequebaz"),
600        ],
601        vec![
602            r".*-(\d*)-.*",
603            r".*-(\d*)-.*",
604            r".*-(\d*)-.*",
605            r".*-(\d*)-.*",
606            r"(bar)(bequ1e)",
607            ""
608        ],
609        StringArray,
610        GenericStringBuilder<i32>,
611        [Some("005"), Some("7"), None, None, None, Some("")]
612    );
613    test_match_single_group!(
614        match_single_group_string_view,
615        vec![
616            Some("abc-005-def"),
617            Some("X-7-5"),
618            Some("X545"),
619            None,
620            Some("foobarbequebaz"),
621            Some("foobarbequebaz"),
622        ],
623        vec![
624            r".*-(\d*)-.*",
625            r".*-(\d*)-.*",
626            r".*-(\d*)-.*",
627            r".*-(\d*)-.*",
628            r"(bar)(bequ1e)",
629            ""
630        ],
631        StringViewArray,
632        StringViewBuilder,
633        [Some("005"), Some("7"), None, None, None, Some("")]
634    );
635
636    macro_rules! test_match_single_group_with_flags {
637        ($test_name:ident, $values:expr, $patterns:expr, $flags:expr, $array_type:ty, $builder_type:ty, $expected:expr) => {
638            #[test]
639            fn $test_name() {
640                let array: $array_type = <$array_type>::from($values);
641                let pattern: $array_type = <$array_type>::from($patterns);
642                let flags: $array_type = <$array_type>::from($flags);
643
644                let actual = regexp_match(&array, &pattern, Some(&flags)).unwrap();
645
646                let elem_builder: $builder_type = <$builder_type>::new();
647                let mut expected_builder = ListBuilder::new(elem_builder);
648
649                for val in $expected {
650                    match val {
651                        Some(v) => {
652                            expected_builder.values().append_value(v);
653                            expected_builder.append(true);
654                        }
655                        None => {
656                            expected_builder.append(false);
657                        }
658                    }
659                }
660
661                let expected = expected_builder.finish();
662                let result = actual.as_any().downcast_ref::<ListArray>().unwrap();
663                assert_eq!(&expected, result);
664            }
665        };
666    }
667
668    test_match_single_group_with_flags!(
669        match_single_group_with_flags_string,
670        vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None],
671        vec![r"x.*-(\d*)-.*"; 4],
672        vec!["i"; 4],
673        StringArray,
674        GenericStringBuilder<i32>,
675        [None, Some("7"), None, None]
676    );
677    test_match_single_group_with_flags!(
678        match_single_group_with_flags_stringview,
679        vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None],
680        vec![r"x.*-(\d*)-.*"; 4],
681        vec!["i"; 4],
682        StringViewArray,
683        StringViewBuilder,
684        [None, Some("7"), None, None]
685    );
686
687    macro_rules! test_match_scalar_pattern {
688        ($test_name:ident, $values:expr, $pattern:expr, $flag:expr, $array_type:ty, $builder_type:ty, $expected:expr) => {
689            #[test]
690            fn $test_name() {
691                let array: $array_type = <$array_type>::from($values);
692
693                let pattern_scalar = Scalar::new(<$array_type>::from(vec![$pattern; 1]));
694                let flag_scalar = Scalar::new(<$array_type>::from(vec![$flag; 1]));
695
696                let actual = regexp_match(&array, &pattern_scalar, Some(&flag_scalar)).unwrap();
697
698                let elem_builder: $builder_type = <$builder_type>::new();
699                let mut expected_builder = ListBuilder::new(elem_builder);
700
701                for val in $expected {
702                    match val {
703                        Some(v) => {
704                            expected_builder.values().append_value(v);
705                            expected_builder.append(true);
706                        }
707                        None => expected_builder.append(false),
708                    }
709                }
710
711                let expected = expected_builder.finish();
712                let result = actual.as_any().downcast_ref::<ListArray>().unwrap();
713                assert_eq!(&expected, result);
714            }
715        };
716    }
717
718    test_match_scalar_pattern!(
719        match_scalar_pattern_string_with_flags,
720        vec![
721            Some("abc-005-def"),
722            Some("x-7-5"),
723            Some("X-0-Y"),
724            Some("X545"),
725            None
726        ],
727        r"x.*-(\d*)-.*",
728        Some("i"),
729        StringArray,
730        GenericStringBuilder<i32>,
731        [None, Some("7"), Some("0"), None, None]
732    );
733    test_match_scalar_pattern!(
734        match_scalar_pattern_stringview_with_flags,
735        vec![
736            Some("abc-005-def"),
737            Some("x-7-5"),
738            Some("X-0-Y"),
739            Some("X545"),
740            None
741        ],
742        r"x.*-(\d*)-.*",
743        Some("i"),
744        StringViewArray,
745        StringViewBuilder,
746        [None, Some("7"), Some("0"), None, None]
747    );
748
749    test_match_scalar_pattern!(
750        match_scalar_pattern_string_no_flags,
751        vec![
752            Some("abc-005-def"),
753            Some("x-7-5"),
754            Some("X-0-Y"),
755            Some("X545"),
756            None
757        ],
758        r"x.*-(\d*)-.*",
759        None::<&str>,
760        StringArray,
761        GenericStringBuilder<i32>,
762        [None, Some("7"), None, None, None]
763    );
764    test_match_scalar_pattern!(
765        match_scalar_pattern_stringview_no_flags,
766        vec![
767            Some("abc-005-def"),
768            Some("x-7-5"),
769            Some("X-0-Y"),
770            Some("X545"),
771            None
772        ],
773        r"x.*-(\d*)-.*",
774        None::<&str>,
775        StringViewArray,
776        StringViewBuilder,
777        [None, Some("7"), None, None, None]
778    );
779
780    macro_rules! test_match_scalar_no_pattern {
781        ($test_name:ident, $values:expr, $array_type:ty, $pattern_type:expr, $builder_type:ty, $expected:expr) => {
782            #[test]
783            fn $test_name() {
784                let array: $array_type = <$array_type>::from($values);
785                let pattern = Scalar::new(new_null_array(&$pattern_type, 1));
786
787                let actual = regexp_match(&array, &pattern, None).unwrap();
788
789                let elem_builder: $builder_type = <$builder_type>::new();
790                let mut expected_builder = ListBuilder::new(elem_builder);
791
792                for val in $expected {
793                    match val {
794                        Some(v) => {
795                            expected_builder.values().append_value(v);
796                            expected_builder.append(true);
797                        }
798                        None => expected_builder.append(false),
799                    }
800                }
801
802                let expected = expected_builder.finish();
803                let result = actual.as_any().downcast_ref::<ListArray>().unwrap();
804                assert_eq!(&expected, result);
805            }
806        };
807    }
808
809    test_match_scalar_no_pattern!(
810        match_scalar_no_pattern_string,
811        vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None],
812        StringArray,
813        DataType::Utf8,
814        GenericStringBuilder<i32>,
815        [None::<&str>, None, None, None]
816    );
817    test_match_scalar_no_pattern!(
818        match_scalar_no_pattern_stringview,
819        vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None],
820        StringViewArray,
821        DataType::Utf8View,
822        StringViewBuilder,
823        [None::<&str>, None, None, None]
824    );
825
826    macro_rules! test_match_single_group_not_skip {
827        ($test_name:ident, $values:expr, $pattern:expr, $array_type:ty, $builder_type:ty, $expected:expr) => {
828            #[test]
829            fn $test_name() {
830                let array: $array_type = <$array_type>::from($values);
831                let pattern: $array_type = <$array_type>::from(vec![$pattern]);
832
833                let actual = regexp_match(&array, &pattern, None).unwrap();
834
835                let elem_builder: $builder_type = <$builder_type>::new();
836                let mut expected_builder = ListBuilder::new(elem_builder);
837
838                for val in $expected {
839                    match val {
840                        Some(v) => {
841                            expected_builder.values().append_value(v);
842                            expected_builder.append(true);
843                        }
844                        None => expected_builder.append(false),
845                    }
846                }
847
848                let expected = expected_builder.finish();
849                let result = actual.as_any().downcast_ref::<ListArray>().unwrap();
850                assert_eq!(&expected, result);
851            }
852        };
853    }
854
855    test_match_single_group_not_skip!(
856        match_single_group_not_skip_string,
857        vec![Some("foo"), Some("bar")],
858        r"foo",
859        StringArray,
860        GenericStringBuilder<i32>,
861        [Some("foo")]
862    );
863    test_match_single_group_not_skip!(
864        match_single_group_not_skip_stringview,
865        vec![Some("foo"), Some("bar")],
866        r"foo",
867        StringViewArray,
868        StringViewBuilder,
869        [Some("foo")]
870    );
871
872    macro_rules! test_flag_utf8 {
873        ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => {
874            #[test]
875            fn $test_name() {
876                let left = $left;
877                let right = $right;
878                let res = $op(&left, &right, None).unwrap();
879                let expected = $expected;
880                assert_eq!(expected.len(), res.len());
881                for i in 0..res.len() {
882                    let v = res.value(i);
883                    assert_eq!(v, expected[i]);
884                }
885            }
886        };
887        ($test_name:ident, $left:expr, $right:expr, $flag:expr, $op:expr, $expected:expr) => {
888            #[test]
889            fn $test_name() {
890                let left = $left;
891                let right = $right;
892                let flag = Some($flag);
893                let res = $op(&left, &right, flag.as_ref()).unwrap();
894                let expected = $expected;
895                assert_eq!(expected.len(), res.len());
896                for i in 0..res.len() {
897                    let v = res.value(i);
898                    assert_eq!(v, expected[i]);
899                }
900            }
901        };
902    }
903
904    macro_rules! test_flag_utf8_scalar {
905        ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => {
906            #[test]
907            fn $test_name() {
908                let left = $left;
909                let res = $op(&left, $right, None).unwrap();
910                let expected = $expected;
911                assert_eq!(expected.len(), res.len());
912                for i in 0..res.len() {
913                    let v = res.value(i);
914                    assert_eq!(
915                        v,
916                        expected[i],
917                        "unexpected result when comparing {} at position {} to {} ",
918                        left.value(i),
919                        i,
920                        $right
921                    );
922                }
923            }
924        };
925        ($test_name:ident, $left:expr, $right:expr, $flag:expr, $op:expr, $expected:expr) => {
926            #[test]
927            fn $test_name() {
928                let left = $left;
929                let flag = Some($flag);
930                let res = $op(&left, $right, flag).unwrap();
931                let expected = $expected;
932                assert_eq!(expected.len(), res.len());
933                for i in 0..res.len() {
934                    let v = res.value(i);
935                    assert_eq!(
936                        v,
937                        expected[i],
938                        "unexpected result when comparing {} at position {} to {} ",
939                        left.value(i),
940                        i,
941                        $right
942                    );
943                }
944            }
945        };
946    }
947
948    test_flag_utf8!(
949        test_array_regexp_is_match_utf8,
950        StringArray::from(vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"]),
951        StringArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]),
952        regexp_is_match::<StringArray, StringArray, StringArray>,
953        [true, false, true, false, false, true]
954    );
955    test_flag_utf8!(
956        test_array_regexp_is_match_utf8_insensitive,
957        StringArray::from(vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"]),
958        StringArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]),
959        StringArray::from(vec!["i"; 6]),
960        regexp_is_match,
961        [true, true, true, true, false, true]
962    );
963
964    test_flag_utf8_scalar!(
965        test_array_regexp_is_match_utf8_scalar,
966        StringArray::from(vec!["arrow", "ARROW", "parquet", "PARQUET"]),
967        "^ar",
968        regexp_is_match_scalar,
969        [true, false, false, false]
970    );
971    test_flag_utf8_scalar!(
972        test_array_regexp_is_match_utf8_scalar_empty,
973        StringArray::from(vec!["arrow", "ARROW", "parquet", "PARQUET"]),
974        "",
975        regexp_is_match_scalar,
976        [true, true, true, true]
977    );
978    test_flag_utf8_scalar!(
979        test_array_regexp_is_match_utf8_scalar_insensitive,
980        StringArray::from(vec!["arrow", "ARROW", "parquet", "PARQUET"]),
981        "^ar",
982        "i",
983        regexp_is_match_scalar,
984        [true, true, false, false]
985    );
986
987    test_flag_utf8!(
988        tes_array_regexp_is_match,
989        StringViewArray::from(vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"]),
990        StringViewArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]),
991        regexp_is_match::<StringViewArray, StringViewArray, StringViewArray>,
992        [true, false, true, false, false, true]
993    );
994    test_flag_utf8!(
995        test_array_regexp_is_match_2,
996        StringViewArray::from(vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"]),
997        StringArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]),
998        regexp_is_match::<StringViewArray, GenericStringArray<i32>, GenericStringArray<i32>>,
999        [true, false, true, false, false, true]
1000    );
1001    test_flag_utf8!(
1002        test_array_regexp_is_match_insensitive,
1003        StringViewArray::from(vec![
1004            "Official Rust implementation of Apache Arrow",
1005            "apache/arrow-rs",
1006            "apache/arrow-rs",
1007            "parquet",
1008            "parquet",
1009            "row",
1010            "row",
1011        ]),
1012        StringViewArray::from(vec![
1013            ".*rust implement.*",
1014            "^ap",
1015            "^AP",
1016            "et$",
1017            "ET$",
1018            "foo",
1019            ""
1020        ]),
1021        StringViewArray::from(vec!["i"; 7]),
1022        regexp_is_match::<StringViewArray, StringViewArray, StringViewArray>,
1023        [true, true, true, true, true, false, true]
1024    );
1025    test_flag_utf8!(
1026        test_array_regexp_is_match_insensitive_2,
1027        LargeStringArray::from(vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrow"]),
1028        StringViewArray::from(vec!["^ar", "^AR", "ow$", "OW$", "foo", ""]),
1029        StringArray::from(vec!["i"; 6]),
1030        regexp_is_match::<GenericStringArray<i64>, StringViewArray, GenericStringArray<i32>>,
1031        [true, true, true, true, false, true]
1032    );
1033
1034    test_flag_utf8_scalar!(
1035        test_array_regexp_is_match_scalar,
1036        StringViewArray::from(vec![
1037            "apache/arrow-rs",
1038            "APACHE/ARROW-RS",
1039            "parquet",
1040            "PARQUET",
1041        ]),
1042        "^ap",
1043        regexp_is_match_scalar::<StringViewArray>,
1044        [true, false, false, false]
1045    );
1046    test_flag_utf8_scalar!(
1047        test_array_regexp_is_match_scalar_empty,
1048        StringViewArray::from(vec![
1049            "apache/arrow-rs",
1050            "APACHE/ARROW-RS",
1051            "parquet",
1052            "PARQUET",
1053        ]),
1054        "",
1055        regexp_is_match_scalar::<StringViewArray>,
1056        [true, true, true, true]
1057    );
1058    test_flag_utf8_scalar!(
1059        test_array_regexp_is_match_scalar_insensitive,
1060        StringViewArray::from(vec![
1061            "apache/arrow-rs",
1062            "APACHE/ARROW-RS",
1063            "parquet",
1064            "PARQUET",
1065        ]),
1066        "^ap",
1067        "i",
1068        regexp_is_match_scalar::<StringViewArray>,
1069        [true, true, false, false]
1070    );
1071}