regex_automata/util/
interpolate.rs

1/*!
2Provides routines for interpolating capture group references.
3
4That is, if a replacement string contains references like `$foo` or `${foo1}`,
5then they are replaced with the corresponding capture values for the groups
6named `foo` and `foo1`, respectively. Similarly, syntax like `$1` and `${1}`
7is supported as well, with `1` corresponding to a capture group index and not
8a name.
9
10This module provides the free functions [`string`] and [`bytes`], which
11interpolate Rust Unicode strings and byte strings, respectively.
12
13# Format
14
15These routines support two different kinds of capture references: unbraced and
16braced.
17
18For the unbraced format, the format supported is `$ref` where `name` can be
19any character in the class `[0-9A-Za-z_]`. `ref` is always the longest
20possible parse. So for example, `$1a` corresponds to the capture group named
21`1a` and not the capture group at index `1`. If `ref` matches `^[0-9]+$`, then
22it is treated as a capture group index itself and not a name.
23
24For the braced format, the format supported is `${ref}` where `ref` can be any
25sequence of bytes except for `}`. If no closing brace occurs, then it is not
26considered a capture reference. As with the unbraced format, if `ref` matches
27`^[0-9]+$`, then it is treated as a capture group index and not a name.
28
29The braced format is useful for exerting precise control over the name of the
30capture reference. For example, `${1}a` corresponds to the capture group
31reference `1` followed by the letter `a`, where as `$1a` (as mentioned above)
32corresponds to the capture group reference `1a`. The braced format is also
33useful for expressing capture group names that use characters not supported by
34the unbraced format. For example, `${foo[bar].baz}` refers to the capture group
35named `foo[bar].baz`.
36
37If a capture group reference is found and it does not refer to a valid capture
38group, then it will be replaced with the empty string.
39
40To write a literal `$`, use `$$`.
41
42To be clear, and as exhibited via the type signatures in the routines in this
43module, it is impossible for a replacement string to be invalid. A replacement
44string may not have the intended semantics, but the interpolation procedure
45itself can never fail.
46*/
47
48use alloc::{string::String, vec::Vec};
49
50use crate::util::memchr::memchr;
51
52/// Accepts a replacement string and interpolates capture references with their
53/// corresponding values.
54///
55/// `append` should be a function that appends the string value of a capture
56/// group at a particular index to the string given. If the capture group
57/// index is invalid, then nothing should be appended.
58///
59/// `name_to_index` should be a function that maps a capture group name to a
60/// capture group index. If the given name doesn't exist, then `None` should
61/// be returned.
62///
63/// Finally, `dst` is where the final interpolated contents should be written.
64/// If `replacement` contains no capture group references, then `dst` will be
65/// equivalent to `replacement`.
66///
67/// See the [module documentation](self) for details about the format
68/// supported.
69///
70/// # Example
71///
72/// ```
73/// use regex_automata::util::interpolate;
74///
75/// let mut dst = String::new();
76/// interpolate::string(
77///     "foo $bar baz",
78///     |index, dst| {
79///         if index == 0 {
80///             dst.push_str("BAR");
81///         }
82///     },
83///     |name| {
84///         if name == "bar" {
85///             Some(0)
86///         } else {
87///             None
88///         }
89///     },
90///     &mut dst,
91/// );
92/// assert_eq!("foo BAR baz", dst);
93/// ```
94pub fn string(
95    mut replacement: &str,
96    mut append: impl FnMut(usize, &mut String),
97    mut name_to_index: impl FnMut(&str) -> Option<usize>,
98    dst: &mut String,
99) {
100    while !replacement.is_empty() {
101        match memchr(b'$', replacement.as_bytes()) {
102            None => break,
103            Some(i) => {
104                dst.push_str(&replacement[..i]);
105                replacement = &replacement[i..];
106            }
107        }
108        // Handle escaping of '$'.
109        if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') {
110            dst.push_str("$");
111            replacement = &replacement[2..];
112            continue;
113        }
114        debug_assert!(!replacement.is_empty());
115        let cap_ref = match find_cap_ref(replacement.as_bytes()) {
116            Some(cap_ref) => cap_ref,
117            None => {
118                dst.push_str("$");
119                replacement = &replacement[1..];
120                continue;
121            }
122        };
123        replacement = &replacement[cap_ref.end..];
124        match cap_ref.cap {
125            Ref::Number(i) => append(i, dst),
126            Ref::Named(name) => {
127                if let Some(i) = name_to_index(name) {
128                    append(i, dst);
129                }
130            }
131        }
132    }
133    dst.push_str(replacement);
134}
135
136/// Accepts a replacement byte string and interpolates capture references with
137/// their corresponding values.
138///
139/// `append` should be a function that appends the byte string value of a
140/// capture group at a particular index to the byte string given. If the
141/// capture group index is invalid, then nothing should be appended.
142///
143/// `name_to_index` should be a function that maps a capture group name to a
144/// capture group index. If the given name doesn't exist, then `None` should
145/// be returned.
146///
147/// Finally, `dst` is where the final interpolated contents should be written.
148/// If `replacement` contains no capture group references, then `dst` will be
149/// equivalent to `replacement`.
150///
151/// See the [module documentation](self) for details about the format
152/// supported.
153///
154/// # Example
155///
156/// ```
157/// use regex_automata::util::interpolate;
158///
159/// let mut dst = vec![];
160/// interpolate::bytes(
161///     b"foo $bar baz",
162///     |index, dst| {
163///         if index == 0 {
164///             dst.extend_from_slice(b"BAR");
165///         }
166///     },
167///     |name| {
168///         if name == "bar" {
169///             Some(0)
170///         } else {
171///             None
172///         }
173///     },
174///     &mut dst,
175/// );
176/// assert_eq!(&b"foo BAR baz"[..], dst);
177/// ```
178pub fn bytes(
179    mut replacement: &[u8],
180    mut append: impl FnMut(usize, &mut Vec<u8>),
181    mut name_to_index: impl FnMut(&str) -> Option<usize>,
182    dst: &mut Vec<u8>,
183) {
184    while !replacement.is_empty() {
185        match memchr(b'$', replacement) {
186            None => break,
187            Some(i) => {
188                dst.extend_from_slice(&replacement[..i]);
189                replacement = &replacement[i..];
190            }
191        }
192        // Handle escaping of '$'.
193        if replacement.get(1).map_or(false, |&b| b == b'$') {
194            dst.push(b'$');
195            replacement = &replacement[2..];
196            continue;
197        }
198        debug_assert!(!replacement.is_empty());
199        let cap_ref = match find_cap_ref(replacement) {
200            Some(cap_ref) => cap_ref,
201            None => {
202                dst.push(b'$');
203                replacement = &replacement[1..];
204                continue;
205            }
206        };
207        replacement = &replacement[cap_ref.end..];
208        match cap_ref.cap {
209            Ref::Number(i) => append(i, dst),
210            Ref::Named(name) => {
211                if let Some(i) = name_to_index(name) {
212                    append(i, dst);
213                }
214            }
215        }
216    }
217    dst.extend_from_slice(replacement);
218}
219
220/// `CaptureRef` represents a reference to a capture group inside some text.
221/// The reference is either a capture group name or a number.
222///
223/// It is also tagged with the position in the text following the
224/// capture reference.
225#[derive(Clone, Copy, Debug, Eq, PartialEq)]
226struct CaptureRef<'a> {
227    cap: Ref<'a>,
228    end: usize,
229}
230
231/// A reference to a capture group in some text.
232///
233/// e.g., `$2`, `$foo`, `${foo}`.
234#[derive(Clone, Copy, Debug, Eq, PartialEq)]
235enum Ref<'a> {
236    Named(&'a str),
237    Number(usize),
238}
239
240impl<'a> From<&'a str> for Ref<'a> {
241    fn from(x: &'a str) -> Ref<'a> {
242        Ref::Named(x)
243    }
244}
245
246impl From<usize> for Ref<'static> {
247    fn from(x: usize) -> Ref<'static> {
248        Ref::Number(x)
249    }
250}
251
252/// Parses a possible reference to a capture group name in the given text,
253/// starting at the beginning of `replacement`.
254///
255/// If no such valid reference could be found, None is returned.
256///
257/// Note that this returns a "possible" reference because this routine doesn't
258/// know whether the reference is to a valid group or not. If it winds up not
259/// being a valid reference, then it should be replaced with the empty string.
260fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> {
261    let mut i = 0;
262    let rep: &[u8] = replacement;
263    if rep.len() <= 1 || rep[0] != b'$' {
264        return None;
265    }
266    i += 1;
267    if rep[i] == b'{' {
268        return find_cap_ref_braced(rep, i + 1);
269    }
270    let mut cap_end = i;
271    while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) {
272        cap_end += 1;
273    }
274    if cap_end == i {
275        return None;
276    }
277    // We just verified that the range 0..cap_end is valid ASCII, so it must
278    // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
279    // check via an unchecked conversion or by parsing the number straight from
280    // &[u8].
281    let cap = core::str::from_utf8(&rep[i..cap_end])
282        .expect("valid UTF-8 capture name");
283    Some(CaptureRef {
284        cap: match cap.parse::<usize>() {
285            Ok(i) => Ref::Number(i),
286            Err(_) => Ref::Named(cap),
287        },
288        end: cap_end,
289    })
290}
291
292/// Looks for a braced reference, e.g., `${foo1}`. This assumes that an opening
293/// brace has been found at `i-1` in `rep`. This then looks for a closing
294/// brace and returns the capture reference within the brace.
295fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> {
296    assert_eq!(b'{', rep[i.checked_sub(1).unwrap()]);
297    let start = i;
298    while rep.get(i).map_or(false, |&b| b != b'}') {
299        i += 1;
300    }
301    if !rep.get(i).map_or(false, |&b| b == b'}') {
302        return None;
303    }
304    // When looking at braced names, we don't put any restrictions on the name,
305    // so it's possible it could be invalid UTF-8. But a capture group name
306    // can never be invalid UTF-8, so if we have invalid UTF-8, then we can
307    // safely return None.
308    let cap = match core::str::from_utf8(&rep[start..i]) {
309        Err(_) => return None,
310        Ok(cap) => cap,
311    };
312    Some(CaptureRef {
313        cap: match cap.parse::<usize>() {
314            Ok(i) => Ref::Number(i),
315            Err(_) => Ref::Named(cap),
316        },
317        end: i + 1,
318    })
319}
320
321/// Returns true if and only if the given byte is allowed in a capture name
322/// written in non-brace form.
323fn is_valid_cap_letter(b: u8) -> bool {
324    match b {
325        b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
326        _ => false,
327    }
328}
329
330#[cfg(test)]
331mod tests {
332    use alloc::{string::String, vec, vec::Vec};
333
334    use super::{find_cap_ref, CaptureRef};
335
336    macro_rules! find {
337        ($name:ident, $text:expr) => {
338            #[test]
339            fn $name() {
340                assert_eq!(None, find_cap_ref($text.as_bytes()));
341            }
342        };
343        ($name:ident, $text:expr, $capref:expr) => {
344            #[test]
345            fn $name() {
346                assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
347            }
348        };
349    }
350
351    macro_rules! c {
352        ($name_or_number:expr, $pos:expr) => {
353            CaptureRef { cap: $name_or_number.into(), end: $pos }
354        };
355    }
356
357    find!(find_cap_ref1, "$foo", c!("foo", 4));
358    find!(find_cap_ref2, "${foo}", c!("foo", 6));
359    find!(find_cap_ref3, "$0", c!(0, 2));
360    find!(find_cap_ref4, "$5", c!(5, 2));
361    find!(find_cap_ref5, "$10", c!(10, 3));
362    // See https://github.com/rust-lang/regex/pull/585
363    // for more on characters following numbers
364    find!(find_cap_ref6, "$42a", c!("42a", 4));
365    find!(find_cap_ref7, "${42}a", c!(42, 5));
366    find!(find_cap_ref8, "${42");
367    find!(find_cap_ref9, "${42 ");
368    find!(find_cap_ref10, " $0 ");
369    find!(find_cap_ref11, "$");
370    find!(find_cap_ref12, " ");
371    find!(find_cap_ref13, "");
372    find!(find_cap_ref14, "$1-$2", c!(1, 2));
373    find!(find_cap_ref15, "$1_$2", c!("1_", 3));
374    find!(find_cap_ref16, "$x-$y", c!("x", 2));
375    find!(find_cap_ref17, "$x_$y", c!("x_", 3));
376    find!(find_cap_ref18, "${#}", c!("#", 4));
377    find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
378    find!(find_cap_ref20, "${¾}", c!("¾", 5));
379    find!(find_cap_ref21, "${¾a}", c!("¾a", 6));
380    find!(find_cap_ref22, "${a¾}", c!("a¾", 6));
381    find!(find_cap_ref23, "${☃}", c!("☃", 6));
382    find!(find_cap_ref24, "${a☃}", c!("a☃", 7));
383    find!(find_cap_ref25, "${☃a}", c!("☃a", 7));
384    find!(find_cap_ref26, "${名字}", c!("名字", 9));
385
386    fn interpolate_string(
387        mut name_to_index: Vec<(&'static str, usize)>,
388        caps: Vec<&'static str>,
389        replacement: &str,
390    ) -> String {
391        name_to_index.sort_by_key(|x| x.0);
392
393        let mut dst = String::new();
394        super::string(
395            replacement,
396            |i, dst| {
397                if let Some(&s) = caps.get(i) {
398                    dst.push_str(s);
399                }
400            },
401            |name| -> Option<usize> {
402                name_to_index
403                    .binary_search_by_key(&name, |x| x.0)
404                    .ok()
405                    .map(|i| name_to_index[i].1)
406            },
407            &mut dst,
408        );
409        dst
410    }
411
412    fn interpolate_bytes(
413        mut name_to_index: Vec<(&'static str, usize)>,
414        caps: Vec<&'static str>,
415        replacement: &str,
416    ) -> String {
417        name_to_index.sort_by_key(|x| x.0);
418
419        let mut dst = vec![];
420        super::bytes(
421            replacement.as_bytes(),
422            |i, dst| {
423                if let Some(&s) = caps.get(i) {
424                    dst.extend_from_slice(s.as_bytes());
425                }
426            },
427            |name| -> Option<usize> {
428                name_to_index
429                    .binary_search_by_key(&name, |x| x.0)
430                    .ok()
431                    .map(|i| name_to_index[i].1)
432            },
433            &mut dst,
434        );
435        String::from_utf8(dst).unwrap()
436    }
437
438    macro_rules! interp {
439        ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => {
440            #[test]
441            fn $name() {
442                assert_eq!(
443                    $expected,
444                    interpolate_string($map, $caps, $hay),
445                    "interpolate::string failed",
446                );
447                assert_eq!(
448                    $expected,
449                    interpolate_bytes($map, $caps, $hay),
450                    "interpolate::bytes failed",
451                );
452            }
453        };
454    }
455
456    interp!(
457        interp1,
458        vec![("foo", 2)],
459        vec!["", "", "xxx"],
460        "test $foo test",
461        "test xxx test",
462    );
463
464    interp!(
465        interp2,
466        vec![("foo", 2)],
467        vec!["", "", "xxx"],
468        "test$footest",
469        "test",
470    );
471
472    interp!(
473        interp3,
474        vec![("foo", 2)],
475        vec!["", "", "xxx"],
476        "test${foo}test",
477        "testxxxtest",
478    );
479
480    interp!(
481        interp4,
482        vec![("foo", 2)],
483        vec!["", "", "xxx"],
484        "test$2test",
485        "test",
486    );
487
488    interp!(
489        interp5,
490        vec![("foo", 2)],
491        vec!["", "", "xxx"],
492        "test${2}test",
493        "testxxxtest",
494    );
495
496    interp!(
497        interp6,
498        vec![("foo", 2)],
499        vec!["", "", "xxx"],
500        "test $$foo test",
501        "test $foo test",
502    );
503
504    interp!(
505        interp7,
506        vec![("foo", 2)],
507        vec!["", "", "xxx"],
508        "test $foo",
509        "test xxx",
510    );
511
512    interp!(
513        interp8,
514        vec![("foo", 2)],
515        vec!["", "", "xxx"],
516        "$foo test",
517        "xxx test",
518    );
519
520    interp!(
521        interp9,
522        vec![("bar", 1), ("foo", 2)],
523        vec!["", "yyy", "xxx"],
524        "test $bar$foo",
525        "test yyyxxx",
526    );
527
528    interp!(
529        interp10,
530        vec![("bar", 1), ("foo", 2)],
531        vec!["", "yyy", "xxx"],
532        "test $ test",
533        "test $ test",
534    );
535
536    interp!(
537        interp11,
538        vec![("bar", 1), ("foo", 2)],
539        vec!["", "yyy", "xxx"],
540        "test ${} test",
541        "test  test",
542    );
543
544    interp!(
545        interp12,
546        vec![("bar", 1), ("foo", 2)],
547        vec!["", "yyy", "xxx"],
548        "test ${ } test",
549        "test  test",
550    );
551
552    interp!(
553        interp13,
554        vec![("bar", 1), ("foo", 2)],
555        vec!["", "yyy", "xxx"],
556        "test ${a b} test",
557        "test  test",
558    );
559
560    interp!(
561        interp14,
562        vec![("bar", 1), ("foo", 2)],
563        vec!["", "yyy", "xxx"],
564        "test ${a} test",
565        "test  test",
566    );
567
568    // This is a funny case where a braced reference is never closed, but
569    // within the unclosed braced reference, there is an unbraced reference.
570    // In this case, the braced reference is just treated literally and the
571    // unbraced reference is found.
572    interp!(
573        interp15,
574        vec![("bar", 1), ("foo", 2)],
575        vec!["", "yyy", "xxx"],
576        "test ${wat $bar ok",
577        "test ${wat yyy ok",
578    );
579}