fancy_regex/lib.rs
1// Copyright 2016 The Fancy Regex Authors.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21/*!
22An implementation of regexes, supporting a relatively rich set of features, including backreferences
23and lookaround.
24
25It builds on top of the excellent [regex] crate. If you are not
26familiar with it, make sure you read its documentation and maybe you don't even need fancy-regex.
27
28If your regex or parts of it does not use any special features, the matching is delegated to the
29regex crate. That means it has linear runtime. But if you use "fancy" features such as
30backreferences or look-around, an engine with backtracking needs to be used. In that case, the regex
31can be slow and take exponential time to run because of what is called "catastrophic backtracking".
32This depends on the regex and the input.
33
34# Usage
35
36The API should feel very similar to the regex crate, and involves compiling a regex and then using
37it to find matches in text.
38
39## Example: Matching text
40
41An example with backreferences to check if a text consists of two identical words:
42
43```rust
44use fancy_regex::Regex;
45
46let re = Regex::new(r"^(\w+) (\1)$").unwrap();
47let result = re.is_match("foo foo");
48
49assert!(result.is_ok());
50let did_match = result.unwrap();
51assert!(did_match);
52```
53
54Note that like in the regex crate, the regex needs anchors like `^` and `$` to match against the
55entire input text.
56
57## Example: Finding the position of matches
58
59```rust
60use fancy_regex::Regex;
61
62let re = Regex::new(r"(\d)\1").unwrap();
63let result = re.find("foo 22");
64
65assert!(result.is_ok(), "execution was successful");
66let match_option = result.unwrap();
67
68assert!(match_option.is_some(), "found a match");
69let m = match_option.unwrap();
70
71assert_eq!(m.start(), 4);
72assert_eq!(m.end(), 6);
73assert_eq!(m.as_str(), "22");
74```
75
76## Example: Capturing groups
77
78```rust
79use fancy_regex::Regex;
80
81let re = Regex::new(r"(?<!AU)\$(\d+)").unwrap();
82let result = re.captures("AU$10, $20");
83
84let captures = result.expect("Error running regex").expect("No match found");
85let group = captures.get(1).expect("No group");
86assert_eq!(group.as_str(), "20");
87```
88
89## Example: Splitting text
90
91```rust
92use fancy_regex::Regex;
93
94let re = Regex::new(r"[ \t]+").unwrap();
95let target = "a b \t c\td e";
96let fields: Vec<&str> = re.split(target).map(|x| x.unwrap()).collect();
97assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
98
99let fields: Vec<&str> = re.splitn(target, 3).map(|x| x.unwrap()).collect();
100assert_eq!(fields, vec!["a", "b", "c\td e"]);
101```
102
103# Syntax
104
105The regex syntax is based on the [regex] crate's, with some additional supported syntax.
106
107Escapes:
108
109`\h`
110: hex digit (`[0-9A-Fa-f]`) \
111`\H`
112: not hex digit (`[^0-9A-Fa-f]`) \
113`\e`
114: escape control character (`\x1B`) \
115`\K`
116: keep text matched so far out of the overall match ([docs](https://www.regular-expressions.info/keep.html))\
117`\G`
118: anchor to where the previous match ended ([docs](https://www.regular-expressions.info/continue.html))
119
120Backreferences:
121
122`\1`
123: match the exact string that the first capture group matched \
124`\2`
125: backref to the second capture group, etc
126
127Named capture groups:
128
129`(?<name>exp)`
130: match *exp*, creating capture group named *name* \
131`\k<name>`
132: match the exact string that the capture group named *name* matched \
133`(?P<name>exp)`
134: same as `(?<name>exp)` for compatibility with Python, etc. \
135`(?P=name)`
136: same as `\k<name>` for compatibility with Python, etc.
137
138Look-around assertions for matching without changing the current position:
139
140`(?=exp)`
141: look-ahead, succeeds if *exp* matches to the right of the current position \
142`(?!exp)`
143: negative look-ahead, succeeds if *exp* doesn't match to the right \
144`(?<=exp)`
145: look-behind, succeeds if *exp* matches to the left of the current position \
146`(?<!exp)`
147: negative look-behind, succeeds if *exp* doesn't match to the left
148
149Atomic groups using `(?>exp)` to prevent backtracking within `exp`, e.g.:
150
151```
152# use fancy_regex::Regex;
153let re = Regex::new(r"^a(?>bc|b)c$").unwrap();
154assert!(re.is_match("abcc").unwrap());
155// Doesn't match because `|b` is never tried because of the atomic group
156assert!(!re.is_match("abc").unwrap());
157```
158
159Conditionals - if/then/else:
160
161`(?(1))`
162: continue only if first capture group matched \
163`(?(<name>))`
164: continue only if capture group named *name* matched \
165`(?(1)true_branch|false_branch)`
166: if the first capture group matched then execute the true_branch regex expression, else execute false_branch ([docs](https://www.regular-expressions.info/conditional.html)) \
167`(?(condition)true_branch|false_branch)`
168: if the condition matches then execute the true_branch regex expression, else execute false_branch from the point just before the condition was evaluated
169
170[regex]: https://crates.io/crates/regex
171*/
172
173#![doc(html_root_url = "https://docs.rs/fancy-regex/0.14.0")]
174#![deny(missing_docs)]
175#![deny(missing_debug_implementations)]
176#![cfg_attr(not(feature = "std"), no_std)]
177
178extern crate alloc;
179
180use alloc::borrow::{Cow, ToOwned};
181use alloc::boxed::Box;
182use alloc::string::{String, ToString};
183use alloc::sync::Arc;
184use alloc::vec;
185use alloc::vec::Vec;
186
187use core::convert::TryFrom;
188use core::fmt::{Debug, Formatter};
189use core::ops::{Index, Range};
190use core::str::FromStr;
191use core::{fmt, usize};
192use regex_automata::meta::Regex as RaRegex;
193use regex_automata::util::captures::Captures as RaCaptures;
194use regex_automata::util::syntax::Config as SyntaxConfig;
195use regex_automata::Input as RaInput;
196
197mod analyze;
198mod compile;
199mod error;
200mod expand;
201mod parse;
202mod replacer;
203mod vm;
204
205use crate::analyze::analyze;
206use crate::compile::compile;
207use crate::parse::{ExprTree, NamedGroups, Parser};
208use crate::vm::{Prog, OPTION_SKIPPED_EMPTY_MATCH};
209
210pub use crate::error::{CompileError, Error, ParseError, Result, RuntimeError};
211pub use crate::expand::Expander;
212pub use crate::replacer::{NoExpand, Replacer, ReplacerRef};
213
214const MAX_RECURSION: usize = 64;
215
216// the public API
217
218/// A builder for a `Regex` to allow configuring options.
219#[derive(Debug)]
220pub struct RegexBuilder(RegexOptions);
221
222/// A compiled regular expression.
223#[derive(Clone)]
224pub struct Regex {
225 inner: RegexImpl,
226 named_groups: Arc<NamedGroups>,
227}
228
229// Separate enum because we don't want to expose any of this
230#[derive(Clone)]
231enum RegexImpl {
232 // Do we want to box this? It's pretty big...
233 Wrap {
234 inner: RaRegex,
235 options: RegexOptions,
236 },
237 Fancy {
238 prog: Prog,
239 n_groups: usize,
240 options: RegexOptions,
241 },
242}
243
244/// A single match of a regex or group in an input text
245#[derive(Copy, Clone, Debug, Eq, PartialEq)]
246pub struct Match<'t> {
247 text: &'t str,
248 start: usize,
249 end: usize,
250}
251
252/// An iterator over all non-overlapping matches for a particular string.
253///
254/// The iterator yields a `Result<Match>`. The iterator stops when no more
255/// matches can be found.
256///
257/// `'r` is the lifetime of the compiled regular expression and `'t` is the
258/// lifetime of the matched string.
259#[derive(Debug)]
260pub struct Matches<'r, 't> {
261 re: &'r Regex,
262 text: &'t str,
263 last_end: usize,
264 last_match: Option<usize>,
265}
266
267impl<'r, 't> Matches<'r, 't> {
268 /// Return the text being searched.
269 pub fn text(&self) -> &'t str {
270 self.text
271 }
272
273 /// Return the underlying regex.
274 pub fn regex(&self) -> &'r Regex {
275 &self.re
276 }
277}
278
279impl<'r, 't> Iterator for Matches<'r, 't> {
280 type Item = Result<Match<'t>>;
281
282 /// Adapted from the `regex` crate. Calls `find_from_pos` repeatedly.
283 /// Ignores empty matches immediately after a match.
284 fn next(&mut self) -> Option<Self::Item> {
285 if self.last_end > self.text.len() {
286 return None;
287 }
288
289 let option_flags = if let Some(last_match) = self.last_match {
290 if self.last_end > last_match {
291 OPTION_SKIPPED_EMPTY_MATCH
292 } else {
293 0
294 }
295 } else {
296 0
297 };
298 let mat =
299 match self
300 .re
301 .find_from_pos_with_option_flags(self.text, self.last_end, option_flags)
302 {
303 Err(error) => return Some(Err(error)),
304 Ok(None) => return None,
305 Ok(Some(mat)) => mat,
306 };
307
308 if mat.start == mat.end {
309 // This is an empty match. To ensure we make progress, start
310 // the next search at the smallest possible starting position
311 // of the next match following this one.
312 self.last_end = next_utf8(self.text, mat.end);
313 // Don't accept empty matches immediately following a match.
314 // Just move on to the next match.
315 if Some(mat.end) == self.last_match {
316 return self.next();
317 }
318 } else {
319 self.last_end = mat.end;
320 }
321
322 self.last_match = Some(mat.end);
323
324 Some(Ok(mat))
325 }
326}
327
328/// An iterator that yields all non-overlapping capture groups matching a
329/// particular regular expression.
330///
331/// The iterator stops when no more matches can be found.
332///
333/// `'r` is the lifetime of the compiled regular expression and `'t` is the
334/// lifetime of the matched string.
335#[derive(Debug)]
336pub struct CaptureMatches<'r, 't>(Matches<'r, 't>);
337
338impl<'r, 't> CaptureMatches<'r, 't> {
339 /// Return the text being searched.
340 pub fn text(&self) -> &'t str {
341 self.0.text
342 }
343
344 /// Return the underlying regex.
345 pub fn regex(&self) -> &'r Regex {
346 &self.0.re
347 }
348}
349
350impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
351 type Item = Result<Captures<'t>>;
352
353 /// Adapted from the `regex` crate. Calls `captures_from_pos` repeatedly.
354 /// Ignores empty matches immediately after a match.
355 fn next(&mut self) -> Option<Self::Item> {
356 if self.0.last_end > self.0.text.len() {
357 return None;
358 }
359
360 let captures = match self.0.re.captures_from_pos(self.0.text, self.0.last_end) {
361 Err(error) => return Some(Err(error)),
362 Ok(None) => return None,
363 Ok(Some(captures)) => captures,
364 };
365
366 let mat = captures
367 .get(0)
368 .expect("`Captures` is expected to have entire match at 0th position");
369 if mat.start == mat.end {
370 self.0.last_end = next_utf8(self.0.text, mat.end);
371 if Some(mat.end) == self.0.last_match {
372 return self.next();
373 }
374 } else {
375 self.0.last_end = mat.end;
376 }
377
378 self.0.last_match = Some(mat.end);
379
380 Some(Ok(captures))
381 }
382}
383
384/// A set of capture groups found for a regex.
385#[derive(Debug)]
386pub struct Captures<'t> {
387 inner: CapturesImpl<'t>,
388 named_groups: Arc<NamedGroups>,
389}
390
391#[derive(Debug)]
392enum CapturesImpl<'t> {
393 Wrap {
394 text: &'t str,
395 locations: RaCaptures,
396 },
397 Fancy {
398 text: &'t str,
399 saves: Vec<usize>,
400 },
401}
402
403/// Iterator for captured groups in order in which they appear in the regex.
404#[derive(Debug)]
405pub struct SubCaptureMatches<'c, 't> {
406 caps: &'c Captures<'t>,
407 i: usize,
408}
409
410/// An iterator over all substrings delimited by a regex.
411///
412/// This iterator yields `Result<&'h str>`, where each item is a substring of the
413/// target string that is delimited by matches of the regular expression. It stops when there
414/// are no more substrings to yield.
415///
416/// `'r` is the lifetime of the compiled regular expression, and `'h` is the
417/// lifetime of the target string being split.
418///
419/// This iterator can be created by the [`Regex::split`] method.
420#[derive(Debug)]
421pub struct Split<'r, 'h> {
422 matches: Matches<'r, 'h>,
423 next_start: usize,
424 target: &'h str,
425}
426
427impl<'r, 'h> Iterator for Split<'r, 'h> {
428 type Item = Result<&'h str>;
429
430 /// Returns the next substring that results from splitting the target string by the regex.
431 ///
432 /// If no more matches are found, returns the remaining part of the string,
433 /// or `None` if all substrings have been yielded.
434 fn next(&mut self) -> Option<Result<&'h str>> {
435 match self.matches.next() {
436 None => {
437 let len = self.target.len();
438 if self.next_start > len {
439 // No more substrings to return
440 None
441 } else {
442 // Return the last part of the target string
443 // Next call will return None
444 let part = &self.target[self.next_start..len];
445 self.next_start = len + 1;
446 Some(Ok(part))
447 }
448 }
449 // Return the next substring
450 Some(Ok(m)) => {
451 let part = &self.target[self.next_start..m.start()];
452 self.next_start = m.end();
453 Some(Ok(part))
454 }
455 Some(Err(e)) => Some(Err(e)),
456 }
457 }
458}
459
460impl<'r, 'h> core::iter::FusedIterator for Split<'r, 'h> {}
461
462/// An iterator over at most `N` substrings delimited by a regex.
463///
464/// This iterator yields `Result<&'h str>`, where each item is a substring of the
465/// target that is delimited by matches of the regular expression. It stops either when
466/// there are no more substrings to yield, or after `N` substrings have been yielded.
467///
468/// The `N`th substring is the remaining part of the target.
469///
470/// `'r` is the lifetime of the compiled regular expression, and `'h` is the
471/// lifetime of the target string being split.
472///
473/// This iterator can be created by the [`Regex::splitn`] method.
474#[derive(Debug)]
475pub struct SplitN<'r, 'h> {
476 splits: Split<'r, 'h>,
477 limit: usize,
478}
479
480impl<'r, 'h> Iterator for SplitN<'r, 'h> {
481 type Item = Result<&'h str>;
482
483 /// Returns the next substring resulting from splitting the target by the regex,
484 /// limited to `N` splits.
485 ///
486 /// Returns `None` if no more matches are found or if the limit is reached after yielding
487 /// the remaining part of the target.
488 fn next(&mut self) -> Option<Result<&'h str>> {
489 if self.limit == 0 {
490 // Limit reached. No more substrings available.
491 return None;
492 }
493
494 // Decrement the limit for each split.
495 self.limit -= 1;
496 if self.limit > 0 {
497 return self.splits.next();
498 }
499
500 // Nth split
501 let len = self.splits.target.len();
502 if self.splits.next_start > len {
503 // No more substrings available.
504 return None;
505 } else {
506 // Return the remaining part of the target
507 let start = self.splits.next_start;
508 self.splits.next_start = len + 1;
509 return Some(Ok(&self.splits.target[start..len]));
510 }
511 }
512
513 fn size_hint(&self) -> (usize, Option<usize>) {
514 (0, Some(self.limit))
515 }
516}
517
518impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {}
519
520#[derive(Clone, Debug)]
521struct RegexOptions {
522 pattern: String,
523 syntaxc: SyntaxConfig,
524 backtrack_limit: usize,
525 delegate_size_limit: Option<usize>,
526 delegate_dfa_size_limit: Option<usize>,
527}
528
529impl Default for RegexOptions {
530 fn default() -> Self {
531 RegexOptions {
532 pattern: String::new(),
533 syntaxc: SyntaxConfig::default(),
534 backtrack_limit: 1_000_000,
535 delegate_size_limit: None,
536 delegate_dfa_size_limit: None,
537 }
538 }
539}
540
541impl RegexBuilder {
542 /// Create a new regex builder with a regex pattern.
543 ///
544 /// If the pattern is invalid, the call to `build` will fail later.
545 pub fn new(pattern: &str) -> Self {
546 let mut builder = RegexBuilder(RegexOptions::default());
547 builder.0.pattern = pattern.to_string();
548 builder
549 }
550
551 /// Build the `Regex`.
552 ///
553 /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed.
554 pub fn build(&self) -> Result<Regex> {
555 Regex::new_options(self.0.clone())
556 }
557
558 /// Override default case insensitive
559 /// this is to enable/disable casing via builder instead of a flag within
560 /// the raw string provided to the regex builder
561 ///
562 /// Default is false
563 pub fn case_insensitive(&mut self, yes: bool) -> &mut Self {
564 let syntaxc = self.0.syntaxc.to_owned();
565 self.0.syntaxc = syntaxc.case_insensitive(yes);
566 self
567 }
568
569 /// Limit for how many times backtracking should be attempted for fancy regexes (where
570 /// backtracking is used). If this limit is exceeded, execution returns an error with
571 /// [`Error::BacktrackLimitExceeded`](enum.Error.html#variant.BacktrackLimitExceeded).
572 /// This is for preventing a regex with catastrophic backtracking to run for too long.
573 ///
574 /// Default is `1_000_000` (1 million).
575 pub fn backtrack_limit(&mut self, limit: usize) -> &mut Self {
576 self.0.backtrack_limit = limit;
577 self
578 }
579
580 /// Set the approximate size limit of the compiled regular expression.
581 ///
582 /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used
583 /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As
584 /// such the actual limit is closer to `<number of delegated regexes> * delegate_size_limit`.
585 pub fn delegate_size_limit(&mut self, limit: usize) -> &mut Self {
586 self.0.delegate_size_limit = Some(limit);
587 self
588 }
589
590 /// Set the approximate size of the cache used by the DFA.
591 ///
592 /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used
593 /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As
594 /// such the actual limit is closer to `<number of delegated regexes> *
595 /// delegate_dfa_size_limit`.
596 pub fn delegate_dfa_size_limit(&mut self, limit: usize) -> &mut Self {
597 self.0.delegate_dfa_size_limit = Some(limit);
598 self
599 }
600}
601
602impl fmt::Debug for Regex {
603 /// Shows the original regular expression.
604 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
605 write!(f, "{}", self.as_str())
606 }
607}
608
609impl fmt::Display for Regex {
610 /// Shows the original regular expression
611 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
612 write!(f, "{}", self.as_str())
613 }
614}
615
616impl FromStr for Regex {
617 type Err = Error;
618
619 /// Attempts to parse a string into a regular expression
620 fn from_str(s: &str) -> Result<Regex> {
621 Regex::new(s)
622 }
623}
624
625impl Regex {
626 /// Parse and compile a regex with default options, see `RegexBuilder`.
627 ///
628 /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed.
629 pub fn new(re: &str) -> Result<Regex> {
630 let options = RegexOptions {
631 pattern: re.to_string(),
632 ..RegexOptions::default()
633 };
634 Self::new_options(options)
635 }
636
637 fn new_options(options: RegexOptions) -> Result<Regex> {
638 let raw_tree = Expr::parse_tree(&options.pattern)?;
639
640 // wrapper to search for re at arbitrary start position,
641 // and to capture the match bounds
642 let tree = ExprTree {
643 expr: Expr::Concat(vec![
644 Expr::Repeat {
645 child: Box::new(Expr::Any { newline: true }),
646 lo: 0,
647 hi: usize::MAX,
648 greedy: false,
649 },
650 Expr::Group(Box::new(raw_tree.expr)),
651 ]),
652 ..raw_tree
653 };
654
655 let info = analyze(&tree)?;
656
657 let inner_info = &info.children[1].children[0]; // references inner expr
658 if !inner_info.hard {
659 // easy case, wrap regex
660
661 // we do our own to_str because escapes are different
662 let mut re_cooked = String::new();
663 // same as raw_tree.expr above, but it was moved, so traverse to find it
664 let raw_e = match tree.expr {
665 Expr::Concat(ref v) => match v[1] {
666 Expr::Group(ref child) => child,
667 _ => unreachable!(),
668 },
669 _ => unreachable!(),
670 };
671 raw_e.to_str(&mut re_cooked, 0);
672 let inner = compile::compile_inner(&re_cooked, &options)?;
673 return Ok(Regex {
674 inner: RegexImpl::Wrap { inner, options },
675 named_groups: Arc::new(tree.named_groups),
676 });
677 }
678
679 let prog = compile(&info)?;
680 Ok(Regex {
681 inner: RegexImpl::Fancy {
682 prog,
683 n_groups: info.end_group,
684 options,
685 },
686 named_groups: Arc::new(tree.named_groups),
687 })
688 }
689
690 /// Returns the original string of this regex.
691 pub fn as_str(&self) -> &str {
692 match &self.inner {
693 RegexImpl::Wrap { options, .. } => &options.pattern,
694 RegexImpl::Fancy { options, .. } => &options.pattern,
695 }
696 }
697
698 /// Check if the regex matches the input text.
699 ///
700 /// # Example
701 ///
702 /// Test if some text contains the same word twice:
703 ///
704 /// ```rust
705 /// # use fancy_regex::Regex;
706 ///
707 /// let re = Regex::new(r"(\w+) \1").unwrap();
708 /// assert!(re.is_match("mirror mirror on the wall").unwrap());
709 /// ```
710 pub fn is_match(&self, text: &str) -> Result<bool> {
711 match &self.inner {
712 RegexImpl::Wrap { ref inner, .. } => Ok(inner.is_match(text)),
713 RegexImpl::Fancy {
714 ref prog, options, ..
715 } => {
716 let result = vm::run(prog, text, 0, 0, options)?;
717 Ok(result.is_some())
718 }
719 }
720 }
721
722 /// Returns an iterator for each successive non-overlapping match in `text`.
723 ///
724 /// If you have capturing groups in your regex that you want to extract, use the [Regex::captures_iter()]
725 /// method.
726 ///
727 /// # Example
728 ///
729 /// Find all words followed by an exclamation point:
730 ///
731 /// ```rust
732 /// # use fancy_regex::Regex;
733 ///
734 /// let re = Regex::new(r"\w+(?=!)").unwrap();
735 /// let mut matches = re.find_iter("so fancy! even with! iterators!");
736 /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "fancy");
737 /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "with");
738 /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "iterators");
739 /// assert!(matches.next().is_none());
740 /// ```
741 pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> {
742 Matches {
743 re: &self,
744 text,
745 last_end: 0,
746 last_match: None,
747 }
748 }
749
750 /// Find the first match in the input text.
751 ///
752 /// If you have capturing groups in your regex that you want to extract, use the [Regex::captures()]
753 /// method.
754 ///
755 /// # Example
756 ///
757 /// Find a word that is followed by an exclamation point:
758 ///
759 /// ```rust
760 /// # use fancy_regex::Regex;
761 ///
762 /// let re = Regex::new(r"\w+(?=!)").unwrap();
763 /// assert_eq!(re.find("so fancy!").unwrap().unwrap().as_str(), "fancy");
764 /// ```
765 pub fn find<'t>(&self, text: &'t str) -> Result<Option<Match<'t>>> {
766 self.find_from_pos(text, 0)
767 }
768
769 /// Returns the first match in `text`, starting from the specified byte position `pos`.
770 ///
771 /// # Examples
772 ///
773 /// Finding match starting at a position:
774 ///
775 /// ```
776 /// # use fancy_regex::Regex;
777 /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
778 /// let text = "1 test 123\n2 foo";
779 /// let mat = re.find_from_pos(text, 7).unwrap().unwrap();
780 ///
781 /// assert_eq!(mat.start(), 11);
782 /// assert_eq!(mat.end(), 12);
783 /// ```
784 ///
785 /// Note that in some cases this is not the same as using the `find`
786 /// method and passing a slice of the string, see [Regex::captures_from_pos()] for details.
787 pub fn find_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result<Option<Match<'t>>> {
788 self.find_from_pos_with_option_flags(text, pos, 0)
789 }
790
791 fn find_from_pos_with_option_flags<'t>(
792 &self,
793 text: &'t str,
794 pos: usize,
795 option_flags: u32,
796 ) -> Result<Option<Match<'t>>> {
797 match &self.inner {
798 RegexImpl::Wrap { inner, .. } => Ok(inner
799 .search(&RaInput::new(text).span(pos..text.len()))
800 .map(|m| Match::new(text, m.start(), m.end()))),
801 RegexImpl::Fancy { prog, options, .. } => {
802 let result = vm::run(prog, text, pos, option_flags, options)?;
803 Ok(result.map(|saves| Match::new(text, saves[0], saves[1])))
804 }
805 }
806 }
807
808 /// Returns an iterator over all the non-overlapping capture groups matched in `text`.
809 ///
810 /// # Examples
811 ///
812 /// Finding all matches and capturing parts of each:
813 ///
814 /// ```rust
815 /// # use fancy_regex::Regex;
816 ///
817 /// let re = Regex::new(r"(\d{4})-(\d{2})").unwrap();
818 /// let text = "It was between 2018-04 and 2020-01";
819 /// let mut all_captures = re.captures_iter(text);
820 ///
821 /// let first = all_captures.next().unwrap().unwrap();
822 /// assert_eq!(first.get(1).unwrap().as_str(), "2018");
823 /// assert_eq!(first.get(2).unwrap().as_str(), "04");
824 /// assert_eq!(first.get(0).unwrap().as_str(), "2018-04");
825 ///
826 /// let second = all_captures.next().unwrap().unwrap();
827 /// assert_eq!(second.get(1).unwrap().as_str(), "2020");
828 /// assert_eq!(second.get(2).unwrap().as_str(), "01");
829 /// assert_eq!(second.get(0).unwrap().as_str(), "2020-01");
830 ///
831 /// assert!(all_captures.next().is_none());
832 /// ```
833 pub fn captures_iter<'r, 't>(&'r self, text: &'t str) -> CaptureMatches<'r, 't> {
834 CaptureMatches(self.find_iter(text))
835 }
836
837 /// Returns the capture groups for the first match in `text`.
838 ///
839 /// If no match is found, then `Ok(None)` is returned.
840 ///
841 /// # Examples
842 ///
843 /// Finding matches and capturing parts of the match:
844 ///
845 /// ```rust
846 /// # use fancy_regex::Regex;
847 ///
848 /// let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
849 /// let text = "The date was 2018-04-07";
850 /// let captures = re.captures(text).unwrap().unwrap();
851 ///
852 /// assert_eq!(captures.get(1).unwrap().as_str(), "2018");
853 /// assert_eq!(captures.get(2).unwrap().as_str(), "04");
854 /// assert_eq!(captures.get(3).unwrap().as_str(), "07");
855 /// assert_eq!(captures.get(0).unwrap().as_str(), "2018-04-07");
856 /// ```
857 pub fn captures<'t>(&self, text: &'t str) -> Result<Option<Captures<'t>>> {
858 self.captures_from_pos(text, 0)
859 }
860
861 /// Returns the capture groups for the first match in `text`, starting from
862 /// the specified byte position `pos`.
863 ///
864 /// # Examples
865 ///
866 /// Finding captures starting at a position:
867 ///
868 /// ```
869 /// # use fancy_regex::Regex;
870 /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
871 /// let text = "1 test 123\n2 foo";
872 /// let captures = re.captures_from_pos(text, 7).unwrap().unwrap();
873 ///
874 /// let group = captures.get(1).unwrap();
875 /// assert_eq!(group.as_str(), "2");
876 /// assert_eq!(group.start(), 11);
877 /// assert_eq!(group.end(), 12);
878 /// ```
879 ///
880 /// Note that in some cases this is not the same as using the `captures`
881 /// method and passing a slice of the string, see the capture that we get
882 /// when we do this:
883 ///
884 /// ```
885 /// # use fancy_regex::Regex;
886 /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
887 /// let text = "1 test 123\n2 foo";
888 /// let captures = re.captures(&text[7..]).unwrap().unwrap();
889 /// assert_eq!(captures.get(1).unwrap().as_str(), "123");
890 /// ```
891 ///
892 /// This matched the number "123" because it's at the beginning of the text
893 /// of the string slice.
894 ///
895 pub fn captures_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result<Option<Captures<'t>>> {
896 let named_groups = self.named_groups.clone();
897 match &self.inner {
898 RegexImpl::Wrap { inner, .. } => {
899 let mut locations = inner.create_captures();
900 inner.captures(RaInput::new(text).span(pos..text.len()), &mut locations);
901 Ok(locations.is_match().then(|| Captures {
902 inner: CapturesImpl::Wrap { text, locations },
903 named_groups,
904 }))
905 }
906 RegexImpl::Fancy {
907 prog,
908 n_groups,
909 options,
910 ..
911 } => {
912 let result = vm::run(prog, text, pos, 0, options)?;
913 Ok(result.map(|mut saves| {
914 saves.truncate(n_groups * 2);
915 Captures {
916 inner: CapturesImpl::Fancy { text, saves },
917 named_groups,
918 }
919 }))
920 }
921 }
922 }
923
924 /// Returns the number of captures, including the implicit capture of the entire expression.
925 pub fn captures_len(&self) -> usize {
926 match &self.inner {
927 RegexImpl::Wrap { inner, .. } => inner.captures_len(),
928 RegexImpl::Fancy { n_groups, .. } => *n_groups,
929 }
930 }
931
932 /// Returns an iterator over the capture names.
933 pub fn capture_names(&self) -> CaptureNames {
934 let mut names = Vec::new();
935 names.resize(self.captures_len(), None);
936 for (name, &i) in self.named_groups.iter() {
937 names[i] = Some(name.as_str());
938 }
939 CaptureNames(names.into_iter())
940 }
941
942 // for debugging only
943 #[doc(hidden)]
944 pub fn debug_print(&self) {
945 match &self.inner {
946 #[cfg(feature = "std")]
947 RegexImpl::Wrap { inner, .. } => println!("wrapped {:?}", inner),
948 #[cfg(not(feature = "std"))]
949 RegexImpl::Wrap { .. } => {}
950 RegexImpl::Fancy { prog, .. } => prog.debug_print(),
951 }
952 }
953
954 /// Replaces the leftmost-first match with the replacement provided.
955 /// The replacement can be a regular string (where `$N` and `$name` are
956 /// expanded to match capture groups) or a function that takes the matches'
957 /// `Captures` and returns the replaced string.
958 ///
959 /// If no match is found, then a copy of the string is returned unchanged.
960 ///
961 /// # Replacement string syntax
962 ///
963 /// All instances of `$name` in the replacement text is replaced with the
964 /// corresponding capture group `name`.
965 ///
966 /// `name` may be an integer corresponding to the index of the
967 /// capture group (counted by order of opening parenthesis where `0` is the
968 /// entire match) or it can be a name (consisting of letters, digits or
969 /// underscores) corresponding to a named capture group.
970 ///
971 /// If `name` isn't a valid capture group (whether the name doesn't exist
972 /// or isn't a valid index), then it is replaced with the empty string.
973 ///
974 /// The longest possible name is used. e.g., `$1a` looks up the capture
975 /// group named `1a` and not the capture group at index `1`. To exert more
976 /// precise control over the name, use braces, e.g., `${1}a`.
977 ///
978 /// To write a literal `$` use `$$`.
979 ///
980 /// # Examples
981 ///
982 /// Note that this function is polymorphic with respect to the replacement.
983 /// In typical usage, this can just be a normal string:
984 ///
985 /// ```rust
986 /// # use fancy_regex::Regex;
987 /// let re = Regex::new("[^01]+").unwrap();
988 /// assert_eq!(re.replace("1078910", ""), "1010");
989 /// ```
990 ///
991 /// But anything satisfying the `Replacer` trait will work. For example,
992 /// a closure of type `|&Captures| -> String` provides direct access to the
993 /// captures corresponding to a match. This allows one to access
994 /// capturing group matches easily:
995 ///
996 /// ```rust
997 /// # use fancy_regex::{Regex, Captures};
998 /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
999 /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| {
1000 /// format!("{} {}", &caps[2], &caps[1])
1001 /// });
1002 /// assert_eq!(result, "Bruce Springsteen");
1003 /// ```
1004 ///
1005 /// But this is a bit cumbersome to use all the time. Instead, a simple
1006 /// syntax is supported that expands `$name` into the corresponding capture
1007 /// group. Here's the last example, but using this expansion technique
1008 /// with named capture groups:
1009 ///
1010 /// ```rust
1011 /// # use fancy_regex::Regex;
1012 /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap();
1013 /// let result = re.replace("Springsteen, Bruce", "$first $last");
1014 /// assert_eq!(result, "Bruce Springsteen");
1015 /// ```
1016 ///
1017 /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
1018 /// would produce the same result. To write a literal `$` use `$$`.
1019 ///
1020 /// Sometimes the replacement string requires use of curly braces to
1021 /// delineate a capture group replacement and surrounding literal text.
1022 /// For example, if we wanted to join two words together with an
1023 /// underscore:
1024 ///
1025 /// ```rust
1026 /// # use fancy_regex::Regex;
1027 /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap();
1028 /// let result = re.replace("deep fried", "${first}_$second");
1029 /// assert_eq!(result, "deep_fried");
1030 /// ```
1031 ///
1032 /// Without the curly braces, the capture group name `first_` would be
1033 /// used, and since it doesn't exist, it would be replaced with the empty
1034 /// string.
1035 ///
1036 /// Finally, sometimes you just want to replace a literal string with no
1037 /// regard for capturing group expansion. This can be done by wrapping a
1038 /// byte string with `NoExpand`:
1039 ///
1040 /// ```rust
1041 /// # use fancy_regex::Regex;
1042 /// use fancy_regex::NoExpand;
1043 ///
1044 /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap();
1045 /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last"));
1046 /// assert_eq!(result, "$2 $last");
1047 /// ```
1048 pub fn replace<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> {
1049 self.replacen(text, 1, rep)
1050 }
1051
1052 /// Replaces all non-overlapping matches in `text` with the replacement
1053 /// provided. This is the same as calling `replacen` with `limit` set to
1054 /// `0`.
1055 ///
1056 /// See the documentation for `replace` for details on how to access
1057 /// capturing group matches in the replacement string.
1058 pub fn replace_all<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> {
1059 self.replacen(text, 0, rep)
1060 }
1061
1062 /// Replaces at most `limit` non-overlapping matches in `text` with the
1063 /// replacement provided. If `limit` is 0, then all non-overlapping matches
1064 /// are replaced.
1065 ///
1066 /// Will panic if any errors are encountered. Use `try_replacen`, which this
1067 /// function unwraps, if you want to handle errors.
1068 ///
1069 /// See the documentation for `replace` for details on how to access
1070 /// capturing group matches in the replacement string.
1071 ///
1072 pub fn replacen<'t, R: Replacer>(&self, text: &'t str, limit: usize, rep: R) -> Cow<'t, str> {
1073 self.try_replacen(text, limit, rep).unwrap()
1074 }
1075
1076 /// Replaces at most `limit` non-overlapping matches in `text` with the
1077 /// replacement provided. If `limit` is 0, then all non-overlapping matches
1078 /// are replaced.
1079 ///
1080 /// Propagates any errors encountered, such as `RuntimeError::BacktrackLimitExceeded`.
1081 ///
1082 /// See the documentation for `replace` for details on how to access
1083 /// capturing group matches in the replacement string.
1084 pub fn try_replacen<'t, R: Replacer>(
1085 &self,
1086 text: &'t str,
1087 limit: usize,
1088 mut rep: R,
1089 ) -> Result<Cow<'t, str>> {
1090 // If we know that the replacement doesn't have any capture expansions,
1091 // then we can fast path. The fast path can make a tremendous
1092 // difference:
1093 //
1094 // 1) We use `find_iter` instead of `captures_iter`. Not asking for
1095 // captures generally makes the regex engines faster.
1096 // 2) We don't need to look up all of the capture groups and do
1097 // replacements inside the replacement string. We just push it
1098 // at each match and be done with it.
1099 if let Some(rep) = rep.no_expansion() {
1100 let mut it = self.find_iter(text).enumerate().peekable();
1101 if it.peek().is_none() {
1102 return Ok(Cow::Borrowed(text));
1103 }
1104 let mut new = String::with_capacity(text.len());
1105 let mut last_match = 0;
1106 for (i, m) in it {
1107 let m = m?;
1108
1109 if limit > 0 && i >= limit {
1110 break;
1111 }
1112 new.push_str(&text[last_match..m.start()]);
1113 new.push_str(&rep);
1114 last_match = m.end();
1115 }
1116 new.push_str(&text[last_match..]);
1117 return Ok(Cow::Owned(new));
1118 }
1119
1120 // The slower path, which we use if the replacement needs access to
1121 // capture groups.
1122 let mut it = self.captures_iter(text).enumerate().peekable();
1123 if it.peek().is_none() {
1124 return Ok(Cow::Borrowed(text));
1125 }
1126 let mut new = String::with_capacity(text.len());
1127 let mut last_match = 0;
1128 for (i, cap) in it {
1129 let cap = cap?;
1130
1131 if limit > 0 && i >= limit {
1132 break;
1133 }
1134 // unwrap on 0 is OK because captures only reports matches
1135 let m = cap.get(0).unwrap();
1136 new.push_str(&text[last_match..m.start()]);
1137 rep.replace_append(&cap, &mut new);
1138 last_match = m.end();
1139 }
1140 new.push_str(&text[last_match..]);
1141 Ok(Cow::Owned(new))
1142 }
1143
1144 /// Splits the string by matches of the regex.
1145 ///
1146 /// Returns an iterator over the substrings of the target string
1147 /// that *aren't* matched by the regex.
1148 ///
1149 /// # Example
1150 ///
1151 /// To split a string delimited by arbitrary amounts of spaces or tabs:
1152 ///
1153 /// ```rust
1154 /// # use fancy_regex::Regex;
1155 /// let re = Regex::new(r"[ \t]+").unwrap();
1156 /// let target = "a b \t c\td e";
1157 /// let fields: Vec<&str> = re.split(target).map(|x| x.unwrap()).collect();
1158 /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
1159 /// ```
1160 pub fn split<'r, 'h>(&'r self, target: &'h str) -> Split<'r, 'h> {
1161 Split {
1162 matches: self.find_iter(target),
1163 next_start: 0,
1164 target,
1165 }
1166 }
1167
1168 /// Splits the string by matches of the regex at most `limit` times.
1169 ///
1170 /// Returns an iterator over the substrings of the target string
1171 /// that *aren't* matched by the regex.
1172 ///
1173 /// The `N`th substring is the remaining part of the target.
1174 ///
1175 /// # Example
1176 ///
1177 /// To split a string delimited by arbitrary amounts of spaces or tabs
1178 /// 3 times:
1179 ///
1180 /// ```rust
1181 /// # use fancy_regex::Regex;
1182 /// let re = Regex::new(r"[ \t]+").unwrap();
1183 /// let target = "a b \t c\td e";
1184 /// let fields: Vec<&str> = re.splitn(target, 3).map(|x| x.unwrap()).collect();
1185 /// assert_eq!(fields, vec!["a", "b", "c\td e"]);
1186 /// ```
1187 pub fn splitn<'r, 'h>(&'r self, target: &'h str, limit: usize) -> SplitN<'r, 'h> {
1188 SplitN {
1189 splits: self.split(target),
1190 limit: limit,
1191 }
1192 }
1193}
1194
1195impl TryFrom<&str> for Regex {
1196 type Error = Error;
1197
1198 /// Attempts to parse a string into a regular expression
1199 fn try_from(s: &str) -> Result<Self> {
1200 Self::new(s)
1201 }
1202}
1203
1204impl TryFrom<String> for Regex {
1205 type Error = Error;
1206
1207 /// Attempts to parse a string into a regular expression
1208 fn try_from(s: String) -> Result<Self> {
1209 Self::new(&s)
1210 }
1211}
1212
1213impl<'t> Match<'t> {
1214 /// Returns the starting byte offset of the match in the text.
1215 #[inline]
1216 pub fn start(&self) -> usize {
1217 self.start
1218 }
1219
1220 /// Returns the ending byte offset of the match in the text.
1221 #[inline]
1222 pub fn end(&self) -> usize {
1223 self.end
1224 }
1225
1226 /// Returns the range over the starting and ending byte offsets of the match in text.
1227 #[inline]
1228 pub fn range(&self) -> Range<usize> {
1229 self.start..self.end
1230 }
1231
1232 /// Returns the matched text.
1233 #[inline]
1234 pub fn as_str(&self) -> &'t str {
1235 &self.text[self.start..self.end]
1236 }
1237
1238 /// Creates a new match from the given text and byte offsets.
1239 fn new(text: &'t str, start: usize, end: usize) -> Match<'t> {
1240 Match { text, start, end }
1241 }
1242}
1243
1244impl<'t> From<Match<'t>> for &'t str {
1245 fn from(m: Match<'t>) -> &'t str {
1246 m.as_str()
1247 }
1248}
1249
1250impl<'t> From<Match<'t>> for Range<usize> {
1251 fn from(m: Match<'t>) -> Range<usize> {
1252 m.range()
1253 }
1254}
1255
1256#[allow(clippy::len_without_is_empty)] // follow regex's API
1257impl<'t> Captures<'t> {
1258 /// Get the capture group by its index in the regex.
1259 ///
1260 /// If there is no match for that group or the index does not correspond to a group, `None` is
1261 /// returned. The index 0 returns the whole match.
1262 pub fn get(&self, i: usize) -> Option<Match<'t>> {
1263 match &self.inner {
1264 CapturesImpl::Wrap { text, locations } => locations.get_group(i).map(|span| Match {
1265 text,
1266 start: span.start,
1267 end: span.end,
1268 }),
1269 CapturesImpl::Fancy { text, ref saves } => {
1270 let slot = i * 2;
1271 if slot >= saves.len() {
1272 return None;
1273 }
1274 let lo = saves[slot];
1275 if lo == usize::MAX {
1276 return None;
1277 }
1278 let hi = saves[slot + 1];
1279 Some(Match {
1280 text,
1281 start: lo,
1282 end: hi,
1283 })
1284 }
1285 }
1286 }
1287
1288 /// Returns the match for a named capture group. Returns `None` the capture
1289 /// group did not match or if there is no group with the given name.
1290 pub fn name(&self, name: &str) -> Option<Match<'t>> {
1291 self.named_groups.get(name).and_then(|i| self.get(*i))
1292 }
1293
1294 /// Expands all instances of `$group` in `replacement` to the corresponding
1295 /// capture group `name`, and writes them to the `dst` buffer given.
1296 ///
1297 /// `group` may be an integer corresponding to the index of the
1298 /// capture group (counted by order of opening parenthesis where `\0` is the
1299 /// entire match) or it can be a name (consisting of letters, digits or
1300 /// underscores) corresponding to a named capture group.
1301 ///
1302 /// If `group` isn't a valid capture group (whether the name doesn't exist
1303 /// or isn't a valid index), then it is replaced with the empty string.
1304 ///
1305 /// The longest possible name is used. e.g., `$1a` looks up the capture
1306 /// group named `1a` and not the capture group at index `1`. To exert more
1307 /// precise control over the name, use braces, e.g., `${1}a`.
1308 ///
1309 /// To write a literal `$`, use `$$`.
1310 ///
1311 /// For more control over expansion, see [`Expander`].
1312 ///
1313 /// [`Expander`]: expand/struct.Expander.html
1314 pub fn expand(&self, replacement: &str, dst: &mut String) {
1315 Expander::default().append_expansion(dst, replacement, self);
1316 }
1317
1318 /// Iterate over the captured groups in order in which they appeared in the regex. The first
1319 /// capture corresponds to the whole match.
1320 pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
1321 SubCaptureMatches { caps: self, i: 0 }
1322 }
1323
1324 /// How many groups were captured. This is always at least 1 because group 0 returns the whole
1325 /// match.
1326 pub fn len(&self) -> usize {
1327 match &self.inner {
1328 CapturesImpl::Wrap { locations, .. } => locations.group_len(),
1329 CapturesImpl::Fancy { saves, .. } => saves.len() / 2,
1330 }
1331 }
1332}
1333
1334/// Get a group by index.
1335///
1336/// `'t` is the lifetime of the matched text.
1337///
1338/// The text can't outlive the `Captures` object if this method is
1339/// used, because of how `Index` is defined (normally `a[i]` is part
1340/// of `a` and can't outlive it); to do that, use `get()` instead.
1341///
1342/// # Panics
1343///
1344/// If there is no group at the given index.
1345impl<'t> Index<usize> for Captures<'t> {
1346 type Output = str;
1347
1348 fn index(&self, i: usize) -> &str {
1349 self.get(i)
1350 .map(|m| m.as_str())
1351 .unwrap_or_else(|| panic!("no group at index '{}'", i))
1352 }
1353}
1354
1355/// Get a group by name.
1356///
1357/// `'t` is the lifetime of the matched text and `'i` is the lifetime
1358/// of the group name (the index).
1359///
1360/// The text can't outlive the `Captures` object if this method is
1361/// used, because of how `Index` is defined (normally `a[i]` is part
1362/// of `a` and can't outlive it); to do that, use `name` instead.
1363///
1364/// # Panics
1365///
1366/// If there is no group named by the given value.
1367impl<'t, 'i> Index<&'i str> for Captures<'t> {
1368 type Output = str;
1369
1370 fn index<'a>(&'a self, name: &'i str) -> &'a str {
1371 self.name(name)
1372 .map(|m| m.as_str())
1373 .unwrap_or_else(|| panic!("no group named '{}'", name))
1374 }
1375}
1376
1377impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
1378 type Item = Option<Match<'t>>;
1379
1380 fn next(&mut self) -> Option<Option<Match<'t>>> {
1381 if self.i < self.caps.len() {
1382 let result = self.caps.get(self.i);
1383 self.i += 1;
1384 Some(result)
1385 } else {
1386 None
1387 }
1388 }
1389}
1390
1391// TODO: might be nice to implement ExactSizeIterator etc for SubCaptures
1392
1393/// Regular expression AST. This is public for now but may change.
1394#[derive(Debug, PartialEq, Eq)]
1395pub enum Expr {
1396 /// An empty expression, e.g. the last branch in `(a|b|)`
1397 Empty,
1398 /// Any character, regex `.`
1399 Any {
1400 /// Whether it also matches newlines or not
1401 newline: bool,
1402 },
1403 /// An assertion
1404 Assertion(Assertion),
1405 /// The string as a literal, e.g. `a`
1406 Literal {
1407 /// The string to match
1408 val: String,
1409 /// Whether match is case-insensitive or not
1410 casei: bool,
1411 },
1412 /// Concatenation of multiple expressions, must match in order, e.g. `a.` is a concatenation of
1413 /// the literal `a` and `.` for any character
1414 Concat(Vec<Expr>),
1415 /// Alternative of multiple expressions, one of them must match, e.g. `a|b` is an alternative
1416 /// where either the literal `a` or `b` must match
1417 Alt(Vec<Expr>),
1418 /// Capturing group of expression, e.g. `(a.)` matches `a` and any character and "captures"
1419 /// (remembers) the match
1420 Group(Box<Expr>),
1421 /// Look-around (e.g. positive/negative look-ahead or look-behind) with an expression, e.g.
1422 /// `(?=a)` means the next character must be `a` (but the match is not consumed)
1423 LookAround(Box<Expr>, LookAround),
1424 /// Repeat of an expression, e.g. `a*` or `a+` or `a{1,3}`
1425 Repeat {
1426 /// The expression that is being repeated
1427 child: Box<Expr>,
1428 /// The minimum number of repetitions
1429 lo: usize,
1430 /// The maximum number of repetitions (or `usize::MAX`)
1431 hi: usize,
1432 /// Greedy means as much as possible is matched, e.g. `.*b` would match all of `abab`.
1433 /// Non-greedy means as little as possible, e.g. `.*?b` would match only `ab` in `abab`.
1434 greedy: bool,
1435 },
1436 /// Delegate a regex to the regex crate. This is used as a simplification so that we don't have
1437 /// to represent all the expressions in the AST, e.g. character classes.
1438 Delegate {
1439 /// The regex
1440 inner: String,
1441 /// How many characters the regex matches
1442 size: usize, // TODO: move into analysis result
1443 /// Whether the matching is case-insensitive or not
1444 casei: bool,
1445 },
1446 /// Back reference to a capture group, e.g. `\1` in `(abc|def)\1` references the captured group
1447 /// and the whole regex matches either `abcabc` or `defdef`.
1448 Backref(usize),
1449 /// Atomic non-capturing group, e.g. `(?>ab|a)` in text that contains `ab` will match `ab` and
1450 /// never backtrack and try `a`, even if matching fails after the atomic group.
1451 AtomicGroup(Box<Expr>),
1452 /// Keep matched text so far out of overall match
1453 KeepOut,
1454 /// Anchor to match at the position where the previous match ended
1455 ContinueFromPreviousMatchEnd,
1456 /// Conditional expression based on whether the numbered capture group matched or not
1457 BackrefExistsCondition(usize),
1458 /// If/Then/Else Condition. If there is no Then/Else, these will just be empty expressions.
1459 Conditional {
1460 /// The conditional expression to evaluate
1461 condition: Box<Expr>,
1462 /// What to execute if the condition is true
1463 true_branch: Box<Expr>,
1464 /// What to execute if the condition is false
1465 false_branch: Box<Expr>,
1466 },
1467}
1468
1469/// Type of look-around assertion as used for a look-around expression.
1470#[derive(Debug, PartialEq, Eq, Clone, Copy)]
1471pub enum LookAround {
1472 /// Look-ahead assertion, e.g. `(?=a)`
1473 LookAhead,
1474 /// Negative look-ahead assertion, e.g. `(?!a)`
1475 LookAheadNeg,
1476 /// Look-behind assertion, e.g. `(?<=a)`
1477 LookBehind,
1478 /// Negative look-behind assertion, e.g. `(?<!a)`
1479 LookBehindNeg,
1480}
1481
1482/// An iterator over capture names in a [Regex]. The iterator
1483/// returns the name of each group, or [None] if the group has
1484/// no name. Because capture group 0 cannot have a name, the
1485/// first item returned is always [None].
1486pub struct CaptureNames<'r>(vec::IntoIter<Option<&'r str>>);
1487
1488impl Debug for CaptureNames<'_> {
1489 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
1490 f.write_str("<CaptureNames>")
1491 }
1492}
1493
1494impl<'r> Iterator for CaptureNames<'r> {
1495 type Item = Option<&'r str>;
1496
1497 fn next(&mut self) -> Option<Self::Item> {
1498 self.0.next()
1499 }
1500}
1501
1502// silly to write my own, but this is super-fast for the common 1-digit
1503// case.
1504fn push_usize(s: &mut String, x: usize) {
1505 if x >= 10 {
1506 push_usize(s, x / 10);
1507 s.push((b'0' + (x % 10) as u8) as char);
1508 } else {
1509 s.push((b'0' + (x as u8)) as char);
1510 }
1511}
1512
1513fn is_special(c: char) -> bool {
1514 match c {
1515 '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
1516 | '#' => true,
1517 _ => false,
1518 }
1519}
1520
1521fn push_quoted(buf: &mut String, s: &str) {
1522 for c in s.chars() {
1523 if is_special(c) {
1524 buf.push('\\');
1525 }
1526 buf.push(c);
1527 }
1528}
1529
1530/// Escapes special characters in `text` with '\\'. Returns a string which, when interpreted
1531/// as a regex, matches exactly `text`.
1532pub fn escape(text: &str) -> Cow<str> {
1533 // Using bytes() is OK because all special characters are single bytes.
1534 match text.bytes().filter(|&b| is_special(b as char)).count() {
1535 0 => Cow::Borrowed(text),
1536 n => {
1537 // The capacity calculation is exact because '\\' is a single byte.
1538 let mut buf = String::with_capacity(text.len() + n);
1539 push_quoted(&mut buf, text);
1540 Cow::Owned(buf)
1541 }
1542 }
1543}
1544
1545/// Type of assertions
1546#[derive(Debug, PartialEq, Eq, Clone, Copy)]
1547pub enum Assertion {
1548 /// Start of input text
1549 StartText,
1550 /// End of input text
1551 EndText,
1552 /// Start of a line
1553 StartLine {
1554 /// CRLF mode
1555 crlf: bool,
1556 },
1557 /// End of a line
1558 EndLine {
1559 /// CRLF mode
1560 crlf: bool,
1561 },
1562 /// Left word boundary
1563 LeftWordBoundary,
1564 /// Right word boundary
1565 RightWordBoundary,
1566 /// Both word boundaries
1567 WordBoundary,
1568 /// Not word boundary
1569 NotWordBoundary,
1570}
1571
1572impl Assertion {
1573 pub(crate) fn is_hard(&self) -> bool {
1574 use Assertion::*;
1575 matches!(
1576 self,
1577 // these will make regex-automata use PikeVM
1578 LeftWordBoundary | RightWordBoundary | WordBoundary | NotWordBoundary
1579 )
1580 }
1581}
1582
1583impl Expr {
1584 /// Parse the regex and return an expression (AST) and a bit set with the indexes of groups
1585 /// that are referenced by backrefs.
1586 pub fn parse_tree(re: &str) -> Result<ExprTree> {
1587 Parser::parse(re)
1588 }
1589
1590 /// Convert expression to a regex string in the regex crate's syntax.
1591 ///
1592 /// # Panics
1593 ///
1594 /// Panics for expressions that are hard, i.e. can not be handled by the regex crate.
1595 pub fn to_str(&self, buf: &mut String, precedence: u8) {
1596 match *self {
1597 Expr::Empty => (),
1598 Expr::Any { newline } => buf.push_str(if newline { "(?s:.)" } else { "." }),
1599 Expr::Literal { ref val, casei } => {
1600 if casei {
1601 buf.push_str("(?i:");
1602 }
1603 push_quoted(buf, val);
1604 if casei {
1605 buf.push_str(")");
1606 }
1607 }
1608 Expr::Assertion(Assertion::StartText) => buf.push('^'),
1609 Expr::Assertion(Assertion::EndText) => buf.push('$'),
1610 Expr::Assertion(Assertion::StartLine { crlf: false }) => buf.push_str("(?m:^)"),
1611 Expr::Assertion(Assertion::EndLine { crlf: false }) => buf.push_str("(?m:$)"),
1612 Expr::Assertion(Assertion::StartLine { crlf: true }) => buf.push_str("(?Rm:^)"),
1613 Expr::Assertion(Assertion::EndLine { crlf: true }) => buf.push_str("(?Rm:$)"),
1614 Expr::Concat(ref children) => {
1615 if precedence > 1 {
1616 buf.push_str("(?:");
1617 }
1618 for child in children {
1619 child.to_str(buf, 2);
1620 }
1621 if precedence > 1 {
1622 buf.push(')')
1623 }
1624 }
1625 Expr::Alt(ref children) => {
1626 if precedence > 0 {
1627 buf.push_str("(?:");
1628 }
1629 for (i, child) in children.iter().enumerate() {
1630 if i != 0 {
1631 buf.push('|');
1632 }
1633 child.to_str(buf, 1);
1634 }
1635 if precedence > 0 {
1636 buf.push(')');
1637 }
1638 }
1639 Expr::Group(ref child) => {
1640 buf.push('(');
1641 child.to_str(buf, 0);
1642 buf.push(')');
1643 }
1644 Expr::Repeat {
1645 ref child,
1646 lo,
1647 hi,
1648 greedy,
1649 } => {
1650 if precedence > 2 {
1651 buf.push_str("(?:");
1652 }
1653 child.to_str(buf, 3);
1654 match (lo, hi) {
1655 (0, 1) => buf.push('?'),
1656 (0, usize::MAX) => buf.push('*'),
1657 (1, usize::MAX) => buf.push('+'),
1658 (lo, hi) => {
1659 buf.push('{');
1660 push_usize(buf, lo);
1661 if lo != hi {
1662 buf.push(',');
1663 if hi != usize::MAX {
1664 push_usize(buf, hi);
1665 }
1666 }
1667 buf.push('}');
1668 }
1669 }
1670 if !greedy {
1671 buf.push('?');
1672 }
1673 if precedence > 2 {
1674 buf.push(')');
1675 }
1676 }
1677 Expr::Delegate {
1678 ref inner, casei, ..
1679 } => {
1680 // at the moment, delegate nodes are just atoms
1681 if casei {
1682 buf.push_str("(?i:");
1683 }
1684 buf.push_str(inner);
1685 if casei {
1686 buf.push_str(")");
1687 }
1688 }
1689 _ => panic!("attempting to format hard expr"),
1690 }
1691 }
1692}
1693
1694// precondition: ix > 0
1695fn prev_codepoint_ix(s: &str, mut ix: usize) -> usize {
1696 let bytes = s.as_bytes();
1697 loop {
1698 ix -= 1;
1699 // fancy bit magic for ranges 0..0x80 + 0xc0..
1700 if (bytes[ix] as i8) >= -0x40 {
1701 break;
1702 }
1703 }
1704 ix
1705}
1706
1707fn codepoint_len(b: u8) -> usize {
1708 match b {
1709 b if b < 0x80 => 1,
1710 b if b < 0xe0 => 2,
1711 b if b < 0xf0 => 3,
1712 _ => 4,
1713 }
1714}
1715
1716/// Returns the smallest possible index of the next valid UTF-8 sequence
1717/// starting after `i`.
1718/// Adapted from a function with the same name in the `regex` crate.
1719fn next_utf8(text: &str, i: usize) -> usize {
1720 let b = match text.as_bytes().get(i) {
1721 None => return i + 1,
1722 Some(&b) => b,
1723 };
1724 i + codepoint_len(b)
1725}
1726
1727// If this returns false, then there is no possible backref in the re
1728
1729// Both potential implementations are turned off, because we currently
1730// always need to do a deeper analysis because of 1-character
1731// look-behind. If we could call a find_from_pos method of regex::Regex,
1732// it would make sense to bring this back.
1733/*
1734pub fn detect_possible_backref(re: &str) -> bool {
1735 let mut last = b'\x00';
1736 for b in re.as_bytes() {
1737 if b'0' <= *b && *b <= b'9' && last == b'\\' { return true; }
1738 last = *b;
1739 }
1740 false
1741}
1742
1743pub fn detect_possible_backref(re: &str) -> bool {
1744 let mut bytes = re.as_bytes();
1745 loop {
1746 match memchr::memchr(b'\\', &bytes[..bytes.len() - 1]) {
1747 Some(i) => {
1748 bytes = &bytes[i + 1..];
1749 let c = bytes[0];
1750 if b'0' <= c && c <= b'9' { return true; }
1751 }
1752 None => return false
1753 }
1754 }
1755}
1756*/
1757
1758/// The internal module only exists so that the toy example can access internals for debugging and
1759/// experimenting.
1760#[doc(hidden)]
1761pub mod internal {
1762 pub use crate::analyze::analyze;
1763 pub use crate::compile::compile;
1764 pub use crate::vm::{run_default, run_trace, Insn, Prog};
1765}
1766
1767#[cfg(test)]
1768mod tests {
1769 use alloc::borrow::Cow;
1770 use alloc::boxed::Box;
1771 use alloc::string::String;
1772 use alloc::{format, vec};
1773
1774 use crate::parse::make_literal;
1775 use crate::{Expr, Regex};
1776
1777 //use detect_possible_backref;
1778
1779 // tests for to_str
1780
1781 fn to_str(e: Expr) -> String {
1782 let mut s = String::new();
1783 e.to_str(&mut s, 0);
1784 s
1785 }
1786
1787 #[test]
1788 fn to_str_concat_alt() {
1789 let e = Expr::Concat(vec![
1790 Expr::Alt(vec![make_literal("a"), make_literal("b")]),
1791 make_literal("c"),
1792 ]);
1793 assert_eq!(to_str(e), "(?:a|b)c");
1794 }
1795
1796 #[test]
1797 fn to_str_rep_concat() {
1798 let e = Expr::Repeat {
1799 child: Box::new(Expr::Concat(vec![make_literal("a"), make_literal("b")])),
1800 lo: 2,
1801 hi: 3,
1802 greedy: true,
1803 };
1804 assert_eq!(to_str(e), "(?:ab){2,3}");
1805 }
1806
1807 #[test]
1808 fn to_str_group_alt() {
1809 let e = Expr::Group(Box::new(Expr::Alt(vec![
1810 make_literal("a"),
1811 make_literal("b"),
1812 ])));
1813 assert_eq!(to_str(e), "(a|b)");
1814 }
1815
1816 #[test]
1817 fn as_str_debug() {
1818 let s = r"(a+)b\1";
1819 let regex = Regex::new(s).unwrap();
1820 assert_eq!(s, regex.as_str());
1821 assert_eq!(s, format!("{:?}", regex));
1822 }
1823
1824 #[test]
1825 fn display() {
1826 let s = r"(a+)b\1";
1827 let regex = Regex::new(s).unwrap();
1828 assert_eq!(s, format!("{}", regex));
1829 }
1830
1831 #[test]
1832 fn from_str() {
1833 let s = r"(a+)b\1";
1834 let regex = s.parse::<Regex>().unwrap();
1835 assert_eq!(regex.as_str(), s);
1836 }
1837
1838 #[test]
1839 fn to_str_repeat() {
1840 fn repeat(lo: usize, hi: usize, greedy: bool) -> Expr {
1841 Expr::Repeat {
1842 child: Box::new(make_literal("a")),
1843 lo,
1844 hi,
1845 greedy,
1846 }
1847 }
1848
1849 assert_eq!(to_str(repeat(2, 2, true)), "a{2}");
1850 assert_eq!(to_str(repeat(2, 2, false)), "a{2}?");
1851 assert_eq!(to_str(repeat(2, 3, true)), "a{2,3}");
1852 assert_eq!(to_str(repeat(2, 3, false)), "a{2,3}?");
1853 assert_eq!(to_str(repeat(2, usize::MAX, true)), "a{2,}");
1854 assert_eq!(to_str(repeat(2, usize::MAX, false)), "a{2,}?");
1855 assert_eq!(to_str(repeat(0, 1, true)), "a?");
1856 assert_eq!(to_str(repeat(0, 1, false)), "a??");
1857 assert_eq!(to_str(repeat(0, usize::MAX, true)), "a*");
1858 assert_eq!(to_str(repeat(0, usize::MAX, false)), "a*?");
1859 assert_eq!(to_str(repeat(1, usize::MAX, true)), "a+");
1860 assert_eq!(to_str(repeat(1, usize::MAX, false)), "a+?");
1861 }
1862
1863 #[test]
1864 fn escape() {
1865 // Check that strings that need no quoting are borrowed, and that non-special punctuation
1866 // is not quoted.
1867 match crate::escape("@foo") {
1868 Cow::Borrowed(s) => assert_eq!(s, "@foo"),
1869 _ => panic!("Value should be borrowed."),
1870 }
1871
1872 // Check typical usage.
1873 assert_eq!(crate::escape("fo*o").into_owned(), "fo\\*o");
1874
1875 // Check that multibyte characters are handled correctly.
1876 assert_eq!(crate::escape("fø*ø").into_owned(), "fø\\*ø");
1877 }
1878
1879 /*
1880 #[test]
1881 fn detect_backref() {
1882 assert_eq!(detect_possible_backref("a0a1a2"), false);
1883 assert_eq!(detect_possible_backref("a0a1\\a2"), false);
1884 assert_eq!(detect_possible_backref("a0a\\1a2"), true);
1885 assert_eq!(detect_possible_backref("a0a1a2\\"), false);
1886 }
1887 */
1888}