tantivy_fst/regex/
mod.rs

1use crate::Automaton;
2use regex_syntax;
3use std::fmt;
4
5mod compile;
6mod dfa;
7mod error;
8mod sparse;
9
10pub use self::error::Error;
11
12/// A regular expression for searching FSTs with Unicode support.
13///
14/// Regular expressions are compiled down to a deterministic finite automaton
15/// that can efficiently search any finite state transducer. Notably, most
16/// regular expressions only need to explore a small portion of a finite state
17/// transducer without loading all of it into memory.
18///
19/// # Syntax
20///
21/// `Regex` supports fully featured regular expressions. Namely, it supports
22/// all of the same constructs as the standard `regex` crate except for the
23/// following things:
24///
25/// 1. Lazy quantifiers, since a regular expression automaton only reports
26///    whether a key matches at all, and not its location. Namely, lazy
27///    quantifiers such as `+?` only modify the location of a match, but never
28///    change a non-match into a match or a match into a non-match.
29/// 2. Word boundaries (i.e., `\b`). Because such things are hard to do in
30///    a deterministic finite automaton, but not impossible. As such, these
31///    may be allowed some day.
32/// 3. Other zero width assertions like `^` and `$`. These are easier to
33///    support than word boundaries, but are still tricky and usually aren't
34///    as useful when searching dictionaries.
35///
36/// Otherwise, the [full syntax of the `regex`
37/// crate](http://doc.rust-lang.org/regex/regex/index.html#syntax)
38/// is supported. This includes all Unicode support and relevant flags.
39/// (The `U` and `m` flags are no-ops because of (1) and (3) above,
40/// respectively.)
41///
42/// # Matching semantics
43///
44/// A regular expression matches a key in a finite state transducer if and only
45/// if it matches from the start of a key all the way to end. Stated
46/// differently, every regular expression `(re)` is matched as if it were
47/// `^(re)$`. This means that if you want to do a substring match, then you
48/// must use `.*substring.*`.
49///
50/// **Caution**: Starting a regular expression with `.*` means that it could
51/// potentially match *any* key in a finite state transducer. This implies that
52/// all keys could be visited, which could be slow. It is possible that this
53/// crate will grow facilities for detecting regular expressions that will
54/// scan a large portion of a transducer and optionally disallow them.
55///
56pub struct Regex {
57    original: String,
58    dfa: dfa::Dfa,
59}
60
61#[derive(Eq, PartialEq)]
62pub enum Inst {
63    Match,
64    Jump(usize),
65    Split(usize, usize),
66    Range(u8, u8),
67}
68
69impl Regex {
70    /// Create a new regular expression query.
71    ///
72    /// The query finds all terms matching the regular expression.
73    ///
74    /// If the regular expression is malformed or if it results in an automaton
75    /// that is too big, then an error is returned.
76    ///
77    /// A `Regex` value satisfies the `Automaton` trait, which means it can be
78    /// used with the `search` method of any finite state transducer.
79    #[inline]
80    pub fn new(re: &str) -> Result<Regex, Error> {
81        Regex::with_size_limit(10 * (1 << 20), re)
82    }
83
84    fn with_size_limit(size: usize, re: &str) -> Result<Regex, Error> {
85        let hir = regex_syntax::Parser::new().parse(re)?;
86        let insts = self::compile::Compiler::new(size).compile(&hir)?;
87        let dfa = self::dfa::DfaBuilder::new(insts).build()?;
88        Ok(Regex {
89            original: re.to_owned(),
90            dfa,
91        })
92    }
93}
94
95impl Automaton for Regex {
96    type State = Option<usize>;
97
98    #[inline]
99    fn start(&self) -> Option<usize> {
100        Some(0)
101    }
102
103    #[inline]
104    fn is_match(&self, state: &Option<usize>) -> bool {
105        state.map(|state| self.dfa.is_match(state)).unwrap_or(false)
106    }
107
108    #[inline]
109    fn can_match(&self, state: &Option<usize>) -> bool {
110        state.is_some()
111    }
112
113    #[inline]
114    fn accept(&self, state: &Option<usize>, byte: u8) -> Option<usize> {
115        state.and_then(|state| self.dfa.accept(state, byte))
116    }
117}
118
119impl fmt::Debug for Regex {
120    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
121        writeln!(f, "Regex({:?})", self.original)?;
122        self.dfa.fmt(f)
123    }
124}
125
126impl fmt::Debug for Inst {
127    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
128        use self::Inst::*;
129        match *self {
130            Match => write!(f, "Match"),
131            Jump(ip) => write!(f, "JUMP {}", ip),
132            Split(ip1, ip2) => write!(f, "SPLIT {}, {}", ip1, ip2),
133            Range(s, e) => write!(f, "RANGE {:X}-{:X}", s, e),
134        }
135    }
136}