tantivy_fst/regex/mod.rs
1use crate::Automaton;
2use regex_syntax;
3use std::fmt;
4
5mod compile;
6mod dfa;
7mod error;
8mod sparse;
9
10pub use self::error::Error;
11
12/// A regular expression for searching FSTs with Unicode support.
13///
14/// Regular expressions are compiled down to a deterministic finite automaton
15/// that can efficiently search any finite state transducer. Notably, most
16/// regular expressions only need to explore a small portion of a finite state
17/// transducer without loading all of it into memory.
18///
19/// # Syntax
20///
21/// `Regex` supports fully featured regular expressions. Namely, it supports
22/// all of the same constructs as the standard `regex` crate except for the
23/// following things:
24///
25/// 1. Lazy quantifiers, since a regular expression automaton only reports
26/// whether a key matches at all, and not its location. Namely, lazy
27/// quantifiers such as `+?` only modify the location of a match, but never
28/// change a non-match into a match or a match into a non-match.
29/// 2. Word boundaries (i.e., `\b`). Because such things are hard to do in
30/// a deterministic finite automaton, but not impossible. As such, these
31/// may be allowed some day.
32/// 3. Other zero width assertions like `^` and `$`. These are easier to
33/// support than word boundaries, but are still tricky and usually aren't
34/// as useful when searching dictionaries.
35///
36/// Otherwise, the [full syntax of the `regex`
37/// crate](http://doc.rust-lang.org/regex/regex/index.html#syntax)
38/// is supported. This includes all Unicode support and relevant flags.
39/// (The `U` and `m` flags are no-ops because of (1) and (3) above,
40/// respectively.)
41///
42/// # Matching semantics
43///
44/// A regular expression matches a key in a finite state transducer if and only
45/// if it matches from the start of a key all the way to end. Stated
46/// differently, every regular expression `(re)` is matched as if it were
47/// `^(re)$`. This means that if you want to do a substring match, then you
48/// must use `.*substring.*`.
49///
50/// **Caution**: Starting a regular expression with `.*` means that it could
51/// potentially match *any* key in a finite state transducer. This implies that
52/// all keys could be visited, which could be slow. It is possible that this
53/// crate will grow facilities for detecting regular expressions that will
54/// scan a large portion of a transducer and optionally disallow them.
55///
56pub struct Regex {
57 original: String,
58 dfa: dfa::Dfa,
59}
60
61#[derive(Eq, PartialEq)]
62pub enum Inst {
63 Match,
64 Jump(usize),
65 Split(usize, usize),
66 Range(u8, u8),
67}
68
69impl Regex {
70 /// Create a new regular expression query.
71 ///
72 /// The query finds all terms matching the regular expression.
73 ///
74 /// If the regular expression is malformed or if it results in an automaton
75 /// that is too big, then an error is returned.
76 ///
77 /// A `Regex` value satisfies the `Automaton` trait, which means it can be
78 /// used with the `search` method of any finite state transducer.
79 #[inline]
80 pub fn new(re: &str) -> Result<Regex, Error> {
81 Regex::with_size_limit(10 * (1 << 20), re)
82 }
83
84 fn with_size_limit(size: usize, re: &str) -> Result<Regex, Error> {
85 let hir = regex_syntax::Parser::new().parse(re)?;
86 let insts = self::compile::Compiler::new(size).compile(&hir)?;
87 let dfa = self::dfa::DfaBuilder::new(insts).build()?;
88 Ok(Regex {
89 original: re.to_owned(),
90 dfa,
91 })
92 }
93}
94
95impl Automaton for Regex {
96 type State = Option<usize>;
97
98 #[inline]
99 fn start(&self) -> Option<usize> {
100 Some(0)
101 }
102
103 #[inline]
104 fn is_match(&self, state: &Option<usize>) -> bool {
105 state.map(|state| self.dfa.is_match(state)).unwrap_or(false)
106 }
107
108 #[inline]
109 fn can_match(&self, state: &Option<usize>) -> bool {
110 state.is_some()
111 }
112
113 #[inline]
114 fn accept(&self, state: &Option<usize>, byte: u8) -> Option<usize> {
115 state.and_then(|state| self.dfa.accept(state, byte))
116 }
117}
118
119impl fmt::Debug for Regex {
120 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
121 writeln!(f, "Regex({:?})", self.original)?;
122 self.dfa.fmt(f)
123 }
124}
125
126impl fmt::Debug for Inst {
127 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
128 use self::Inst::*;
129 match *self {
130 Match => write!(f, "Match"),
131 Jump(ip) => write!(f, "JUMP {}", ip),
132 Split(ip1, ip2) => write!(f, "SPLIT {}, {}", ip1, ip2),
133 Range(s, e) => write!(f, "RANGE {:X}-{:X}", s, e),
134 }
135 }
136}