sea_query/
token.rs

1//! Tokenizer for processing SQL.
2
3use std::fmt::Write;
4use std::iter::Iterator;
5
6#[derive(Debug, Default)]
7pub struct Tokenizer {
8    pub chars: Vec<char>,
9    pub p: usize,
10}
11
12#[derive(Debug, PartialEq, Eq)]
13pub enum Token {
14    Quoted(String),
15    Unquoted(String),
16    Space(String),
17    Punctuation(String),
18}
19
20impl Tokenizer {
21    pub fn new(string: &str) -> Self {
22        Self {
23            chars: string.chars().collect(),
24            p: 0,
25        }
26    }
27
28    pub fn iter(self) -> impl Iterator<Item = Token> {
29        self
30    }
31
32    fn get(&self) -> char {
33        self.chars[self.p]
34    }
35
36    fn inc(&mut self) {
37        self.p += 1;
38    }
39
40    fn end(&self) -> bool {
41        self.p == self.chars.len()
42    }
43
44    fn space(&mut self) -> Option<Token> {
45        let mut string = String::new();
46        while !self.end() {
47            let c = self.get();
48            if Self::is_space(c) {
49                write!(string, "{c}").unwrap();
50            } else {
51                break;
52            }
53            self.inc();
54        }
55        if !string.is_empty() {
56            Some(Token::Space(string))
57        } else {
58            None
59        }
60    }
61
62    fn unquoted(&mut self) -> Option<Token> {
63        let mut string = String::new();
64        let mut first = true;
65        while !self.end() {
66            let c = self.get();
67            if Self::is_alphanumeric(c) {
68                write!(string, "{c}").unwrap();
69                first = false;
70                self.inc();
71            } else if !first && Self::is_identifier(c) {
72                write!(string, "{c}").unwrap();
73                self.inc();
74            } else {
75                break;
76            }
77        }
78        if !string.is_empty() {
79            Some(Token::Unquoted(string))
80        } else {
81            None
82        }
83    }
84
85    fn quoted(&mut self) -> Option<Token> {
86        let mut string = String::new();
87        let mut first = true;
88        let mut escape = false;
89        let mut start = ' ';
90        while !self.end() {
91            let c = self.get();
92            if first && Self::is_string_delimiter_start(c) {
93                write!(string, "{c}").unwrap();
94                first = false;
95                start = c;
96                self.inc();
97            } else if !first && !escape && Self::is_string_delimiter_end_for(start, c) {
98                write!(string, "{c}").unwrap();
99                self.inc();
100                if self.end() {
101                    break;
102                }
103                if !Self::is_string_escape_for(start, self.get()) {
104                    break;
105                } else {
106                    write!(string, "{}", self.get()).unwrap();
107                    self.inc();
108                }
109            } else if !first {
110                escape = !escape && Self::is_escape_char(c);
111                write!(string, "{c}").unwrap();
112                self.inc();
113            } else {
114                break;
115            }
116        }
117        if !string.is_empty() {
118            Some(Token::Quoted(string))
119        } else {
120            None
121        }
122    }
123
124    /// unquote a quoted string
125    fn unquote(mut self) -> String {
126        let mut string = String::new();
127        let mut first = true;
128        let mut escape = false;
129        let mut start = ' ';
130        while !self.end() {
131            let c = self.get();
132            if first && Self::is_string_delimiter_start(c) {
133                first = false;
134                start = c;
135                self.inc();
136            } else if !first && !escape && Self::is_string_delimiter_end_for(start, c) {
137                self.inc();
138                if self.end() {
139                    break;
140                }
141                if !Self::is_string_escape_for(start, self.get()) {
142                    break;
143                } else {
144                    write!(string, "{c}").unwrap();
145                    self.inc();
146                }
147            } else if !first {
148                escape = !escape && Self::is_escape_char(c);
149                write!(string, "{c}").unwrap();
150                self.inc();
151            } else {
152                break;
153            }
154        }
155        string
156    }
157
158    fn punctuation(&mut self) -> Option<Token> {
159        let mut string = String::new();
160        if !self.end() {
161            let c = self.get();
162            if !Self::is_space(c) && !Self::is_alphanumeric(c) {
163                write!(string, "{c}").unwrap();
164                self.inc();
165            }
166        }
167        if !string.is_empty() {
168            Some(Token::Punctuation(string))
169        } else {
170            None
171        }
172    }
173
174    fn is_space(c: char) -> bool {
175        matches!(c, ' ' | '\t' | '\r' | '\n')
176    }
177
178    fn is_identifier(c: char) -> bool {
179        matches!(c, '_' | '$')
180    }
181
182    fn is_alphanumeric(c: char) -> bool {
183        c.is_alphabetic() || c.is_ascii_digit()
184    }
185
186    fn is_string_delimiter_start(c: char) -> bool {
187        matches!(c, '`' | '[' | '\'' | '"')
188    }
189
190    fn is_string_escape_for(start: char, c: char) -> bool {
191        match start {
192            '`' => c == '`',
193            '\'' => c == '\'',
194            '"' => c == '"',
195            _ => false,
196        }
197    }
198
199    fn is_string_delimiter_end_for(start: char, c: char) -> bool {
200        match start {
201            '`' => c == '`',
202            '[' => c == ']',
203            '\'' => c == '\'',
204            '"' => c == '"',
205            _ => false,
206        }
207    }
208
209    fn is_escape_char(c: char) -> bool {
210        c == '\\'
211    }
212}
213
214impl Iterator for Tokenizer {
215    type Item = Token;
216
217    fn next(&mut self) -> Option<Self::Item> {
218        if let Some(space) = self.space() {
219            return Some(space);
220        }
221        if let Some(unquoted) = self.unquoted() {
222            return Some(unquoted);
223        }
224        if let Some(quoted) = self.quoted() {
225            return Some(quoted);
226        }
227        if let Some(punctuation) = self.punctuation() {
228            return Some(punctuation);
229        }
230        None
231    }
232}
233
234impl Token {
235    pub fn is_quoted(&self) -> bool {
236        matches!(self, Self::Quoted(_))
237    }
238
239    pub fn is_unquoted(&self) -> bool {
240        matches!(self, Self::Unquoted(_))
241    }
242
243    pub fn is_space(&self) -> bool {
244        matches!(self, Self::Space(_))
245    }
246
247    pub fn is_punctuation(&self) -> bool {
248        matches!(self, Self::Punctuation(_))
249    }
250
251    pub fn as_str(&self) -> &str {
252        match self {
253            Self::Quoted(string) => string,
254            Self::Unquoted(string) => string,
255            Self::Space(string) => string,
256            Self::Punctuation(string) => string,
257        }
258    }
259
260    pub fn unquote(&self) -> Option<String> {
261        if self.is_quoted() {
262            let tokenizer = Tokenizer::new(self.as_str());
263            Some(tokenizer.unquote())
264        } else {
265            None
266        }
267    }
268}
269
270impl std::fmt::Display for Token {
271    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
272        write!(
273            f,
274            "{}",
275            match self {
276                Token::Unquoted(string) => string,
277                Token::Space(string) => string,
278                Token::Quoted(string) => string,
279                Token::Punctuation(string) => string,
280            }
281        )
282    }
283}
284
285#[cfg(test)]
286mod tests {
287    use super::*;
288    use pretty_assertions::assert_eq;
289
290    #[test]
291    fn test_0() {
292        let tokenizer = Tokenizer::new("");
293        let tokens: Vec<Token> = tokenizer.iter().collect();
294        assert_eq!(tokens, vec![]);
295    }
296
297    #[test]
298    fn test_1() {
299        let string = "SELECT * FROM `character`";
300        let tokenizer = Tokenizer::new(string);
301        let tokens: Vec<Token> = tokenizer.iter().collect();
302        assert_eq!(
303            tokens,
304            vec![
305                Token::Unquoted("SELECT".to_string()),
306                Token::Space(" ".to_string()),
307                Token::Punctuation("*".to_string()),
308                Token::Space(" ".to_string()),
309                Token::Unquoted("FROM".to_string()),
310                Token::Space(" ".to_string()),
311                Token::Quoted("`character`".to_string()),
312            ]
313        );
314        assert_eq!(
315            string,
316            tokens.iter().map(|x| x.to_string()).collect::<String>()
317        );
318    }
319
320    #[test]
321    fn test_2() {
322        let string = "SELECT * FROM `character` WHERE id = ?";
323        let tokenizer = Tokenizer::new(string);
324        let tokens: Vec<Token> = tokenizer.iter().collect();
325        assert_eq!(
326            tokens,
327            vec![
328                Token::Unquoted("SELECT".to_string()),
329                Token::Space(" ".to_string()),
330                Token::Punctuation("*".to_string()),
331                Token::Space(" ".to_string()),
332                Token::Unquoted("FROM".to_string()),
333                Token::Space(" ".to_string()),
334                Token::Quoted("`character`".to_string()),
335                Token::Space(" ".to_string()),
336                Token::Unquoted("WHERE".to_string()),
337                Token::Space(" ".to_string()),
338                Token::Unquoted("id".to_string()),
339                Token::Space(" ".to_string()),
340                Token::Punctuation("=".to_string()),
341                Token::Space(" ".to_string()),
342                Token::Punctuation("?".to_string()),
343            ]
344        );
345        assert_eq!(
346            string,
347            tokens.iter().map(|x| x.to_string()).collect::<String>()
348        );
349    }
350
351    #[test]
352    fn test_3() {
353        let string = r#"? = "?" "#;
354        let tokenizer = Tokenizer::new(string);
355        let tokens: Vec<Token> = tokenizer.iter().collect();
356        assert_eq!(
357            tokens,
358            vec![
359                Token::Punctuation("?".to_string()),
360                Token::Space(" ".to_string()),
361                Token::Punctuation("=".to_string()),
362                Token::Space(" ".to_string()),
363                Token::Quoted(r#""?""#.to_string()),
364                Token::Space(" ".to_string()),
365            ]
366        );
367        assert_eq!(
368            string,
369            tokens.iter().map(|x| x.to_string()).collect::<String>()
370        );
371    }
372
373    #[test]
374    fn test_4() {
375        let string = r#""a\"bc""#;
376        let tokenizer = Tokenizer::new(string);
377        let tokens: Vec<Token> = tokenizer.iter().collect();
378        assert_eq!(tokens, vec![Token::Quoted("\"a\\\"bc\"".to_string())]);
379        assert_eq!(
380            string,
381            tokens.iter().map(|x| x.to_string()).collect::<String>()
382        );
383    }
384
385    #[test]
386    fn test_5() {
387        let string = "abc123";
388        let tokenizer = Tokenizer::new(string);
389        let tokens: Vec<Token> = tokenizer.iter().collect();
390        assert_eq!(tokens, vec![Token::Unquoted(string.to_string())]);
391        assert_eq!(
392            string,
393            tokens.iter().map(|x| x.to_string()).collect::<String>()
394        );
395    }
396
397    #[test]
398    fn test_6() {
399        let string = "2.3*4";
400        let tokenizer = Tokenizer::new(string);
401        let tokens: Vec<Token> = tokenizer.iter().collect();
402        assert_eq!(
403            tokens,
404            vec![
405                Token::Unquoted("2".to_string()),
406                Token::Punctuation(".".to_string()),
407                Token::Unquoted("3".to_string()),
408                Token::Punctuation("*".to_string()),
409                Token::Unquoted("4".to_string()),
410            ]
411        );
412        assert_eq!(
413            string,
414            tokens.iter().map(|x| x.to_string()).collect::<String>()
415        );
416    }
417
418    #[test]
419    fn test_7() {
420        let string = r#""a\\" B"#;
421        let tokenizer = Tokenizer::new(string);
422        let tokens: Vec<Token> = tokenizer.iter().collect();
423        assert_eq!(
424            tokens,
425            vec![
426                Token::Quoted("\"a\\\\\"".to_string()),
427                Token::Space(" ".to_string()),
428                Token::Unquoted("B".to_string()),
429            ]
430        );
431        assert_eq!(
432            string,
433            tokens.iter().map(|x| x.to_string()).collect::<String>()
434        );
435    }
436
437    #[test]
438    fn test_8() {
439        let string = r#"`a"b` "#;
440        let tokenizer = Tokenizer::new(string);
441        let tokens: Vec<Token> = tokenizer.iter().collect();
442        assert_eq!(
443            tokens,
444            vec![
445                Token::Quoted("`a\"b`".to_string()),
446                Token::Space(" ".to_string()),
447            ]
448        );
449        assert_eq!(
450            string,
451            tokens.iter().map(|x| x.to_string()).collect::<String>()
452        );
453    }
454
455    #[test]
456    fn test_9() {
457        let string = r"[ab] ";
458        let tokenizer = Tokenizer::new(string);
459        let tokens: Vec<Token> = tokenizer.iter().collect();
460        assert_eq!(
461            tokens,
462            vec![
463                Token::Quoted("[ab]".to_string()),
464                Token::Space(" ".to_string()),
465            ]
466        );
467        assert_eq!(
468            string,
469            tokens.iter().map(|x| x.to_string()).collect::<String>()
470        );
471    }
472
473    #[test]
474    fn test_10() {
475        let string = r#" 'a"b' "#;
476        let tokenizer = Tokenizer::new(string);
477        let tokens: Vec<Token> = tokenizer.iter().collect();
478        assert_eq!(
479            tokens,
480            vec![
481                Token::Space(" ".to_string()),
482                Token::Quoted("'a\"b'".to_string()),
483                Token::Space(" ".to_string()),
484            ]
485        );
486        assert_eq!(
487            string,
488            tokens.iter().map(|x| x.to_string()).collect::<String>()
489        );
490    }
491
492    #[test]
493    fn test_11() {
494        let string = r" `a``b` ";
495        let tokenizer = Tokenizer::new(string);
496        let tokens: Vec<Token> = tokenizer.iter().collect();
497        assert_eq!(
498            tokens,
499            vec![
500                Token::Space(" ".to_string()),
501                Token::Quoted("`a``b`".to_string()),
502                Token::Space(" ".to_string()),
503            ]
504        );
505        assert_eq!(
506            string,
507            tokens.iter().map(|x| x.to_string()).collect::<String>()
508        );
509    }
510
511    #[test]
512    fn test_12() {
513        let string = r" 'a''b' ";
514        let tokenizer = Tokenizer::new(string);
515        let tokens: Vec<Token> = tokenizer.iter().collect();
516        assert_eq!(
517            tokens,
518            vec![
519                Token::Space(" ".to_string()),
520                Token::Quoted("'a''b'".to_string()),
521                Token::Space(" ".to_string()),
522            ]
523        );
524        assert_eq!(
525            string,
526            tokens.iter().map(|x| x.to_string()).collect::<String>()
527        );
528    }
529
530    #[test]
531    fn test_13() {
532        let string = r"(?)";
533        let tokenizer = Tokenizer::new(string);
534        let tokens: Vec<Token> = tokenizer.iter().collect();
535        assert_eq!(
536            tokens,
537            vec![
538                Token::Punctuation("(".to_string()),
539                Token::Punctuation("?".to_string()),
540                Token::Punctuation(")".to_string()),
541            ]
542        );
543        assert_eq!(
544            string,
545            tokens.iter().map(|x| x.to_string()).collect::<String>()
546        );
547    }
548
549    #[test]
550    fn test_14() {
551        let string = r"($1 = $2)";
552        let tokenizer = Tokenizer::new(string);
553        let tokens: Vec<Token> = tokenizer.iter().collect();
554        assert_eq!(
555            tokens,
556            vec![
557                Token::Punctuation("(".to_string()),
558                Token::Punctuation("$".to_string()),
559                Token::Unquoted("1".to_string()),
560                Token::Space(" ".to_string()),
561                Token::Punctuation("=".to_string()),
562                Token::Space(" ".to_string()),
563                Token::Punctuation("$".to_string()),
564                Token::Unquoted("2".to_string()),
565                Token::Punctuation(")".to_string()),
566            ]
567        );
568        assert_eq!(
569            string,
570            tokens.iter().map(|x| x.to_string()).collect::<String>()
571        );
572    }
573
574    #[test]
575    fn test_15() {
576        let string = r#" "Hello World" "#;
577        let tokenizer = Tokenizer::new(string);
578        let tokens: Vec<Token> = tokenizer.iter().collect();
579        assert_eq!(
580            tokens,
581            vec![
582                Token::Space(" ".to_string()),
583                Token::Quoted("\"Hello World\"".to_string()),
584                Token::Space(" ".to_string()),
585            ]
586        );
587        assert_eq!(
588            string,
589            tokens.iter().map(|x| x.to_string()).collect::<String>()
590        );
591    }
592
593    #[test]
594    fn test_16() {
595        let string = "abc_$123";
596        let tokenizer = Tokenizer::new(string);
597        let tokens: Vec<Token> = tokenizer.iter().collect();
598        assert_eq!(tokens, vec![Token::Unquoted(string.to_string())]);
599        assert_eq!(
600            string,
601            tokens.iter().map(|x| x.to_string()).collect::<String>()
602        );
603    }
604
605    #[test]
606    fn test_17() {
607        let string = "$abc$123";
608        let tokenizer = Tokenizer::new(string);
609        let tokens: Vec<Token> = tokenizer.iter().collect();
610        assert_eq!(
611            tokens,
612            vec![
613                Token::Punctuation("$".to_string()),
614                Token::Unquoted("abc$123".to_string()),
615            ]
616        );
617        assert_eq!(
618            string,
619            tokens.iter().map(|x| x.to_string()).collect::<String>()
620        );
621    }
622
623    #[test]
624    fn test_18() {
625        let string = "_$abc_123$";
626        let tokenizer = Tokenizer::new(string);
627        let tokens: Vec<Token> = tokenizer.iter().collect();
628        assert_eq!(
629            tokens,
630            vec![
631                Token::Punctuation("_".to_string()),
632                Token::Punctuation("$".to_string()),
633                Token::Unquoted("abc_123$".to_string()),
634            ]
635        );
636        assert_eq!(
637            string,
638            tokens.iter().map(|x| x.to_string()).collect::<String>()
639        );
640    }
641
642    #[test]
643    fn test_19() {
644        let string = r#""a\"bc""#;
645        let tokenizer = Tokenizer::new(string);
646        assert_eq!(tokenizer.unquote(), "a\\\"bc".to_owned());
647    }
648
649    #[test]
650    fn test_20() {
651        let string = r#""a""bc""#;
652        let tokenizer = Tokenizer::new(string);
653        assert_eq!(tokenizer.unquote(), "a\"bc".to_owned());
654    }
655
656    #[test]
657    fn test_21() {
658        assert_eq!(
659            Token::Quoted("'a\\nb'".to_owned()).unquote().unwrap(),
660            "a\\nb".to_owned()
661        );
662    }
663
664    #[test]
665    fn test_22() {
666        let string = r#" "Hello\nWorld" "#;
667        let tokenizer = Tokenizer::new(string);
668        let tokens: Vec<Token> = tokenizer.iter().collect();
669        assert_eq!(
670            tokens,
671            vec![
672                Token::Space(" ".to_string()),
673                Token::Quoted("\"Hello\\nWorld\"".to_string()),
674                Token::Space(" ".to_string()),
675            ]
676        );
677        assert_eq!(
678            string,
679            tokens.iter().map(|x| x.to_string()).collect::<String>()
680        );
681    }
682}