1use std::fmt::Write;
4use std::iter::Iterator;
5
6#[derive(Debug, Default)]
7pub struct Tokenizer {
8 pub chars: Vec<char>,
9 pub p: usize,
10}
11
12#[derive(Debug, PartialEq, Eq)]
13pub enum Token {
14 Quoted(String),
15 Unquoted(String),
16 Space(String),
17 Punctuation(String),
18}
19
20impl Tokenizer {
21 pub fn new(string: &str) -> Self {
22 Self {
23 chars: string.chars().collect(),
24 p: 0,
25 }
26 }
27
28 pub fn iter(self) -> impl Iterator<Item = Token> {
29 self
30 }
31
32 fn get(&self) -> char {
33 self.chars[self.p]
34 }
35
36 fn inc(&mut self) {
37 self.p += 1;
38 }
39
40 fn end(&self) -> bool {
41 self.p == self.chars.len()
42 }
43
44 fn space(&mut self) -> Option<Token> {
45 let mut string = String::new();
46 while !self.end() {
47 let c = self.get();
48 if Self::is_space(c) {
49 write!(string, "{c}").unwrap();
50 } else {
51 break;
52 }
53 self.inc();
54 }
55 if !string.is_empty() {
56 Some(Token::Space(string))
57 } else {
58 None
59 }
60 }
61
62 fn unquoted(&mut self) -> Option<Token> {
63 let mut string = String::new();
64 let mut first = true;
65 while !self.end() {
66 let c = self.get();
67 if Self::is_alphanumeric(c) {
68 write!(string, "{c}").unwrap();
69 first = false;
70 self.inc();
71 } else if !first && Self::is_identifier(c) {
72 write!(string, "{c}").unwrap();
73 self.inc();
74 } else {
75 break;
76 }
77 }
78 if !string.is_empty() {
79 Some(Token::Unquoted(string))
80 } else {
81 None
82 }
83 }
84
85 fn quoted(&mut self) -> Option<Token> {
86 let mut string = String::new();
87 let mut first = true;
88 let mut escape = false;
89 let mut start = ' ';
90 while !self.end() {
91 let c = self.get();
92 if first && Self::is_string_delimiter_start(c) {
93 write!(string, "{c}").unwrap();
94 first = false;
95 start = c;
96 self.inc();
97 } else if !first && !escape && Self::is_string_delimiter_end_for(start, c) {
98 write!(string, "{c}").unwrap();
99 self.inc();
100 if self.end() {
101 break;
102 }
103 if !Self::is_string_escape_for(start, self.get()) {
104 break;
105 } else {
106 write!(string, "{}", self.get()).unwrap();
107 self.inc();
108 }
109 } else if !first {
110 escape = !escape && Self::is_escape_char(c);
111 write!(string, "{c}").unwrap();
112 self.inc();
113 } else {
114 break;
115 }
116 }
117 if !string.is_empty() {
118 Some(Token::Quoted(string))
119 } else {
120 None
121 }
122 }
123
124 fn unquote(mut self) -> String {
126 let mut string = String::new();
127 let mut first = true;
128 let mut escape = false;
129 let mut start = ' ';
130 while !self.end() {
131 let c = self.get();
132 if first && Self::is_string_delimiter_start(c) {
133 first = false;
134 start = c;
135 self.inc();
136 } else if !first && !escape && Self::is_string_delimiter_end_for(start, c) {
137 self.inc();
138 if self.end() {
139 break;
140 }
141 if !Self::is_string_escape_for(start, self.get()) {
142 break;
143 } else {
144 write!(string, "{c}").unwrap();
145 self.inc();
146 }
147 } else if !first {
148 escape = !escape && Self::is_escape_char(c);
149 write!(string, "{c}").unwrap();
150 self.inc();
151 } else {
152 break;
153 }
154 }
155 string
156 }
157
158 fn punctuation(&mut self) -> Option<Token> {
159 let mut string = String::new();
160 if !self.end() {
161 let c = self.get();
162 if !Self::is_space(c) && !Self::is_alphanumeric(c) {
163 write!(string, "{c}").unwrap();
164 self.inc();
165 }
166 }
167 if !string.is_empty() {
168 Some(Token::Punctuation(string))
169 } else {
170 None
171 }
172 }
173
174 fn is_space(c: char) -> bool {
175 matches!(c, ' ' | '\t' | '\r' | '\n')
176 }
177
178 fn is_identifier(c: char) -> bool {
179 matches!(c, '_' | '$')
180 }
181
182 fn is_alphanumeric(c: char) -> bool {
183 c.is_alphabetic() || c.is_ascii_digit()
184 }
185
186 fn is_string_delimiter_start(c: char) -> bool {
187 matches!(c, '`' | '[' | '\'' | '"')
188 }
189
190 fn is_string_escape_for(start: char, c: char) -> bool {
191 match start {
192 '`' => c == '`',
193 '\'' => c == '\'',
194 '"' => c == '"',
195 _ => false,
196 }
197 }
198
199 fn is_string_delimiter_end_for(start: char, c: char) -> bool {
200 match start {
201 '`' => c == '`',
202 '[' => c == ']',
203 '\'' => c == '\'',
204 '"' => c == '"',
205 _ => false,
206 }
207 }
208
209 fn is_escape_char(c: char) -> bool {
210 c == '\\'
211 }
212}
213
214impl Iterator for Tokenizer {
215 type Item = Token;
216
217 fn next(&mut self) -> Option<Self::Item> {
218 if let Some(space) = self.space() {
219 return Some(space);
220 }
221 if let Some(unquoted) = self.unquoted() {
222 return Some(unquoted);
223 }
224 if let Some(quoted) = self.quoted() {
225 return Some(quoted);
226 }
227 if let Some(punctuation) = self.punctuation() {
228 return Some(punctuation);
229 }
230 None
231 }
232}
233
234impl Token {
235 pub fn is_quoted(&self) -> bool {
236 matches!(self, Self::Quoted(_))
237 }
238
239 pub fn is_unquoted(&self) -> bool {
240 matches!(self, Self::Unquoted(_))
241 }
242
243 pub fn is_space(&self) -> bool {
244 matches!(self, Self::Space(_))
245 }
246
247 pub fn is_punctuation(&self) -> bool {
248 matches!(self, Self::Punctuation(_))
249 }
250
251 pub fn as_str(&self) -> &str {
252 match self {
253 Self::Quoted(string) => string,
254 Self::Unquoted(string) => string,
255 Self::Space(string) => string,
256 Self::Punctuation(string) => string,
257 }
258 }
259
260 pub fn unquote(&self) -> Option<String> {
261 if self.is_quoted() {
262 let tokenizer = Tokenizer::new(self.as_str());
263 Some(tokenizer.unquote())
264 } else {
265 None
266 }
267 }
268}
269
270impl std::fmt::Display for Token {
271 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
272 write!(
273 f,
274 "{}",
275 match self {
276 Token::Unquoted(string) => string,
277 Token::Space(string) => string,
278 Token::Quoted(string) => string,
279 Token::Punctuation(string) => string,
280 }
281 )
282 }
283}
284
285#[cfg(test)]
286mod tests {
287 use super::*;
288 use pretty_assertions::assert_eq;
289
290 #[test]
291 fn test_0() {
292 let tokenizer = Tokenizer::new("");
293 let tokens: Vec<Token> = tokenizer.iter().collect();
294 assert_eq!(tokens, vec![]);
295 }
296
297 #[test]
298 fn test_1() {
299 let string = "SELECT * FROM `character`";
300 let tokenizer = Tokenizer::new(string);
301 let tokens: Vec<Token> = tokenizer.iter().collect();
302 assert_eq!(
303 tokens,
304 vec![
305 Token::Unquoted("SELECT".to_string()),
306 Token::Space(" ".to_string()),
307 Token::Punctuation("*".to_string()),
308 Token::Space(" ".to_string()),
309 Token::Unquoted("FROM".to_string()),
310 Token::Space(" ".to_string()),
311 Token::Quoted("`character`".to_string()),
312 ]
313 );
314 assert_eq!(
315 string,
316 tokens.iter().map(|x| x.to_string()).collect::<String>()
317 );
318 }
319
320 #[test]
321 fn test_2() {
322 let string = "SELECT * FROM `character` WHERE id = ?";
323 let tokenizer = Tokenizer::new(string);
324 let tokens: Vec<Token> = tokenizer.iter().collect();
325 assert_eq!(
326 tokens,
327 vec![
328 Token::Unquoted("SELECT".to_string()),
329 Token::Space(" ".to_string()),
330 Token::Punctuation("*".to_string()),
331 Token::Space(" ".to_string()),
332 Token::Unquoted("FROM".to_string()),
333 Token::Space(" ".to_string()),
334 Token::Quoted("`character`".to_string()),
335 Token::Space(" ".to_string()),
336 Token::Unquoted("WHERE".to_string()),
337 Token::Space(" ".to_string()),
338 Token::Unquoted("id".to_string()),
339 Token::Space(" ".to_string()),
340 Token::Punctuation("=".to_string()),
341 Token::Space(" ".to_string()),
342 Token::Punctuation("?".to_string()),
343 ]
344 );
345 assert_eq!(
346 string,
347 tokens.iter().map(|x| x.to_string()).collect::<String>()
348 );
349 }
350
351 #[test]
352 fn test_3() {
353 let string = r#"? = "?" "#;
354 let tokenizer = Tokenizer::new(string);
355 let tokens: Vec<Token> = tokenizer.iter().collect();
356 assert_eq!(
357 tokens,
358 vec![
359 Token::Punctuation("?".to_string()),
360 Token::Space(" ".to_string()),
361 Token::Punctuation("=".to_string()),
362 Token::Space(" ".to_string()),
363 Token::Quoted(r#""?""#.to_string()),
364 Token::Space(" ".to_string()),
365 ]
366 );
367 assert_eq!(
368 string,
369 tokens.iter().map(|x| x.to_string()).collect::<String>()
370 );
371 }
372
373 #[test]
374 fn test_4() {
375 let string = r#""a\"bc""#;
376 let tokenizer = Tokenizer::new(string);
377 let tokens: Vec<Token> = tokenizer.iter().collect();
378 assert_eq!(tokens, vec![Token::Quoted("\"a\\\"bc\"".to_string())]);
379 assert_eq!(
380 string,
381 tokens.iter().map(|x| x.to_string()).collect::<String>()
382 );
383 }
384
385 #[test]
386 fn test_5() {
387 let string = "abc123";
388 let tokenizer = Tokenizer::new(string);
389 let tokens: Vec<Token> = tokenizer.iter().collect();
390 assert_eq!(tokens, vec![Token::Unquoted(string.to_string())]);
391 assert_eq!(
392 string,
393 tokens.iter().map(|x| x.to_string()).collect::<String>()
394 );
395 }
396
397 #[test]
398 fn test_6() {
399 let string = "2.3*4";
400 let tokenizer = Tokenizer::new(string);
401 let tokens: Vec<Token> = tokenizer.iter().collect();
402 assert_eq!(
403 tokens,
404 vec![
405 Token::Unquoted("2".to_string()),
406 Token::Punctuation(".".to_string()),
407 Token::Unquoted("3".to_string()),
408 Token::Punctuation("*".to_string()),
409 Token::Unquoted("4".to_string()),
410 ]
411 );
412 assert_eq!(
413 string,
414 tokens.iter().map(|x| x.to_string()).collect::<String>()
415 );
416 }
417
418 #[test]
419 fn test_7() {
420 let string = r#""a\\" B"#;
421 let tokenizer = Tokenizer::new(string);
422 let tokens: Vec<Token> = tokenizer.iter().collect();
423 assert_eq!(
424 tokens,
425 vec![
426 Token::Quoted("\"a\\\\\"".to_string()),
427 Token::Space(" ".to_string()),
428 Token::Unquoted("B".to_string()),
429 ]
430 );
431 assert_eq!(
432 string,
433 tokens.iter().map(|x| x.to_string()).collect::<String>()
434 );
435 }
436
437 #[test]
438 fn test_8() {
439 let string = r#"`a"b` "#;
440 let tokenizer = Tokenizer::new(string);
441 let tokens: Vec<Token> = tokenizer.iter().collect();
442 assert_eq!(
443 tokens,
444 vec![
445 Token::Quoted("`a\"b`".to_string()),
446 Token::Space(" ".to_string()),
447 ]
448 );
449 assert_eq!(
450 string,
451 tokens.iter().map(|x| x.to_string()).collect::<String>()
452 );
453 }
454
455 #[test]
456 fn test_9() {
457 let string = r"[ab] ";
458 let tokenizer = Tokenizer::new(string);
459 let tokens: Vec<Token> = tokenizer.iter().collect();
460 assert_eq!(
461 tokens,
462 vec![
463 Token::Quoted("[ab]".to_string()),
464 Token::Space(" ".to_string()),
465 ]
466 );
467 assert_eq!(
468 string,
469 tokens.iter().map(|x| x.to_string()).collect::<String>()
470 );
471 }
472
473 #[test]
474 fn test_10() {
475 let string = r#" 'a"b' "#;
476 let tokenizer = Tokenizer::new(string);
477 let tokens: Vec<Token> = tokenizer.iter().collect();
478 assert_eq!(
479 tokens,
480 vec![
481 Token::Space(" ".to_string()),
482 Token::Quoted("'a\"b'".to_string()),
483 Token::Space(" ".to_string()),
484 ]
485 );
486 assert_eq!(
487 string,
488 tokens.iter().map(|x| x.to_string()).collect::<String>()
489 );
490 }
491
492 #[test]
493 fn test_11() {
494 let string = r" `a``b` ";
495 let tokenizer = Tokenizer::new(string);
496 let tokens: Vec<Token> = tokenizer.iter().collect();
497 assert_eq!(
498 tokens,
499 vec![
500 Token::Space(" ".to_string()),
501 Token::Quoted("`a``b`".to_string()),
502 Token::Space(" ".to_string()),
503 ]
504 );
505 assert_eq!(
506 string,
507 tokens.iter().map(|x| x.to_string()).collect::<String>()
508 );
509 }
510
511 #[test]
512 fn test_12() {
513 let string = r" 'a''b' ";
514 let tokenizer = Tokenizer::new(string);
515 let tokens: Vec<Token> = tokenizer.iter().collect();
516 assert_eq!(
517 tokens,
518 vec![
519 Token::Space(" ".to_string()),
520 Token::Quoted("'a''b'".to_string()),
521 Token::Space(" ".to_string()),
522 ]
523 );
524 assert_eq!(
525 string,
526 tokens.iter().map(|x| x.to_string()).collect::<String>()
527 );
528 }
529
530 #[test]
531 fn test_13() {
532 let string = r"(?)";
533 let tokenizer = Tokenizer::new(string);
534 let tokens: Vec<Token> = tokenizer.iter().collect();
535 assert_eq!(
536 tokens,
537 vec![
538 Token::Punctuation("(".to_string()),
539 Token::Punctuation("?".to_string()),
540 Token::Punctuation(")".to_string()),
541 ]
542 );
543 assert_eq!(
544 string,
545 tokens.iter().map(|x| x.to_string()).collect::<String>()
546 );
547 }
548
549 #[test]
550 fn test_14() {
551 let string = r"($1 = $2)";
552 let tokenizer = Tokenizer::new(string);
553 let tokens: Vec<Token> = tokenizer.iter().collect();
554 assert_eq!(
555 tokens,
556 vec![
557 Token::Punctuation("(".to_string()),
558 Token::Punctuation("$".to_string()),
559 Token::Unquoted("1".to_string()),
560 Token::Space(" ".to_string()),
561 Token::Punctuation("=".to_string()),
562 Token::Space(" ".to_string()),
563 Token::Punctuation("$".to_string()),
564 Token::Unquoted("2".to_string()),
565 Token::Punctuation(")".to_string()),
566 ]
567 );
568 assert_eq!(
569 string,
570 tokens.iter().map(|x| x.to_string()).collect::<String>()
571 );
572 }
573
574 #[test]
575 fn test_15() {
576 let string = r#" "Hello World" "#;
577 let tokenizer = Tokenizer::new(string);
578 let tokens: Vec<Token> = tokenizer.iter().collect();
579 assert_eq!(
580 tokens,
581 vec![
582 Token::Space(" ".to_string()),
583 Token::Quoted("\"Hello World\"".to_string()),
584 Token::Space(" ".to_string()),
585 ]
586 );
587 assert_eq!(
588 string,
589 tokens.iter().map(|x| x.to_string()).collect::<String>()
590 );
591 }
592
593 #[test]
594 fn test_16() {
595 let string = "abc_$123";
596 let tokenizer = Tokenizer::new(string);
597 let tokens: Vec<Token> = tokenizer.iter().collect();
598 assert_eq!(tokens, vec![Token::Unquoted(string.to_string())]);
599 assert_eq!(
600 string,
601 tokens.iter().map(|x| x.to_string()).collect::<String>()
602 );
603 }
604
605 #[test]
606 fn test_17() {
607 let string = "$abc$123";
608 let tokenizer = Tokenizer::new(string);
609 let tokens: Vec<Token> = tokenizer.iter().collect();
610 assert_eq!(
611 tokens,
612 vec![
613 Token::Punctuation("$".to_string()),
614 Token::Unquoted("abc$123".to_string()),
615 ]
616 );
617 assert_eq!(
618 string,
619 tokens.iter().map(|x| x.to_string()).collect::<String>()
620 );
621 }
622
623 #[test]
624 fn test_18() {
625 let string = "_$abc_123$";
626 let tokenizer = Tokenizer::new(string);
627 let tokens: Vec<Token> = tokenizer.iter().collect();
628 assert_eq!(
629 tokens,
630 vec![
631 Token::Punctuation("_".to_string()),
632 Token::Punctuation("$".to_string()),
633 Token::Unquoted("abc_123$".to_string()),
634 ]
635 );
636 assert_eq!(
637 string,
638 tokens.iter().map(|x| x.to_string()).collect::<String>()
639 );
640 }
641
642 #[test]
643 fn test_19() {
644 let string = r#""a\"bc""#;
645 let tokenizer = Tokenizer::new(string);
646 assert_eq!(tokenizer.unquote(), "a\\\"bc".to_owned());
647 }
648
649 #[test]
650 fn test_20() {
651 let string = r#""a""bc""#;
652 let tokenizer = Tokenizer::new(string);
653 assert_eq!(tokenizer.unquote(), "a\"bc".to_owned());
654 }
655
656 #[test]
657 fn test_21() {
658 assert_eq!(
659 Token::Quoted("'a\\nb'".to_owned()).unquote().unwrap(),
660 "a\\nb".to_owned()
661 );
662 }
663
664 #[test]
665 fn test_22() {
666 let string = r#" "Hello\nWorld" "#;
667 let tokenizer = Tokenizer::new(string);
668 let tokens: Vec<Token> = tokenizer.iter().collect();
669 assert_eq!(
670 tokens,
671 vec![
672 Token::Space(" ".to_string()),
673 Token::Quoted("\"Hello\\nWorld\"".to_string()),
674 Token::Space(" ".to_string()),
675 ]
676 );
677 assert_eq!(
678 string,
679 tokens.iter().map(|x| x.to_string()).collect::<String>()
680 );
681 }
682}