1use std::char;
9use std::iter::Peekable;
10use std::str::Chars;
11
12use crate::serialize::txt::errors::{LexerError, LexerErrorKind, LexerResult};
13
14pub struct Lexer<'a> {
16 txt: Peekable<Chars<'a>>,
17 state: State,
18}
19
20impl<'a> Lexer<'a> {
21 pub fn new(txt: &str) -> Lexer<'_> {
23 Lexer {
24 txt: txt.chars().peekable(),
25 state: State::StartLine,
26 }
27 }
28
29 pub fn next_token(&mut self) -> LexerResult<Option<Token>> {
31 let mut char_data_vec: Option<Vec<String>> = None;
32 let mut char_data: Option<String> = None;
33
34 for i in 0..4096 {
35 assert!(i < 4095); let ch: Option<char> = self.peek();
40
41 match self.state {
48 State::StartLine => {
49 match ch {
50 Some('\r') | Some('\n') => {
51 self.state = State::EOL;
52 }
53 Some(ch) if ch.is_whitespace() => self.state = State::Blank,
55 Some(_) => self.state = State::RestOfLine,
56 None => {
57 self.state = State::EOF;
58 }
59 }
60 }
61 State::RestOfLine => {
62 match ch {
63 Some('@') => self.state = State::At,
64 Some('(') => {
65 self.txt.next();
66 char_data_vec = Some(Vec::new());
67 self.state = State::List;
68 }
69 Some(ch @ ')') => return Err(LexerErrorKind::IllegalCharacter(ch).into()),
70 Some('$') => {
71 self.txt.next();
72 char_data = Some(String::new());
73 self.state = State::Dollar;
74 }
75 Some('\r') | Some('\n') => {
76 self.state = State::EOL;
77 }
78 Some('"') => {
79 self.txt.next();
80 char_data = Some(String::new());
81 self.state = State::Quote;
82 }
83 Some(';') => self.state = State::Comment { is_list: false },
84 Some(ch) if ch.is_whitespace() => {
85 self.txt.next();
86 } Some(ch) if !ch.is_control() && !ch.is_whitespace() => {
88 char_data = Some(String::new());
89 self.state = State::CharData { is_list: false };
90 }
91 Some(ch) => return Err(LexerErrorKind::UnrecognizedChar(ch).into()),
92 None => {
93 self.state = State::EOF;
94 }
95 }
96 }
97 State::Blank => {
98 self.txt.next();
100 self.state = State::RestOfLine;
101 return Ok(Some(Token::Blank));
102 }
103 State::Comment { is_list } => {
104 match ch {
105 Some('\r') | Some('\n') => {
106 self.state = if is_list { State::List } else { State::EOL };
107 } Some(_) => {
109 self.txt.next();
110 } None => {
112 self.state = State::EOF;
113 }
114 }
115 }
116 State::Quote => {
117 match ch {
118 Some('"') => {
120 self.state = State::RestOfLine;
121 self.txt.next();
122 return Ok(Some(Token::CharData(
123 char_data.take().unwrap_or_else(|| "".into()),
124 )));
125 }
126 Some('\\') => {
127 Self::push_to_str(&mut char_data, self.escape_seq()?)?;
128 }
129 Some(ch) => {
130 self.txt.next();
131 Self::push_to_str(&mut char_data, ch)?;
132 }
133 None => return Err(LexerErrorKind::UnclosedQuotedString.into()),
134 }
135 }
136 State::Dollar => {
137 match ch {
138 Some(ch @ 'A'..='Z') => {
140 self.txt.next();
141 Self::push_to_str(&mut char_data, ch)?;
142 }
143 Some(_) | None => {
145 self.state = State::RestOfLine;
146 let dollar: String = char_data.take().ok_or_else(|| {
147 LexerError::from(LexerErrorKind::IllegalState(
148 "char_data \
149 is None",
150 ))
151 })?;
152
153 return Ok(Some(match dollar.as_str() {
154 "INCLUDE" => Token::Include,
155 "ORIGIN" => Token::Origin,
156 "TTL" => Token::Ttl,
157 _ => {
158 return Err(LexerErrorKind::UnrecognizedDollar(
159 char_data.take().unwrap_or_else(|| "".into()),
160 )
161 .into())
162 }
163 }));
164 }
165 }
166 }
167 State::List => match ch {
168 Some(';') => {
169 self.txt.next();
170 self.state = State::Comment { is_list: true }
171 }
172 Some(')') => {
173 self.txt.next();
174 self.state = State::RestOfLine;
175 return char_data_vec
176 .take()
177 .ok_or_else(|| {
178 LexerErrorKind::IllegalState("char_data_vec is None").into()
179 })
180 .map(|v| Some(Token::List(v)));
181 }
182 Some(ch) if ch.is_whitespace() => {
183 self.txt.next();
184 }
185 Some(ch) if !ch.is_control() && !ch.is_whitespace() => {
186 char_data = Some(String::new());
187 self.state = State::CharData { is_list: true }
188 }
189 Some(ch) => return Err(LexerErrorKind::UnrecognizedChar(ch).into()),
190 None => return Err(LexerErrorKind::UnclosedList.into()),
191 },
192 State::CharData { is_list } => {
193 match ch {
194 Some(ch @ ')') if !is_list => {
195 return Err(LexerErrorKind::IllegalCharacter(ch).into())
196 }
197 Some(ch) if ch.is_whitespace() || ch == ')' || ch == ';' => {
198 if is_list {
199 char_data_vec
200 .as_mut()
201 .ok_or_else(|| {
202 LexerError::from(LexerErrorKind::IllegalState(
203 "char_data_vec is None",
204 ))
205 })
206 .and_then(|v| {
207 let char_data = char_data.take().ok_or(
208 LexerErrorKind::IllegalState("char_data is None"),
209 )?;
210
211 v.push(char_data);
212 Ok(())
213 })?;
214 self.state = State::List;
215 } else {
216 self.state = State::RestOfLine;
217 let result = char_data.take().ok_or_else(|| {
218 LexerErrorKind::IllegalState("char_data is None").into()
219 });
220 let opt = result.map(|s| Some(Token::CharData(s)));
221 return opt;
222 }
223 }
224 Some(ch) if !ch.is_control() && !ch.is_whitespace() => {
227 self.txt.next();
228 Self::push_to_str(&mut char_data, ch)?;
229 }
230 Some(ch) => return Err(LexerErrorKind::UnrecognizedChar(ch).into()),
231 None => {
232 self.state = State::EOF;
233 return char_data
234 .take()
235 .ok_or_else(|| {
236 LexerErrorKind::IllegalState("char_data is None").into()
237 })
238 .map(|s| Some(Token::CharData(s)));
239 }
240 }
241 }
242 State::At => {
243 self.txt.next();
244 self.state = State::RestOfLine;
245 return Ok(Some(Token::At));
246 }
247 State::EOL => match ch {
248 Some('\r') => {
249 self.txt.next();
250 }
251 Some('\n') => {
252 self.txt.next();
253 self.state = State::StartLine;
254 return Ok(Some(Token::EOL));
255 }
256 Some(ch) => return Err(LexerErrorKind::IllegalCharacter(ch).into()),
257 None => return Err(LexerErrorKind::EOF.into()),
258 },
259 State::EOF => {
261 self.txt.next(); return Ok(None);
263 }
264 }
265 }
266
267 unreachable!("The above match statement should have found a terminal state");
268 }
269
270 fn push_to_str(collect: &mut Option<String>, ch: char) -> LexerResult<()> {
271 collect
272 .as_mut()
273 .ok_or_else(|| LexerErrorKind::IllegalState("collect is None").into())
274 .map(|s| {
275 s.push(ch);
276 })
277 }
278
279 fn escape_seq(&mut self) -> LexerResult<char> {
280 self.txt.next(); let ch = self
283 .peek()
284 .ok_or_else(|| LexerError::from(LexerErrorKind::EOF))?;
285
286 if !ch.is_control() {
287 if ch.is_numeric() {
288 let d1: u32 = self
290 .txt
291 .next()
292 .ok_or_else(|| LexerError::from(LexerErrorKind::EOF))
293 .map(|c| {
294 c.to_digit(10)
295 .ok_or_else(|| LexerError::from(LexerErrorKind::IllegalCharacter(c)))
296 })??; let d2: u32 = self
298 .txt
299 .next()
300 .ok_or_else(|| LexerError::from(LexerErrorKind::EOF))
301 .map(|c| {
302 c.to_digit(10)
303 .ok_or_else(|| LexerError::from(LexerErrorKind::IllegalCharacter(c)))
304 })??; let d3: u32 = self
306 .txt
307 .next()
308 .ok_or_else(|| LexerError::from(LexerErrorKind::EOF))
309 .map(|c| {
310 c.to_digit(10)
311 .ok_or_else(|| LexerError::from(LexerErrorKind::IllegalCharacter(c)))
312 })??; let val: u32 = (d1 << 16) + (d2 << 8) + d3;
315 let ch: char = char::from_u32(val)
316 .ok_or_else(|| LexerError::from(LexerErrorKind::UnrecognizedOctet(val)))?;
317
318 Ok(ch)
319 } else {
320 self.txt.next(); Ok(ch)
323 }
324 } else {
325 Err(LexerErrorKind::IllegalCharacter(ch).into())
326 }
327 }
328
329 fn peek(&mut self) -> Option<char> {
330 self.txt.peek().cloned()
331 }
332}
333
334#[doc(hidden)]
335#[derive(Copy, Clone, PartialEq, Debug)]
336pub(crate) enum State {
337 StartLine,
338 RestOfLine,
339 Blank, List, CharData { is_list: bool }, Comment { is_list: bool }, At, Quote, Dollar, EOL, EOF,
349}
350
351#[derive(Eq, PartialEq, Debug, Clone)]
353pub enum Token {
354 Blank,
356 List(Vec<String>),
358 CharData(String),
360 At,
362 Include,
364 Origin,
366 Ttl,
368 EOL,
370}
371
372#[cfg(test)]
373mod lex_test {
374 use super::*;
375
376 #[allow(clippy::uninlined_format_args)]
377 fn next_token(lexer: &mut Lexer<'_>) -> Option<Token> {
378 let result = lexer.next_token();
379 assert!(result.is_ok(), "{:?}", result);
380 result.unwrap()
381 }
382
383 #[test]
384 fn blank() {
385 let mut lexer = Lexer::new(" dead beef");
387 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
388 assert_eq!(
389 next_token(&mut lexer).unwrap(),
390 Token::CharData("dead".to_string())
391 );
392 assert_eq!(
393 next_token(&mut lexer).unwrap(),
394 Token::CharData("beef".to_string())
395 );
396
397 let mut lexer = Lexer::new("dead beef");
399 assert_eq!(
400 next_token(&mut lexer).unwrap(),
401 Token::CharData("dead".to_string())
402 );
403 assert_eq!(
404 next_token(&mut lexer).unwrap(),
405 Token::CharData("beef".to_string())
406 );
407
408 let mut lexer = Lexer::new("dead beef\r\n after");
409 assert_eq!(
410 next_token(&mut lexer).unwrap(),
411 Token::CharData("dead".to_string())
412 );
413 assert_eq!(
414 next_token(&mut lexer).unwrap(),
415 Token::CharData("beef".to_string())
416 );
417 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
418 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
419 assert_eq!(
420 next_token(&mut lexer).unwrap(),
421 Token::CharData("after".to_string())
422 );
423
424 let mut lexer = Lexer::new(
425 "dead beef ();comment
426 after",
427 );
428 assert_eq!(
429 next_token(&mut lexer).unwrap(),
430 Token::CharData("dead".to_string())
431 );
432 assert_eq!(
433 next_token(&mut lexer).unwrap(),
434 Token::CharData("beef".to_string())
435 );
436 assert_eq!(next_token(&mut lexer).unwrap(), Token::List(vec![]));
437 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
438 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
439 assert_eq!(
440 next_token(&mut lexer).unwrap(),
441 Token::CharData("after".to_string())
442 );
443 }
444
445 #[test]
446 fn escape() {
447 assert_eq!(
448 Lexer::new("a\\Aa").next_token().unwrap().unwrap(),
449 Token::CharData("a\\Aa".to_string())
450 );
451 assert_eq!(
452 Lexer::new("a\\$").next_token().unwrap().unwrap(),
453 Token::CharData("a\\$".to_string())
454 );
455 assert_eq!(
456 Lexer::new("a\\077").next_token().unwrap().unwrap(),
457 Token::CharData("a\\077".to_string())
458 );
459 }
460
461 #[test]
462 fn quoted_txt() {
463 assert_eq!(
464 Lexer::new("\"Quoted\"").next_token().unwrap().unwrap(),
465 Token::CharData("Quoted".to_string())
466 );
467 assert_eq!(
468 Lexer::new("\";@$\"").next_token().unwrap().unwrap(),
469 Token::CharData(";@$".to_string())
470 );
471 assert_eq!(
472 Lexer::new("\"some \\A\"").next_token().unwrap().unwrap(),
473 Token::CharData("some A".to_string())
474 );
475 assert_eq!(
476 Lexer::new("\"a\\Aa\"").next_token().unwrap().unwrap(),
477 Token::CharData("aAa".to_string())
478 );
479 assert_eq!(
480 Lexer::new("\"a\\$\"").next_token().unwrap().unwrap(),
481 Token::CharData("a$".to_string())
482 );
483 assert_eq!(
484 Lexer::new("\"a\\077\"").next_token().unwrap().unwrap(),
485 Token::CharData("a\u{707}".to_string())
486 );
487
488 assert!(Lexer::new("\"a\\\"").next_token().is_err());
489 assert!(Lexer::new("\"a\\0\"").next_token().is_err());
490 assert!(Lexer::new("\"a\\07\"").next_token().is_err());
491
492 let mut lexer = Lexer::new("\"multi\nline\ntext\"");
493
494 assert_eq!(
495 next_token(&mut lexer).unwrap(),
496 Token::CharData("multi\nline\ntext".to_string())
497 );
498 assert_eq!(next_token(&mut lexer), None);
499
500 let mut lexer = Lexer::new("\"multi\r\nline\r\ntext\"\r\n");
501
502 assert_eq!(
503 next_token(&mut lexer).unwrap(),
504 Token::CharData("multi\r\nline\r\ntext".to_string())
505 );
506 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
507 assert_eq!(next_token(&mut lexer), None);
508
509 assert!(Lexer::new("\"multi").next_token().is_err());
510 }
511
512 #[test]
513 fn unicode() {
514 assert_eq!(
515 Lexer::new("♥").next_token().unwrap().unwrap(),
516 Token::CharData("♥".to_string())
517 );
518 }
519
520 #[test]
522 fn lex() {
523 assert_eq!(
524 next_token(&mut Lexer::new(".")).unwrap(),
525 Token::CharData(".".to_string())
526 );
527 assert_eq!(
528 next_token(&mut Lexer::new(" .")).unwrap(),
529 Token::Blank
530 );
531 assert_eq!(
532 next_token(&mut Lexer::new("abc")).unwrap(),
533 Token::CharData("abc".to_string())
534 );
535 assert_eq!(
536 next_token(&mut Lexer::new("abc.")).unwrap(),
537 Token::CharData("abc.".to_string())
538 );
539 assert_eq!(next_token(&mut Lexer::new(";abc")), None);
540 assert_eq!(next_token(&mut Lexer::new(";;@$-\"")), None);
541 assert_eq!(next_token(&mut Lexer::new("@")).unwrap(), Token::At);
542 assert_eq!(
543 next_token(&mut Lexer::new("123")).unwrap(),
544 Token::CharData("123".to_string())
545 );
546 assert_eq!(
547 next_token(&mut Lexer::new("$INCLUDE")).unwrap(),
548 Token::Include
549 );
550 assert_eq!(
551 next_token(&mut Lexer::new("$ORIGIN")).unwrap(),
552 Token::Origin
553 );
554 assert_eq!(next_token(&mut Lexer::new("$TTL")).unwrap(), Token::Ttl);
555 assert_eq!(next_token(&mut Lexer::new("\n")), Some(Token::EOL));
556 assert_eq!(next_token(&mut Lexer::new("\r\n")), Some(Token::EOL));
557 }
558
559 #[test]
560 fn list() {
561 let mut lexer = Lexer::new("(");
562 assert!(lexer.next_token().is_err());
563
564 assert!(Lexer::new(")").next_token().is_err());
565
566 let mut lexer = Lexer::new("()");
567 assert_eq!(next_token(&mut lexer).unwrap(), Token::List(vec![]));
568 assert_eq!(next_token(&mut lexer), None);
569
570 let mut lexer = Lexer::new("(abc)");
571 assert_eq!(
572 next_token(&mut lexer).unwrap(),
573 Token::List(vec!["abc".to_string()])
574 );
575 assert_eq!(next_token(&mut lexer), None);
576
577 let mut lexer = Lexer::new("(\nabc\n)");
578 assert_eq!(
579 next_token(&mut lexer).unwrap(),
580 Token::List(vec!["abc".to_string()])
581 );
582 assert_eq!(next_token(&mut lexer), None);
583
584 let mut lexer = Lexer::new("(\nabc\nabc)");
585 assert_eq!(
586 next_token(&mut lexer).unwrap(),
587 Token::List(vec!["abc".to_string(), "abc".to_string()])
588 );
589 assert_eq!(next_token(&mut lexer), None);
590
591 let mut lexer = Lexer::new("(\nabc;comment\n)");
592 assert_eq!(
593 next_token(&mut lexer).unwrap(),
594 Token::List(vec!["abc".to_string()])
595 );
596 assert_eq!(next_token(&mut lexer), None);
597 }
598
599 #[test]
600 #[allow(clippy::cognitive_complexity)]
601 fn soa() {
602 let mut lexer = Lexer::new(
603 "@ IN SOA VENERA Action\\.domains (
604 \
605 20 ; SERIAL
606 7200 ; REFRESH
607 \
608 600 ; RETRY
609 3600000; EXPIRE
610 \
611 60) ; MINIMUM
612
613 NS A.ISI.EDU.
614 NS VENERA
615 \
616 NS VAXA
617 MX 10 VENERA
618 MX 20 VAXA
619
620\
621 A A 26.3.0.103
622
623VENERA A 10.1.0.52
624 A \
625 128.9.0.32
626
627$INCLUDE <SUBSYS>ISI-MAILBOXES.TXT",
628 );
629
630 assert_eq!(next_token(&mut lexer).unwrap(), Token::At);
631 assert_eq!(
632 next_token(&mut lexer).unwrap(),
633 Token::CharData("IN".to_string())
634 );
635 assert_eq!(
636 next_token(&mut lexer).unwrap(),
637 Token::CharData("SOA".to_string())
638 );
639 assert_eq!(
640 next_token(&mut lexer).unwrap(),
641 Token::CharData("VENERA".to_string())
642 );
643 assert_eq!(
644 next_token(&mut lexer).unwrap(),
645 Token::CharData("Action\\.domains".to_string())
646 );
647 assert_eq!(
648 next_token(&mut lexer).unwrap(),
649 Token::List(vec![
650 "20".to_string(),
651 "7200".to_string(),
652 "600".to_string(),
653 "3600000".to_string(),
654 "60".to_string(),
655 ])
656 );
657 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
658 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
659 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
660 assert_eq!(
661 next_token(&mut lexer).unwrap(),
662 Token::CharData("NS".to_string())
663 );
664 assert_eq!(
665 next_token(&mut lexer).unwrap(),
666 Token::CharData("A.ISI.EDU.".to_string())
667 );
668 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
669 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
670 assert_eq!(
671 next_token(&mut lexer).unwrap(),
672 Token::CharData("NS".to_string())
673 );
674 assert_eq!(
675 next_token(&mut lexer).unwrap(),
676 Token::CharData("VENERA".to_string())
677 );
678 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
679 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
680 assert_eq!(
681 next_token(&mut lexer).unwrap(),
682 Token::CharData("NS".to_string())
683 );
684 assert_eq!(
685 next_token(&mut lexer).unwrap(),
686 Token::CharData("VAXA".to_string())
687 );
688 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
689 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
690 assert_eq!(
691 next_token(&mut lexer).unwrap(),
692 Token::CharData("MX".to_string())
693 );
694 assert_eq!(
695 next_token(&mut lexer).unwrap(),
696 Token::CharData("10".to_string())
697 );
698 assert_eq!(
699 next_token(&mut lexer).unwrap(),
700 Token::CharData("VENERA".to_string())
701 );
702 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
703 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
704 assert_eq!(
705 next_token(&mut lexer).unwrap(),
706 Token::CharData("MX".to_string())
707 );
708 assert_eq!(
709 next_token(&mut lexer).unwrap(),
710 Token::CharData("20".to_string())
711 );
712 assert_eq!(
713 next_token(&mut lexer).unwrap(),
714 Token::CharData("VAXA".to_string())
715 );
716 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
717 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
718 assert_eq!(
719 next_token(&mut lexer).unwrap(),
720 Token::CharData("A".to_string())
721 );
722 assert_eq!(
723 next_token(&mut lexer).unwrap(),
724 Token::CharData("A".to_string())
725 );
726 assert_eq!(
727 next_token(&mut lexer).unwrap(),
728 Token::CharData("26.3.0.103".to_string())
729 );
730 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
731 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
732 assert_eq!(
733 next_token(&mut lexer).unwrap(),
734 Token::CharData("VENERA".to_string())
735 );
736 assert_eq!(
737 next_token(&mut lexer).unwrap(),
738 Token::CharData("A".to_string())
739 );
740 assert_eq!(
741 next_token(&mut lexer).unwrap(),
742 Token::CharData("10.1.0.52".to_string())
743 );
744 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
745 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
746 assert_eq!(
747 next_token(&mut lexer).unwrap(),
748 Token::CharData("A".to_string())
749 );
750 assert_eq!(
751 next_token(&mut lexer).unwrap(),
752 Token::CharData("128.9.0.32".to_string())
753 );
754 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
755 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
756 assert_eq!(next_token(&mut lexer).unwrap(), Token::Include);
757 assert_eq!(
758 next_token(&mut lexer).unwrap(),
759 Token::CharData("<SUBSYS>ISI-MAILBOXES.TXT".to_string())
760 );
761 assert!(next_token(&mut lexer).is_none());
762 }
763}