1use alloc::borrow::Cow;
9use alloc::string::String;
10use alloc::vec::Vec;
11use core::{char, iter::Peekable};
12
13use crate::serialize::txt::errors::{LexerError, LexerErrorKind, LexerResult};
14
15pub(crate) struct Lexer<'a> {
17 txt: Peekable<CowChars<'a>>,
18 state: State,
19}
20
21impl<'a> Lexer<'a> {
22 pub(crate) fn new(txt: impl Into<Cow<'a, str>>) -> Self {
24 Lexer {
25 txt: CowChars {
26 data: txt.into(),
27 offset: 0,
28 }
29 .peekable(),
30 state: State::StartLine,
31 }
32 }
33
34 pub(crate) fn next_token(&mut self) -> LexerResult<Option<Token>> {
36 let mut char_data_vec: Option<Vec<String>> = None;
37 let mut char_data: Option<String> = None;
38
39 for i in 0..4_096 {
40 assert!(i < 4095); let ch: Option<char> = self.peek();
45
46 match self.state {
53 State::StartLine => {
54 match ch {
55 Some('\r') | Some('\n') => {
56 self.state = State::EOL;
57 }
58 Some(ch) if ch.is_whitespace() => self.state = State::Blank,
60 Some(_) => self.state = State::RestOfLine,
61 None => {
62 self.state = State::EOF;
63 }
64 }
65 }
66 State::RestOfLine => {
67 match ch {
68 Some('@') => self.state = State::At,
69 Some('(') => {
70 self.txt.next();
71 char_data_vec = Some(Vec::new());
72 self.state = State::List;
73 }
74 Some(ch @ ')') => return Err(LexerErrorKind::IllegalCharacter(ch).into()),
75 Some('$') => {
76 self.txt.next();
77 char_data = Some(String::new());
78 self.state = State::Dollar;
79 }
80 Some('\r') | Some('\n') => {
81 self.state = State::EOL;
82 }
83 Some('"') => {
84 self.txt.next();
85 char_data = Some(String::new());
86 self.state = State::Quote;
87 }
88 Some(';') => self.state = State::Comment { is_list: false },
89 Some(ch) if ch.is_whitespace() => {
90 self.txt.next();
91 } Some(ch) if !ch.is_control() && !ch.is_whitespace() => {
93 char_data = Some(String::new());
94 self.state = State::CharData { is_list: false };
95 }
96 Some(ch) => return Err(LexerErrorKind::UnrecognizedChar(ch).into()),
97 None => {
98 self.state = State::EOF;
99 }
100 }
101 }
102 State::Blank => {
103 self.txt.next();
105 self.state = State::RestOfLine;
106 return Ok(Some(Token::Blank));
107 }
108 State::Comment { is_list } => {
109 match ch {
110 Some('\r') | Some('\n') => {
111 self.state = if is_list { State::List } else { State::EOL };
112 } Some(_) => {
114 self.txt.next();
115 } None => {
117 self.state = State::EOF;
118 }
119 }
120 }
121 State::Quote => {
122 match ch {
123 Some('"') => {
125 self.state = State::RestOfLine;
126 self.txt.next();
127 return Ok(Some(Token::CharData(
128 char_data.take().unwrap_or_else(|| "".into()),
129 )));
130 }
131 Some('\\') => {
132 Self::push_to_str(&mut char_data, self.escape_seq()?)?;
133 }
134 Some(ch) => {
135 self.txt.next();
136 Self::push_to_str(&mut char_data, ch)?;
137 }
138 None => return Err(LexerErrorKind::UnclosedQuotedString.into()),
139 }
140 }
141 State::Dollar => {
142 match ch {
143 Some(ch @ 'A'..='Z') => {
145 self.txt.next();
146 Self::push_to_str(&mut char_data, ch)?;
147 }
148 Some(_) | None => {
150 self.state = State::RestOfLine;
151 let dollar: String = char_data.take().ok_or_else(|| {
152 LexerError::from(LexerErrorKind::IllegalState(
153 "char_data \
154 is None",
155 ))
156 })?;
157
158 return Ok(Some(match dollar.as_str() {
159 "INCLUDE" => Token::Include,
160 "ORIGIN" => Token::Origin,
161 "TTL" => Token::Ttl,
162 _ => {
163 return Err(LexerErrorKind::UnrecognizedDollar(
164 char_data.take().unwrap_or_else(|| "".into()),
165 )
166 .into());
167 }
168 }));
169 }
170 }
171 }
172 State::List => match ch {
173 Some(';') => {
174 self.txt.next();
175 self.state = State::Comment { is_list: true }
176 }
177 Some(')') => {
178 self.txt.next();
179 self.state = State::RestOfLine;
180 return char_data_vec
181 .take()
182 .ok_or_else(|| {
183 LexerErrorKind::IllegalState("char_data_vec is None").into()
184 })
185 .map(|v| Some(Token::List(v)));
186 }
187 Some(ch) if ch.is_whitespace() => {
188 self.txt.next();
189 }
190 Some(ch) if !ch.is_control() && !ch.is_whitespace() => {
191 char_data = Some(String::new());
192 self.state = State::CharData { is_list: true }
193 }
194 Some(ch) => return Err(LexerErrorKind::UnrecognizedChar(ch).into()),
195 None => return Err(LexerErrorKind::UnclosedList.into()),
196 },
197 State::CharData { is_list } => {
198 match ch {
199 Some(ch @ ')') if !is_list => {
200 return Err(LexerErrorKind::IllegalCharacter(ch).into());
201 }
202 Some(ch) if ch.is_whitespace() || ch == ')' || ch == ';' => {
203 if is_list {
204 char_data_vec
205 .as_mut()
206 .ok_or_else(|| {
207 LexerError::from(LexerErrorKind::IllegalState(
208 "char_data_vec is None",
209 ))
210 })
211 .and_then(|v| {
212 let char_data = char_data.take().ok_or(
213 LexerErrorKind::IllegalState("char_data is None"),
214 )?;
215
216 v.push(char_data);
217 Ok(())
218 })?;
219 self.state = State::List;
220 } else {
221 self.state = State::RestOfLine;
222 let result = char_data.take().ok_or_else(|| {
223 LexerErrorKind::IllegalState("char_data is None").into()
224 });
225 let opt = result.map(|s| Some(Token::CharData(s)));
226 return opt;
227 }
228 }
229 Some(ch) if !ch.is_control() && !ch.is_whitespace() => {
232 self.txt.next();
233 Self::push_to_str(&mut char_data, ch)?;
234 }
235 Some(ch) => return Err(LexerErrorKind::UnrecognizedChar(ch).into()),
236 None => {
237 self.state = State::EOF;
238 return char_data
239 .take()
240 .ok_or_else(|| {
241 LexerErrorKind::IllegalState("char_data is None").into()
242 })
243 .map(|s| Some(Token::CharData(s)));
244 }
245 }
246 }
247 State::At => {
248 self.txt.next();
249 self.state = State::RestOfLine;
250 return Ok(Some(Token::At));
251 }
252 State::EOL => match ch {
253 Some('\r') => {
254 self.txt.next();
255 }
256 Some('\n') => {
257 self.txt.next();
258 self.state = State::StartLine;
259 return Ok(Some(Token::EOL));
260 }
261 Some(ch) => return Err(LexerErrorKind::IllegalCharacter(ch).into()),
262 None => return Err(LexerErrorKind::EOF.into()),
263 },
264 State::EOF => {
266 self.txt.next(); return Ok(None);
268 }
269 }
270 }
271
272 unreachable!("The above match statement should have found a terminal state");
273 }
274
275 fn push_to_str(collect: &mut Option<String>, ch: char) -> LexerResult<()> {
276 collect
277 .as_mut()
278 .ok_or_else(|| LexerErrorKind::IllegalState("collect is None").into())
279 .map(|s| {
280 s.push(ch);
281 })
282 }
283
284 fn escape_seq(&mut self) -> LexerResult<char> {
285 self.txt.next(); let ch = self
288 .peek()
289 .ok_or_else(|| LexerError::from(LexerErrorKind::EOF))?;
290
291 if !ch.is_control() {
292 if ch.is_numeric() {
293 let d1: u32 = self
295 .txt
296 .next()
297 .ok_or_else(|| LexerError::from(LexerErrorKind::EOF))
298 .map(|c| {
299 c.to_digit(10)
300 .ok_or_else(|| LexerError::from(LexerErrorKind::IllegalCharacter(c)))
301 })??; let d2: u32 = self
303 .txt
304 .next()
305 .ok_or_else(|| LexerError::from(LexerErrorKind::EOF))
306 .map(|c| {
307 c.to_digit(10)
308 .ok_or_else(|| LexerError::from(LexerErrorKind::IllegalCharacter(c)))
309 })??; let d3: u32 = self
311 .txt
312 .next()
313 .ok_or_else(|| LexerError::from(LexerErrorKind::EOF))
314 .map(|c| {
315 c.to_digit(10)
316 .ok_or_else(|| LexerError::from(LexerErrorKind::IllegalCharacter(c)))
317 })??; let val: u32 = (d1 << 16) + (d2 << 8) + d3;
320 let ch: char = char::from_u32(val)
321 .ok_or_else(|| LexerError::from(LexerErrorKind::UnrecognizedOctet(val)))?;
322
323 Ok(ch)
324 } else {
325 self.txt.next(); Ok(ch)
328 }
329 } else {
330 Err(LexerErrorKind::IllegalCharacter(ch).into())
331 }
332 }
333
334 fn peek(&mut self) -> Option<char> {
335 self.txt.peek().copied()
336 }
337}
338
339struct CowChars<'a> {
340 data: Cow<'a, str>,
341 offset: usize,
342}
343
344impl Iterator for CowChars<'_> {
345 type Item = char;
346
347 fn next(&mut self) -> Option<char> {
348 let mut iter = self.data[self.offset..].char_indices();
349 let (_, ch) = iter.next()?; match iter.next() {
351 Some((idx, _)) => self.offset += idx,
352 None => self.offset = self.data.len(),
353 }
354
355 Some(ch)
356 }
357}
358
359#[doc(hidden)]
360#[derive(Copy, Clone, PartialEq, Debug)]
361pub(crate) enum State {
362 StartLine,
363 RestOfLine,
364 Blank, List, CharData { is_list: bool }, Comment { is_list: bool }, At, Quote, Dollar, EOL, EOF,
374}
375
376#[derive(Eq, PartialEq, Debug, Clone)]
378pub enum Token {
379 Blank,
381 List(Vec<String>),
383 CharData(String),
385 At,
387 Include,
389 Origin,
391 Ttl,
393 EOL,
395}
396
397#[cfg(test)]
398mod lex_test {
399 use alloc::string::ToString;
400
401 use super::*;
402
403 #[allow(clippy::uninlined_format_args)]
404 fn next_token(lexer: &mut Lexer<'_>) -> Option<Token> {
405 let result = lexer.next_token();
406 assert!(result.is_ok(), "{:?}", result);
407 result.unwrap()
408 }
409
410 #[test]
411 fn blank() {
412 let mut lexer = Lexer::new(" dead beef");
414 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
415 assert_eq!(
416 next_token(&mut lexer).unwrap(),
417 Token::CharData("dead".to_string())
418 );
419 assert_eq!(
420 next_token(&mut lexer).unwrap(),
421 Token::CharData("beef".to_string())
422 );
423
424 let mut lexer = Lexer::new("dead beef");
426 assert_eq!(
427 next_token(&mut lexer).unwrap(),
428 Token::CharData("dead".to_string())
429 );
430 assert_eq!(
431 next_token(&mut lexer).unwrap(),
432 Token::CharData("beef".to_string())
433 );
434
435 let mut lexer = Lexer::new("dead beef\r\n after");
436 assert_eq!(
437 next_token(&mut lexer).unwrap(),
438 Token::CharData("dead".to_string())
439 );
440 assert_eq!(
441 next_token(&mut lexer).unwrap(),
442 Token::CharData("beef".to_string())
443 );
444 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
445 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
446 assert_eq!(
447 next_token(&mut lexer).unwrap(),
448 Token::CharData("after".to_string())
449 );
450
451 let mut lexer = Lexer::new(
452 "dead beef ();comment
453 after",
454 );
455 assert_eq!(
456 next_token(&mut lexer).unwrap(),
457 Token::CharData("dead".to_string())
458 );
459 assert_eq!(
460 next_token(&mut lexer).unwrap(),
461 Token::CharData("beef".to_string())
462 );
463 assert_eq!(next_token(&mut lexer).unwrap(), Token::List(vec![]));
464 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
465 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
466 assert_eq!(
467 next_token(&mut lexer).unwrap(),
468 Token::CharData("after".to_string())
469 );
470 }
471
472 #[test]
473 fn escape() {
474 assert_eq!(
475 Lexer::new("a\\Aa").next_token().unwrap().unwrap(),
476 Token::CharData("a\\Aa".to_string())
477 );
478 assert_eq!(
479 Lexer::new("a\\$").next_token().unwrap().unwrap(),
480 Token::CharData("a\\$".to_string())
481 );
482 assert_eq!(
483 Lexer::new("a\\077").next_token().unwrap().unwrap(),
484 Token::CharData("a\\077".to_string())
485 );
486 }
487
488 #[test]
489 fn quoted_txt() {
490 assert_eq!(
491 Lexer::new("\"Quoted\"").next_token().unwrap().unwrap(),
492 Token::CharData("Quoted".to_string())
493 );
494 assert_eq!(
495 Lexer::new("\";@$\"").next_token().unwrap().unwrap(),
496 Token::CharData(";@$".to_string())
497 );
498 assert_eq!(
499 Lexer::new("\"some \\A\"").next_token().unwrap().unwrap(),
500 Token::CharData("some A".to_string())
501 );
502 assert_eq!(
503 Lexer::new("\"a\\Aa\"").next_token().unwrap().unwrap(),
504 Token::CharData("aAa".to_string())
505 );
506 assert_eq!(
507 Lexer::new("\"a\\$\"").next_token().unwrap().unwrap(),
508 Token::CharData("a$".to_string())
509 );
510 assert_eq!(
511 Lexer::new("\"a\\077\"").next_token().unwrap().unwrap(),
512 Token::CharData("a\u{707}".to_string())
513 );
514
515 assert!(Lexer::new("\"a\\\"").next_token().is_err());
516 assert!(Lexer::new("\"a\\0\"").next_token().is_err());
517 assert!(Lexer::new("\"a\\07\"").next_token().is_err());
518
519 let mut lexer = Lexer::new("\"multi\nline\ntext\"");
520
521 assert_eq!(
522 next_token(&mut lexer).unwrap(),
523 Token::CharData("multi\nline\ntext".to_string())
524 );
525 assert_eq!(next_token(&mut lexer), None);
526
527 let mut lexer = Lexer::new("\"multi\r\nline\r\ntext\"\r\n");
528
529 assert_eq!(
530 next_token(&mut lexer).unwrap(),
531 Token::CharData("multi\r\nline\r\ntext".to_string())
532 );
533 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
534 assert_eq!(next_token(&mut lexer), None);
535
536 assert!(Lexer::new("\"multi").next_token().is_err());
537 }
538
539 #[test]
540 fn unicode() {
541 assert_eq!(
542 Lexer::new("♥").next_token().unwrap().unwrap(),
543 Token::CharData("♥".to_string())
544 );
545 }
546
547 #[test]
549 fn lex() {
550 assert_eq!(
551 next_token(&mut Lexer::new(".")).unwrap(),
552 Token::CharData(".".to_string())
553 );
554 assert_eq!(
555 next_token(&mut Lexer::new(" .")).unwrap(),
556 Token::Blank
557 );
558 assert_eq!(
559 next_token(&mut Lexer::new("abc")).unwrap(),
560 Token::CharData("abc".to_string())
561 );
562 assert_eq!(
563 next_token(&mut Lexer::new("abc.")).unwrap(),
564 Token::CharData("abc.".to_string())
565 );
566 assert_eq!(next_token(&mut Lexer::new(";abc")), None);
567 assert_eq!(next_token(&mut Lexer::new(";;@$-\"")), None);
568 assert_eq!(next_token(&mut Lexer::new("@")).unwrap(), Token::At);
569 assert_eq!(
570 next_token(&mut Lexer::new("123")).unwrap(),
571 Token::CharData("123".to_string())
572 );
573 assert_eq!(
574 next_token(&mut Lexer::new("$INCLUDE")).unwrap(),
575 Token::Include
576 );
577 assert_eq!(
578 next_token(&mut Lexer::new("$ORIGIN")).unwrap(),
579 Token::Origin
580 );
581 assert_eq!(next_token(&mut Lexer::new("$TTL")).unwrap(), Token::Ttl);
582 assert_eq!(next_token(&mut Lexer::new("\n")), Some(Token::EOL));
583 assert_eq!(next_token(&mut Lexer::new("\r\n")), Some(Token::EOL));
584 }
585
586 #[test]
587 fn list() {
588 let mut lexer = Lexer::new("(");
589 assert!(lexer.next_token().is_err());
590
591 assert!(Lexer::new(")").next_token().is_err());
592
593 let mut lexer = Lexer::new("()");
594 assert_eq!(next_token(&mut lexer).unwrap(), Token::List(vec![]));
595 assert_eq!(next_token(&mut lexer), None);
596
597 let mut lexer = Lexer::new("(abc)");
598 assert_eq!(
599 next_token(&mut lexer).unwrap(),
600 Token::List(vec!["abc".to_string()])
601 );
602 assert_eq!(next_token(&mut lexer), None);
603
604 let mut lexer = Lexer::new("(\nabc\n)");
605 assert_eq!(
606 next_token(&mut lexer).unwrap(),
607 Token::List(vec!["abc".to_string()])
608 );
609 assert_eq!(next_token(&mut lexer), None);
610
611 let mut lexer = Lexer::new("(\nabc\nabc)");
612 assert_eq!(
613 next_token(&mut lexer).unwrap(),
614 Token::List(vec!["abc".to_string(), "abc".to_string()])
615 );
616 assert_eq!(next_token(&mut lexer), None);
617
618 let mut lexer = Lexer::new("(\nabc;comment\n)");
619 assert_eq!(
620 next_token(&mut lexer).unwrap(),
621 Token::List(vec!["abc".to_string()])
622 );
623 assert_eq!(next_token(&mut lexer), None);
624 }
625
626 #[test]
627 #[allow(clippy::cognitive_complexity)]
628 fn soa() {
629 let mut lexer = Lexer::new(
630 "@ IN SOA VENERA Action\\.domains (
631 \
632 20 ; SERIAL
633 7200 ; REFRESH
634 \
635 600 ; RETRY
636 3600000; EXPIRE
637 \
638 60) ; MINIMUM
639
640 NS A.ISI.EDU.
641 NS VENERA
642 \
643 NS VAXA
644 MX 10 VENERA
645 MX 20 VAXA
646
647\
648 A A 26.3.0.103
649
650VENERA A 10.1.0.52
651 A \
652 128.9.0.32
653
654$INCLUDE <SUBSYS>ISI-MAILBOXES.TXT",
655 );
656
657 assert_eq!(next_token(&mut lexer).unwrap(), Token::At);
658 assert_eq!(
659 next_token(&mut lexer).unwrap(),
660 Token::CharData("IN".to_string())
661 );
662 assert_eq!(
663 next_token(&mut lexer).unwrap(),
664 Token::CharData("SOA".to_string())
665 );
666 assert_eq!(
667 next_token(&mut lexer).unwrap(),
668 Token::CharData("VENERA".to_string())
669 );
670 assert_eq!(
671 next_token(&mut lexer).unwrap(),
672 Token::CharData("Action\\.domains".to_string())
673 );
674 assert_eq!(
675 next_token(&mut lexer).unwrap(),
676 Token::List(vec![
677 "20".to_string(),
678 "7200".to_string(),
679 "600".to_string(),
680 "3600000".to_string(),
681 "60".to_string(),
682 ])
683 );
684 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
685 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
686 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
687 assert_eq!(
688 next_token(&mut lexer).unwrap(),
689 Token::CharData("NS".to_string())
690 );
691 assert_eq!(
692 next_token(&mut lexer).unwrap(),
693 Token::CharData("A.ISI.EDU.".to_string())
694 );
695 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
696 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
697 assert_eq!(
698 next_token(&mut lexer).unwrap(),
699 Token::CharData("NS".to_string())
700 );
701 assert_eq!(
702 next_token(&mut lexer).unwrap(),
703 Token::CharData("VENERA".to_string())
704 );
705 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
706 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
707 assert_eq!(
708 next_token(&mut lexer).unwrap(),
709 Token::CharData("NS".to_string())
710 );
711 assert_eq!(
712 next_token(&mut lexer).unwrap(),
713 Token::CharData("VAXA".to_string())
714 );
715 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
716 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
717 assert_eq!(
718 next_token(&mut lexer).unwrap(),
719 Token::CharData("MX".to_string())
720 );
721 assert_eq!(
722 next_token(&mut lexer).unwrap(),
723 Token::CharData("10".to_string())
724 );
725 assert_eq!(
726 next_token(&mut lexer).unwrap(),
727 Token::CharData("VENERA".to_string())
728 );
729 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
730 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
731 assert_eq!(
732 next_token(&mut lexer).unwrap(),
733 Token::CharData("MX".to_string())
734 );
735 assert_eq!(
736 next_token(&mut lexer).unwrap(),
737 Token::CharData("20".to_string())
738 );
739 assert_eq!(
740 next_token(&mut lexer).unwrap(),
741 Token::CharData("VAXA".to_string())
742 );
743 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
744 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
745 assert_eq!(
746 next_token(&mut lexer).unwrap(),
747 Token::CharData("A".to_string())
748 );
749 assert_eq!(
750 next_token(&mut lexer).unwrap(),
751 Token::CharData("A".to_string())
752 );
753 assert_eq!(
754 next_token(&mut lexer).unwrap(),
755 Token::CharData("26.3.0.103".to_string())
756 );
757 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
758 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
759 assert_eq!(
760 next_token(&mut lexer).unwrap(),
761 Token::CharData("VENERA".to_string())
762 );
763 assert_eq!(
764 next_token(&mut lexer).unwrap(),
765 Token::CharData("A".to_string())
766 );
767 assert_eq!(
768 next_token(&mut lexer).unwrap(),
769 Token::CharData("10.1.0.52".to_string())
770 );
771 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
772 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
773 assert_eq!(
774 next_token(&mut lexer).unwrap(),
775 Token::CharData("A".to_string())
776 );
777 assert_eq!(
778 next_token(&mut lexer).unwrap(),
779 Token::CharData("128.9.0.32".to_string())
780 );
781 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
782 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
783 assert_eq!(next_token(&mut lexer).unwrap(), Token::Include);
784 assert_eq!(
785 next_token(&mut lexer).unwrap(),
786 Token::CharData("<SUBSYS>ISI-MAILBOXES.TXT".to_string())
787 );
788 assert!(next_token(&mut lexer).is_none());
789 }
790}