1use std::borrow::Cow;
9use std::{char, iter::Peekable};
10
11use crate::serialize::txt::errors::{LexerError, LexerErrorKind, LexerResult};
12
13pub(crate) struct Lexer<'a> {
15 txt: Peekable<CowChars<'a>>,
16 state: State,
17}
18
19impl<'a> Lexer<'a> {
20 pub(crate) fn new(txt: impl Into<Cow<'a, str>>) -> Self {
22 Lexer {
23 txt: CowChars {
24 data: txt.into(),
25 offset: 0,
26 }
27 .peekable(),
28 state: State::StartLine,
29 }
30 }
31
32 pub(crate) fn next_token(&mut self) -> LexerResult<Option<Token>> {
34 let mut char_data_vec: Option<Vec<String>> = None;
35 let mut char_data: Option<String> = None;
36
37 for i in 0..4096 {
38 assert!(i < 4095); let ch: Option<char> = self.peek();
43
44 match self.state {
51 State::StartLine => {
52 match ch {
53 Some('\r') | Some('\n') => {
54 self.state = State::EOL;
55 }
56 Some(ch) if ch.is_whitespace() => self.state = State::Blank,
58 Some(_) => self.state = State::RestOfLine,
59 None => {
60 self.state = State::EOF;
61 }
62 }
63 }
64 State::RestOfLine => {
65 match ch {
66 Some('@') => self.state = State::At,
67 Some('(') => {
68 self.txt.next();
69 char_data_vec = Some(Vec::new());
70 self.state = State::List;
71 }
72 Some(ch @ ')') => return Err(LexerErrorKind::IllegalCharacter(ch).into()),
73 Some('$') => {
74 self.txt.next();
75 char_data = Some(String::new());
76 self.state = State::Dollar;
77 }
78 Some('\r') | Some('\n') => {
79 self.state = State::EOL;
80 }
81 Some('"') => {
82 self.txt.next();
83 char_data = Some(String::new());
84 self.state = State::Quote;
85 }
86 Some(';') => self.state = State::Comment { is_list: false },
87 Some(ch) if ch.is_whitespace() => {
88 self.txt.next();
89 } Some(ch) if !ch.is_control() && !ch.is_whitespace() => {
91 char_data = Some(String::new());
92 self.state = State::CharData { is_list: false };
93 }
94 Some(ch) => return Err(LexerErrorKind::UnrecognizedChar(ch).into()),
95 None => {
96 self.state = State::EOF;
97 }
98 }
99 }
100 State::Blank => {
101 self.txt.next();
103 self.state = State::RestOfLine;
104 return Ok(Some(Token::Blank));
105 }
106 State::Comment { is_list } => {
107 match ch {
108 Some('\r') | Some('\n') => {
109 self.state = if is_list { State::List } else { State::EOL };
110 } Some(_) => {
112 self.txt.next();
113 } None => {
115 self.state = State::EOF;
116 }
117 }
118 }
119 State::Quote => {
120 match ch {
121 Some('"') => {
123 self.state = State::RestOfLine;
124 self.txt.next();
125 return Ok(Some(Token::CharData(
126 char_data.take().unwrap_or_else(|| "".into()),
127 )));
128 }
129 Some('\\') => {
130 Self::push_to_str(&mut char_data, self.escape_seq()?)?;
131 }
132 Some(ch) => {
133 self.txt.next();
134 Self::push_to_str(&mut char_data, ch)?;
135 }
136 None => return Err(LexerErrorKind::UnclosedQuotedString.into()),
137 }
138 }
139 State::Dollar => {
140 match ch {
141 Some(ch @ 'A'..='Z') => {
143 self.txt.next();
144 Self::push_to_str(&mut char_data, ch)?;
145 }
146 Some(_) | None => {
148 self.state = State::RestOfLine;
149 let dollar: String = char_data.take().ok_or_else(|| {
150 LexerError::from(LexerErrorKind::IllegalState(
151 "char_data \
152 is None",
153 ))
154 })?;
155
156 return Ok(Some(match dollar.as_str() {
157 "INCLUDE" => Token::Include,
158 "ORIGIN" => Token::Origin,
159 "TTL" => Token::Ttl,
160 _ => {
161 return Err(LexerErrorKind::UnrecognizedDollar(
162 char_data.take().unwrap_or_else(|| "".into()),
163 )
164 .into())
165 }
166 }));
167 }
168 }
169 }
170 State::List => match ch {
171 Some(';') => {
172 self.txt.next();
173 self.state = State::Comment { is_list: true }
174 }
175 Some(')') => {
176 self.txt.next();
177 self.state = State::RestOfLine;
178 return char_data_vec
179 .take()
180 .ok_or_else(|| {
181 LexerErrorKind::IllegalState("char_data_vec is None").into()
182 })
183 .map(|v| Some(Token::List(v)));
184 }
185 Some(ch) if ch.is_whitespace() => {
186 self.txt.next();
187 }
188 Some(ch) if !ch.is_control() && !ch.is_whitespace() => {
189 char_data = Some(String::new());
190 self.state = State::CharData { is_list: true }
191 }
192 Some(ch) => return Err(LexerErrorKind::UnrecognizedChar(ch).into()),
193 None => return Err(LexerErrorKind::UnclosedList.into()),
194 },
195 State::CharData { is_list } => {
196 match ch {
197 Some(ch @ ')') if !is_list => {
198 return Err(LexerErrorKind::IllegalCharacter(ch).into())
199 }
200 Some(ch) if ch.is_whitespace() || ch == ')' || ch == ';' => {
201 if is_list {
202 char_data_vec
203 .as_mut()
204 .ok_or_else(|| {
205 LexerError::from(LexerErrorKind::IllegalState(
206 "char_data_vec is None",
207 ))
208 })
209 .and_then(|v| {
210 let char_data = char_data.take().ok_or(
211 LexerErrorKind::IllegalState("char_data is None"),
212 )?;
213
214 v.push(char_data);
215 Ok(())
216 })?;
217 self.state = State::List;
218 } else {
219 self.state = State::RestOfLine;
220 let result = char_data.take().ok_or_else(|| {
221 LexerErrorKind::IllegalState("char_data is None").into()
222 });
223 let opt = result.map(|s| Some(Token::CharData(s)));
224 return opt;
225 }
226 }
227 Some(ch) if !ch.is_control() && !ch.is_whitespace() => {
230 self.txt.next();
231 Self::push_to_str(&mut char_data, ch)?;
232 }
233 Some(ch) => return Err(LexerErrorKind::UnrecognizedChar(ch).into()),
234 None => {
235 self.state = State::EOF;
236 return char_data
237 .take()
238 .ok_or_else(|| {
239 LexerErrorKind::IllegalState("char_data is None").into()
240 })
241 .map(|s| Some(Token::CharData(s)));
242 }
243 }
244 }
245 State::At => {
246 self.txt.next();
247 self.state = State::RestOfLine;
248 return Ok(Some(Token::At));
249 }
250 State::EOL => match ch {
251 Some('\r') => {
252 self.txt.next();
253 }
254 Some('\n') => {
255 self.txt.next();
256 self.state = State::StartLine;
257 return Ok(Some(Token::EOL));
258 }
259 Some(ch) => return Err(LexerErrorKind::IllegalCharacter(ch).into()),
260 None => return Err(LexerErrorKind::EOF.into()),
261 },
262 State::EOF => {
264 self.txt.next(); return Ok(None);
266 }
267 }
268 }
269
270 unreachable!("The above match statement should have found a terminal state");
271 }
272
273 fn push_to_str(collect: &mut Option<String>, ch: char) -> LexerResult<()> {
274 collect
275 .as_mut()
276 .ok_or_else(|| LexerErrorKind::IllegalState("collect is None").into())
277 .map(|s| {
278 s.push(ch);
279 })
280 }
281
282 fn escape_seq(&mut self) -> LexerResult<char> {
283 self.txt.next(); let ch = self
286 .peek()
287 .ok_or_else(|| LexerError::from(LexerErrorKind::EOF))?;
288
289 if !ch.is_control() {
290 if ch.is_numeric() {
291 let d1: u32 = self
293 .txt
294 .next()
295 .ok_or_else(|| LexerError::from(LexerErrorKind::EOF))
296 .map(|c| {
297 c.to_digit(10)
298 .ok_or_else(|| LexerError::from(LexerErrorKind::IllegalCharacter(c)))
299 })??; let d2: u32 = self
301 .txt
302 .next()
303 .ok_or_else(|| LexerError::from(LexerErrorKind::EOF))
304 .map(|c| {
305 c.to_digit(10)
306 .ok_or_else(|| LexerError::from(LexerErrorKind::IllegalCharacter(c)))
307 })??; let d3: u32 = self
309 .txt
310 .next()
311 .ok_or_else(|| LexerError::from(LexerErrorKind::EOF))
312 .map(|c| {
313 c.to_digit(10)
314 .ok_or_else(|| LexerError::from(LexerErrorKind::IllegalCharacter(c)))
315 })??; let val: u32 = (d1 << 16) + (d2 << 8) + d3;
318 let ch: char = char::from_u32(val)
319 .ok_or_else(|| LexerError::from(LexerErrorKind::UnrecognizedOctet(val)))?;
320
321 Ok(ch)
322 } else {
323 self.txt.next(); Ok(ch)
326 }
327 } else {
328 Err(LexerErrorKind::IllegalCharacter(ch).into())
329 }
330 }
331
332 fn peek(&mut self) -> Option<char> {
333 self.txt.peek().copied()
334 }
335}
336
337struct CowChars<'a> {
338 data: Cow<'a, str>,
339 offset: usize,
340}
341
342impl Iterator for CowChars<'_> {
343 type Item = char;
344
345 fn next(&mut self) -> Option<char> {
346 let mut iter = self.data[self.offset..].char_indices();
347 let (_, ch) = iter.next()?; match iter.next() {
349 Some((idx, _)) => self.offset += idx,
350 None => self.offset = self.data.len(),
351 }
352
353 Some(ch)
354 }
355}
356
357#[doc(hidden)]
358#[derive(Copy, Clone, PartialEq, Debug)]
359pub(crate) enum State {
360 StartLine,
361 RestOfLine,
362 Blank, List, CharData { is_list: bool }, Comment { is_list: bool }, At, Quote, Dollar, EOL, EOF,
372}
373
374#[derive(Eq, PartialEq, Debug, Clone)]
376pub enum Token {
377 Blank,
379 List(Vec<String>),
381 CharData(String),
383 At,
385 Include,
387 Origin,
389 Ttl,
391 EOL,
393}
394
395#[cfg(test)]
396mod lex_test {
397 use super::*;
398
399 #[allow(clippy::uninlined_format_args)]
400 fn next_token(lexer: &mut Lexer<'_>) -> Option<Token> {
401 let result = lexer.next_token();
402 assert!(result.is_ok(), "{:?}", result);
403 result.unwrap()
404 }
405
406 #[test]
407 fn blank() {
408 let mut lexer = Lexer::new(" dead beef");
410 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
411 assert_eq!(
412 next_token(&mut lexer).unwrap(),
413 Token::CharData("dead".to_string())
414 );
415 assert_eq!(
416 next_token(&mut lexer).unwrap(),
417 Token::CharData("beef".to_string())
418 );
419
420 let mut lexer = Lexer::new("dead beef");
422 assert_eq!(
423 next_token(&mut lexer).unwrap(),
424 Token::CharData("dead".to_string())
425 );
426 assert_eq!(
427 next_token(&mut lexer).unwrap(),
428 Token::CharData("beef".to_string())
429 );
430
431 let mut lexer = Lexer::new("dead beef\r\n after");
432 assert_eq!(
433 next_token(&mut lexer).unwrap(),
434 Token::CharData("dead".to_string())
435 );
436 assert_eq!(
437 next_token(&mut lexer).unwrap(),
438 Token::CharData("beef".to_string())
439 );
440 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
441 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
442 assert_eq!(
443 next_token(&mut lexer).unwrap(),
444 Token::CharData("after".to_string())
445 );
446
447 let mut lexer = Lexer::new(
448 "dead beef ();comment
449 after",
450 );
451 assert_eq!(
452 next_token(&mut lexer).unwrap(),
453 Token::CharData("dead".to_string())
454 );
455 assert_eq!(
456 next_token(&mut lexer).unwrap(),
457 Token::CharData("beef".to_string())
458 );
459 assert_eq!(next_token(&mut lexer).unwrap(), Token::List(vec![]));
460 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
461 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
462 assert_eq!(
463 next_token(&mut lexer).unwrap(),
464 Token::CharData("after".to_string())
465 );
466 }
467
468 #[test]
469 fn escape() {
470 assert_eq!(
471 Lexer::new("a\\Aa").next_token().unwrap().unwrap(),
472 Token::CharData("a\\Aa".to_string())
473 );
474 assert_eq!(
475 Lexer::new("a\\$").next_token().unwrap().unwrap(),
476 Token::CharData("a\\$".to_string())
477 );
478 assert_eq!(
479 Lexer::new("a\\077").next_token().unwrap().unwrap(),
480 Token::CharData("a\\077".to_string())
481 );
482 }
483
484 #[test]
485 fn quoted_txt() {
486 assert_eq!(
487 Lexer::new("\"Quoted\"").next_token().unwrap().unwrap(),
488 Token::CharData("Quoted".to_string())
489 );
490 assert_eq!(
491 Lexer::new("\";@$\"").next_token().unwrap().unwrap(),
492 Token::CharData(";@$".to_string())
493 );
494 assert_eq!(
495 Lexer::new("\"some \\A\"").next_token().unwrap().unwrap(),
496 Token::CharData("some A".to_string())
497 );
498 assert_eq!(
499 Lexer::new("\"a\\Aa\"").next_token().unwrap().unwrap(),
500 Token::CharData("aAa".to_string())
501 );
502 assert_eq!(
503 Lexer::new("\"a\\$\"").next_token().unwrap().unwrap(),
504 Token::CharData("a$".to_string())
505 );
506 assert_eq!(
507 Lexer::new("\"a\\077\"").next_token().unwrap().unwrap(),
508 Token::CharData("a\u{707}".to_string())
509 );
510
511 assert!(Lexer::new("\"a\\\"").next_token().is_err());
512 assert!(Lexer::new("\"a\\0\"").next_token().is_err());
513 assert!(Lexer::new("\"a\\07\"").next_token().is_err());
514
515 let mut lexer = Lexer::new("\"multi\nline\ntext\"");
516
517 assert_eq!(
518 next_token(&mut lexer).unwrap(),
519 Token::CharData("multi\nline\ntext".to_string())
520 );
521 assert_eq!(next_token(&mut lexer), None);
522
523 let mut lexer = Lexer::new("\"multi\r\nline\r\ntext\"\r\n");
524
525 assert_eq!(
526 next_token(&mut lexer).unwrap(),
527 Token::CharData("multi\r\nline\r\ntext".to_string())
528 );
529 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
530 assert_eq!(next_token(&mut lexer), None);
531
532 assert!(Lexer::new("\"multi").next_token().is_err());
533 }
534
535 #[test]
536 fn unicode() {
537 assert_eq!(
538 Lexer::new("♥").next_token().unwrap().unwrap(),
539 Token::CharData("♥".to_string())
540 );
541 }
542
543 #[test]
545 fn lex() {
546 assert_eq!(
547 next_token(&mut Lexer::new(".")).unwrap(),
548 Token::CharData(".".to_string())
549 );
550 assert_eq!(
551 next_token(&mut Lexer::new(" .")).unwrap(),
552 Token::Blank
553 );
554 assert_eq!(
555 next_token(&mut Lexer::new("abc")).unwrap(),
556 Token::CharData("abc".to_string())
557 );
558 assert_eq!(
559 next_token(&mut Lexer::new("abc.")).unwrap(),
560 Token::CharData("abc.".to_string())
561 );
562 assert_eq!(next_token(&mut Lexer::new(";abc")), None);
563 assert_eq!(next_token(&mut Lexer::new(";;@$-\"")), None);
564 assert_eq!(next_token(&mut Lexer::new("@")).unwrap(), Token::At);
565 assert_eq!(
566 next_token(&mut Lexer::new("123")).unwrap(),
567 Token::CharData("123".to_string())
568 );
569 assert_eq!(
570 next_token(&mut Lexer::new("$INCLUDE")).unwrap(),
571 Token::Include
572 );
573 assert_eq!(
574 next_token(&mut Lexer::new("$ORIGIN")).unwrap(),
575 Token::Origin
576 );
577 assert_eq!(next_token(&mut Lexer::new("$TTL")).unwrap(), Token::Ttl);
578 assert_eq!(next_token(&mut Lexer::new("\n")), Some(Token::EOL));
579 assert_eq!(next_token(&mut Lexer::new("\r\n")), Some(Token::EOL));
580 }
581
582 #[test]
583 fn list() {
584 let mut lexer = Lexer::new("(");
585 assert!(lexer.next_token().is_err());
586
587 assert!(Lexer::new(")").next_token().is_err());
588
589 let mut lexer = Lexer::new("()");
590 assert_eq!(next_token(&mut lexer).unwrap(), Token::List(vec![]));
591 assert_eq!(next_token(&mut lexer), None);
592
593 let mut lexer = Lexer::new("(abc)");
594 assert_eq!(
595 next_token(&mut lexer).unwrap(),
596 Token::List(vec!["abc".to_string()])
597 );
598 assert_eq!(next_token(&mut lexer), None);
599
600 let mut lexer = Lexer::new("(\nabc\n)");
601 assert_eq!(
602 next_token(&mut lexer).unwrap(),
603 Token::List(vec!["abc".to_string()])
604 );
605 assert_eq!(next_token(&mut lexer), None);
606
607 let mut lexer = Lexer::new("(\nabc\nabc)");
608 assert_eq!(
609 next_token(&mut lexer).unwrap(),
610 Token::List(vec!["abc".to_string(), "abc".to_string()])
611 );
612 assert_eq!(next_token(&mut lexer), None);
613
614 let mut lexer = Lexer::new("(\nabc;comment\n)");
615 assert_eq!(
616 next_token(&mut lexer).unwrap(),
617 Token::List(vec!["abc".to_string()])
618 );
619 assert_eq!(next_token(&mut lexer), None);
620 }
621
622 #[test]
623 #[allow(clippy::cognitive_complexity)]
624 fn soa() {
625 let mut lexer = Lexer::new(
626 "@ IN SOA VENERA Action\\.domains (
627 \
628 20 ; SERIAL
629 7200 ; REFRESH
630 \
631 600 ; RETRY
632 3600000; EXPIRE
633 \
634 60) ; MINIMUM
635
636 NS A.ISI.EDU.
637 NS VENERA
638 \
639 NS VAXA
640 MX 10 VENERA
641 MX 20 VAXA
642
643\
644 A A 26.3.0.103
645
646VENERA A 10.1.0.52
647 A \
648 128.9.0.32
649
650$INCLUDE <SUBSYS>ISI-MAILBOXES.TXT",
651 );
652
653 assert_eq!(next_token(&mut lexer).unwrap(), Token::At);
654 assert_eq!(
655 next_token(&mut lexer).unwrap(),
656 Token::CharData("IN".to_string())
657 );
658 assert_eq!(
659 next_token(&mut lexer).unwrap(),
660 Token::CharData("SOA".to_string())
661 );
662 assert_eq!(
663 next_token(&mut lexer).unwrap(),
664 Token::CharData("VENERA".to_string())
665 );
666 assert_eq!(
667 next_token(&mut lexer).unwrap(),
668 Token::CharData("Action\\.domains".to_string())
669 );
670 assert_eq!(
671 next_token(&mut lexer).unwrap(),
672 Token::List(vec![
673 "20".to_string(),
674 "7200".to_string(),
675 "600".to_string(),
676 "3600000".to_string(),
677 "60".to_string(),
678 ])
679 );
680 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
681 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
682 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
683 assert_eq!(
684 next_token(&mut lexer).unwrap(),
685 Token::CharData("NS".to_string())
686 );
687 assert_eq!(
688 next_token(&mut lexer).unwrap(),
689 Token::CharData("A.ISI.EDU.".to_string())
690 );
691 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
692 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
693 assert_eq!(
694 next_token(&mut lexer).unwrap(),
695 Token::CharData("NS".to_string())
696 );
697 assert_eq!(
698 next_token(&mut lexer).unwrap(),
699 Token::CharData("VENERA".to_string())
700 );
701 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
702 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
703 assert_eq!(
704 next_token(&mut lexer).unwrap(),
705 Token::CharData("NS".to_string())
706 );
707 assert_eq!(
708 next_token(&mut lexer).unwrap(),
709 Token::CharData("VAXA".to_string())
710 );
711 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
712 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
713 assert_eq!(
714 next_token(&mut lexer).unwrap(),
715 Token::CharData("MX".to_string())
716 );
717 assert_eq!(
718 next_token(&mut lexer).unwrap(),
719 Token::CharData("10".to_string())
720 );
721 assert_eq!(
722 next_token(&mut lexer).unwrap(),
723 Token::CharData("VENERA".to_string())
724 );
725 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
726 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
727 assert_eq!(
728 next_token(&mut lexer).unwrap(),
729 Token::CharData("MX".to_string())
730 );
731 assert_eq!(
732 next_token(&mut lexer).unwrap(),
733 Token::CharData("20".to_string())
734 );
735 assert_eq!(
736 next_token(&mut lexer).unwrap(),
737 Token::CharData("VAXA".to_string())
738 );
739 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
740 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
741 assert_eq!(
742 next_token(&mut lexer).unwrap(),
743 Token::CharData("A".to_string())
744 );
745 assert_eq!(
746 next_token(&mut lexer).unwrap(),
747 Token::CharData("A".to_string())
748 );
749 assert_eq!(
750 next_token(&mut lexer).unwrap(),
751 Token::CharData("26.3.0.103".to_string())
752 );
753 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
754 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
755 assert_eq!(
756 next_token(&mut lexer).unwrap(),
757 Token::CharData("VENERA".to_string())
758 );
759 assert_eq!(
760 next_token(&mut lexer).unwrap(),
761 Token::CharData("A".to_string())
762 );
763 assert_eq!(
764 next_token(&mut lexer).unwrap(),
765 Token::CharData("10.1.0.52".to_string())
766 );
767 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
768 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
769 assert_eq!(
770 next_token(&mut lexer).unwrap(),
771 Token::CharData("A".to_string())
772 );
773 assert_eq!(
774 next_token(&mut lexer).unwrap(),
775 Token::CharData("128.9.0.32".to_string())
776 );
777 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
778 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
779 assert_eq!(next_token(&mut lexer).unwrap(), Token::Include);
780 assert_eq!(
781 next_token(&mut lexer).unwrap(),
782 Token::CharData("<SUBSYS>ISI-MAILBOXES.TXT".to_string())
783 );
784 assert!(next_token(&mut lexer).is_none());
785 }
786}