1mod builder;
4mod entry;
5mod file_format_option;
6pub(crate) mod record;
7
8use std::{error, str};
9
10use indexmap::IndexMap;
11
12pub use self::{
13 builder::Builder, entry::Entry, file_format_option::FileFormatOption, record::parse_record,
14};
15use super::{
16 file_format::FileFormat,
17 record::value::{
18 map::{AlternativeAllele, Contig, Filter, Format, Info},
19 Map,
20 },
21 AlternativeAlleles, Contigs, Filters, Formats, Header, Infos, OtherRecords, Record,
22 SampleNames, StringMaps,
23};
24
25#[derive(Debug, Default, Eq, PartialEq)]
26enum State {
27 #[default]
28 Empty,
29 Ready,
30 Done,
31}
32
33#[derive(Debug, Default, Eq, PartialEq)]
35pub struct Parser {
36 file_format_option: FileFormatOption,
37 state: State,
38 file_format: FileFormat,
39 infos: Infos,
40 filters: Filters,
41 formats: Formats,
42 alternative_alleles: AlternativeAlleles,
43 contigs: Contigs,
44 sample_names: SampleNames,
45 other_records: OtherRecords,
46}
47
48impl Parser {
49 pub fn builder() -> Builder {
51 Builder::default()
52 }
53
54 pub fn parse(&self, s: &str) -> Result<Header, ParseError> {
56 let mut parser = Self::default();
57
58 for line in s.lines() {
59 parser.parse_partial(line.as_bytes())?;
60 }
61
62 parser.finish()
63 }
64
65 pub fn parse_partial(&mut self, src: &[u8]) -> Result<Entry<'_>, ParseError> {
67 if self.state == State::Done {
68 return Err(ParseError::ExpectedEof);
69 }
70
71 if self.state == State::Empty {
72 let file_format = match parse_file_format(src) {
73 Ok(f) => match self.file_format_option {
74 FileFormatOption::Auto => f,
75 FileFormatOption::FileFormat(g) => g,
76 },
77 Err(e) => return Err(e),
78 };
79
80 self.file_format = file_format;
81 self.state = State::Ready;
82
83 return Ok(Entry::FileFormat(file_format));
84 }
85
86 if src.starts_with(b"#CHROM") {
87 parse_header(src, &mut self.sample_names)?;
88 self.state = State::Done;
89 return Ok(Entry::Header);
90 }
91
92 let record =
93 record::parse_record(src, self.file_format).map_err(ParseError::InvalidRecord)?;
94
95 match record {
96 Record::FileFormat(_) => Err(ParseError::UnexpectedFileFormat),
97 Record::Info(id, info) => try_insert_info(&mut self.infos, id, info),
98 Record::Filter(id, filter) => try_insert_filter(&mut self.filters, id, filter),
99 Record::Format(id, format) => try_insert_format(&mut self.formats, id, format),
100 Record::AlternativeAllele(id, alternative_allele) => {
101 try_insert_alternative_allele(&mut self.alternative_alleles, id, alternative_allele)
102 }
103 Record::Contig(id, contig) => try_insert_contig(&mut self.contigs, id, contig),
104 Record::Other(key, value) => insert_other_record(&mut self.other_records, key, value),
105 }
106 }
107
108 pub fn finish(self) -> Result<Header, ParseError> {
110 match self.state {
111 State::Empty => Err(ParseError::Empty),
112 State::Ready => Err(ParseError::MissingHeader),
113 State::Done => Ok(Header {
114 file_format: self.file_format,
115 infos: self.infos,
116 filters: self.filters,
117 formats: self.formats,
118 alternative_alleles: self.alternative_alleles,
119 contigs: self.contigs,
120 sample_names: self.sample_names,
121 other_records: self.other_records,
122 string_maps: StringMaps::default(),
123 }),
124 }
125 }
126}
127
128#[derive(Clone, Debug, Eq, PartialEq)]
130pub enum ParseError {
131 Empty,
133 InvalidUtf8(str::Utf8Error),
135 MissingFileFormat,
137 UnexpectedFileFormat,
139 InvalidRecord(record::ParseError),
141 DuplicateInfoId(String),
143 DuplicateFilterId(String),
145 DuplicateFormatId(String),
147 DuplicateAlternativeAlleleId(String),
149 DuplicateContigId(String),
151 InvalidRecordValue(super::record::value::collection::AddError),
153 MissingHeader,
155 InvalidHeader(String, String),
157 DuplicateSampleName(String),
161 ExpectedEof,
163 StringMapPositionMismatch((usize, String), (usize, String)),
166}
167
168impl error::Error for ParseError {
169 fn source(&self) -> Option<&(dyn error::Error + 'static)> {
170 match self {
171 Self::InvalidUtf8(e) => Some(e),
172 Self::InvalidRecord(e) => Some(e),
173 Self::InvalidRecordValue(e) => Some(e),
174 _ => None,
175 }
176 }
177}
178
179impl std::fmt::Display for ParseError {
180 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
181 match self {
182 Self::Empty => f.write_str("empty input"),
183 Self::InvalidUtf8(_) => f.write_str("invalid UTF-8"),
184 Self::MissingFileFormat => f.write_str("missing fileformat"),
185 Self::UnexpectedFileFormat => f.write_str("unexpected file format"),
186 Self::InvalidRecord(_) => f.write_str("invalid record"),
187 Self::DuplicateInfoId(id) => write!(f, "duplicate INFO ID: {id}"),
188 Self::DuplicateFilterId(id) => write!(f, "duplicate FILTER ID: {id}"),
189 Self::DuplicateFormatId(id) => write!(f, "duplicate FORMAT ID: {id}"),
190 Self::DuplicateAlternativeAlleleId(id) => write!(f, "duplicate ALT ID: {id}"),
191 Self::DuplicateContigId(id) => write!(f, "duplicate contig ID: {id}"),
192 Self::InvalidRecordValue(_) => f.write_str("invalid record value"),
193 Self::MissingHeader => f.write_str("missing header"),
194 Self::InvalidHeader(actual, expected) => {
195 write!(f, "invalid header: expected {expected}, got {actual}")
196 }
197 Self::DuplicateSampleName(sample_name) => {
198 write!(f, "duplicate sample name: {sample_name}")
199 }
200 Self::ExpectedEof => f.write_str("expected EOF"),
201 Self::StringMapPositionMismatch(actual, expected) => write!(
202 f,
203 "string map position mismatch: expected {} (IDX={}), got {} (IDX={})",
204 expected.1, expected.0, actual.1, actual.0,
205 ),
206 }
207 }
208}
209
210fn parse_file_format(src: &[u8]) -> Result<FileFormat, ParseError> {
211 let record =
212 record::parse_record(src, FileFormat::default()).map_err(ParseError::InvalidRecord)?;
213
214 match record {
215 Record::FileFormat(file_format) => Ok(file_format),
216 _ => Err(ParseError::MissingFileFormat),
217 }
218}
219
220fn try_insert_info(
221 infos: &mut Infos,
222 id: String,
223 info: Map<Info>,
224) -> Result<Entry<'_>, ParseError> {
225 use indexmap::map::Entry;
226
227 match infos.entry(id) {
228 Entry::Vacant(entry) => {
229 let i = entry.index();
230
231 entry.insert(info);
232
233 Ok(infos
235 .get_index(i)
236 .map(|(k, v)| self::Entry::Info(k, v))
237 .unwrap())
238 }
239 Entry::Occupied(entry) => Err(ParseError::DuplicateInfoId(entry.key().into())),
240 }
241}
242
243fn try_insert_filter(
244 filters: &mut Filters,
245 id: String,
246 filter: Map<Filter>,
247) -> Result<Entry<'_>, ParseError> {
248 use indexmap::map::Entry;
249
250 match filters.entry(id) {
251 Entry::Vacant(entry) => {
252 let i = entry.index();
253
254 entry.insert(filter);
255
256 Ok(filters
258 .get_index(i)
259 .map(|(k, v)| self::Entry::Filter(k, v))
260 .unwrap())
261 }
262 Entry::Occupied(entry) => Err(ParseError::DuplicateFilterId(entry.key().into())),
263 }
264}
265
266fn try_insert_format(
267 formats: &mut Formats,
268 id: String,
269 format: Map<Format>,
270) -> Result<Entry<'_>, ParseError> {
271 use indexmap::map::Entry;
272
273 match formats.entry(id) {
274 Entry::Vacant(entry) => {
275 let i = entry.index();
276
277 entry.insert(format);
278
279 Ok(formats
281 .get_index(i)
282 .map(|(k, v)| self::Entry::Format(k, v))
283 .unwrap())
284 }
285 Entry::Occupied(entry) => Err(ParseError::DuplicateFormatId(entry.key().into())),
286 }
287}
288
289fn try_insert_alternative_allele(
290 alternative_alleles: &mut AlternativeAlleles,
291 id: String,
292 alternative_allele: Map<AlternativeAllele>,
293) -> Result<Entry<'_>, ParseError> {
294 use indexmap::map::Entry;
295
296 match alternative_alleles.entry(id) {
297 Entry::Vacant(entry) => {
298 let i = entry.index();
299
300 entry.insert(alternative_allele);
301
302 Ok(alternative_alleles
304 .get_index(i)
305 .map(|(k, v)| self::Entry::AlternativeAllele(k, v))
306 .unwrap())
307 }
308 Entry::Occupied(entry) => Err(ParseError::DuplicateAlternativeAlleleId(entry.key().into())),
309 }
310}
311
312fn try_insert_contig(
313 contigs: &mut Contigs,
314 id: String,
315 contig: Map<Contig>,
316) -> Result<Entry<'_>, ParseError> {
317 use indexmap::map::Entry;
318
319 match contigs.entry(id) {
320 Entry::Vacant(entry) => {
321 let i = entry.index();
322
323 entry.insert(contig);
324
325 Ok(contigs
327 .get_index(i)
328 .map(|(k, v)| self::Entry::Contig(k, v))
329 .unwrap())
330 }
331 Entry::Occupied(entry) => Err(ParseError::DuplicateContigId(entry.key().into())),
332 }
333}
334
335fn insert_other_record(
336 other_records: &mut OtherRecords,
337 key: super::record::key::Other,
338 value: super::record::Value,
339) -> Result<Entry<'_>, ParseError> {
340 let collection = other_records.entry(key).or_insert_with(|| match value {
341 super::record::Value::String(_) => {
342 super::record::value::Collection::Unstructured(Vec::new())
343 }
344 super::record::Value::Map(..) => {
345 super::record::value::Collection::Structured(IndexMap::new())
346 }
347 });
348
349 collection
350 .add(value)
351 .map_err(ParseError::InvalidRecordValue)?;
352
353 Ok(Entry::Other)
354}
355
356fn parse_header(src: &[u8], sample_names: &mut SampleNames) -> Result<(), ParseError> {
357 static HEADERS: &[&str] = &[
358 "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO",
359 ];
360 static FORMAT_HEADER: &str = "FORMAT";
361
362 const DELIMITER: char = '\t';
363
364 let line = str::from_utf8(src).map_err(ParseError::InvalidUtf8)?;
365 let mut fields = line.split(DELIMITER);
366
367 for &expected in HEADERS.iter() {
368 if let Some(actual) = fields.next() {
369 if actual != expected {
370 return Err(ParseError::InvalidHeader(actual.into(), expected.into()));
371 }
372 } else {
373 return Err(ParseError::InvalidHeader(String::from(""), expected.into()));
374 }
375 }
376
377 if let Some(field) = fields.next() {
378 if field != FORMAT_HEADER {
379 return Err(ParseError::InvalidHeader(
380 field.into(),
381 FORMAT_HEADER.into(),
382 ));
383 }
384
385 for sample_name in fields {
386 if !sample_names.insert(sample_name.into()) {
387 return Err(ParseError::DuplicateSampleName(sample_name.into()));
388 }
389 }
390 }
391
392 Ok(())
393}
394
395#[cfg(test)]
396mod tests {
397 use super::*;
398
399 #[test]
400 fn test_from_str() -> Result<(), Box<dyn std::error::Error>> {
401 use crate::{
402 header::record::{value::map::Other, Value},
403 variant::record::{info, samples},
404 };
405
406 let s = r#"##fileformat=VCFv4.3
407##fileDate=20200506
408##source=noodles-vcf
409##contig=<ID=sq0,length=8>
410##contig=<ID=sq1,length=13>
411##contig=<ID=sq2,length=21>
412##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
413##FILTER=<ID=q10,Description="Quality below 10">
414##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
415##ALT=<ID=DEL,Description="Deletion">
416##META=<ID=Assay,Type=String,Number=.,Values=[WholeGenome, Exome]>
417##SAMPLE=<ID=sample0,Assay=WholeGenome>
418##PEDIGREE=<ID=cid,Father=fid,Mother=mid>
419#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample0
420"#;
421
422 let actual = Parser::default().parse(s)?;
423
424 let expected = Header::builder()
425 .set_file_format(FileFormat::new(4, 3))
426 .insert("fileDate".parse()?, Value::String(String::from("20200506")))?
427 .insert(
428 "source".parse()?,
429 Value::String(String::from("noodles-vcf")),
430 )?
431 .add_contig("sq0", Map::<Contig>::builder().set_length(8).build()?)
432 .add_contig("sq1", Map::<Contig>::builder().set_length(13).build()?)
433 .add_contig("sq2", Map::<Contig>::builder().set_length(21).build()?)
434 .add_info(
435 info::field::key::SAMPLES_WITH_DATA_COUNT,
436 Map::<Info>::from(info::field::key::SAMPLES_WITH_DATA_COUNT),
437 )
438 .add_filter("q10", Map::<Filter>::new("Quality below 10"))
439 .add_format(
440 samples::keys::key::GENOTYPE,
441 Map::<Format>::from(samples::keys::key::GENOTYPE),
442 )
443 .add_alternative_allele("DEL", Map::<AlternativeAllele>::new("Deletion"))
444 .insert(
445 "META".parse()?,
446 Value::Map(
447 String::from("Assay"),
448 Map::<Other>::builder()
449 .insert("Type".parse()?, "String")
450 .insert("Number".parse()?, ".")
451 .insert("Values".parse()?, "[WholeGenome, Exome]")
452 .build()?,
453 ),
454 )?
455 .insert(
456 "SAMPLE".parse()?,
457 Value::Map(
458 String::from("sample0"),
459 Map::<Other>::builder()
460 .insert("Assay".parse()?, "WholeGenome")
461 .build()?,
462 ),
463 )?
464 .insert(
465 "PEDIGREE".parse()?,
466 Value::Map(
467 String::from("cid"),
468 Map::<Other>::builder()
469 .insert("Father".parse()?, "fid")
470 .insert("Mother".parse()?, "mid")
471 .build()?,
472 ),
473 )?
474 .add_sample_name("sample0")
475 .build();
476
477 assert_eq!(actual, expected);
478
479 Ok(())
480 }
481
482 #[test]
483 fn test_from_str_without_file_format() {
484 let s = r#"##ALT=<ID=DEL,Description="Deletion">
485"#;
486
487 assert_eq!(
488 Parser::default().parse(s),
489 Err(ParseError::MissingFileFormat)
490 );
491 }
492
493 #[test]
494 fn test_from_str_with_data_after_header() {
495 let s = r#"##fileformat=VCFv4.3
496#CHROM POS ID REF ALT QUAL FILTER INFO
497##contig=<ID=sq0,length=8>
498"#;
499
500 assert_eq!(Parser::default().parse(s), Err(ParseError::ExpectedEof));
501 }
502
503 #[test]
504 fn test_from_str_with_multiple_fileformats() {
505 let s = "\
506##fileformat=VCFv4.3
507##fileformat=VCFv4.3
508";
509
510 assert_eq!(
511 Parser::default().parse(s),
512 Err(ParseError::UnexpectedFileFormat)
513 );
514 }
515
516 #[test]
517 fn test_from_str_with_missing_headers() {
518 let s = "##fileformat=VCFv4.3
519";
520 assert_eq!(Parser::default().parse(s), Err(ParseError::MissingHeader));
521 }
522
523 #[test]
524 fn test_from_str_with_invalid_headers() {
525 let s = "##fileformat=VCFv4.3
526#CHROM POS ID REF ALT QUALITY FILTER INFO
527";
528
529 assert_eq!(
530 Parser::default().parse(s),
531 Err(ParseError::InvalidHeader(
532 String::from("QUALITY"),
533 String::from("QUAL")
534 ))
535 );
536
537 let s = "##fileformat=VCFv4.3
538#CHROM POS ID
539";
540
541 assert_eq!(
542 Parser::default().parse(s),
543 Err(ParseError::InvalidHeader(
544 String::from(""),
545 String::from("REF")
546 ))
547 );
548
549 let s = "##fileformat=VCFv4.3
550#CHROM POS ID REF ALT QUAL FILTER INFO sample0
551";
552
553 assert_eq!(
554 Parser::default().parse(s),
555 Err(ParseError::InvalidHeader(
556 String::from("sample0"),
557 String::from("FORMAT")
558 ))
559 );
560 }
561
562 #[test]
563 fn test_from_str_with_duplicate_map_id() {
564 let s = r#"##fileformat=VCFv4.3
565##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
566##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
567#CHROM POS ID REF ALT QUAL FILTER INFO
568"#;
569
570 assert!(matches!(
571 Parser::default().parse(s),
572 Err(ParseError::DuplicateInfoId(_))
573 ));
574
575 let s = r#"##fileformat=VCFv4.3
576##FILTER=<ID=q10,Description="Quality below 10">
577##FILTER=<ID=q10,Description="Quality below 10">
578#CHROM POS ID REF ALT QUAL FILTER INFO
579"#;
580
581 assert_eq!(
582 Parser::default().parse(s),
583 Err(ParseError::DuplicateFilterId(String::from("q10")))
584 );
585
586 let s = r#"##fileformat=VCFv4.3
587##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
588##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
589#CHROM POS ID REF ALT QUAL FILTER INFO
590"#;
591
592 assert_eq!(
593 Parser::default().parse(s),
594 Err(ParseError::DuplicateFormatId(String::from(
595 crate::variant::record::samples::keys::key::GENOTYPE
596 )))
597 );
598
599 let s = r#"##fileformat=VCFv4.3
600##ALT=<ID=DEL,Description="Deletion">
601##ALT=<ID=DEL,Description="Deletion">
602#CHROM POS ID REF ALT QUAL FILTER INFO
603"#;
604
605 assert!(matches!(
606 Parser::default().parse(s),
607 Err(ParseError::DuplicateAlternativeAlleleId(_))
608 ));
609
610 let s = r#"##fileformat=VCFv4.3
611##contig=<ID=sq0,length=8>
612##contig=<ID=sq0,length=8>
613#CHROM POS ID REF ALT QUAL FILTER INFO
614"#;
615
616 assert!(matches!(
617 Parser::default().parse(s),
618 Err(ParseError::DuplicateContigId(_))
619 ));
620
621 let s = r#"##fileformat=VCFv4.3
622##contig=<ID=sq0,length=8>
623##contig=<ID=sq0,length=8>
624#CHROM POS ID REF ALT QUAL FILTER INFO
625"#;
626
627 assert!(matches!(
628 Parser::default().parse(s),
629 Err(ParseError::DuplicateContigId(_))
630 ));
631
632 let s = r#"##fileformat=VCFv4.3
633##SAMPLE=<ID=sample0,Assay=WholeGenome>
634##SAMPLE=<ID=sample0,Assay=WholeGenome>
635#CHROM POS ID REF ALT QUAL FILTER INFO
636"#;
637
638 assert!(matches!(
639 Parser::default().parse(s),
640 Err(ParseError::InvalidRecordValue(_))
641 ));
642 }
643
644 #[test]
645 fn test_from_str_with_duplicate_sample_names() {
646 let s = "##fileformat=VCFv4.3
647#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample0 sample0
648";
649
650 assert_eq!(
651 Parser::default().parse(s),
652 Err(ParseError::DuplicateSampleName(String::from("sample0")))
653 );
654 }
655}