noodles_vcf/header/
parser.rs

1//! VCF header parser.
2
3mod builder;
4mod entry;
5mod file_format_option;
6pub(crate) mod record;
7
8use std::{error, str};
9
10use indexmap::IndexMap;
11
12pub use self::{
13    builder::Builder, entry::Entry, file_format_option::FileFormatOption, record::parse_record,
14};
15use super::{
16    file_format::FileFormat,
17    record::value::{
18        map::{AlternativeAllele, Contig, Filter, Format, Info},
19        Map,
20    },
21    AlternativeAlleles, Contigs, Filters, Formats, Header, Infos, OtherRecords, Record,
22    SampleNames, StringMaps,
23};
24
25#[derive(Debug, Default, Eq, PartialEq)]
26enum State {
27    #[default]
28    Empty,
29    Ready,
30    Done,
31}
32
33/// A VCF header parser.
34#[derive(Debug, Default, Eq, PartialEq)]
35pub struct Parser {
36    file_format_option: FileFormatOption,
37    state: State,
38    file_format: FileFormat,
39    infos: Infos,
40    filters: Filters,
41    formats: Formats,
42    alternative_alleles: AlternativeAlleles,
43    contigs: Contigs,
44    sample_names: SampleNames,
45    other_records: OtherRecords,
46}
47
48impl Parser {
49    /// Creates a VCF header parser builder.
50    pub fn builder() -> Builder {
51        Builder::default()
52    }
53
54    /// Parses a raw VCF header.
55    pub fn parse(&self, s: &str) -> Result<Header, ParseError> {
56        let mut parser = Self::default();
57
58        for line in s.lines() {
59            parser.parse_partial(line.as_bytes())?;
60        }
61
62        parser.finish()
63    }
64
65    /// Parses and adds a raw record to the header.
66    pub fn parse_partial(&mut self, src: &[u8]) -> Result<Entry<'_>, ParseError> {
67        if self.state == State::Done {
68            return Err(ParseError::ExpectedEof);
69        }
70
71        if self.state == State::Empty {
72            let file_format = match parse_file_format(src) {
73                Ok(f) => match self.file_format_option {
74                    FileFormatOption::Auto => f,
75                    FileFormatOption::FileFormat(g) => g,
76                },
77                Err(e) => return Err(e),
78            };
79
80            self.file_format = file_format;
81            self.state = State::Ready;
82
83            return Ok(Entry::FileFormat(file_format));
84        }
85
86        if src.starts_with(b"#CHROM") {
87            parse_header(src, &mut self.sample_names)?;
88            self.state = State::Done;
89            return Ok(Entry::Header);
90        }
91
92        let record =
93            record::parse_record(src, self.file_format).map_err(ParseError::InvalidRecord)?;
94
95        match record {
96            Record::FileFormat(_) => Err(ParseError::UnexpectedFileFormat),
97            Record::Info(id, info) => try_insert_info(&mut self.infos, id, info),
98            Record::Filter(id, filter) => try_insert_filter(&mut self.filters, id, filter),
99            Record::Format(id, format) => try_insert_format(&mut self.formats, id, format),
100            Record::AlternativeAllele(id, alternative_allele) => {
101                try_insert_alternative_allele(&mut self.alternative_alleles, id, alternative_allele)
102            }
103            Record::Contig(id, contig) => try_insert_contig(&mut self.contigs, id, contig),
104            Record::Other(key, value) => insert_other_record(&mut self.other_records, key, value),
105        }
106    }
107
108    /// Builds the VCF header.
109    pub fn finish(self) -> Result<Header, ParseError> {
110        match self.state {
111            State::Empty => Err(ParseError::Empty),
112            State::Ready => Err(ParseError::MissingHeader),
113            State::Done => Ok(Header {
114                file_format: self.file_format,
115                infos: self.infos,
116                filters: self.filters,
117                formats: self.formats,
118                alternative_alleles: self.alternative_alleles,
119                contigs: self.contigs,
120                sample_names: self.sample_names,
121                other_records: self.other_records,
122                string_maps: StringMaps::default(),
123            }),
124        }
125    }
126}
127
128/// An error returned when a raw VCF header fails to parse.
129#[derive(Clone, Debug, Eq, PartialEq)]
130pub enum ParseError {
131    /// The input is empty.
132    Empty,
133    /// The input contains invalid UTF-8.
134    InvalidUtf8(str::Utf8Error),
135    /// The file format (`fileformat`) is missing.
136    MissingFileFormat,
137    /// The file format (`fileformat`) appears other than the first line.
138    UnexpectedFileFormat,
139    /// A record is invalid.
140    InvalidRecord(record::ParseError),
141    /// An info ID is duplicated.
142    DuplicateInfoId(String),
143    /// A filter ID is duplicated.
144    DuplicateFilterId(String),
145    /// A format ID is duplicated.
146    DuplicateFormatId(String),
147    /// An alternative allele ID is duplicated.
148    DuplicateAlternativeAlleleId(String),
149    /// A contig ID is duplicated.
150    DuplicateContigId(String),
151    /// A record has an invalid value.
152    InvalidRecordValue(super::record::value::collection::AddError),
153    /// The header is missing.
154    MissingHeader,
155    /// The header is invalid.
156    InvalidHeader(String, String),
157    /// A sample name is duplicated.
158    ///
159    /// ยง 1.5 Header line syntax (2021-01-13): "Duplicate sample IDs are not allowed."
160    DuplicateSampleName(String),
161    /// More data unexpectedly appears after the header header (`#CHROM`...).
162    ExpectedEof,
163    /// The position of the entry in the string match does not match the absolute position defined
164    /// by the `IDX` field of a record.
165    StringMapPositionMismatch((usize, String), (usize, String)),
166}
167
168impl error::Error for ParseError {
169    fn source(&self) -> Option<&(dyn error::Error + 'static)> {
170        match self {
171            Self::InvalidUtf8(e) => Some(e),
172            Self::InvalidRecord(e) => Some(e),
173            Self::InvalidRecordValue(e) => Some(e),
174            _ => None,
175        }
176    }
177}
178
179impl std::fmt::Display for ParseError {
180    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
181        match self {
182            Self::Empty => f.write_str("empty input"),
183            Self::InvalidUtf8(_) => f.write_str("invalid UTF-8"),
184            Self::MissingFileFormat => f.write_str("missing fileformat"),
185            Self::UnexpectedFileFormat => f.write_str("unexpected file format"),
186            Self::InvalidRecord(_) => f.write_str("invalid record"),
187            Self::DuplicateInfoId(id) => write!(f, "duplicate INFO ID: {id}"),
188            Self::DuplicateFilterId(id) => write!(f, "duplicate FILTER ID: {id}"),
189            Self::DuplicateFormatId(id) => write!(f, "duplicate FORMAT ID: {id}"),
190            Self::DuplicateAlternativeAlleleId(id) => write!(f, "duplicate ALT ID: {id}"),
191            Self::DuplicateContigId(id) => write!(f, "duplicate contig ID: {id}"),
192            Self::InvalidRecordValue(_) => f.write_str("invalid record value"),
193            Self::MissingHeader => f.write_str("missing header"),
194            Self::InvalidHeader(actual, expected) => {
195                write!(f, "invalid header: expected {expected}, got {actual}")
196            }
197            Self::DuplicateSampleName(sample_name) => {
198                write!(f, "duplicate sample name: {sample_name}")
199            }
200            Self::ExpectedEof => f.write_str("expected EOF"),
201            Self::StringMapPositionMismatch(actual, expected) => write!(
202                f,
203                "string map position mismatch: expected {} (IDX={}), got {} (IDX={})",
204                expected.1, expected.0, actual.1, actual.0,
205            ),
206        }
207    }
208}
209
210fn parse_file_format(src: &[u8]) -> Result<FileFormat, ParseError> {
211    let record =
212        record::parse_record(src, FileFormat::default()).map_err(ParseError::InvalidRecord)?;
213
214    match record {
215        Record::FileFormat(file_format) => Ok(file_format),
216        _ => Err(ParseError::MissingFileFormat),
217    }
218}
219
220fn try_insert_info(
221    infos: &mut Infos,
222    id: String,
223    info: Map<Info>,
224) -> Result<Entry<'_>, ParseError> {
225    use indexmap::map::Entry;
226
227    match infos.entry(id) {
228        Entry::Vacant(entry) => {
229            let i = entry.index();
230
231            entry.insert(info);
232
233            // SAFETY: The entry was inserted at `i`.
234            Ok(infos
235                .get_index(i)
236                .map(|(k, v)| self::Entry::Info(k, v))
237                .unwrap())
238        }
239        Entry::Occupied(entry) => Err(ParseError::DuplicateInfoId(entry.key().into())),
240    }
241}
242
243fn try_insert_filter(
244    filters: &mut Filters,
245    id: String,
246    filter: Map<Filter>,
247) -> Result<Entry<'_>, ParseError> {
248    use indexmap::map::Entry;
249
250    match filters.entry(id) {
251        Entry::Vacant(entry) => {
252            let i = entry.index();
253
254            entry.insert(filter);
255
256            // SAFETY: The entry was inserted at `i`.
257            Ok(filters
258                .get_index(i)
259                .map(|(k, v)| self::Entry::Filter(k, v))
260                .unwrap())
261        }
262        Entry::Occupied(entry) => Err(ParseError::DuplicateFilterId(entry.key().into())),
263    }
264}
265
266fn try_insert_format(
267    formats: &mut Formats,
268    id: String,
269    format: Map<Format>,
270) -> Result<Entry<'_>, ParseError> {
271    use indexmap::map::Entry;
272
273    match formats.entry(id) {
274        Entry::Vacant(entry) => {
275            let i = entry.index();
276
277            entry.insert(format);
278
279            // SAFETY: The entry was inserted at `i`.
280            Ok(formats
281                .get_index(i)
282                .map(|(k, v)| self::Entry::Format(k, v))
283                .unwrap())
284        }
285        Entry::Occupied(entry) => Err(ParseError::DuplicateFormatId(entry.key().into())),
286    }
287}
288
289fn try_insert_alternative_allele(
290    alternative_alleles: &mut AlternativeAlleles,
291    id: String,
292    alternative_allele: Map<AlternativeAllele>,
293) -> Result<Entry<'_>, ParseError> {
294    use indexmap::map::Entry;
295
296    match alternative_alleles.entry(id) {
297        Entry::Vacant(entry) => {
298            let i = entry.index();
299
300            entry.insert(alternative_allele);
301
302            // SAFETY: The entry was inserted at `i`.
303            Ok(alternative_alleles
304                .get_index(i)
305                .map(|(k, v)| self::Entry::AlternativeAllele(k, v))
306                .unwrap())
307        }
308        Entry::Occupied(entry) => Err(ParseError::DuplicateAlternativeAlleleId(entry.key().into())),
309    }
310}
311
312fn try_insert_contig(
313    contigs: &mut Contigs,
314    id: String,
315    contig: Map<Contig>,
316) -> Result<Entry<'_>, ParseError> {
317    use indexmap::map::Entry;
318
319    match contigs.entry(id) {
320        Entry::Vacant(entry) => {
321            let i = entry.index();
322
323            entry.insert(contig);
324
325            // SAFETY: The entry was inserted at `i`.
326            Ok(contigs
327                .get_index(i)
328                .map(|(k, v)| self::Entry::Contig(k, v))
329                .unwrap())
330        }
331        Entry::Occupied(entry) => Err(ParseError::DuplicateContigId(entry.key().into())),
332    }
333}
334
335fn insert_other_record(
336    other_records: &mut OtherRecords,
337    key: super::record::key::Other,
338    value: super::record::Value,
339) -> Result<Entry<'_>, ParseError> {
340    let collection = other_records.entry(key).or_insert_with(|| match value {
341        super::record::Value::String(_) => {
342            super::record::value::Collection::Unstructured(Vec::new())
343        }
344        super::record::Value::Map(..) => {
345            super::record::value::Collection::Structured(IndexMap::new())
346        }
347    });
348
349    collection
350        .add(value)
351        .map_err(ParseError::InvalidRecordValue)?;
352
353    Ok(Entry::Other)
354}
355
356fn parse_header(src: &[u8], sample_names: &mut SampleNames) -> Result<(), ParseError> {
357    static HEADERS: &[&str] = &[
358        "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO",
359    ];
360    static FORMAT_HEADER: &str = "FORMAT";
361
362    const DELIMITER: char = '\t';
363
364    let line = str::from_utf8(src).map_err(ParseError::InvalidUtf8)?;
365    let mut fields = line.split(DELIMITER);
366
367    for &expected in HEADERS.iter() {
368        if let Some(actual) = fields.next() {
369            if actual != expected {
370                return Err(ParseError::InvalidHeader(actual.into(), expected.into()));
371            }
372        } else {
373            return Err(ParseError::InvalidHeader(String::from(""), expected.into()));
374        }
375    }
376
377    if let Some(field) = fields.next() {
378        if field != FORMAT_HEADER {
379            return Err(ParseError::InvalidHeader(
380                field.into(),
381                FORMAT_HEADER.into(),
382            ));
383        }
384
385        for sample_name in fields {
386            if !sample_names.insert(sample_name.into()) {
387                return Err(ParseError::DuplicateSampleName(sample_name.into()));
388            }
389        }
390    }
391
392    Ok(())
393}
394
395#[cfg(test)]
396mod tests {
397    use super::*;
398
399    #[test]
400    fn test_from_str() -> Result<(), Box<dyn std::error::Error>> {
401        use crate::{
402            header::record::{value::map::Other, Value},
403            variant::record::{info, samples},
404        };
405
406        let s = r#"##fileformat=VCFv4.3
407##fileDate=20200506
408##source=noodles-vcf
409##contig=<ID=sq0,length=8>
410##contig=<ID=sq1,length=13>
411##contig=<ID=sq2,length=21>
412##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
413##FILTER=<ID=q10,Description="Quality below 10">
414##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
415##ALT=<ID=DEL,Description="Deletion">
416##META=<ID=Assay,Type=String,Number=.,Values=[WholeGenome, Exome]>
417##SAMPLE=<ID=sample0,Assay=WholeGenome>
418##PEDIGREE=<ID=cid,Father=fid,Mother=mid>
419#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample0
420"#;
421
422        let actual = Parser::default().parse(s)?;
423
424        let expected = Header::builder()
425            .set_file_format(FileFormat::new(4, 3))
426            .insert("fileDate".parse()?, Value::String(String::from("20200506")))?
427            .insert(
428                "source".parse()?,
429                Value::String(String::from("noodles-vcf")),
430            )?
431            .add_contig("sq0", Map::<Contig>::builder().set_length(8).build()?)
432            .add_contig("sq1", Map::<Contig>::builder().set_length(13).build()?)
433            .add_contig("sq2", Map::<Contig>::builder().set_length(21).build()?)
434            .add_info(
435                info::field::key::SAMPLES_WITH_DATA_COUNT,
436                Map::<Info>::from(info::field::key::SAMPLES_WITH_DATA_COUNT),
437            )
438            .add_filter("q10", Map::<Filter>::new("Quality below 10"))
439            .add_format(
440                samples::keys::key::GENOTYPE,
441                Map::<Format>::from(samples::keys::key::GENOTYPE),
442            )
443            .add_alternative_allele("DEL", Map::<AlternativeAllele>::new("Deletion"))
444            .insert(
445                "META".parse()?,
446                Value::Map(
447                    String::from("Assay"),
448                    Map::<Other>::builder()
449                        .insert("Type".parse()?, "String")
450                        .insert("Number".parse()?, ".")
451                        .insert("Values".parse()?, "[WholeGenome, Exome]")
452                        .build()?,
453                ),
454            )?
455            .insert(
456                "SAMPLE".parse()?,
457                Value::Map(
458                    String::from("sample0"),
459                    Map::<Other>::builder()
460                        .insert("Assay".parse()?, "WholeGenome")
461                        .build()?,
462                ),
463            )?
464            .insert(
465                "PEDIGREE".parse()?,
466                Value::Map(
467                    String::from("cid"),
468                    Map::<Other>::builder()
469                        .insert("Father".parse()?, "fid")
470                        .insert("Mother".parse()?, "mid")
471                        .build()?,
472                ),
473            )?
474            .add_sample_name("sample0")
475            .build();
476
477        assert_eq!(actual, expected);
478
479        Ok(())
480    }
481
482    #[test]
483    fn test_from_str_without_file_format() {
484        let s = r#"##ALT=<ID=DEL,Description="Deletion">
485"#;
486
487        assert_eq!(
488            Parser::default().parse(s),
489            Err(ParseError::MissingFileFormat)
490        );
491    }
492
493    #[test]
494    fn test_from_str_with_data_after_header() {
495        let s = r#"##fileformat=VCFv4.3
496#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
497##contig=<ID=sq0,length=8>
498"#;
499
500        assert_eq!(Parser::default().parse(s), Err(ParseError::ExpectedEof));
501    }
502
503    #[test]
504    fn test_from_str_with_multiple_fileformats() {
505        let s = "\
506##fileformat=VCFv4.3
507##fileformat=VCFv4.3
508";
509
510        assert_eq!(
511            Parser::default().parse(s),
512            Err(ParseError::UnexpectedFileFormat)
513        );
514    }
515
516    #[test]
517    fn test_from_str_with_missing_headers() {
518        let s = "##fileformat=VCFv4.3
519";
520        assert_eq!(Parser::default().parse(s), Err(ParseError::MissingHeader));
521    }
522
523    #[test]
524    fn test_from_str_with_invalid_headers() {
525        let s = "##fileformat=VCFv4.3
526#CHROM	POS	ID	REF	ALT	QUALITY	FILTER	INFO
527";
528
529        assert_eq!(
530            Parser::default().parse(s),
531            Err(ParseError::InvalidHeader(
532                String::from("QUALITY"),
533                String::from("QUAL")
534            ))
535        );
536
537        let s = "##fileformat=VCFv4.3
538#CHROM	POS	ID
539";
540
541        assert_eq!(
542            Parser::default().parse(s),
543            Err(ParseError::InvalidHeader(
544                String::from(""),
545                String::from("REF")
546            ))
547        );
548
549        let s = "##fileformat=VCFv4.3
550#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	sample0
551";
552
553        assert_eq!(
554            Parser::default().parse(s),
555            Err(ParseError::InvalidHeader(
556                String::from("sample0"),
557                String::from("FORMAT")
558            ))
559        );
560    }
561
562    #[test]
563    fn test_from_str_with_duplicate_map_id() {
564        let s = r#"##fileformat=VCFv4.3
565##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
566##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
567#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
568"#;
569
570        assert!(matches!(
571            Parser::default().parse(s),
572            Err(ParseError::DuplicateInfoId(_))
573        ));
574
575        let s = r#"##fileformat=VCFv4.3
576##FILTER=<ID=q10,Description="Quality below 10">
577##FILTER=<ID=q10,Description="Quality below 10">
578#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
579"#;
580
581        assert_eq!(
582            Parser::default().parse(s),
583            Err(ParseError::DuplicateFilterId(String::from("q10")))
584        );
585
586        let s = r#"##fileformat=VCFv4.3
587##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
588##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
589#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
590"#;
591
592        assert_eq!(
593            Parser::default().parse(s),
594            Err(ParseError::DuplicateFormatId(String::from(
595                crate::variant::record::samples::keys::key::GENOTYPE
596            )))
597        );
598
599        let s = r#"##fileformat=VCFv4.3
600##ALT=<ID=DEL,Description="Deletion">
601##ALT=<ID=DEL,Description="Deletion">
602#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
603"#;
604
605        assert!(matches!(
606            Parser::default().parse(s),
607            Err(ParseError::DuplicateAlternativeAlleleId(_))
608        ));
609
610        let s = r#"##fileformat=VCFv4.3
611##contig=<ID=sq0,length=8>
612##contig=<ID=sq0,length=8>
613#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
614"#;
615
616        assert!(matches!(
617            Parser::default().parse(s),
618            Err(ParseError::DuplicateContigId(_))
619        ));
620
621        let s = r#"##fileformat=VCFv4.3
622##contig=<ID=sq0,length=8>
623##contig=<ID=sq0,length=8>
624#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
625"#;
626
627        assert!(matches!(
628            Parser::default().parse(s),
629            Err(ParseError::DuplicateContigId(_))
630        ));
631
632        let s = r#"##fileformat=VCFv4.3
633##SAMPLE=<ID=sample0,Assay=WholeGenome>
634##SAMPLE=<ID=sample0,Assay=WholeGenome>
635#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
636"#;
637
638        assert!(matches!(
639            Parser::default().parse(s),
640            Err(ParseError::InvalidRecordValue(_))
641        ));
642    }
643
644    #[test]
645    fn test_from_str_with_duplicate_sample_names() {
646        let s = "##fileformat=VCFv4.3
647#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample0	sample0
648";
649
650        assert_eq!(
651            Parser::default().parse(s),
652            Err(ParseError::DuplicateSampleName(String::from("sample0")))
653        );
654    }
655}