noodles_vcf/header/
string_maps.rs

1//! An indexed map of VCF strings.
2
3mod string_map;
4
5use std::str::{FromStr, Lines};
6
7pub use self::string_map::StringMap;
8use crate::{
9    header::{
10        parser::{parse_record, Entry},
11        FileFormat, ParseError, Record,
12    },
13    Header,
14};
15
16/// An indexed map of VCF strings (FILTER, FORMAT, and INFO).
17pub type StringStringMap = StringMap;
18
19/// An indexed map of VCF contig names.
20pub type ContigStringMap = StringMap;
21
22/// An indexed map of VCF strings.
23///
24/// This includes both the dictionary of strings and dictionary of contigs.
25#[derive(Clone, Debug, Eq, PartialEq)]
26pub struct StringMaps {
27    string_string_map: StringStringMap,
28    contig_string_map: ContigStringMap,
29}
30
31impl StringMaps {
32    /// Returns an indexed map of VCF strings (FILTER, FORMAT, and INFO).
33    ///
34    /// The filter ID "PASS" is always the first entry in the string string map.
35    ///
36    /// # Examples
37    ///
38    /// ```
39    /// use noodles_vcf::{
40    ///     self as vcf,
41    ///     header::{
42    ///         record::value::{map::{Contig, Filter, Format, Info}, Map},
43    ///         StringMaps,
44    ///     },
45    ///     variant::record::{info, samples},
46    /// };
47    ///
48    /// let header = vcf::Header::builder()
49    ///     .add_info(info::field::key::TOTAL_DEPTH, Map::<Info>::from(info::field::key::TOTAL_DEPTH))
50    ///     .add_filter("q10", Map::<Filter>::new("Quality below 10"))
51    ///     .add_format(samples::keys::key::READ_DEPTH, Map::<Format>::from(samples::keys::key::READ_DEPTH))
52    ///     .add_contig("sq0", Map::<Contig>::new())
53    ///     .build();
54    ///
55    /// let string_maps = StringMaps::try_from(&header)?;
56    /// let string_string_map = string_maps.strings();
57    ///
58    /// assert_eq!(string_string_map.get_index(0), Some("PASS"));
59    /// assert_eq!(string_string_map.get_index(1), Some("DP"));
60    /// assert_eq!(string_string_map.get_index(2), Some("q10"));
61    /// assert!(string_string_map.get_index(3).is_none());
62    /// # Ok::<_, Box<dyn std::error::Error>>(())
63    /// ```
64    pub fn strings(&self) -> &StringStringMap {
65        &self.string_string_map
66    }
67
68    fn strings_mut(&mut self) -> &mut StringStringMap {
69        &mut self.string_string_map
70    }
71
72    /// Returns an indexed map of contig names.
73    ///
74    /// # Examples
75    ///
76    /// ```
77    /// use noodles_vcf::{
78    ///     self as vcf,
79    ///     header::{
80    ///         record::value::{map::{Contig, Filter, Format, Info}, Map},
81    ///         StringMaps,
82    ///     },
83    ///     variant::record::{info, samples},
84    /// };
85    ///
86    /// let header = vcf::Header::builder()
87    ///     .add_info(info::field::key::TOTAL_DEPTH, Map::<Info>::from(info::field::key::TOTAL_DEPTH))
88    ///     .add_filter("q10", Map::<Filter>::new("Quality below 10"))
89    ///     .add_format(samples::keys::key::READ_DEPTH, Map::<Format>::from(samples::keys::key::READ_DEPTH))
90    ///     .add_contig("sq0", Map::<Contig>::new())
91    ///     .build();
92    ///
93    /// let string_maps = StringMaps::try_from(&header)?;
94    /// let contig_string_map = string_maps.contigs();
95    ///
96    /// assert_eq!(contig_string_map.get_index(0), Some("sq0"));
97    /// assert!(contig_string_map.get_index(1).is_none());
98    /// # Ok::<_, Box<dyn std::error::Error>>(())
99    /// ```
100    pub fn contigs(&self) -> &ContigStringMap {
101        &self.contig_string_map
102    }
103
104    fn contigs_mut(&mut self) -> &mut ContigStringMap {
105        &mut self.contig_string_map
106    }
107
108    #[doc(hidden)]
109    pub fn insert_entry(&mut self, entry: &Entry<'_>) -> Result<(), ParseError> {
110        match entry {
111            Entry::Contig(id, contig) => insert(self.contigs_mut(), id, contig.idx()),
112            Entry::Filter(id, filter) => insert(self.strings_mut(), id, filter.idx()),
113            Entry::Format(id, format) => insert(self.strings_mut(), id, format.idx()),
114            Entry::Info(id, info) => insert(self.strings_mut(), id, info.idx()),
115            _ => Ok(()),
116        }
117    }
118}
119
120impl Default for StringMaps {
121    fn default() -> Self {
122        // ยง 6.2.1 Dictionary of strings (2021-01-13): "Note that 'PASS' is always implicitly
123        // encoded as the first entry in the header dictionary."
124        let mut string_string_map = StringMap::default();
125        string_string_map.insert(String::from("PASS"));
126
127        let contig_string_map = StringMap::default();
128
129        Self {
130            string_string_map,
131            contig_string_map,
132        }
133    }
134}
135
136impl FromStr for StringMaps {
137    type Err = ParseError;
138
139    fn from_str(s: &str) -> Result<Self, Self::Err> {
140        let mut string_maps = Self::default();
141
142        let mut lines = s.lines();
143        let file_format = parse_file_format(&mut lines)?;
144
145        for line in &mut lines {
146            if line.starts_with("#CHROM") {
147                break;
148            }
149
150            let record =
151                parse_record(line.as_bytes(), file_format).map_err(ParseError::InvalidRecord)?;
152
153            match record {
154                Record::Contig(id, contig) => {
155                    insert(string_maps.contigs_mut(), id.as_ref(), contig.idx())?;
156                }
157                Record::Filter(id, filter) => {
158                    insert(string_maps.strings_mut(), &id, filter.idx())?;
159                }
160                Record::Format(id, format) => {
161                    insert(string_maps.strings_mut(), id.as_ref(), format.idx())?;
162                }
163                Record::Info(id, info) => {
164                    insert(string_maps.strings_mut(), id.as_ref(), info.idx())?;
165                }
166                _ => {}
167            }
168        }
169
170        Ok(string_maps)
171    }
172}
173
174fn parse_file_format(lines: &mut Lines<'_>) -> Result<FileFormat, ParseError> {
175    let record = lines
176        .next()
177        .ok_or(ParseError::MissingFileFormat)
178        .and_then(|line| {
179            parse_record(line.as_bytes(), Default::default()).map_err(ParseError::InvalidRecord)
180        })?;
181
182    match record {
183        Record::FileFormat(file_format) => Ok(file_format),
184        _ => Err(ParseError::MissingFileFormat),
185    }
186}
187
188fn insert(string_map: &mut StringMap, id: &str, idx: Option<usize>) -> Result<(), ParseError> {
189    if let Some(i) = idx {
190        if let Some((j, entry)) = string_map.get_full(id) {
191            let actual = (i, id.into());
192            let expected = (j, entry.into());
193
194            if actual != expected {
195                return Err(ParseError::StringMapPositionMismatch(actual, expected));
196            }
197        } else {
198            string_map.insert_at(i, id.into());
199        }
200    } else {
201        string_map.insert(id.into());
202    }
203
204    Ok(())
205}
206
207impl TryFrom<&Header> for StringMaps {
208    type Error = ParseError;
209
210    fn try_from(header: &Header) -> Result<Self, Self::Error> {
211        let mut string_maps = StringMaps::default();
212
213        for (id, contig) in header.contigs() {
214            insert(string_maps.contigs_mut(), id.as_ref(), contig.idx())?;
215        }
216
217        for (id, info) in header.infos() {
218            insert(string_maps.strings_mut(), id.as_ref(), info.idx())?;
219        }
220
221        for (id, filter) in header.filters() {
222            insert(string_maps.strings_mut(), id, filter.idx())?;
223        }
224
225        for (id, format) in header.formats() {
226            insert(string_maps.strings_mut(), id.as_ref(), format.idx())?;
227        }
228
229        Ok(string_maps)
230    }
231}
232
233#[cfg(test)]
234mod tests {
235    use super::*;
236
237    #[test]
238    fn test_default() {
239        let actual = StringMaps::default();
240
241        let mut string_string_map = StringMap::default();
242        string_string_map.insert("PASS".into());
243
244        let contig_string_map = StringMap::default();
245
246        let expected = StringMaps {
247            string_string_map,
248            contig_string_map,
249        };
250
251        assert_eq!(actual, expected);
252    }
253
254    #[test]
255    fn test_from_str() {
256        let s = r#"##fileformat=VCFv4.3
257##fileDate=20210412
258##contig=<ID=sq0,length=8>
259##contig=<ID=sq1,length=13>
260##contig=<ID=sq2,length=21>
261##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data",IDX=1>
262##INFO=<ID=DP,Number=1,Type=Integer,Description="Combined depth across samples",IDX=2>
263##FILTER=<ID=PASS,Description="All filters passed",IDX=0>
264##FILTER=<ID=q10,Description="Quality below 10",IDX=3>
265##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype",IDX=4>
266##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth",IDX=2>
267##ALT=<ID=DEL,Description="Deletion">
268#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample0
269"#;
270
271        let string_string_map = StringMap {
272            indices: [
273                (String::from("PASS"), 0),
274                (String::from("NS"), 1),
275                (String::from("DP"), 2),
276                (String::from("q10"), 3),
277                (String::from("GT"), 4),
278            ]
279            .into_iter()
280            .collect(),
281            entries: vec![
282                Some(String::from("PASS")),
283                Some(String::from("NS")),
284                Some(String::from("DP")),
285                Some(String::from("q10")),
286                Some(String::from("GT")),
287            ],
288        };
289
290        let contig_string_map = StringMap {
291            indices: [
292                (String::from("sq0"), 0),
293                (String::from("sq1"), 1),
294                (String::from("sq2"), 2),
295            ]
296            .into_iter()
297            .collect(),
298            entries: vec![
299                Some(String::from("sq0")),
300                Some(String::from("sq1")),
301                Some(String::from("sq2")),
302            ],
303        };
304
305        let expected = StringMaps {
306            string_string_map,
307            contig_string_map,
308        };
309
310        assert_eq!(s.parse(), Ok(expected));
311    }
312
313    #[test]
314    fn test_from_str_with_file_format() {
315        // FORMAT MQ is an `Integer` in VCF 4.2 and `Float` in VCF 4.3.
316        let s = r#"##fileformat=VCFv4.2
317##FORMAT=<ID=MQ,Number=1,Type=Integer,Description="RMS mapping quality">
318#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample0
319"#;
320
321        let mut string_string_map = StringMap::default();
322        string_string_map.insert(String::from("PASS"));
323        string_string_map.insert(String::from("MQ"));
324
325        let contig_string_map = StringMap::default();
326
327        let expected = StringMaps {
328            string_string_map,
329            contig_string_map,
330        };
331
332        assert_eq!(s.parse(), Ok(expected));
333    }
334
335    #[test]
336    fn test_from_str_with_mixed_positions() {
337        let s = r#"##fileformat=VCFv4.3
338##fileDate=20210412
339##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data",IDX=1>
340##FILTER=<ID=PASS,Description="All filters passed",IDX=0>
341##FILTER=<ID=q10,Description="Quality below 10",IDX=3>
342##FILTER=<ID=q15,Description="Quality below 15",IDX=4>
343##FILTER=<ID=q20,Description="Quality below 20">
344##FILTER=<ID=NS,Description="">
345#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample0
346"#;
347
348        let string_string_map = StringMap {
349            indices: [
350                (String::from("PASS"), 0),
351                (String::from("NS"), 1),
352                (String::from("q10"), 3),
353                (String::from("q15"), 4),
354                (String::from("q20"), 5),
355            ]
356            .into_iter()
357            .collect(),
358            entries: vec![
359                Some(String::from("PASS")),
360                Some(String::from("NS")),
361                None,
362                Some(String::from("q10")),
363                Some(String::from("q15")),
364                Some(String::from("q20")),
365            ],
366        };
367
368        let contig_string_map = StringMap::default();
369
370        let expected = StringMaps {
371            string_string_map,
372            contig_string_map,
373        };
374
375        assert_eq!(s.parse(), Ok(expected));
376    }
377
378    #[test]
379    fn test_from_str_with_a_position_mismatch() {
380        let s = r#"##fileformat=VCFv4.3
381##FILTER=<ID=PASS,Description="All filters passed",IDX=8>
382#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample0
383"#;
384
385        assert_eq!(
386            s.parse::<StringMaps>(),
387            Err(ParseError::StringMapPositionMismatch(
388                (8, String::from("PASS")),
389                (0, String::from("PASS"))
390            ))
391        );
392
393        let s = r#"##fileformat=VCFv4.3
394##INFO=<ID=DP,Number=1,Type=Integer,Description="Combined depth across samples",IDX=1>
395##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth",IDX=2>
396#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample0
397"#;
398
399        assert_eq!(
400            s.parse::<StringMaps>(),
401            Err(ParseError::StringMapPositionMismatch(
402                (2, String::from("DP")),
403                (1, String::from("DP"))
404            ))
405        );
406    }
407
408    #[test]
409    fn test_try_from_vcf_header_for_string_maps() -> Result<(), Box<dyn std::error::Error>> {
410        use crate::{
411            header::record::value::{
412                map::{AlternativeAllele, Contig, Filter, Format, Info},
413                Map,
414            },
415            variant::record::{info, samples},
416        };
417
418        let header = Header::builder()
419            .add_contig("sq0", Map::<Contig>::new())
420            .add_contig("sq1", Map::<Contig>::new())
421            .add_contig("sq2", Map::<Contig>::new())
422            .add_info(
423                info::field::key::SAMPLES_WITH_DATA_COUNT,
424                Map::<Info>::from(info::field::key::SAMPLES_WITH_DATA_COUNT),
425            )
426            .add_info(
427                info::field::key::TOTAL_DEPTH,
428                Map::<Info>::from(info::field::key::TOTAL_DEPTH),
429            )
430            .add_filter("PASS", Map::<Filter>::pass())
431            .add_filter("q10", Map::<Filter>::new("Quality below 10"))
432            .add_format(
433                samples::keys::key::GENOTYPE,
434                Map::<Format>::from(samples::keys::key::GENOTYPE),
435            )
436            .add_format(
437                samples::keys::key::READ_DEPTH,
438                Map::<Format>::from(samples::keys::key::READ_DEPTH),
439            )
440            .add_alternative_allele("DEL", Map::<AlternativeAllele>::new("Deletion"))
441            .build();
442
443        let actual = StringMaps::try_from(&header)?;
444
445        let string_string_map = StringMap {
446            indices: [
447                (String::from("PASS"), 0),
448                (String::from("NS"), 1),
449                (String::from("DP"), 2),
450                (String::from("q10"), 3),
451                (String::from("GT"), 4),
452            ]
453            .into_iter()
454            .collect(),
455            entries: vec![
456                Some(String::from("PASS")),
457                Some(String::from("NS")),
458                Some(String::from("DP")),
459                Some(String::from("q10")),
460                Some(String::from("GT")),
461            ],
462        };
463
464        let contig_string_map = StringMap {
465            indices: [
466                (String::from("sq0"), 0),
467                (String::from("sq1"), 1),
468                (String::from("sq2"), 2),
469            ]
470            .into_iter()
471            .collect(),
472            entries: vec![
473                Some(String::from("sq0")),
474                Some(String::from("sq1")),
475                Some(String::from("sq2")),
476            ],
477        };
478
479        let expected = StringMaps {
480            string_string_map,
481            contig_string_map,
482        };
483
484        assert_eq!(actual, expected);
485
486        Ok(())
487    }
488
489    #[test]
490    fn test_try_from_vcf_header_for_string_maps_with_idx() -> Result<(), Box<dyn std::error::Error>>
491    {
492        use crate::{
493            header::record::value::{
494                map::{Filter, Info},
495                Map,
496            },
497            variant::record::info,
498        };
499
500        let ns = {
501            let mut map = Map::<Info>::from(info::field::key::SAMPLES_WITH_DATA_COUNT);
502            *map.idx_mut() = Some(1);
503            map
504        };
505
506        let header = Header::builder()
507            .add_filter(
508                "PASS",
509                Map::<Filter>::builder()
510                    .set_description("All filters passed")
511                    .set_idx(0)
512                    .build()?,
513            )
514            .add_filter(
515                "q10",
516                Map::<Filter>::builder()
517                    .set_description("Quality below 10")
518                    .set_idx(3)
519                    .build()?,
520            )
521            .add_filter(
522                "q15",
523                Map::<Filter>::builder()
524                    .set_description("Quality below 15")
525                    .set_idx(4)
526                    .build()?,
527            )
528            .add_filter("q20", Map::<Filter>::new("Quality below 20"))
529            .add_filter("NS", Map::<Filter>::new(""))
530            .add_info(info::field::key::SAMPLES_WITH_DATA_COUNT, ns)
531            .build();
532
533        let actual = StringMaps::try_from(&header)?;
534
535        let string_string_map = StringMap {
536            indices: [
537                (String::from("PASS"), 0),
538                (String::from("NS"), 1),
539                (String::from("q10"), 3),
540                (String::from("q15"), 4),
541                (String::from("q20"), 5),
542            ]
543            .into_iter()
544            .collect(),
545            entries: vec![
546                Some(String::from("PASS")),
547                Some(String::from("NS")),
548                None,
549                Some(String::from("q10")),
550                Some(String::from("q15")),
551                Some(String::from("q20")),
552            ],
553        };
554
555        let contig_string_map = StringMap::default();
556
557        let expected = StringMaps {
558            string_string_map,
559            contig_string_map,
560        };
561
562        assert_eq!(actual, expected);
563
564        Ok(())
565    }
566
567    #[test]
568    fn test_parse_file_format() {
569        let s = "##fileformat=VCFv4.3\n";
570        let mut lines = s.lines();
571        assert_eq!(parse_file_format(&mut lines), Ok(FileFormat::new(4, 3)));
572
573        let s = "";
574        let mut lines = s.lines();
575        assert_eq!(
576            parse_file_format(&mut lines),
577            Err(ParseError::MissingFileFormat)
578        );
579
580        let s = "##fileDate=20211119";
581        let mut lines = s.lines();
582        assert_eq!(
583            parse_file_format(&mut lines),
584            Err(ParseError::MissingFileFormat)
585        );
586
587        let s = "fileformat=VCFv4.3";
588        let mut lines = s.lines();
589        assert!(matches!(
590            parse_file_format(&mut lines),
591            Err(ParseError::InvalidRecord(_))
592        ));
593
594        let s = "##fileformat=VCF43\n";
595        let mut lines = s.lines();
596        assert!(matches!(
597            parse_file_format(&mut lines),
598            Err(ParseError::InvalidRecord(_))
599        ));
600    }
601}