noodles_sam/header/
parser.rs

1mod context;
2mod record;
3
4use std::{error, fmt, hash::Hash, str};
5
6use bstr::BString;
7use indexmap::IndexMap;
8
9pub(crate) use self::context::Context;
10use self::record::parse_record;
11use super::{
12    record::value::{
13        map::{self, header::Version},
14        Map,
15    },
16    Header, Programs, ReadGroups, Record, ReferenceSequences,
17};
18
19/// An error returned when a raw SAM header fails to parse.
20#[derive(Clone, Debug, Eq, PartialEq)]
21pub enum ParseError {
22    /// A header record is not on the first line.
23    UnexpectedHeader,
24    /// The record is invalid.
25    InvalidRecord(record::ParseError),
26    /// A reference sequence name is duplicated.
27    DuplicateReferenceSequenceName(BString),
28    /// A read group ID is duplicated.
29    DuplicateReadGroupId(BString),
30    /// A program ID is duplicated.
31    DuplicateProgramId(BString),
32    /// A comment record is invalid.
33    InvalidComment,
34}
35
36impl error::Error for ParseError {
37    fn source(&self) -> Option<&(dyn error::Error + 'static)> {
38        match self {
39            Self::InvalidRecord(e) => Some(e),
40            _ => None,
41        }
42    }
43}
44
45impl fmt::Display for ParseError {
46    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
47        match self {
48            Self::UnexpectedHeader => write!(f, "unexpected header (HD) record"),
49            Self::InvalidRecord(_) => f.write_str("invalid record"),
50            Self::DuplicateReferenceSequenceName(name) => {
51                write!(f, "duplicate reference sequence name: {name}")
52            }
53            Self::DuplicateReadGroupId(id) => write!(f, "duplicate read group ID: {id}"),
54            Self::DuplicateProgramId(id) => write!(f, "duplicate program ID: {id}"),
55            Self::InvalidComment => f.write_str("invalid comment record"),
56        }
57    }
58}
59
60/// A SAM header parser.
61#[derive(Default)]
62pub struct Parser {
63    ctx: Context,
64    header: Option<Map<map::Header>>,
65    reference_sequences: ReferenceSequences,
66    read_groups: ReadGroups,
67    programs: Programs,
68    comments: Vec<BString>,
69}
70
71impl Parser {
72    fn is_empty(&self) -> bool {
73        self.header.is_none()
74            && self.reference_sequences.is_empty()
75            && self.read_groups.is_empty()
76            && self.programs.as_ref().is_empty()
77            && self.comments.is_empty()
78    }
79
80    /// Parses and adds a raw record to the header.
81    ///
82    /// # Examples
83    ///
84    /// ```
85    /// use noodles_sam as sam;
86    /// let mut parser = sam::header::Parser::default();
87    /// parser.parse_partial(b"@HD\tVN:1.6")?;
88    /// # Ok::<_, sam::header::ParseError>(())
89    /// ```
90    pub fn parse_partial(&mut self, src: &[u8]) -> Result<(), ParseError> {
91        if self.is_empty() {
92            if let Some(version) = extract_version(src) {
93                self.ctx = Context::from(version);
94            }
95        }
96
97        let record = parse_record(src, &self.ctx).map_err(ParseError::InvalidRecord)?;
98
99        match record {
100            Record::Header(header) => {
101                if self.is_empty() {
102                    self.header = Some(header);
103                } else {
104                    return Err(ParseError::UnexpectedHeader);
105                }
106            }
107            Record::ReferenceSequence(name, reference_sequence) => try_insert(
108                &mut self.reference_sequences,
109                name,
110                reference_sequence,
111                ParseError::DuplicateReferenceSequenceName,
112            )?,
113            Record::ReadGroup(id, read_group) => try_insert(
114                &mut self.read_groups,
115                id,
116                read_group,
117                ParseError::DuplicateReadGroupId,
118            )?,
119            Record::Program(id, program) => try_insert(
120                self.programs.as_mut(),
121                id,
122                program,
123                ParseError::DuplicateProgramId,
124            )?,
125            Record::Comment(comment) => self.comments.push(comment),
126        }
127
128        Ok(())
129    }
130
131    /// Builds the SAM header.
132    ///
133    /// # Examples
134    ///
135    /// ```
136    /// use noodles_sam as sam;
137    /// let parser = sam::header::Parser::default();
138    /// let header = parser.finish();
139    /// assert!(header.is_empty());
140    /// # Ok::<_, sam::header::ParseError>(())
141    /// ```
142    pub fn finish(self) -> Header {
143        Header {
144            header: self.header,
145            reference_sequences: self.reference_sequences,
146            read_groups: self.read_groups,
147            programs: self.programs,
148            comments: self.comments,
149        }
150    }
151}
152
153fn extract_version(src: &[u8]) -> Option<Version> {
154    use self::record::value::map::header::parse_version;
155
156    const RECORD_PREFIX: &[u8] = b"@HD\t";
157    const DELIMITER: u8 = b'\t';
158    const FIELD_PREFIX: &[u8] = b"VN:";
159
160    if let Some(raw_value) = src.strip_prefix(RECORD_PREFIX) {
161        for raw_field in raw_value.split(|&b| b == DELIMITER) {
162            if let Some(s) = raw_field.strip_prefix(FIELD_PREFIX) {
163                return parse_version(s).ok();
164            }
165        }
166    }
167
168    None
169}
170
171fn try_insert<K, V, F, E>(map: &mut IndexMap<K, V>, key: K, value: V, f: F) -> Result<(), E>
172where
173    K: Hash + Eq + Clone,
174    F: FnOnce(K) -> E,
175{
176    use indexmap::map::Entry;
177
178    match map.entry(key) {
179        Entry::Vacant(e) => {
180            e.insert(value);
181            Ok(())
182        }
183        Entry::Occupied(e) => Err(f(e.key().clone())),
184    }
185}
186
187/// Parses a raw SAM header.
188///
189/// # Examples
190///
191/// ```
192/// use noodles_sam as sam;
193///
194/// let s = "\
195/// @HD\tVN:1.6\tSO:coordinate
196/// @SQ\tSN:sq0\tLN:8
197/// @SQ\tSN:sq1\tLN:13
198/// ";
199///
200/// let header: sam::Header = s.parse()?;
201///
202/// assert!(header.header().is_some());
203/// assert_eq!(header.reference_sequences().len(), 2);
204/// assert!(header.read_groups().is_empty());
205/// assert!(header.programs().as_ref().is_empty());
206/// assert!(header.comments().is_empty());
207/// # Ok::<(), sam::header::ParseError>(())
208/// ```
209pub(super) fn parse(s: &str) -> Result<Header, ParseError> {
210    let mut parser = Parser::default();
211
212    for line in s.lines() {
213        parser.parse_partial(line.as_bytes())?;
214    }
215
216    Ok(parser.finish())
217}
218
219#[cfg(test)]
220mod tests {
221    use super::*;
222
223    #[test]
224    fn test_parse() -> Result<(), Box<dyn std::error::Error>> {
225        use std::num::NonZeroUsize;
226
227        use crate::header::record::value::map::{
228            self,
229            header::{self, Version},
230            program, Map, Program, ReadGroup, ReferenceSequence,
231        };
232
233        const SQ0_LN: NonZeroUsize = match NonZeroUsize::new(8) {
234            Some(length) => length,
235            None => unreachable!(),
236        };
237
238        const SQ1_LN: NonZeroUsize = match NonZeroUsize::new(13) {
239            Some(length) => length,
240            None => unreachable!(),
241        };
242
243        let s = "\
244@HD\tVN:1.6\tSO:coordinate
245@SQ\tSN:sq0\tLN:8
246@SQ\tSN:sq1\tLN:13
247@RG\tID:rg0
248@PG\tID:pg0\tPN:noodles
249@CO\tndls
250";
251
252        let actual = parse(s)?;
253
254        let expected = Header::builder()
255            .set_header(
256                Map::<map::Header>::builder()
257                    .set_version(Version::new(1, 6))
258                    .insert(header::tag::SORT_ORDER, "coordinate")
259                    .build()?,
260            )
261            .add_reference_sequence("sq0", Map::<ReferenceSequence>::new(SQ0_LN))
262            .add_reference_sequence("sq1", Map::<ReferenceSequence>::new(SQ1_LN))
263            .add_read_group("rg0", Map::<ReadGroup>::default())
264            .add_program(
265                "pg0",
266                Map::<Program>::builder()
267                    .insert(program::tag::NAME, "noodles")
268                    .build()?,
269            )
270            .add_comment("ndls")
271            .build();
272
273        assert_eq!(actual, expected);
274
275        Ok(())
276    }
277
278    #[test]
279    fn test_parse_with_empty_input() -> Result<(), ParseError> {
280        let header = parse("")?;
281
282        assert!(header.header().is_none());
283        assert!(header.reference_sequences().is_empty());
284        assert!(header.read_groups().is_empty());
285        assert!(header.programs().as_ref().is_empty());
286        assert!(header.comments().is_empty());
287
288        Ok(())
289    }
290
291    #[test]
292    fn test_parse_without_hd() -> Result<(), ParseError> {
293        let header = parse("@SQ\tSN:sq0\tLN:8\n")?;
294        assert!(header.header().is_none());
295        assert_eq!(header.reference_sequences().len(), 1);
296        Ok(())
297    }
298
299    #[test]
300    fn test_parse_with_multiple_hd() {
301        let s = "\
302@HD\tVN:1.6\tSO:coordinate
303@HD\tVN:1.6\tSO:coordinate
304";
305
306        assert_eq!(parse(s), Err(ParseError::UnexpectedHeader));
307    }
308
309    #[test]
310    fn test_parse_with_duplicate_reference_sequence_names() {
311        let s = "\
312@SQ\tSN:sq0\tLN:8
313@SQ\tSN:sq0\tLN:8
314";
315
316        assert_eq!(
317            parse(s),
318            Err(ParseError::DuplicateReferenceSequenceName(BString::from(
319                "sq0"
320            )))
321        );
322    }
323
324    #[test]
325    fn test_parse_with_duplicate_read_group_ids() {
326        let s = "\
327@RG\tID:rg0
328@RG\tID:rg0
329";
330
331        assert_eq!(
332            parse(s),
333            Err(ParseError::DuplicateReadGroupId(BString::from("rg0")))
334        );
335    }
336
337    #[test]
338    fn test_parse_with_duplicate_program_ids() {
339        let s = "\
340@PG\tID:pg0
341@PG\tID:pg0
342";
343        assert_eq!(
344            parse(s),
345            Err(ParseError::DuplicateProgramId(BString::from("pg0")))
346        );
347    }
348
349    #[test]
350    fn test_extract_version() {
351        assert_eq!(extract_version(b"@HD\tVN:1.6"), Some(Version::new(1, 6)));
352        assert_eq!(
353            extract_version(b"@HD\tSO:coordinate\tVN:1.6"),
354            Some(Version::new(1, 6))
355        );
356        assert!(extract_version(b"@HD\tVN:NA").is_none());
357        assert!(extract_version(b"@SQ\tSN:sq0\tLN:8\tVN:1.6").is_none());
358        assert!(extract_version(b"@CO\tVN:1.6").is_none());
359    }
360}