1mod context;
2mod record;
3
4use std::{error, fmt, hash::Hash, str};
5
6use bstr::BString;
7use indexmap::IndexMap;
8
9pub(crate) use self::context::Context;
10use self::record::parse_record;
11use super::{
12 record::value::{
13 map::{self, header::Version},
14 Map,
15 },
16 Header, Programs, ReadGroups, Record, ReferenceSequences,
17};
18
19#[derive(Clone, Debug, Eq, PartialEq)]
21pub enum ParseError {
22 UnexpectedHeader,
24 InvalidRecord(record::ParseError),
26 DuplicateReferenceSequenceName(BString),
28 DuplicateReadGroupId(BString),
30 DuplicateProgramId(BString),
32 InvalidComment,
34}
35
36impl error::Error for ParseError {
37 fn source(&self) -> Option<&(dyn error::Error + 'static)> {
38 match self {
39 Self::InvalidRecord(e) => Some(e),
40 _ => None,
41 }
42 }
43}
44
45impl fmt::Display for ParseError {
46 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
47 match self {
48 Self::UnexpectedHeader => write!(f, "unexpected header (HD) record"),
49 Self::InvalidRecord(_) => f.write_str("invalid record"),
50 Self::DuplicateReferenceSequenceName(name) => {
51 write!(f, "duplicate reference sequence name: {name}")
52 }
53 Self::DuplicateReadGroupId(id) => write!(f, "duplicate read group ID: {id}"),
54 Self::DuplicateProgramId(id) => write!(f, "duplicate program ID: {id}"),
55 Self::InvalidComment => f.write_str("invalid comment record"),
56 }
57 }
58}
59
60#[derive(Default)]
62pub struct Parser {
63 ctx: Context,
64 header: Option<Map<map::Header>>,
65 reference_sequences: ReferenceSequences,
66 read_groups: ReadGroups,
67 programs: Programs,
68 comments: Vec<BString>,
69}
70
71impl Parser {
72 fn is_empty(&self) -> bool {
73 self.header.is_none()
74 && self.reference_sequences.is_empty()
75 && self.read_groups.is_empty()
76 && self.programs.as_ref().is_empty()
77 && self.comments.is_empty()
78 }
79
80 pub fn parse_partial(&mut self, src: &[u8]) -> Result<(), ParseError> {
91 if self.is_empty() {
92 if let Some(version) = extract_version(src) {
93 self.ctx = Context::from(version);
94 }
95 }
96
97 let record = parse_record(src, &self.ctx).map_err(ParseError::InvalidRecord)?;
98
99 match record {
100 Record::Header(header) => {
101 if self.is_empty() {
102 self.header = Some(header);
103 } else {
104 return Err(ParseError::UnexpectedHeader);
105 }
106 }
107 Record::ReferenceSequence(name, reference_sequence) => try_insert(
108 &mut self.reference_sequences,
109 name,
110 reference_sequence,
111 ParseError::DuplicateReferenceSequenceName,
112 )?,
113 Record::ReadGroup(id, read_group) => try_insert(
114 &mut self.read_groups,
115 id,
116 read_group,
117 ParseError::DuplicateReadGroupId,
118 )?,
119 Record::Program(id, program) => try_insert(
120 self.programs.as_mut(),
121 id,
122 program,
123 ParseError::DuplicateProgramId,
124 )?,
125 Record::Comment(comment) => self.comments.push(comment),
126 }
127
128 Ok(())
129 }
130
131 pub fn finish(self) -> Header {
143 Header {
144 header: self.header,
145 reference_sequences: self.reference_sequences,
146 read_groups: self.read_groups,
147 programs: self.programs,
148 comments: self.comments,
149 }
150 }
151}
152
153fn extract_version(src: &[u8]) -> Option<Version> {
154 use self::record::value::map::header::parse_version;
155
156 const RECORD_PREFIX: &[u8] = b"@HD\t";
157 const DELIMITER: u8 = b'\t';
158 const FIELD_PREFIX: &[u8] = b"VN:";
159
160 if let Some(raw_value) = src.strip_prefix(RECORD_PREFIX) {
161 for raw_field in raw_value.split(|&b| b == DELIMITER) {
162 if let Some(s) = raw_field.strip_prefix(FIELD_PREFIX) {
163 return parse_version(s).ok();
164 }
165 }
166 }
167
168 None
169}
170
171fn try_insert<K, V, F, E>(map: &mut IndexMap<K, V>, key: K, value: V, f: F) -> Result<(), E>
172where
173 K: Hash + Eq + Clone,
174 F: FnOnce(K) -> E,
175{
176 use indexmap::map::Entry;
177
178 match map.entry(key) {
179 Entry::Vacant(e) => {
180 e.insert(value);
181 Ok(())
182 }
183 Entry::Occupied(e) => Err(f(e.key().clone())),
184 }
185}
186
187pub(super) fn parse(s: &str) -> Result<Header, ParseError> {
210 let mut parser = Parser::default();
211
212 for line in s.lines() {
213 parser.parse_partial(line.as_bytes())?;
214 }
215
216 Ok(parser.finish())
217}
218
219#[cfg(test)]
220mod tests {
221 use super::*;
222
223 #[test]
224 fn test_parse() -> Result<(), Box<dyn std::error::Error>> {
225 use std::num::NonZeroUsize;
226
227 use crate::header::record::value::map::{
228 self,
229 header::{self, Version},
230 program, Map, Program, ReadGroup, ReferenceSequence,
231 };
232
233 const SQ0_LN: NonZeroUsize = match NonZeroUsize::new(8) {
234 Some(length) => length,
235 None => unreachable!(),
236 };
237
238 const SQ1_LN: NonZeroUsize = match NonZeroUsize::new(13) {
239 Some(length) => length,
240 None => unreachable!(),
241 };
242
243 let s = "\
244@HD\tVN:1.6\tSO:coordinate
245@SQ\tSN:sq0\tLN:8
246@SQ\tSN:sq1\tLN:13
247@RG\tID:rg0
248@PG\tID:pg0\tPN:noodles
249@CO\tndls
250";
251
252 let actual = parse(s)?;
253
254 let expected = Header::builder()
255 .set_header(
256 Map::<map::Header>::builder()
257 .set_version(Version::new(1, 6))
258 .insert(header::tag::SORT_ORDER, "coordinate")
259 .build()?,
260 )
261 .add_reference_sequence("sq0", Map::<ReferenceSequence>::new(SQ0_LN))
262 .add_reference_sequence("sq1", Map::<ReferenceSequence>::new(SQ1_LN))
263 .add_read_group("rg0", Map::<ReadGroup>::default())
264 .add_program(
265 "pg0",
266 Map::<Program>::builder()
267 .insert(program::tag::NAME, "noodles")
268 .build()?,
269 )
270 .add_comment("ndls")
271 .build();
272
273 assert_eq!(actual, expected);
274
275 Ok(())
276 }
277
278 #[test]
279 fn test_parse_with_empty_input() -> Result<(), ParseError> {
280 let header = parse("")?;
281
282 assert!(header.header().is_none());
283 assert!(header.reference_sequences().is_empty());
284 assert!(header.read_groups().is_empty());
285 assert!(header.programs().as_ref().is_empty());
286 assert!(header.comments().is_empty());
287
288 Ok(())
289 }
290
291 #[test]
292 fn test_parse_without_hd() -> Result<(), ParseError> {
293 let header = parse("@SQ\tSN:sq0\tLN:8\n")?;
294 assert!(header.header().is_none());
295 assert_eq!(header.reference_sequences().len(), 1);
296 Ok(())
297 }
298
299 #[test]
300 fn test_parse_with_multiple_hd() {
301 let s = "\
302@HD\tVN:1.6\tSO:coordinate
303@HD\tVN:1.6\tSO:coordinate
304";
305
306 assert_eq!(parse(s), Err(ParseError::UnexpectedHeader));
307 }
308
309 #[test]
310 fn test_parse_with_duplicate_reference_sequence_names() {
311 let s = "\
312@SQ\tSN:sq0\tLN:8
313@SQ\tSN:sq0\tLN:8
314";
315
316 assert_eq!(
317 parse(s),
318 Err(ParseError::DuplicateReferenceSequenceName(BString::from(
319 "sq0"
320 )))
321 );
322 }
323
324 #[test]
325 fn test_parse_with_duplicate_read_group_ids() {
326 let s = "\
327@RG\tID:rg0
328@RG\tID:rg0
329";
330
331 assert_eq!(
332 parse(s),
333 Err(ParseError::DuplicateReadGroupId(BString::from("rg0")))
334 );
335 }
336
337 #[test]
338 fn test_parse_with_duplicate_program_ids() {
339 let s = "\
340@PG\tID:pg0
341@PG\tID:pg0
342";
343 assert_eq!(
344 parse(s),
345 Err(ParseError::DuplicateProgramId(BString::from("pg0")))
346 );
347 }
348
349 #[test]
350 fn test_extract_version() {
351 assert_eq!(extract_version(b"@HD\tVN:1.6"), Some(Version::new(1, 6)));
352 assert_eq!(
353 extract_version(b"@HD\tSO:coordinate\tVN:1.6"),
354 Some(Version::new(1, 6))
355 );
356 assert!(extract_version(b"@HD\tVN:NA").is_none());
357 assert!(extract_version(b"@SQ\tSN:sq0\tLN:8\tVN:1.6").is_none());
358 assert!(extract_version(b"@CO\tVN:1.6").is_none());
359 }
360}