rust_htslib/bam/
header.rs

1// Copyright 2014 Johannes Köster.
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5
6use crate::bam::HeaderView;
7use lazy_static::lazy_static;
8use linear_map::LinearMap;
9use regex::Regex;
10use std::borrow::Cow;
11use std::collections::HashMap;
12
13/// A BAM header.
14#[derive(Debug, Clone)]
15pub struct Header {
16    records: Vec<Vec<u8>>,
17}
18
19impl Default for Header {
20    fn default() -> Self {
21        Self::new()
22    }
23}
24
25impl Header {
26    /// Create a new header.
27    pub fn new() -> Self {
28        Header {
29            records: Vec::new(),
30        }
31    }
32
33    pub fn from_template(header: &HeaderView) -> Self {
34        let mut record = header.as_bytes().to_owned();
35        // Strip off any trailing newline character.
36        // Otherwise there could be a blank line in the
37        // header which samtools (<=1.6) will complain
38        // about
39        while let Some(&last_char) = record.last() {
40            if last_char == b'\n' {
41                record.pop();
42            } else {
43                break;
44            }
45        }
46        Header {
47            records: vec![record],
48        }
49    }
50
51    /// Add a record to the header.
52    pub fn push_record(&mut self, record: &HeaderRecord<'_>) -> &mut Self {
53        self.records.push(record.to_bytes());
54        self
55    }
56
57    /// Add a comment to the header.
58    pub fn push_comment(&mut self, comment: &[u8]) -> &mut Self {
59        self.records.push([&b"@CO"[..], comment].join(&b'\t'));
60        self
61    }
62
63    pub fn to_bytes(&self) -> Vec<u8> {
64        self.records.join(&b'\n')
65    }
66
67    /// This returns a header as a HashMap.
68    /// Comment lines starting with "@CO" will NOT be included in the HashMap.
69    /// Comment lines can be obtained by the `comments` function.
70    pub fn to_hashmap(&self) -> HashMap<String, Vec<LinearMap<String, String>>> {
71        let mut header_map = HashMap::default();
72
73        lazy_static! {
74            static ref REC_TYPE_RE: Regex = Regex::new(r"@([A-Z][A-Z])").unwrap();
75            static ref TAG_RE: Regex = Regex::new(r"([A-Za-z][A-Za-z0-9]):([ -~]*)").unwrap();
76        }
77
78        let header_string = String::from_utf8(self.to_bytes()).unwrap();
79
80        for line in header_string.split('\n').filter(|x| !x.is_empty()) {
81            let parts: Vec<_> = line.split('\t').filter(|x| !x.is_empty()).collect();
82            // assert!(rec_type_re.is_match(parts[0]));
83            let record_type = REC_TYPE_RE
84                .captures(parts[0])
85                .unwrap()
86                .get(1)
87                .unwrap()
88                .as_str()
89                .to_owned();
90            if record_type.eq("CO") {
91                continue;
92            }
93            let mut field = LinearMap::default();
94            for part in parts.iter().skip(1) {
95                let cap = TAG_RE.captures(part).unwrap();
96                let tag = cap.get(1).unwrap().as_str().to_owned();
97                let value = cap.get(2).unwrap().as_str().to_owned();
98                field.insert(tag, value);
99            }
100            header_map
101                .entry(record_type)
102                .or_insert_with(Vec::new)
103                .push(field);
104        }
105        header_map
106    }
107
108    /// Returns an iterator of comment lines.
109    pub fn comments(&self) -> impl Iterator<Item = Cow<str>> {
110        self.records.iter().flat_map(|r| {
111            r.split(|x| x == &b'\n')
112                .filter(|x| x.starts_with(b"@CO\t"))
113                .map(|x| String::from_utf8_lossy(&x[4..]))
114        })
115    }
116}
117
118/// Header record.
119#[derive(Debug, Clone)]
120pub struct HeaderRecord<'a> {
121    rec_type: Vec<u8>,
122    tags: Vec<(&'a [u8], Vec<u8>)>,
123}
124
125impl<'a> HeaderRecord<'a> {
126    /// Create a new header record.
127    /// See SAM format specification for possible record types.
128    pub fn new(rec_type: &'a [u8]) -> Self {
129        HeaderRecord {
130            rec_type: [&b"@"[..], rec_type].concat(),
131            tags: Vec::new(),
132        }
133    }
134
135    /// Add a new tag to the record.
136    ///
137    /// # Arguments
138    ///
139    /// * `tag` - the tag identifier
140    /// * `value` - the value. Can be any type convertible into a string. Preferably numbers or
141    ///   strings.
142    pub fn push_tag<V: ToString>(&mut self, tag: &'a [u8], value: V) -> &mut Self {
143        self.tags.push((tag, value.to_string().into_bytes()));
144        self
145    }
146
147    fn to_bytes(&self) -> Vec<u8> {
148        let mut out = Vec::new();
149        out.extend(self.rec_type.iter());
150        for &(tag, ref value) in self.tags.iter() {
151            out.push(b'\t');
152            out.extend(tag.iter());
153            out.push(b':');
154            out.extend(value.iter());
155        }
156        out
157    }
158}
159
160#[cfg(test)]
161mod tests {
162    use super::HeaderRecord;
163
164    #[test]
165    fn test_push_tag() {
166        let mut record = HeaderRecord::new(b"HD");
167        record.push_tag(b"X1", 0);
168        record.push_tag(b"X2", &0);
169
170        let x = "x".to_string();
171        record.push_tag(b"X3", x.as_str());
172        record.push_tag(b"X4", &x);
173        record.push_tag(b"X5", x);
174
175        assert_eq!(record.to_bytes(), b"@HD\tX1:0\tX2:0\tX3:x\tX4:x\tX5:x");
176    }
177}