noodles_sam/io/
reader.rs

1//! SAM reader.
2
3mod builder;
4pub mod header;
5pub(crate) mod query;
6mod record;
7pub(crate) mod record_buf;
8mod record_bufs;
9
10use std::{
11    io::{self, BufRead},
12    iter,
13};
14
15use noodles_bgzf as bgzf;
16use noodles_core::Region;
17use noodles_csi::BinningIndex;
18
19pub(crate) use self::record::read_record;
20pub use self::{builder::Builder, record_bufs::RecordBufs};
21use self::{header::read_header, query::Query, record_buf::read_record_buf};
22use crate::{alignment::RecordBuf, header::ReferenceSequences, Header, Record};
23
24/// A SAM reader.
25///
26/// The SAM format is comprised of two parts: 1) a header and 2) a list of records.
27///
28/// Each header line is prefixed with an `@` (at sign). The header is optional and may be empty.
29///
30/// SAM records are line-based and follow directly after the header or the start of the file until
31/// EOF.
32///
33/// # Examples
34///
35/// ```no_run
36/// use noodles_sam as sam;
37///
38/// let mut reader = sam::io::reader::Builder::default().build_from_path("sample.sam")?;
39/// let header = reader.read_header()?;
40///
41/// for result in reader.records() {
42///     let record = result?;
43///     // ...
44/// }
45/// # Ok::<_, std::io::Error>(())
46/// ```
47#[derive(Debug)]
48pub struct Reader<R> {
49    inner: R,
50    buf: Vec<u8>,
51}
52
53impl<R> Reader<R> {
54    /// Returns a reference to the underlying reader.
55    ///
56    /// # Examples
57    ///
58    /// ```
59    /// use noodles_sam as sam;
60    /// let data = [];
61    /// let reader = sam::io::Reader::new(&data[..]);
62    /// assert!(reader.get_ref().is_empty());
63    /// ```
64    pub fn get_ref(&self) -> &R {
65        &self.inner
66    }
67
68    /// Returns a mutable reference to the underlying reader.
69    ///
70    /// # Examples
71    ///
72    /// ```
73    /// use noodles_sam as sam;
74    /// let data = [];
75    /// let mut reader = sam::io::Reader::new(&data[..]);
76    /// assert!(reader.get_mut().is_empty());
77    /// ```
78    pub fn get_mut(&mut self) -> &mut R {
79        &mut self.inner
80    }
81
82    /// Returns the underlying reader.
83    ///
84    /// # Examples
85    ///
86    /// ```
87    /// use noodles_sam as sam;
88    /// let data = [];
89    /// let reader = sam::io::Reader::new(&data[..]);
90    /// assert!(reader.into_inner().is_empty());
91    /// ```
92    pub fn into_inner(self) -> R {
93        self.inner
94    }
95}
96
97impl<R> Reader<R>
98where
99    R: BufRead,
100{
101    /// Creates a SAM reader.
102    ///
103    /// # Examples
104    ///
105    /// ```
106    /// # use std::io;
107    /// use noodles_sam as sam;
108    /// let reader = sam::io::Reader::new(io::empty());
109    /// ```
110    pub fn new(inner: R) -> Self {
111        Self::from(inner)
112    }
113
114    /// Returns a SAM header reader.
115    ///
116    /// This creates an adapter that reads at most the length of the header, i.e., all lines
117    /// prefixed with a `@` (at sign).
118    ///
119    /// It is more ergonomic to read and parse the header using [`Self::read_header`], but using
120    /// this adapter allows for control of how the header is read, e.g., to read the raw SAM
121    /// header.
122    ///
123    /// The position of the stream is expected to be at the start.
124    ///
125    /// # Examples
126    ///
127    /// ```
128    /// # use std::io::Read;
129    /// use noodles_sam as sam;
130    ///
131    /// let data = b"@HD\tVN:1.6
132    /// *\t4\t*\t0\t255\t*\t*\t0\t0\t*\t*
133    /// ";
134    ///
135    /// let mut reader = sam::io::Reader::new(&data[..]);
136    /// let mut header_reader = reader.header_reader();
137    ///
138    /// let mut raw_header = String::new();
139    /// header_reader.read_to_string(&mut raw_header)?;
140    ///
141    /// assert_eq!(raw_header, "@HD\tVN:1.6\n");
142    /// # Ok::<_, std::io::Error>(())
143    /// ```
144    pub fn header_reader(&mut self) -> header::Reader<&mut R> {
145        header::Reader::new(&mut self.inner)
146    }
147
148    /// Reads the SAM header.
149    ///
150    /// The position of the stream is expected to be at the start.
151    ///
152    /// The SAM header is optional, and if it is missing, an empty [`Header`] is returned.
153    ///
154    /// # Examples
155    ///
156    /// ```
157    /// # use std::io;
158    /// use noodles_sam as sam;
159    ///
160    /// let data = b"@HD\tVN:1.6
161    /// *\t4\t*\t0\t255\t*\t*\t0\t0\t*\t*
162    /// ";
163    ///
164    /// let mut reader = sam::io::Reader::new(&data[..]);
165    /// let actual = reader.read_header()?;
166    ///
167    /// let expected = sam::Header::builder()
168    ///     .set_header(Default::default())
169    ///     .build();
170    ///
171    /// assert_eq!(actual, expected);
172    /// # Ok::<(), io::Error>(())
173    /// ```
174    pub fn read_header(&mut self) -> io::Result<Header> {
175        read_header(&mut self.inner)
176    }
177
178    /// Reads a record into an alignment record buffer.
179    ///
180    /// This reads a line from the underlying stream until a newline is reached and parses that
181    /// line into the given record.
182    ///
183    /// The stream is expected to be directly after the header or at the start of another record.
184    ///
185    /// It is more ergonomic to read records using an iterator (see [`Self::records`] and
186    /// [`Self::query`]), but using this method directly allows reuse of a [`RecordBuf`].
187    ///
188    /// If successful, the number of bytes read is returned. If the number of bytes read is 0, the
189    /// stream reached EOF.
190    ///
191    /// # Examples
192    ///
193    /// ```
194    /// use noodles_sam::{self as sam, alignment::RecordBuf};
195    ///
196    /// let data = b"@HD\tVN:1.6
197    /// *\t4\t*\t0\t255\t*\t*\t0\t0\t*\t*
198    /// ";
199    ///
200    /// let mut reader = sam::io::Reader::new(&data[..]);
201    /// let header = reader.read_header()?;
202    ///
203    /// let mut record = RecordBuf::default();
204    /// reader.read_record_buf(&header, &mut record)?;
205    ///
206    /// assert_eq!(record, RecordBuf::default());
207    /// # Ok::<_, std::io::Error>(())
208    /// ```
209    pub fn read_record_buf(
210        &mut self,
211        header: &Header,
212        record: &mut RecordBuf,
213    ) -> io::Result<usize> {
214        read_record_buf(&mut self.inner, &mut self.buf, header, record)
215    }
216
217    /// Returns an iterator over alignment record buffers starting from the current stream
218    /// position.
219    ///
220    /// The stream is expected to be directly after the header or at the start of another record.
221    ///
222    /// # Examples
223    ///
224    /// ```
225    /// use noodles_sam as sam;
226    ///
227    /// let data = b"@HD\tVN:1.6
228    /// *\t4\t*\t0\t255\t*\t*\t0\t0\t*\t*
229    /// ";
230    ///
231    /// let mut reader = sam::io::Reader::new(&data[..]);
232    /// let header = reader.read_header()?;
233    ///
234    /// let mut records = reader.record_bufs(&header);
235    /// assert!(records.next().is_some());
236    /// assert!(records.next().is_none());
237    /// # Ok::<_, std::io::Error>(())
238    /// ```
239    pub fn record_bufs<'a>(&'a mut self, header: &'a Header) -> RecordBufs<'a, R> {
240        RecordBufs::new(self, header)
241    }
242
243    /// Reads a record.
244    ///
245    /// This reads SAM fields from the underlying stream into the given record's buffer until a
246    /// newline is reached. No fields are decoded, meaning the record is not necessarily valid.
247    /// However, the structure of the buffer is guaranteed to be record-like.
248    ///
249    /// The stream is expected to be directly after the header or at the start of another record.
250    ///
251    /// If successful, the number of bytes read is returned. If the number of bytes read is 0, the
252    /// stream reached EOF.
253    ///
254    /// # Examples
255    ///
256    /// ```
257    /// # use std::io;
258    /// use noodles_sam as sam;
259    ///
260    /// let data = b"@HD\tVN:1.6
261    /// *\t4\t*\t0\t255\t*\t*\t0\t0\t*\t*
262    /// ";
263    ///
264    /// let mut reader = sam::io::Reader::new(&data[..]);
265    /// reader.read_header()?;
266    ///
267    /// let mut record = sam::Record::default();
268    /// reader.read_record(&mut record)?;
269    /// # Ok::<(), io::Error>(())
270    /// ```
271    pub fn read_record(&mut self, record: &mut Record) -> io::Result<usize> {
272        read_record(&mut self.inner, record)
273    }
274
275    /// Returns an iterator over records.
276    ///
277    /// The stream is expected to be directly after the reference sequences or at the start of
278    /// another record.
279    ///
280    /// # Examples
281    ///
282    /// ```
283    /// use noodles_sam as sam;
284    ///
285    /// let data = b"@HD\tVN:1.6
286    /// *\t4\t*\t0\t255\t*\t*\t0\t0\t*\t*
287    /// ";
288    ///
289    /// let mut reader = sam::io::Reader::new(&data[..]);
290    /// reader.read_header()?;
291    ///
292    /// for result in reader.records() {
293    ///     let record = result?;
294    ///     // ...
295    /// }
296    /// # Ok::<_, std::io::Error>(())
297    /// ```
298    pub fn records(&mut self) -> impl Iterator<Item = io::Result<Record>> + '_ {
299        let mut record = Record::default();
300
301        iter::from_fn(move || match self.read_record(&mut record) {
302            Ok(0) => None,
303            Ok(_) => Some(Ok(record.clone())),
304            Err(e) => Some(Err(e)),
305        })
306    }
307}
308
309impl<R> Reader<R>
310where
311    R: bgzf::io::BufRead + bgzf::io::Seek,
312{
313    // Seeks to the first record by setting the cursor to the beginning of the stream and
314    // (re)reading the header.
315    fn seek_to_first_record(&mut self) -> io::Result<bgzf::VirtualPosition> {
316        self.get_mut()
317            .seek_to_virtual_position(bgzf::VirtualPosition::default())?;
318
319        self.read_header()?;
320
321        Ok(self.get_ref().virtual_position())
322    }
323
324    /// Returns an iterator over records that intersect the given region.
325    ///
326    /// To query for unmapped records, use [`Self::query_unmapped`].
327    ///
328    /// # Examples
329    ///
330    /// ```no_run
331    /// # use std::{fs::File, io};
332    /// use noodles_bgzf as bgzf;
333    /// use noodles_csi as csi;
334    /// use noodles_sam as sam;
335    ///
336    /// let mut reader = File::open("sample.sam.gz")
337    ///     .map(bgzf::Reader::new)
338    ///     .map(sam::io::Reader::new)?;
339    ///
340    /// let header = reader.read_header()?;
341    ///
342    /// let index = csi::fs::read("sample.sam.gz.csi")?;
343    /// let region = "sq0:8-13".parse()?;
344    /// let query = reader.query(&header, &index, &region)?;
345    ///
346    /// for result in query {
347    ///     let record = result?;
348    ///     // ...
349    /// }
350    /// # Ok::<(), Box<dyn std::error::Error>>(())
351    /// ```
352    pub fn query<'r, 'h: 'r, I>(
353        &'r mut self,
354        header: &'h Header,
355        index: &I,
356        region: &Region,
357    ) -> io::Result<impl Iterator<Item = io::Result<Record>> + 'r>
358    where
359        I: BinningIndex,
360    {
361        let reference_sequence_id = resolve_region(header.reference_sequences(), region)?;
362        let chunks = index.query(reference_sequence_id, region.interval())?;
363
364        Ok(Query::new(
365            self.get_mut(),
366            chunks,
367            header,
368            reference_sequence_id,
369            region.interval(),
370        ))
371    }
372
373    /// Returns an iterator of unmapped records after querying for the unmapped region.
374    ///
375    /// ```no_run
376    /// # use std::{fs::File, io};
377    /// use noodles_bgzf as bgzf;
378    /// use noodles_csi as csi;
379    /// use noodles_sam as sam;
380    ///
381    /// let mut reader = File::open("sample.sam.gz")
382    ///     .map(bgzf::Reader::new)
383    ///     .map(sam::io::Reader::new)?;
384    ///
385    /// reader.read_header()?;
386    ///
387    /// let index = csi::fs::read("sample.sam.gz.csi")?;
388    /// let query = reader.query_unmapped(&index)?;
389    ///
390    /// for result in query {
391    ///     let record = result?;
392    ///     // ...
393    /// }
394    /// # Ok::<_, io::Error>(())
395    /// ```
396    pub fn query_unmapped<I>(
397        &mut self,
398        index: &I,
399    ) -> io::Result<impl Iterator<Item = io::Result<Record>> + '_>
400    where
401        I: BinningIndex,
402    {
403        if let Some(pos) = index.last_first_record_start_position() {
404            self.get_mut().seek_to_virtual_position(pos)?;
405        } else {
406            self.seek_to_first_record()?;
407        }
408
409        let mut record = Record::default();
410
411        Ok(iter::from_fn(move || loop {
412            match self.read_record(&mut record) {
413                Ok(0) => return None,
414                Ok(_) => {
415                    let result = record.flags().map(|flags| flags.is_unmapped());
416
417                    match result {
418                        Ok(true) => return Some(Ok(record.clone())),
419                        Ok(false) => {}
420                        Err(e) => return Some(Err(e)),
421                    }
422                }
423                Err(e) => return Some(Err(e)),
424            }
425        }))
426    }
427}
428
429impl<R> From<R> for Reader<R>
430where
431    R: BufRead,
432{
433    fn from(inner: R) -> Self {
434        Self {
435            inner,
436            buf: Vec::new(),
437        }
438    }
439}
440
441impl<R> crate::alignment::io::Read<R> for Reader<R>
442where
443    R: BufRead,
444{
445    fn read_alignment_header(&mut self) -> io::Result<Header> {
446        self.read_header()
447    }
448
449    fn alignment_records<'a>(
450        &'a mut self,
451        _header: &'a Header,
452    ) -> Box<dyn Iterator<Item = io::Result<Box<dyn crate::alignment::Record>>> + 'a> {
453        Box::new(self.records().map(|result| {
454            result.map(|record| Box::new(record) as Box<dyn crate::alignment::Record>)
455        }))
456    }
457}
458
459fn read_line<R>(reader: &mut R, buf: &mut Vec<u8>) -> io::Result<usize>
460where
461    R: BufRead,
462{
463    const LINE_FEED: u8 = b'\n';
464    const CARRIAGE_RETURN: u8 = b'\r';
465
466    match reader.read_until(LINE_FEED, buf)? {
467        0 => Ok(0),
468        n => {
469            if buf.ends_with(&[LINE_FEED]) {
470                buf.pop();
471
472                if buf.ends_with(&[CARRIAGE_RETURN]) {
473                    buf.pop();
474                }
475            }
476
477            Ok(n)
478        }
479    }
480}
481
482pub(crate) fn resolve_region(
483    reference_sequences: &ReferenceSequences,
484    region: &Region,
485) -> io::Result<usize> {
486    reference_sequences
487        .get_index_of(region.name())
488        .ok_or_else(|| {
489            io::Error::new(
490                io::ErrorKind::InvalidInput,
491                format!(
492                    "region reference sequence does not exist in reference sequences: {region:?}"
493                ),
494            )
495        })
496}
497
498#[cfg(test)]
499mod tests {
500    use super::*;
501
502    #[test]
503    fn test_read_line() -> io::Result<()> {
504        fn t(buf: &mut Vec<u8>, mut reader: &[u8], expected: &[u8]) -> io::Result<()> {
505            buf.clear();
506            read_line(&mut reader, buf)?;
507            assert_eq!(buf, expected);
508            Ok(())
509        }
510
511        let mut buf = Vec::new();
512
513        t(&mut buf, b"noodles\n", b"noodles")?;
514        t(&mut buf, b"noodles\r\n", b"noodles")?;
515        t(&mut buf, b"noodles", b"noodles")?;
516
517        Ok(())
518    }
519}