noodles_sam/io/reader.rs
1//! SAM reader.
2
3mod builder;
4pub mod header;
5pub(crate) mod query;
6mod record;
7pub(crate) mod record_buf;
8mod record_bufs;
9
10use std::{
11 io::{self, BufRead},
12 iter,
13};
14
15use noodles_bgzf as bgzf;
16use noodles_core::Region;
17use noodles_csi::BinningIndex;
18
19pub(crate) use self::record::read_record;
20pub use self::{builder::Builder, record_bufs::RecordBufs};
21use self::{header::read_header, query::Query, record_buf::read_record_buf};
22use crate::{alignment::RecordBuf, header::ReferenceSequences, Header, Record};
23
24/// A SAM reader.
25///
26/// The SAM format is comprised of two parts: 1) a header and 2) a list of records.
27///
28/// Each header line is prefixed with an `@` (at sign). The header is optional and may be empty.
29///
30/// SAM records are line-based and follow directly after the header or the start of the file until
31/// EOF.
32///
33/// # Examples
34///
35/// ```no_run
36/// use noodles_sam as sam;
37///
38/// let mut reader = sam::io::reader::Builder::default().build_from_path("sample.sam")?;
39/// let header = reader.read_header()?;
40///
41/// for result in reader.records() {
42/// let record = result?;
43/// // ...
44/// }
45/// # Ok::<_, std::io::Error>(())
46/// ```
47#[derive(Debug)]
48pub struct Reader<R> {
49 inner: R,
50 buf: Vec<u8>,
51}
52
53impl<R> Reader<R> {
54 /// Returns a reference to the underlying reader.
55 ///
56 /// # Examples
57 ///
58 /// ```
59 /// use noodles_sam as sam;
60 /// let data = [];
61 /// let reader = sam::io::Reader::new(&data[..]);
62 /// assert!(reader.get_ref().is_empty());
63 /// ```
64 pub fn get_ref(&self) -> &R {
65 &self.inner
66 }
67
68 /// Returns a mutable reference to the underlying reader.
69 ///
70 /// # Examples
71 ///
72 /// ```
73 /// use noodles_sam as sam;
74 /// let data = [];
75 /// let mut reader = sam::io::Reader::new(&data[..]);
76 /// assert!(reader.get_mut().is_empty());
77 /// ```
78 pub fn get_mut(&mut self) -> &mut R {
79 &mut self.inner
80 }
81
82 /// Returns the underlying reader.
83 ///
84 /// # Examples
85 ///
86 /// ```
87 /// use noodles_sam as sam;
88 /// let data = [];
89 /// let reader = sam::io::Reader::new(&data[..]);
90 /// assert!(reader.into_inner().is_empty());
91 /// ```
92 pub fn into_inner(self) -> R {
93 self.inner
94 }
95}
96
97impl<R> Reader<R>
98where
99 R: BufRead,
100{
101 /// Creates a SAM reader.
102 ///
103 /// # Examples
104 ///
105 /// ```
106 /// # use std::io;
107 /// use noodles_sam as sam;
108 /// let reader = sam::io::Reader::new(io::empty());
109 /// ```
110 pub fn new(inner: R) -> Self {
111 Self::from(inner)
112 }
113
114 /// Returns a SAM header reader.
115 ///
116 /// This creates an adapter that reads at most the length of the header, i.e., all lines
117 /// prefixed with a `@` (at sign).
118 ///
119 /// It is more ergonomic to read and parse the header using [`Self::read_header`], but using
120 /// this adapter allows for control of how the header is read, e.g., to read the raw SAM
121 /// header.
122 ///
123 /// The position of the stream is expected to be at the start.
124 ///
125 /// # Examples
126 ///
127 /// ```
128 /// # use std::io::Read;
129 /// use noodles_sam as sam;
130 ///
131 /// let data = b"@HD\tVN:1.6
132 /// *\t4\t*\t0\t255\t*\t*\t0\t0\t*\t*
133 /// ";
134 ///
135 /// let mut reader = sam::io::Reader::new(&data[..]);
136 /// let mut header_reader = reader.header_reader();
137 ///
138 /// let mut raw_header = String::new();
139 /// header_reader.read_to_string(&mut raw_header)?;
140 ///
141 /// assert_eq!(raw_header, "@HD\tVN:1.6\n");
142 /// # Ok::<_, std::io::Error>(())
143 /// ```
144 pub fn header_reader(&mut self) -> header::Reader<&mut R> {
145 header::Reader::new(&mut self.inner)
146 }
147
148 /// Reads the SAM header.
149 ///
150 /// The position of the stream is expected to be at the start.
151 ///
152 /// The SAM header is optional, and if it is missing, an empty [`Header`] is returned.
153 ///
154 /// # Examples
155 ///
156 /// ```
157 /// # use std::io;
158 /// use noodles_sam as sam;
159 ///
160 /// let data = b"@HD\tVN:1.6
161 /// *\t4\t*\t0\t255\t*\t*\t0\t0\t*\t*
162 /// ";
163 ///
164 /// let mut reader = sam::io::Reader::new(&data[..]);
165 /// let actual = reader.read_header()?;
166 ///
167 /// let expected = sam::Header::builder()
168 /// .set_header(Default::default())
169 /// .build();
170 ///
171 /// assert_eq!(actual, expected);
172 /// # Ok::<(), io::Error>(())
173 /// ```
174 pub fn read_header(&mut self) -> io::Result<Header> {
175 read_header(&mut self.inner)
176 }
177
178 /// Reads a record into an alignment record buffer.
179 ///
180 /// This reads a line from the underlying stream until a newline is reached and parses that
181 /// line into the given record.
182 ///
183 /// The stream is expected to be directly after the header or at the start of another record.
184 ///
185 /// It is more ergonomic to read records using an iterator (see [`Self::records`] and
186 /// [`Self::query`]), but using this method directly allows reuse of a [`RecordBuf`].
187 ///
188 /// If successful, the number of bytes read is returned. If the number of bytes read is 0, the
189 /// stream reached EOF.
190 ///
191 /// # Examples
192 ///
193 /// ```
194 /// use noodles_sam::{self as sam, alignment::RecordBuf};
195 ///
196 /// let data = b"@HD\tVN:1.6
197 /// *\t4\t*\t0\t255\t*\t*\t0\t0\t*\t*
198 /// ";
199 ///
200 /// let mut reader = sam::io::Reader::new(&data[..]);
201 /// let header = reader.read_header()?;
202 ///
203 /// let mut record = RecordBuf::default();
204 /// reader.read_record_buf(&header, &mut record)?;
205 ///
206 /// assert_eq!(record, RecordBuf::default());
207 /// # Ok::<_, std::io::Error>(())
208 /// ```
209 pub fn read_record_buf(
210 &mut self,
211 header: &Header,
212 record: &mut RecordBuf,
213 ) -> io::Result<usize> {
214 read_record_buf(&mut self.inner, &mut self.buf, header, record)
215 }
216
217 /// Returns an iterator over alignment record buffers starting from the current stream
218 /// position.
219 ///
220 /// The stream is expected to be directly after the header or at the start of another record.
221 ///
222 /// # Examples
223 ///
224 /// ```
225 /// use noodles_sam as sam;
226 ///
227 /// let data = b"@HD\tVN:1.6
228 /// *\t4\t*\t0\t255\t*\t*\t0\t0\t*\t*
229 /// ";
230 ///
231 /// let mut reader = sam::io::Reader::new(&data[..]);
232 /// let header = reader.read_header()?;
233 ///
234 /// let mut records = reader.record_bufs(&header);
235 /// assert!(records.next().is_some());
236 /// assert!(records.next().is_none());
237 /// # Ok::<_, std::io::Error>(())
238 /// ```
239 pub fn record_bufs<'a>(&'a mut self, header: &'a Header) -> RecordBufs<'a, R> {
240 RecordBufs::new(self, header)
241 }
242
243 /// Reads a record.
244 ///
245 /// This reads SAM fields from the underlying stream into the given record's buffer until a
246 /// newline is reached. No fields are decoded, meaning the record is not necessarily valid.
247 /// However, the structure of the buffer is guaranteed to be record-like.
248 ///
249 /// The stream is expected to be directly after the header or at the start of another record.
250 ///
251 /// If successful, the number of bytes read is returned. If the number of bytes read is 0, the
252 /// stream reached EOF.
253 ///
254 /// # Examples
255 ///
256 /// ```
257 /// # use std::io;
258 /// use noodles_sam as sam;
259 ///
260 /// let data = b"@HD\tVN:1.6
261 /// *\t4\t*\t0\t255\t*\t*\t0\t0\t*\t*
262 /// ";
263 ///
264 /// let mut reader = sam::io::Reader::new(&data[..]);
265 /// reader.read_header()?;
266 ///
267 /// let mut record = sam::Record::default();
268 /// reader.read_record(&mut record)?;
269 /// # Ok::<(), io::Error>(())
270 /// ```
271 pub fn read_record(&mut self, record: &mut Record) -> io::Result<usize> {
272 read_record(&mut self.inner, record)
273 }
274
275 /// Returns an iterator over records.
276 ///
277 /// The stream is expected to be directly after the reference sequences or at the start of
278 /// another record.
279 ///
280 /// # Examples
281 ///
282 /// ```
283 /// use noodles_sam as sam;
284 ///
285 /// let data = b"@HD\tVN:1.6
286 /// *\t4\t*\t0\t255\t*\t*\t0\t0\t*\t*
287 /// ";
288 ///
289 /// let mut reader = sam::io::Reader::new(&data[..]);
290 /// reader.read_header()?;
291 ///
292 /// for result in reader.records() {
293 /// let record = result?;
294 /// // ...
295 /// }
296 /// # Ok::<_, std::io::Error>(())
297 /// ```
298 pub fn records(&mut self) -> impl Iterator<Item = io::Result<Record>> + '_ {
299 let mut record = Record::default();
300
301 iter::from_fn(move || match self.read_record(&mut record) {
302 Ok(0) => None,
303 Ok(_) => Some(Ok(record.clone())),
304 Err(e) => Some(Err(e)),
305 })
306 }
307}
308
309impl<R> Reader<R>
310where
311 R: bgzf::io::BufRead + bgzf::io::Seek,
312{
313 // Seeks to the first record by setting the cursor to the beginning of the stream and
314 // (re)reading the header.
315 fn seek_to_first_record(&mut self) -> io::Result<bgzf::VirtualPosition> {
316 self.get_mut()
317 .seek_to_virtual_position(bgzf::VirtualPosition::default())?;
318
319 self.read_header()?;
320
321 Ok(self.get_ref().virtual_position())
322 }
323
324 /// Returns an iterator over records that intersect the given region.
325 ///
326 /// To query for unmapped records, use [`Self::query_unmapped`].
327 ///
328 /// # Examples
329 ///
330 /// ```no_run
331 /// # use std::{fs::File, io};
332 /// use noodles_bgzf as bgzf;
333 /// use noodles_csi as csi;
334 /// use noodles_sam as sam;
335 ///
336 /// let mut reader = File::open("sample.sam.gz")
337 /// .map(bgzf::Reader::new)
338 /// .map(sam::io::Reader::new)?;
339 ///
340 /// let header = reader.read_header()?;
341 ///
342 /// let index = csi::fs::read("sample.sam.gz.csi")?;
343 /// let region = "sq0:8-13".parse()?;
344 /// let query = reader.query(&header, &index, ®ion)?;
345 ///
346 /// for result in query {
347 /// let record = result?;
348 /// // ...
349 /// }
350 /// # Ok::<(), Box<dyn std::error::Error>>(())
351 /// ```
352 pub fn query<'r, 'h: 'r, I>(
353 &'r mut self,
354 header: &'h Header,
355 index: &I,
356 region: &Region,
357 ) -> io::Result<impl Iterator<Item = io::Result<Record>> + 'r>
358 where
359 I: BinningIndex,
360 {
361 let reference_sequence_id = resolve_region(header.reference_sequences(), region)?;
362 let chunks = index.query(reference_sequence_id, region.interval())?;
363
364 Ok(Query::new(
365 self.get_mut(),
366 chunks,
367 header,
368 reference_sequence_id,
369 region.interval(),
370 ))
371 }
372
373 /// Returns an iterator of unmapped records after querying for the unmapped region.
374 ///
375 /// ```no_run
376 /// # use std::{fs::File, io};
377 /// use noodles_bgzf as bgzf;
378 /// use noodles_csi as csi;
379 /// use noodles_sam as sam;
380 ///
381 /// let mut reader = File::open("sample.sam.gz")
382 /// .map(bgzf::Reader::new)
383 /// .map(sam::io::Reader::new)?;
384 ///
385 /// reader.read_header()?;
386 ///
387 /// let index = csi::fs::read("sample.sam.gz.csi")?;
388 /// let query = reader.query_unmapped(&index)?;
389 ///
390 /// for result in query {
391 /// let record = result?;
392 /// // ...
393 /// }
394 /// # Ok::<_, io::Error>(())
395 /// ```
396 pub fn query_unmapped<I>(
397 &mut self,
398 index: &I,
399 ) -> io::Result<impl Iterator<Item = io::Result<Record>> + '_>
400 where
401 I: BinningIndex,
402 {
403 if let Some(pos) = index.last_first_record_start_position() {
404 self.get_mut().seek_to_virtual_position(pos)?;
405 } else {
406 self.seek_to_first_record()?;
407 }
408
409 let mut record = Record::default();
410
411 Ok(iter::from_fn(move || loop {
412 match self.read_record(&mut record) {
413 Ok(0) => return None,
414 Ok(_) => {
415 let result = record.flags().map(|flags| flags.is_unmapped());
416
417 match result {
418 Ok(true) => return Some(Ok(record.clone())),
419 Ok(false) => {}
420 Err(e) => return Some(Err(e)),
421 }
422 }
423 Err(e) => return Some(Err(e)),
424 }
425 }))
426 }
427}
428
429impl<R> From<R> for Reader<R>
430where
431 R: BufRead,
432{
433 fn from(inner: R) -> Self {
434 Self {
435 inner,
436 buf: Vec::new(),
437 }
438 }
439}
440
441impl<R> crate::alignment::io::Read<R> for Reader<R>
442where
443 R: BufRead,
444{
445 fn read_alignment_header(&mut self) -> io::Result<Header> {
446 self.read_header()
447 }
448
449 fn alignment_records<'a>(
450 &'a mut self,
451 _header: &'a Header,
452 ) -> Box<dyn Iterator<Item = io::Result<Box<dyn crate::alignment::Record>>> + 'a> {
453 Box::new(self.records().map(|result| {
454 result.map(|record| Box::new(record) as Box<dyn crate::alignment::Record>)
455 }))
456 }
457}
458
459fn read_line<R>(reader: &mut R, buf: &mut Vec<u8>) -> io::Result<usize>
460where
461 R: BufRead,
462{
463 const LINE_FEED: u8 = b'\n';
464 const CARRIAGE_RETURN: u8 = b'\r';
465
466 match reader.read_until(LINE_FEED, buf)? {
467 0 => Ok(0),
468 n => {
469 if buf.ends_with(&[LINE_FEED]) {
470 buf.pop();
471
472 if buf.ends_with(&[CARRIAGE_RETURN]) {
473 buf.pop();
474 }
475 }
476
477 Ok(n)
478 }
479 }
480}
481
482pub(crate) fn resolve_region(
483 reference_sequences: &ReferenceSequences,
484 region: &Region,
485) -> io::Result<usize> {
486 reference_sequences
487 .get_index_of(region.name())
488 .ok_or_else(|| {
489 io::Error::new(
490 io::ErrorKind::InvalidInput,
491 format!(
492 "region reference sequence does not exist in reference sequences: {region:?}"
493 ),
494 )
495 })
496}
497
498#[cfg(test)]
499mod tests {
500 use super::*;
501
502 #[test]
503 fn test_read_line() -> io::Result<()> {
504 fn t(buf: &mut Vec<u8>, mut reader: &[u8], expected: &[u8]) -> io::Result<()> {
505 buf.clear();
506 read_line(&mut reader, buf)?;
507 assert_eq!(buf, expected);
508 Ok(())
509 }
510
511 let mut buf = Vec::new();
512
513 t(&mut buf, b"noodles\n", b"noodles")?;
514 t(&mut buf, b"noodles\r\n", b"noodles")?;
515 t(&mut buf, b"noodles", b"noodles")?;
516
517 Ok(())
518 }
519}