noodles_fasta/io/
reader.rs

1//! FASTA reader and iterators.
2
3mod builder;
4mod definition;
5mod records;
6pub mod sequence;
7
8pub use self::{builder::Builder, records::Records};
9
10use std::io::{self, BufRead, Seek, SeekFrom};
11
12use noodles_core::{Position, Region};
13
14use self::definition::read_definition;
15use crate::{fai, Record};
16
17pub(crate) const DEFINITION_PREFIX: u8 = b'>';
18
19/// A FASTA reader.
20pub struct Reader<R> {
21    inner: R,
22}
23
24impl<R> Reader<R>
25where
26    R: BufRead,
27{
28    /// Creates a FASTA reader.
29    ///
30    /// # Examples
31    ///
32    /// ```
33    /// use noodles_fasta as fasta;
34    /// let data = b">sq0\nACGT\n>sq1\nNNNN\nNNNN\nNN\n";
35    /// let mut reader = fasta::io::Reader::new(&data[..]);
36    /// ```
37    pub fn new(inner: R) -> Self {
38        Self { inner }
39    }
40
41    /// Returns a reference to the underlying reader.
42    ///
43    /// # Examples
44    ///
45    /// ```
46    /// use noodles_fasta as fasta;
47    /// let reader = fasta::io::Reader::new(&[][..]);
48    /// assert!(reader.get_ref().is_empty());
49    /// ```
50    pub fn get_ref(&self) -> &R {
51        &self.inner
52    }
53
54    /// Returns a mutable reference to the underlying reader.
55    ///
56    /// # Examples
57    ///
58    /// ```
59    /// use noodles_fasta as fasta;
60    /// let mut reader = fasta::io::Reader::new(&[][..]);
61    /// assert!(reader.get_mut().is_empty());
62    /// ```
63    pub fn get_mut(&mut self) -> &mut R {
64        &mut self.inner
65    }
66
67    /// Returns the underlying reader.
68    ///
69    /// # Examples
70    ///
71    /// ```
72    /// use noodles_fasta as fasta;
73    /// let reader = fasta::io::Reader::new(&[][..]);
74    /// assert!(reader.into_inner().is_empty());
75    /// ```
76    pub fn into_inner(self) -> R {
77        self.inner
78    }
79
80    /// Reads a raw definition line.
81    ///
82    /// The given buffer will not include the trailing newline. It can subsequently be parsed as a
83    /// [`crate::record::Definition`].
84    ///
85    /// The position of the stream is expected to be at the start or at the start of another
86    /// definition.
87    ///
88    /// If successful, this returns the number of bytes read from the stream. If the number of
89    /// bytes read is 0, the stream reached EOF.
90    ///
91    /// # Examples
92    ///
93    /// ```
94    /// # use std::io;
95    /// use noodles_fasta as fasta;
96    ///
97    /// let data = b">sq0\nACGT\n>sq1\nNNNN\nNNNN\nNN\n";
98    /// let mut reader = fasta::io::Reader::new(&data[..]);
99    ///
100    /// let mut buf = String::new();
101    /// reader.read_definition(&mut buf)?;
102    ///
103    /// assert_eq!(buf, ">sq0");
104    /// # Ok::<(), io::Error>(())
105    /// ```
106    pub fn read_definition(&mut self, buf: &mut String) -> io::Result<usize> {
107        read_definition(&mut self.inner, buf)
108    }
109
110    /// Reads a sequence.
111    ///
112    /// The given buffer consumes a sequence without newlines until another definition or EOF is
113    /// reached.
114    ///
115    /// The position of the stream is expected to be at the start of a sequence, which is directly
116    /// after a definition.
117    ///
118    /// If successful, this returns the number of bases read from the stream. If the number of
119    /// bases read is 0, the stream reached EOF (though this case is likely an error).
120    ///
121    /// # Examples
122    ///
123    /// ```
124    /// # use std::io;
125    /// use noodles_fasta as fasta;
126    ///
127    /// let data = b">sq0\nACGT\n>sq1\nNNNN\nNNNN\nNN\n";
128    /// let mut reader = fasta::io::Reader::new(&data[..]);
129    /// reader.read_definition(&mut String::new())?;
130    ///
131    /// let mut buf = Vec::new();
132    /// reader.read_sequence(&mut buf)?;
133    ///
134    /// assert_eq!(buf, b"ACGT");
135    /// # Ok::<(), io::Error>(())
136    /// ```
137    pub fn read_sequence(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
138        use self::sequence::read_sequence;
139        read_sequence(&mut self.inner, buf)
140    }
141
142    /// Returns a sequence reader.
143    ///
144    /// A [`sequence::Reader`] can be used for lower-level reading of the raw sequence.
145    ///
146    /// The position of the stream is expected to be at the start of a sequence to read a full
147    /// sequence or within a sequence to read a partial one.
148    ///
149    /// # Examples
150    ///
151    /// ```
152    /// # use std::io::{self, Read};
153    /// use noodles_fasta as fasta;
154    ///
155    /// let data = b">sq0\nACGT\n>sq1\nNNNN\nNNNN\nNN\n";
156    /// let mut reader = fasta::io::Reader::new(&data[..]);
157    /// reader.read_definition(&mut String::new())?;
158    ///
159    /// let mut sequence_reader = reader.sequence_reader();
160    /// let mut buf = vec![0; 2];
161    /// sequence_reader.read_exact(&mut buf)?;
162    ///
163    /// assert_eq!(buf, b"AC");
164    /// # Ok::<(), io::Error>(())
165    /// ```
166    pub fn sequence_reader(&mut self) -> sequence::Reader<'_, R> {
167        sequence::Reader::new(self.get_mut())
168    }
169
170    /// Returns an iterator over records starting from the current stream position.
171    ///
172    /// The position of the stream is expected to be at the start or at the start of another
173    /// definition.
174    ///
175    /// ```
176    /// # use std::io;
177    /// use noodles_fasta::{self as fasta, record::{Definition, Sequence}};
178    ///
179    /// let data = b">sq0\nACGT\n>sq1\nNNNN\nNNNN\nNN\n";
180    /// let mut reader = fasta::io::Reader::new(&data[..]);
181    ///
182    /// let mut records = reader.records();
183    ///
184    /// assert_eq!(records.next().transpose()?, Some(fasta::Record::new(
185    ///     Definition::new("sq0", None),
186    ///     Sequence::from(b"ACGT".to_vec()),
187    /// )));
188    ///
189    /// assert_eq!(records.next().transpose()?, Some(fasta::Record::new(
190    ///     Definition::new("sq1", None),
191    ///     Sequence::from(b"NNNNNNNNNN".to_vec()),
192    /// )));
193    ///
194    /// assert!(records.next().is_none());
195    /// # Ok::<(), io::Error>(())
196    /// ```
197    pub fn records(&mut self) -> Records<'_, R> {
198        Records::new(self)
199    }
200}
201
202impl<R> Reader<R>
203where
204    R: BufRead + Seek,
205{
206    /// Returns a record of the given region.
207    ///
208    /// # Examples
209    ///
210    /// ```
211    /// # use std::io::Cursor;
212    /// use noodles_core::Region;
213    /// use noodles_fasta::{self as fasta, fai, record::{Definition, Sequence}};
214    ///
215    /// let data = b">sq0\nNNNN\n>sq1\nACGT\n>sq2\nNNNN\n";
216    /// let index = fai::Index::from(vec![
217    ///     fai::Record::new("sq0", 4, 5, 4, 5),
218    ///     fai::Record::new("sq1", 4, 15, 4, 5),
219    ///     fai::Record::new("sq2", 4, 25, 4, 5),
220    /// ]);
221    ///
222    /// let mut reader = fasta::io::Reader::new(Cursor::new(data));
223    ///
224    /// let region = Region::new("sq1", ..);
225    /// let record = reader.query(&index, &region)?;
226    /// assert_eq!(record, fasta::Record::new(
227    ///     Definition::new("sq1", None),
228    ///     Sequence::from(b"ACGT".to_vec()),
229    /// ));
230    ///
231    /// let region = "sq1:2-3".parse()?;
232    /// let record = reader.query(&index, &region)?;
233    /// assert_eq!(record, fasta::Record::new(
234    ///     Definition::new("sq1:2-3", None),
235    ///     Sequence::from(b"CG".to_vec()),
236    /// ));
237    /// # Ok::<(), Box<dyn std::error::Error>>(())
238    /// ```
239    pub fn query(&mut self, index: &fai::Index, region: &Region) -> io::Result<Record> {
240        use self::sequence::read_sequence_limit;
241        use crate::record::{Definition, Sequence};
242
243        let pos = index.query(region)?;
244        self.get_mut().seek(SeekFrom::Start(pos))?;
245
246        let definition = Definition::new(region.to_string(), None);
247
248        let interval = region.interval();
249        let start = usize::from(interval.start().unwrap_or(Position::MIN));
250        let end = usize::from(interval.end().unwrap_or(Position::MAX));
251        let len = end - start + 1;
252
253        let mut raw_sequence = Vec::new();
254        read_sequence_limit(&mut self.inner, len, &mut raw_sequence)?;
255
256        let sequence = Sequence::from(raw_sequence);
257
258        Ok(Record::new(definition, sequence))
259    }
260}
261
262// Reads all bytes until a line feed ('\n') or EOF is reached.
263//
264// The buffer will not include the trailing newline ('\n' or '\r\n').
265pub(crate) fn read_line<R>(reader: &mut R, buf: &mut String) -> io::Result<usize>
266where
267    R: BufRead,
268{
269    const LINE_FEED: char = '\n';
270    const CARRIAGE_RETURN: char = '\r';
271
272    match reader.read_line(buf) {
273        Ok(0) => Ok(0),
274        Ok(n) => {
275            if buf.ends_with(LINE_FEED) {
276                buf.pop();
277
278                if buf.ends_with(CARRIAGE_RETURN) {
279                    buf.pop();
280                }
281            }
282
283            Ok(n)
284        }
285        Err(e) => Err(e),
286    }
287}
288
289#[cfg(test)]
290mod tests {
291    use super::*;
292
293    #[test]
294    fn test_read_definition() -> io::Result<()> {
295        let data = b">sq0\nACGT\n";
296        let mut reader = Reader::new(&data[..]);
297
298        let mut description_buf = String::new();
299        reader.read_definition(&mut description_buf)?;
300
301        assert_eq!(description_buf, ">sq0");
302
303        Ok(())
304    }
305
306    #[test]
307    fn test_read_line() -> io::Result<()> {
308        let mut buf = String::new();
309
310        let data = b"noodles\n";
311        let mut reader = &data[..];
312        buf.clear();
313        read_line(&mut reader, &mut buf)?;
314        assert_eq!(buf, "noodles");
315
316        let data = b"noodles\r\n";
317        let mut reader = &data[..];
318        buf.clear();
319        read_line(&mut reader, &mut buf)?;
320        assert_eq!(buf, "noodles");
321
322        let data = b"noodles";
323        let mut reader = &data[..];
324        buf.clear();
325        read_line(&mut reader, &mut buf)?;
326        assert_eq!(buf, "noodles");
327
328        Ok(())
329    }
330}