noodles_fasta/io/
reader.rs

1//! FASTA reader and iterators.
2
3mod builder;
4mod definition;
5mod records;
6pub mod sequence;
7
8pub use self::{builder::Builder, records::Records};
9
10use std::io::{self, BufRead, Seek, SeekFrom};
11
12use noodles_core::{Position, Region};
13
14use self::definition::read_definition;
15use crate::{fai, Record};
16
17pub(crate) const DEFINITION_PREFIX: u8 = b'>';
18
19/// A FASTA reader.
20pub struct Reader<R> {
21    inner: R,
22}
23
24impl<R> Reader<R> {
25    /// Returns a reference to the underlying reader.
26    ///
27    /// # Examples
28    ///
29    /// ```
30    /// # use std::io;
31    /// use noodles_fasta as fasta;
32    /// let reader = fasta::io::Reader::new(io::empty());
33    /// let _inner = reader.get_ref();
34    /// ```
35    pub fn get_ref(&self) -> &R {
36        &self.inner
37    }
38
39    /// Returns a mutable reference to the underlying reader.
40    ///
41    /// # Examples
42    ///
43    /// ```
44    /// # use std::io;
45    /// use noodles_fasta as fasta;
46    /// let mut reader = fasta::io::Reader::new(io::empty());
47    /// let _inner = reader.get_mut();
48    /// ```
49    pub fn get_mut(&mut self) -> &mut R {
50        &mut self.inner
51    }
52
53    /// Returns the underlying reader.
54    ///
55    /// # Examples
56    ///
57    /// ```
58    /// # use std::io;
59    /// use noodles_fasta as fasta;
60    /// let reader = fasta::io::Reader::new(io::empty());
61    /// let _inner = reader.into_inner();
62    /// ```
63    pub fn into_inner(self) -> R {
64        self.inner
65    }
66}
67
68impl<R> Reader<R>
69where
70    R: BufRead,
71{
72    /// Creates a FASTA reader.
73    ///
74    /// # Examples
75    ///
76    /// ```
77    /// use noodles_fasta as fasta;
78    /// let data = b">sq0\nACGT\n>sq1\nNNNN\nNNNN\nNN\n";
79    /// let mut reader = fasta::io::Reader::new(&data[..]);
80    /// ```
81    pub fn new(inner: R) -> Self {
82        Self { inner }
83    }
84
85    /// Reads a raw definition line.
86    ///
87    /// The given buffer will not include the trailing newline. It can subsequently be parsed as a
88    /// [`crate::record::Definition`].
89    ///
90    /// The position of the stream is expected to be at the start or at the start of another
91    /// definition.
92    ///
93    /// If successful, this returns the number of bytes read from the stream. If the number of
94    /// bytes read is 0, the stream reached EOF.
95    ///
96    /// # Examples
97    ///
98    /// ```
99    /// # use std::io;
100    /// use noodles_fasta as fasta;
101    ///
102    /// let data = b">sq0\nACGT\n>sq1\nNNNN\nNNNN\nNN\n";
103    /// let mut reader = fasta::io::Reader::new(&data[..]);
104    ///
105    /// let mut buf = String::new();
106    /// reader.read_definition(&mut buf)?;
107    ///
108    /// assert_eq!(buf, ">sq0");
109    /// # Ok::<(), io::Error>(())
110    /// ```
111    pub fn read_definition(&mut self, buf: &mut String) -> io::Result<usize> {
112        read_definition(&mut self.inner, buf)
113    }
114
115    /// Reads a sequence.
116    ///
117    /// The given buffer consumes a sequence without newlines until another definition or EOF is
118    /// reached.
119    ///
120    /// The position of the stream is expected to be at the start of a sequence, which is directly
121    /// after a definition.
122    ///
123    /// If successful, this returns the number of bases read from the stream. If the number of
124    /// bases read is 0, the stream reached EOF (though this case is likely an error).
125    ///
126    /// # Examples
127    ///
128    /// ```
129    /// # use std::io;
130    /// use noodles_fasta as fasta;
131    ///
132    /// let data = b">sq0\nACGT\n>sq1\nNNNN\nNNNN\nNN\n";
133    /// let mut reader = fasta::io::Reader::new(&data[..]);
134    /// reader.read_definition(&mut String::new())?;
135    ///
136    /// let mut buf = Vec::new();
137    /// reader.read_sequence(&mut buf)?;
138    ///
139    /// assert_eq!(buf, b"ACGT");
140    /// # Ok::<(), io::Error>(())
141    /// ```
142    pub fn read_sequence(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
143        use self::sequence::read_sequence;
144        read_sequence(&mut self.inner, buf)
145    }
146
147    /// Returns a sequence reader.
148    ///
149    /// A [`sequence::Reader`] can be used for lower-level reading of the raw sequence.
150    ///
151    /// The position of the stream is expected to be at the start of a sequence to read a full
152    /// sequence or within a sequence to read a partial one.
153    ///
154    /// # Examples
155    ///
156    /// ```
157    /// # use std::io::{self, Read};
158    /// use noodles_fasta as fasta;
159    ///
160    /// let data = b">sq0\nACGT\n>sq1\nNNNN\nNNNN\nNN\n";
161    /// let mut reader = fasta::io::Reader::new(&data[..]);
162    /// reader.read_definition(&mut String::new())?;
163    ///
164    /// let mut sequence_reader = reader.sequence_reader();
165    /// let mut buf = vec![0; 2];
166    /// sequence_reader.read_exact(&mut buf)?;
167    ///
168    /// assert_eq!(buf, b"AC");
169    /// # Ok::<(), io::Error>(())
170    /// ```
171    pub fn sequence_reader(&mut self) -> sequence::Reader<'_, R> {
172        sequence::Reader::new(self.get_mut())
173    }
174
175    /// Returns an iterator over records starting from the current stream position.
176    ///
177    /// The position of the stream is expected to be at the start or at the start of another
178    /// definition.
179    ///
180    /// ```
181    /// # use std::io;
182    /// use noodles_fasta::{self as fasta, record::{Definition, Sequence}};
183    ///
184    /// let data = b">sq0\nACGT\n>sq1\nNNNN\nNNNN\nNN\n";
185    /// let mut reader = fasta::io::Reader::new(&data[..]);
186    ///
187    /// let mut records = reader.records();
188    ///
189    /// assert_eq!(records.next().transpose()?, Some(fasta::Record::new(
190    ///     Definition::new("sq0", None),
191    ///     Sequence::from(b"ACGT".to_vec()),
192    /// )));
193    ///
194    /// assert_eq!(records.next().transpose()?, Some(fasta::Record::new(
195    ///     Definition::new("sq1", None),
196    ///     Sequence::from(b"NNNNNNNNNN".to_vec()),
197    /// )));
198    ///
199    /// assert!(records.next().is_none());
200    /// # Ok::<(), io::Error>(())
201    /// ```
202    pub fn records(&mut self) -> Records<'_, R> {
203        Records::new(self)
204    }
205}
206
207impl<R> Reader<R>
208where
209    R: BufRead + Seek,
210{
211    /// Returns a record of the given region.
212    ///
213    /// # Examples
214    ///
215    /// ```
216    /// # use std::io::Cursor;
217    /// use noodles_core::Region;
218    /// use noodles_fasta::{self as fasta, fai, record::{Definition, Sequence}};
219    ///
220    /// let data = b">sq0\nNNNN\n>sq1\nACGT\n>sq2\nNNNN\n";
221    /// let index = fai::Index::from(vec![
222    ///     fai::Record::new("sq0", 4, 5, 4, 5),
223    ///     fai::Record::new("sq1", 4, 15, 4, 5),
224    ///     fai::Record::new("sq2", 4, 25, 4, 5),
225    /// ]);
226    ///
227    /// let mut reader = fasta::io::Reader::new(Cursor::new(data));
228    ///
229    /// let region = Region::new("sq1", ..);
230    /// let record = reader.query(&index, &region)?;
231    /// assert_eq!(record, fasta::Record::new(
232    ///     Definition::new("sq1", None),
233    ///     Sequence::from(b"ACGT".to_vec()),
234    /// ));
235    ///
236    /// let region = "sq1:2-3".parse()?;
237    /// let record = reader.query(&index, &region)?;
238    /// assert_eq!(record, fasta::Record::new(
239    ///     Definition::new("sq1:2-3", None),
240    ///     Sequence::from(b"CG".to_vec()),
241    /// ));
242    /// # Ok::<(), Box<dyn std::error::Error>>(())
243    /// ```
244    pub fn query(&mut self, index: &fai::Index, region: &Region) -> io::Result<Record> {
245        use self::sequence::read_sequence_limit;
246        use crate::record::{Definition, Sequence};
247
248        let pos = index.query(region)?;
249        self.get_mut().seek(SeekFrom::Start(pos))?;
250
251        let definition = Definition::new(region.to_string(), None);
252
253        let interval = region.interval();
254        let start = usize::from(interval.start().unwrap_or(Position::MIN));
255        let end = usize::from(interval.end().unwrap_or(Position::MAX));
256        let len = end - start + 1;
257
258        let mut raw_sequence = Vec::new();
259        read_sequence_limit(&mut self.inner, len, &mut raw_sequence)?;
260
261        let sequence = Sequence::from(raw_sequence);
262
263        Ok(Record::new(definition, sequence))
264    }
265}
266
267// Reads all bytes until a line feed ('\n') or EOF is reached.
268//
269// The buffer will not include the trailing newline ('\n' or '\r\n').
270pub(crate) fn read_line<R>(reader: &mut R, buf: &mut String) -> io::Result<usize>
271where
272    R: BufRead,
273{
274    const LINE_FEED: char = '\n';
275    const CARRIAGE_RETURN: char = '\r';
276
277    match reader.read_line(buf) {
278        Ok(0) => Ok(0),
279        Ok(n) => {
280            if buf.ends_with(LINE_FEED) {
281                buf.pop();
282
283                if buf.ends_with(CARRIAGE_RETURN) {
284                    buf.pop();
285                }
286            }
287
288            Ok(n)
289        }
290        Err(e) => Err(e),
291    }
292}
293
294#[cfg(test)]
295mod tests {
296    use super::*;
297
298    #[test]
299    fn test_read_definition() -> io::Result<()> {
300        let data = b">sq0\nACGT\n";
301        let mut reader = Reader::new(&data[..]);
302
303        let mut description_buf = String::new();
304        reader.read_definition(&mut description_buf)?;
305
306        assert_eq!(description_buf, ">sq0");
307
308        Ok(())
309    }
310
311    #[test]
312    fn test_read_line() -> io::Result<()> {
313        let mut buf = String::new();
314
315        let data = b"noodles\n";
316        let mut reader = &data[..];
317        buf.clear();
318        read_line(&mut reader, &mut buf)?;
319        assert_eq!(buf, "noodles");
320
321        let data = b"noodles\r\n";
322        let mut reader = &data[..];
323        buf.clear();
324        read_line(&mut reader, &mut buf)?;
325        assert_eq!(buf, "noodles");
326
327        let data = b"noodles";
328        let mut reader = &data[..];
329        buf.clear();
330        read_line(&mut reader, &mut buf)?;
331        assert_eq!(buf, "noodles");
332
333        Ok(())
334    }
335}