noodles_fasta/io/reader.rs
1//! FASTA reader and iterators.
2
3mod builder;
4mod definition;
5mod records;
6pub mod sequence;
7
8pub use self::{builder::Builder, records::Records};
9
10use std::io::{self, BufRead, Seek, SeekFrom};
11
12use noodles_core::{Position, Region};
13
14use self::definition::read_definition;
15use crate::{fai, Record};
16
17pub(crate) const DEFINITION_PREFIX: u8 = b'>';
18
19/// A FASTA reader.
20pub struct Reader<R> {
21 inner: R,
22}
23
24impl<R> Reader<R> {
25 /// Returns a reference to the underlying reader.
26 ///
27 /// # Examples
28 ///
29 /// ```
30 /// # use std::io;
31 /// use noodles_fasta as fasta;
32 /// let reader = fasta::io::Reader::new(io::empty());
33 /// let _inner = reader.get_ref();
34 /// ```
35 pub fn get_ref(&self) -> &R {
36 &self.inner
37 }
38
39 /// Returns a mutable reference to the underlying reader.
40 ///
41 /// # Examples
42 ///
43 /// ```
44 /// # use std::io;
45 /// use noodles_fasta as fasta;
46 /// let mut reader = fasta::io::Reader::new(io::empty());
47 /// let _inner = reader.get_mut();
48 /// ```
49 pub fn get_mut(&mut self) -> &mut R {
50 &mut self.inner
51 }
52
53 /// Returns the underlying reader.
54 ///
55 /// # Examples
56 ///
57 /// ```
58 /// # use std::io;
59 /// use noodles_fasta as fasta;
60 /// let reader = fasta::io::Reader::new(io::empty());
61 /// let _inner = reader.into_inner();
62 /// ```
63 pub fn into_inner(self) -> R {
64 self.inner
65 }
66}
67
68impl<R> Reader<R>
69where
70 R: BufRead,
71{
72 /// Creates a FASTA reader.
73 ///
74 /// # Examples
75 ///
76 /// ```
77 /// use noodles_fasta as fasta;
78 /// let data = b">sq0\nACGT\n>sq1\nNNNN\nNNNN\nNN\n";
79 /// let mut reader = fasta::io::Reader::new(&data[..]);
80 /// ```
81 pub fn new(inner: R) -> Self {
82 Self { inner }
83 }
84
85 /// Reads a raw definition line.
86 ///
87 /// The given buffer will not include the trailing newline. It can subsequently be parsed as a
88 /// [`crate::record::Definition`].
89 ///
90 /// The position of the stream is expected to be at the start or at the start of another
91 /// definition.
92 ///
93 /// If successful, this returns the number of bytes read from the stream. If the number of
94 /// bytes read is 0, the stream reached EOF.
95 ///
96 /// # Examples
97 ///
98 /// ```
99 /// # use std::io;
100 /// use noodles_fasta as fasta;
101 ///
102 /// let data = b">sq0\nACGT\n>sq1\nNNNN\nNNNN\nNN\n";
103 /// let mut reader = fasta::io::Reader::new(&data[..]);
104 ///
105 /// let mut buf = String::new();
106 /// reader.read_definition(&mut buf)?;
107 ///
108 /// assert_eq!(buf, ">sq0");
109 /// # Ok::<(), io::Error>(())
110 /// ```
111 pub fn read_definition(&mut self, buf: &mut String) -> io::Result<usize> {
112 read_definition(&mut self.inner, buf)
113 }
114
115 /// Reads a sequence.
116 ///
117 /// The given buffer consumes a sequence without newlines until another definition or EOF is
118 /// reached.
119 ///
120 /// The position of the stream is expected to be at the start of a sequence, which is directly
121 /// after a definition.
122 ///
123 /// If successful, this returns the number of bases read from the stream. If the number of
124 /// bases read is 0, the stream reached EOF (though this case is likely an error).
125 ///
126 /// # Examples
127 ///
128 /// ```
129 /// # use std::io;
130 /// use noodles_fasta as fasta;
131 ///
132 /// let data = b">sq0\nACGT\n>sq1\nNNNN\nNNNN\nNN\n";
133 /// let mut reader = fasta::io::Reader::new(&data[..]);
134 /// reader.read_definition(&mut String::new())?;
135 ///
136 /// let mut buf = Vec::new();
137 /// reader.read_sequence(&mut buf)?;
138 ///
139 /// assert_eq!(buf, b"ACGT");
140 /// # Ok::<(), io::Error>(())
141 /// ```
142 pub fn read_sequence(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
143 use self::sequence::read_sequence;
144 read_sequence(&mut self.inner, buf)
145 }
146
147 /// Returns a sequence reader.
148 ///
149 /// A [`sequence::Reader`] can be used for lower-level reading of the raw sequence.
150 ///
151 /// The position of the stream is expected to be at the start of a sequence to read a full
152 /// sequence or within a sequence to read a partial one.
153 ///
154 /// # Examples
155 ///
156 /// ```
157 /// # use std::io::{self, Read};
158 /// use noodles_fasta as fasta;
159 ///
160 /// let data = b">sq0\nACGT\n>sq1\nNNNN\nNNNN\nNN\n";
161 /// let mut reader = fasta::io::Reader::new(&data[..]);
162 /// reader.read_definition(&mut String::new())?;
163 ///
164 /// let mut sequence_reader = reader.sequence_reader();
165 /// let mut buf = vec![0; 2];
166 /// sequence_reader.read_exact(&mut buf)?;
167 ///
168 /// assert_eq!(buf, b"AC");
169 /// # Ok::<(), io::Error>(())
170 /// ```
171 pub fn sequence_reader(&mut self) -> sequence::Reader<'_, R> {
172 sequence::Reader::new(self.get_mut())
173 }
174
175 /// Returns an iterator over records starting from the current stream position.
176 ///
177 /// The position of the stream is expected to be at the start or at the start of another
178 /// definition.
179 ///
180 /// ```
181 /// # use std::io;
182 /// use noodles_fasta::{self as fasta, record::{Definition, Sequence}};
183 ///
184 /// let data = b">sq0\nACGT\n>sq1\nNNNN\nNNNN\nNN\n";
185 /// let mut reader = fasta::io::Reader::new(&data[..]);
186 ///
187 /// let mut records = reader.records();
188 ///
189 /// assert_eq!(records.next().transpose()?, Some(fasta::Record::new(
190 /// Definition::new("sq0", None),
191 /// Sequence::from(b"ACGT".to_vec()),
192 /// )));
193 ///
194 /// assert_eq!(records.next().transpose()?, Some(fasta::Record::new(
195 /// Definition::new("sq1", None),
196 /// Sequence::from(b"NNNNNNNNNN".to_vec()),
197 /// )));
198 ///
199 /// assert!(records.next().is_none());
200 /// # Ok::<(), io::Error>(())
201 /// ```
202 pub fn records(&mut self) -> Records<'_, R> {
203 Records::new(self)
204 }
205}
206
207impl<R> Reader<R>
208where
209 R: BufRead + Seek,
210{
211 /// Returns a record of the given region.
212 ///
213 /// # Examples
214 ///
215 /// ```
216 /// # use std::io::Cursor;
217 /// use noodles_core::Region;
218 /// use noodles_fasta::{self as fasta, fai, record::{Definition, Sequence}};
219 ///
220 /// let data = b">sq0\nNNNN\n>sq1\nACGT\n>sq2\nNNNN\n";
221 /// let index = fai::Index::from(vec![
222 /// fai::Record::new("sq0", 4, 5, 4, 5),
223 /// fai::Record::new("sq1", 4, 15, 4, 5),
224 /// fai::Record::new("sq2", 4, 25, 4, 5),
225 /// ]);
226 ///
227 /// let mut reader = fasta::io::Reader::new(Cursor::new(data));
228 ///
229 /// let region = Region::new("sq1", ..);
230 /// let record = reader.query(&index, ®ion)?;
231 /// assert_eq!(record, fasta::Record::new(
232 /// Definition::new("sq1", None),
233 /// Sequence::from(b"ACGT".to_vec()),
234 /// ));
235 ///
236 /// let region = "sq1:2-3".parse()?;
237 /// let record = reader.query(&index, ®ion)?;
238 /// assert_eq!(record, fasta::Record::new(
239 /// Definition::new("sq1:2-3", None),
240 /// Sequence::from(b"CG".to_vec()),
241 /// ));
242 /// # Ok::<(), Box<dyn std::error::Error>>(())
243 /// ```
244 pub fn query(&mut self, index: &fai::Index, region: &Region) -> io::Result<Record> {
245 use self::sequence::read_sequence_limit;
246 use crate::record::{Definition, Sequence};
247
248 let pos = index.query(region)?;
249 self.get_mut().seek(SeekFrom::Start(pos))?;
250
251 let definition = Definition::new(region.to_string(), None);
252
253 let interval = region.interval();
254 let start = usize::from(interval.start().unwrap_or(Position::MIN));
255 let end = usize::from(interval.end().unwrap_or(Position::MAX));
256 let len = end - start + 1;
257
258 let mut raw_sequence = Vec::new();
259 read_sequence_limit(&mut self.inner, len, &mut raw_sequence)?;
260
261 let sequence = Sequence::from(raw_sequence);
262
263 Ok(Record::new(definition, sequence))
264 }
265}
266
267// Reads all bytes until a line feed ('\n') or EOF is reached.
268//
269// The buffer will not include the trailing newline ('\n' or '\r\n').
270pub(crate) fn read_line<R>(reader: &mut R, buf: &mut String) -> io::Result<usize>
271where
272 R: BufRead,
273{
274 const LINE_FEED: char = '\n';
275 const CARRIAGE_RETURN: char = '\r';
276
277 match reader.read_line(buf) {
278 Ok(0) => Ok(0),
279 Ok(n) => {
280 if buf.ends_with(LINE_FEED) {
281 buf.pop();
282
283 if buf.ends_with(CARRIAGE_RETURN) {
284 buf.pop();
285 }
286 }
287
288 Ok(n)
289 }
290 Err(e) => Err(e),
291 }
292}
293
294#[cfg(test)]
295mod tests {
296 use super::*;
297
298 #[test]
299 fn test_read_definition() -> io::Result<()> {
300 let data = b">sq0\nACGT\n";
301 let mut reader = Reader::new(&data[..]);
302
303 let mut description_buf = String::new();
304 reader.read_definition(&mut description_buf)?;
305
306 assert_eq!(description_buf, ">sq0");
307
308 Ok(())
309 }
310
311 #[test]
312 fn test_read_line() -> io::Result<()> {
313 let mut buf = String::new();
314
315 let data = b"noodles\n";
316 let mut reader = &data[..];
317 buf.clear();
318 read_line(&mut reader, &mut buf)?;
319 assert_eq!(buf, "noodles");
320
321 let data = b"noodles\r\n";
322 let mut reader = &data[..];
323 buf.clear();
324 read_line(&mut reader, &mut buf)?;
325 assert_eq!(buf, "noodles");
326
327 let data = b"noodles";
328 let mut reader = &data[..];
329 buf.clear();
330 read_line(&mut reader, &mut buf)?;
331 assert_eq!(buf, "noodles");
332
333 Ok(())
334 }
335}