noodles_fasta/io/reader/
sequence.rs

1//! Sequence reader.
2
3use std::io::{self, BufRead, Read};
4
5use super::DEFINITION_PREFIX;
6
7const LINE_FEED: u8 = b'\n';
8const CARRIAGE_RETURN: u8 = b'\r';
9
10/// A sequence reader.
11///
12/// This is used for lower-level reading of the sequence. It implements [`Read`] and [`BufRead`] to
13/// return raw bases sans newlines. It reads up to the next record definition or EOF.
14///
15/// This is created by calling [`super::Reader::sequence_reader`].
16pub struct Reader<'r, R> {
17    inner: &'r mut R,
18}
19
20impl<'r, R> Reader<'r, R>
21where
22    R: BufRead,
23{
24    pub(super) fn new(inner: &'r mut R) -> Self {
25        Self { inner }
26    }
27}
28
29impl<R> Read for Reader<'_, R>
30where
31    R: BufRead,
32{
33    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
34        let mut src = self.fill_buf()?;
35        let amt = src.read(buf)?;
36        self.consume(amt);
37        Ok(amt)
38    }
39}
40
41impl<R> BufRead for Reader<'_, R>
42where
43    R: BufRead,
44{
45    fn fill_buf(&mut self) -> io::Result<&[u8]> {
46        use memchr::memchr;
47
48        consume_empty_lines(&mut self.inner)?;
49
50        let src = self.inner.fill_buf()?;
51
52        let is_eof = src.is_empty();
53        let is_end_of_sequence = || src[0] == DEFINITION_PREFIX;
54
55        if is_eof || is_end_of_sequence() {
56            return Ok(&[]);
57        }
58
59        let line = match memchr(LINE_FEED, src) {
60            Some(i) => &src[..i],
61            None => src,
62        };
63
64        if line.ends_with(&[CARRIAGE_RETURN]) {
65            let end = line.len() - 1;
66            Ok(&line[..end])
67        } else {
68            Ok(line)
69        }
70    }
71
72    fn consume(&mut self, amt: usize) {
73        self.inner.consume(amt);
74    }
75}
76
77fn consume_empty_lines<R>(reader: &mut R) -> io::Result<()>
78where
79    R: BufRead,
80{
81    loop {
82        let mut is_newline = false;
83
84        if reader.fill_buf()?.starts_with(&[CARRIAGE_RETURN]) {
85            is_newline = true;
86            reader.consume(1);
87        }
88
89        if reader.fill_buf()?.starts_with(&[LINE_FEED]) {
90            is_newline = true;
91            reader.consume(1);
92        }
93
94        if !is_newline {
95            break;
96        }
97    }
98
99    Ok(())
100}
101
102pub(super) fn read_sequence<R>(reader: &mut R, buf: &mut Vec<u8>) -> io::Result<usize>
103where
104    R: BufRead,
105{
106    let mut reader = Reader::new(reader);
107    reader.read_to_end(buf)
108}
109
110pub(super) fn read_sequence_limit<R>(
111    reader: &mut R,
112    max_bases: usize,
113    buf: &mut Vec<u8>,
114) -> io::Result<usize>
115where
116    R: BufRead,
117{
118    let mut reader = Reader::new(reader);
119    let mut len = 0;
120
121    while buf.len() < max_bases {
122        let src = reader.fill_buf()?;
123
124        if src.is_empty() {
125            break;
126        }
127
128        let remaining_bases = max_bases - buf.len();
129        let i = remaining_bases.min(src.len());
130
131        let bases = &src[..i];
132        buf.extend(bases);
133
134        reader.consume(i);
135
136        len += i;
137    }
138
139    Ok(len)
140}
141
142#[cfg(test)]
143mod tests {
144    use super::*;
145
146    #[test]
147    fn test_read_sequence() -> io::Result<()> {
148        fn t(buf: &mut Vec<u8>, mut reader: &[u8], expected: &[u8]) -> io::Result<()> {
149            buf.clear();
150            read_sequence(&mut reader, buf)?;
151            assert_eq!(buf, expected);
152            Ok(())
153        }
154
155        let mut buf = Vec::new();
156
157        t(&mut buf, b"ACGT\n", b"ACGT")?;
158        t(&mut buf, b"ACGT\n>sq1\n", b"ACGT")?;
159        t(&mut buf, b"ACGT\n\nACGT\nAC\n\n", b"ACGTACGTAC")?;
160
161        t(&mut buf, b"ACGT\r\n", b"ACGT")?;
162        t(&mut buf, b"ACGT\r\n>sq1\r\n", b"ACGT")?;
163        t(&mut buf, b"ACGT\r\n\r\nACGT\r\nAC\r\n\r\n", b"ACGTACGTAC")?;
164
165        Ok(())
166    }
167
168    #[test]
169    fn test_read_sequence_limit() -> io::Result<()> {
170        fn t(
171            buf: &mut Vec<u8>,
172            mut reader: &[u8],
173            max_bases: usize,
174            expected: &[u8],
175        ) -> io::Result<()> {
176            buf.clear();
177            read_sequence_limit(&mut reader, max_bases, buf)?;
178            assert_eq!(buf, expected);
179            Ok(())
180        }
181
182        let mut buf = Vec::new();
183
184        t(&mut buf, b"ACGT\n", 4, b"ACGT")?;
185        t(&mut buf, b"ACGT\n>sq0\n", 4, b"ACGT")?;
186        t(&mut buf, b"ACGT\nACGT\nAC\n", 10, b"ACGTACGTAC")?;
187
188        t(&mut buf, b"ACGT\n", 2, b"AC")?;
189        t(&mut buf, b"ACGT\n>sq0\n", 2, b"AC")?;
190        t(&mut buf, b"ACGT\nACGT\nAC", 2, b"AC")?;
191
192        t(&mut buf, b"ACGT\n", 5, b"ACGT")?;
193        t(&mut buf, b"ACGT\n>sq0\n", 5, b"ACGT")?;
194        t(&mut buf, b"ACGT\nACGT\nAC", 5, b"ACGTA")?;
195
196        t(&mut buf, b"ACGT\n", 5, b"ACGT")?;
197        t(&mut buf, b"ACGT\r\n>sq0\r\n", 5, b"ACGT")?;
198        t(&mut buf, b"ACGT\r\nACGT\r\nAC\r\n", 5, b"ACGTA")?;
199
200        Ok(())
201    }
202}