noodles_fasta/io/reader/
sequence.rs1use std::io::{self, BufRead, Read};
4
5use super::DEFINITION_PREFIX;
6
7const LINE_FEED: u8 = b'\n';
8const CARRIAGE_RETURN: u8 = b'\r';
9
10pub struct Reader<'r, R> {
17 inner: &'r mut R,
18}
19
20impl<'r, R> Reader<'r, R>
21where
22 R: BufRead,
23{
24 pub(super) fn new(inner: &'r mut R) -> Self {
25 Self { inner }
26 }
27}
28
29impl<R> Read for Reader<'_, R>
30where
31 R: BufRead,
32{
33 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
34 let mut src = self.fill_buf()?;
35 let amt = src.read(buf)?;
36 self.consume(amt);
37 Ok(amt)
38 }
39}
40
41impl<R> BufRead for Reader<'_, R>
42where
43 R: BufRead,
44{
45 fn fill_buf(&mut self) -> io::Result<&[u8]> {
46 use memchr::memchr;
47
48 consume_empty_lines(&mut self.inner)?;
49
50 let src = self.inner.fill_buf()?;
51
52 let is_eof = src.is_empty();
53 let is_end_of_sequence = || src[0] == DEFINITION_PREFIX;
54
55 if is_eof || is_end_of_sequence() {
56 return Ok(&[]);
57 }
58
59 let line = match memchr(LINE_FEED, src) {
60 Some(i) => &src[..i],
61 None => src,
62 };
63
64 if line.ends_with(&[CARRIAGE_RETURN]) {
65 let end = line.len() - 1;
66 Ok(&line[..end])
67 } else {
68 Ok(line)
69 }
70 }
71
72 fn consume(&mut self, amt: usize) {
73 self.inner.consume(amt);
74 }
75}
76
77fn consume_empty_lines<R>(reader: &mut R) -> io::Result<()>
78where
79 R: BufRead,
80{
81 loop {
82 let mut is_newline = false;
83
84 if reader.fill_buf()?.starts_with(&[CARRIAGE_RETURN]) {
85 is_newline = true;
86 reader.consume(1);
87 }
88
89 if reader.fill_buf()?.starts_with(&[LINE_FEED]) {
90 is_newline = true;
91 reader.consume(1);
92 }
93
94 if !is_newline {
95 break;
96 }
97 }
98
99 Ok(())
100}
101
102pub(super) fn read_sequence<R>(reader: &mut R, buf: &mut Vec<u8>) -> io::Result<usize>
103where
104 R: BufRead,
105{
106 let mut reader = Reader::new(reader);
107 reader.read_to_end(buf)
108}
109
110pub(super) fn read_sequence_limit<R>(
111 reader: &mut R,
112 max_bases: usize,
113 buf: &mut Vec<u8>,
114) -> io::Result<usize>
115where
116 R: BufRead,
117{
118 let mut reader = Reader::new(reader);
119 let mut len = 0;
120
121 while buf.len() < max_bases {
122 let src = reader.fill_buf()?;
123
124 if src.is_empty() {
125 break;
126 }
127
128 let remaining_bases = max_bases - buf.len();
129 let i = remaining_bases.min(src.len());
130
131 let bases = &src[..i];
132 buf.extend(bases);
133
134 reader.consume(i);
135
136 len += i;
137 }
138
139 Ok(len)
140}
141
142#[cfg(test)]
143mod tests {
144 use super::*;
145
146 #[test]
147 fn test_read_sequence() -> io::Result<()> {
148 fn t(buf: &mut Vec<u8>, mut reader: &[u8], expected: &[u8]) -> io::Result<()> {
149 buf.clear();
150 read_sequence(&mut reader, buf)?;
151 assert_eq!(buf, expected);
152 Ok(())
153 }
154
155 let mut buf = Vec::new();
156
157 t(&mut buf, b"ACGT\n", b"ACGT")?;
158 t(&mut buf, b"ACGT\n>sq1\n", b"ACGT")?;
159 t(&mut buf, b"ACGT\n\nACGT\nAC\n\n", b"ACGTACGTAC")?;
160
161 t(&mut buf, b"ACGT\r\n", b"ACGT")?;
162 t(&mut buf, b"ACGT\r\n>sq1\r\n", b"ACGT")?;
163 t(&mut buf, b"ACGT\r\n\r\nACGT\r\nAC\r\n\r\n", b"ACGTACGTAC")?;
164
165 Ok(())
166 }
167
168 #[test]
169 fn test_read_sequence_limit() -> io::Result<()> {
170 fn t(
171 buf: &mut Vec<u8>,
172 mut reader: &[u8],
173 max_bases: usize,
174 expected: &[u8],
175 ) -> io::Result<()> {
176 buf.clear();
177 read_sequence_limit(&mut reader, max_bases, buf)?;
178 assert_eq!(buf, expected);
179 Ok(())
180 }
181
182 let mut buf = Vec::new();
183
184 t(&mut buf, b"ACGT\n", 4, b"ACGT")?;
185 t(&mut buf, b"ACGT\n>sq0\n", 4, b"ACGT")?;
186 t(&mut buf, b"ACGT\nACGT\nAC\n", 10, b"ACGTACGTAC")?;
187
188 t(&mut buf, b"ACGT\n", 2, b"AC")?;
189 t(&mut buf, b"ACGT\n>sq0\n", 2, b"AC")?;
190 t(&mut buf, b"ACGT\nACGT\nAC", 2, b"AC")?;
191
192 t(&mut buf, b"ACGT\n", 5, b"ACGT")?;
193 t(&mut buf, b"ACGT\n>sq0\n", 5, b"ACGT")?;
194 t(&mut buf, b"ACGT\nACGT\nAC", 5, b"ACGTA")?;
195
196 t(&mut buf, b"ACGT\n", 5, b"ACGT")?;
197 t(&mut buf, b"ACGT\r\n>sq0\r\n", 5, b"ACGT")?;
198 t(&mut buf, b"ACGT\r\nACGT\r\nAC\r\n", 5, b"ACGTA")?;
199
200 Ok(())
201 }
202}