1use std::{
4 error::Error,
5 fmt,
6 io::{self, BufRead},
7};
8
9use memchr::memchr;
10
11use super::reader::{read_line, DEFINITION_PREFIX};
12use crate::{
13 fai::Record,
14 record::definition::{Definition, ParseError},
15};
16
17pub struct Indexer<R> {
19 inner: R,
20 offset: u64,
21}
22
23impl<R> Indexer<R>
24where
25 R: BufRead,
26{
27 pub fn new(inner: R) -> Self {
29 Self { inner, offset: 0 }
30 }
31
32 fn consume_sequence_line(&mut self) -> io::Result<(usize, usize)> {
38 consume_sequence_line(&mut self.inner)
39 }
40
41 pub fn index_record(&mut self) -> Result<Option<Record>, IndexError> {
55 let definition = match self.read_definition() {
56 Ok(None) => return Ok(None),
57 Ok(Some(d)) => d,
58 Err(e) => return Err(e.into()),
59 };
60
61 let offset = self.offset;
62 let mut length = 0;
63
64 let (line_width, line_bases) = self.consume_sequence_line()?;
65 let (mut prev_line_width, mut prev_line_bases) = (line_width, line_bases);
66
67 loop {
68 self.offset += prev_line_width as u64;
69 length += prev_line_bases;
70
71 match self.consume_sequence_line() {
72 Ok((0, _)) => break,
73 Ok((bytes_read, base_count)) => {
74 if line_bases != prev_line_bases {
75 return Err(IndexError::InvalidLineBases(line_bases, prev_line_bases));
76 } else if line_width != prev_line_width {
77 return Err(IndexError::InvalidLineWidth(line_width, prev_line_width));
78 }
79
80 prev_line_width = bytes_read;
81 prev_line_bases = base_count;
82 }
83 Err(e) => return Err(IndexError::IoError(e)),
84 }
85 }
86
87 if length == 0 {
88 return Err(IndexError::EmptySequence(self.offset));
89 }
90
91 let record = Record::new(
92 definition.name(),
93 length as u64,
94 offset,
95 line_bases as u64,
96 line_width as u64,
97 );
98
99 Ok(Some(record))
100 }
101
102 fn read_definition(&mut self) -> io::Result<Option<Definition>> {
103 let mut buf = String::new();
104
105 match read_line(&mut self.inner, &mut buf) {
106 Ok(0) => return Ok(None),
107 Ok(n) => self.offset += n as u64,
108 Err(e) => return Err(e),
109 }
110
111 buf.parse()
112 .map(Some)
113 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
114 }
115}
116
117fn consume_sequence_line<R>(reader: &mut R) -> io::Result<(usize, usize)>
118where
119 R: BufRead,
120{
121 const LINE_FEED: u8 = b'\n';
122 const CARRIAGE_RETURN: u8 = b'\r';
123
124 fn count_bases(buf: &[u8]) -> usize {
125 if buf.ends_with(&[CARRIAGE_RETURN]) {
126 buf.len() - 1
127 } else {
128 buf.len()
129 }
130 }
131
132 let mut bytes_read = 0;
133 let mut base_count = 0;
134 let mut is_eol = false;
135
136 loop {
137 let src = reader.fill_buf()?;
138
139 if is_eol || src.is_empty() || src[0] == DEFINITION_PREFIX {
140 break;
141 }
142
143 let (chunk_len, chunk_base_count) = match memchr(LINE_FEED, src) {
144 Some(i) => {
145 is_eol = true;
146 (i + 1, count_bases(&src[..i]))
147 }
148 None => (src.len(), count_bases(src)),
149 };
150
151 reader.consume(chunk_len);
152
153 bytes_read += chunk_len;
154 base_count += chunk_base_count;
155 }
156
157 Ok((bytes_read, base_count))
158}
159
160#[derive(Debug)]
161pub enum IndexError {
162 EmptySequence(u64),
163 InvalidDefinition(ParseError),
164 InvalidLineBases(usize, usize),
165 InvalidLineWidth(usize, usize),
166 IoError(io::Error),
167}
168
169impl Error for IndexError {
170 fn source(&self) -> Option<&(dyn Error + 'static)> {
171 match self {
172 Self::EmptySequence(_) => None,
173 Self::InvalidDefinition(e) => Some(e),
174 Self::InvalidLineBases(..) => None,
175 Self::InvalidLineWidth(..) => None,
176 Self::IoError(e) => Some(e),
177 }
178 }
179}
180
181impl fmt::Display for IndexError {
182 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
183 match self {
184 Self::EmptySequence(offset) => write!(f, "empty sequence at offset {offset}"),
185 Self::InvalidDefinition(e) => e.fmt(f),
186 Self::InvalidLineBases(expected, actual) => {
187 write!(f, "invalid line bases: expected {expected}, got {actual}")
188 }
189 Self::InvalidLineWidth(expected, actual) => {
190 write!(f, "invalid line width: expected {expected}, got {actual}")
191 }
192 Self::IoError(e) => e.fmt(f),
193 }
194 }
195}
196
197impl From<io::Error> for IndexError {
198 fn from(error: io::Error) -> Self {
199 Self::IoError(error)
200 }
201}
202
203impl From<ParseError> for IndexError {
204 fn from(error: ParseError) -> Self {
205 Self::InvalidDefinition(error)
206 }
207}
208
209impl From<IndexError> for io::Error {
210 fn from(error: IndexError) -> Self {
211 match error {
212 IndexError::IoError(e) => e,
213 _ => Self::new(io::ErrorKind::InvalidInput, error),
214 }
215 }
216}
217
218#[cfg(test)]
219mod tests {
220 use super::*;
221
222 #[test]
223 fn test_index_record() -> Result<(), IndexError> {
224 let data = b">sq0\nACGT\n>sq1\nNNNN\nNNNN\nNN\n";
225 let mut indexer = Indexer::new(&data[..]);
226
227 let record = indexer.index_record()?;
228 assert_eq!(record, Some(Record::new("sq0", 4, 5, 4, 5)));
229
230 let record = indexer.index_record()?;
231 assert_eq!(record, Some(Record::new("sq1", 10, 15, 4, 5)));
232
233 assert!(indexer.index_record()?.is_none());
234
235 Ok(())
236 }
237
238 #[test]
239 fn test_index_record_with_invalid_line_bases() {
240 let data = b">sq0\nACGT\nACG\nACGT\nAC\n";
241 let mut indexer = Indexer::new(&data[..]);
242
243 assert!(matches!(
244 indexer.index_record(),
245 Err(IndexError::InvalidLineBases(4, 3))
246 ));
247 }
248
249 #[test]
250 fn test_index_record_with_invalid_line_width() {
251 let data = b">sq0\nACGT\nACGT\r\nACGT\nAC\n";
252 let mut indexer = Indexer::new(&data[..]);
253
254 assert!(matches!(
255 indexer.index_record(),
256 Err(IndexError::InvalidLineWidth(5, 6))
257 ));
258 }
259
260 #[test]
261 fn test_index_record_with_empty_sequence() {
262 let data = b">sq0\n";
263 let mut indexer = Indexer::new(&data[..]);
264
265 assert!(matches!(
266 indexer.index_record(),
267 Err(IndexError::EmptySequence(5))
268 ));
269 }
270
271 #[test]
272 fn test_consume_sequence_line() -> io::Result<()> {
273 use std::io::BufReader;
274
275 let data = b"ACGT\nNNNN\n";
276 let mut reader = &data[..];
277 let (len, base_count) = consume_sequence_line(&mut reader)?;
278 assert_eq!(len, 5);
279 assert_eq!(base_count, 4);
280
281 let data = b"ACGT\r\nNNNN\r\n";
282 let mut reader = &data[..];
283 let (len, base_count) = consume_sequence_line(&mut reader)?;
284 assert_eq!(len, 6);
285 assert_eq!(base_count, 4);
286
287 let data = b"ACGT\r\nNNNN\r\n";
288 let mut reader = BufReader::with_capacity(3, &data[..]);
289 let (len, base_count) = consume_sequence_line(&mut reader)?;
290 assert_eq!(len, 6);
291 assert_eq!(base_count, 4);
292
293 Ok(())
294 }
295}