noodles_fasta/io/
indexer.rs

1//! FASTA indexer.
2
3use std::{
4    error::Error,
5    fmt,
6    io::{self, BufRead},
7};
8
9use memchr::memchr;
10
11use super::reader::{read_line, DEFINITION_PREFIX};
12use crate::{
13    fai::Record,
14    record::definition::{Definition, ParseError},
15};
16
17/// A FASTA indexer.
18pub struct Indexer<R> {
19    inner: R,
20    offset: u64,
21}
22
23impl<R> Indexer<R>
24where
25    R: BufRead,
26{
27    /// Creates a FASTA indexer.
28    pub fn new(inner: R) -> Self {
29        Self { inner, offset: 0 }
30    }
31
32    /// Consumes a single sequence line.
33    ///
34    /// If successful, this returns the number of bytes read from the stream (i.e., the line width)
35    /// and the number of bases in the line. If the number of bytes read is 0, the entire sequence
36    /// of the current record was read.
37    fn consume_sequence_line(&mut self) -> io::Result<(usize, usize)> {
38        consume_sequence_line(&mut self.inner)
39    }
40
41    /// Indexes a raw FASTA record.
42    ///
43    /// The position of the stream is expected to be at the start or at the start of another
44    /// definition.
45    ///
46    /// # Errors
47    ///
48    /// An error is returned if the record fails to be completely read. This includes when
49    ///
50    ///   * the stream is not at the start of a definition;
51    ///   * the record is missing a sequence;
52    ///   * the sequence lines have a different number of bases, excluding the last line;
53    ///   * or the sequence lines are not the same length, excluding the last line.
54    pub fn index_record(&mut self) -> Result<Option<Record>, IndexError> {
55        let definition = match self.read_definition() {
56            Ok(None) => return Ok(None),
57            Ok(Some(d)) => d,
58            Err(e) => return Err(e.into()),
59        };
60
61        let offset = self.offset;
62        let mut length = 0;
63
64        let (line_width, line_bases) = self.consume_sequence_line()?;
65        let (mut prev_line_width, mut prev_line_bases) = (line_width, line_bases);
66
67        loop {
68            self.offset += prev_line_width as u64;
69            length += prev_line_bases;
70
71            match self.consume_sequence_line() {
72                Ok((0, _)) => break,
73                Ok((bytes_read, base_count)) => {
74                    if line_bases != prev_line_bases {
75                        return Err(IndexError::InvalidLineBases(line_bases, prev_line_bases));
76                    } else if line_width != prev_line_width {
77                        return Err(IndexError::InvalidLineWidth(line_width, prev_line_width));
78                    }
79
80                    prev_line_width = bytes_read;
81                    prev_line_bases = base_count;
82                }
83                Err(e) => return Err(IndexError::IoError(e)),
84            }
85        }
86
87        if length == 0 {
88            return Err(IndexError::EmptySequence(self.offset));
89        }
90
91        let record = Record::new(
92            definition.name(),
93            length as u64,
94            offset,
95            line_bases as u64,
96            line_width as u64,
97        );
98
99        Ok(Some(record))
100    }
101
102    fn read_definition(&mut self) -> io::Result<Option<Definition>> {
103        let mut buf = String::new();
104
105        match read_line(&mut self.inner, &mut buf) {
106            Ok(0) => return Ok(None),
107            Ok(n) => self.offset += n as u64,
108            Err(e) => return Err(e),
109        }
110
111        buf.parse()
112            .map(Some)
113            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
114    }
115}
116
117fn consume_sequence_line<R>(reader: &mut R) -> io::Result<(usize, usize)>
118where
119    R: BufRead,
120{
121    const LINE_FEED: u8 = b'\n';
122    const CARRIAGE_RETURN: u8 = b'\r';
123
124    fn count_bases(buf: &[u8]) -> usize {
125        if buf.ends_with(&[CARRIAGE_RETURN]) {
126            buf.len() - 1
127        } else {
128            buf.len()
129        }
130    }
131
132    let mut bytes_read = 0;
133    let mut base_count = 0;
134    let mut is_eol = false;
135
136    loop {
137        let src = reader.fill_buf()?;
138
139        if is_eol || src.is_empty() || src[0] == DEFINITION_PREFIX {
140            break;
141        }
142
143        let (chunk_len, chunk_base_count) = match memchr(LINE_FEED, src) {
144            Some(i) => {
145                is_eol = true;
146                (i + 1, count_bases(&src[..i]))
147            }
148            None => (src.len(), count_bases(src)),
149        };
150
151        reader.consume(chunk_len);
152
153        bytes_read += chunk_len;
154        base_count += chunk_base_count;
155    }
156
157    Ok((bytes_read, base_count))
158}
159
160#[derive(Debug)]
161pub enum IndexError {
162    EmptySequence(u64),
163    InvalidDefinition(ParseError),
164    InvalidLineBases(usize, usize),
165    InvalidLineWidth(usize, usize),
166    IoError(io::Error),
167}
168
169impl Error for IndexError {
170    fn source(&self) -> Option<&(dyn Error + 'static)> {
171        match self {
172            Self::EmptySequence(_) => None,
173            Self::InvalidDefinition(e) => Some(e),
174            Self::InvalidLineBases(..) => None,
175            Self::InvalidLineWidth(..) => None,
176            Self::IoError(e) => Some(e),
177        }
178    }
179}
180
181impl fmt::Display for IndexError {
182    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
183        match self {
184            Self::EmptySequence(offset) => write!(f, "empty sequence at offset {offset}"),
185            Self::InvalidDefinition(e) => e.fmt(f),
186            Self::InvalidLineBases(expected, actual) => {
187                write!(f, "invalid line bases: expected {expected}, got {actual}")
188            }
189            Self::InvalidLineWidth(expected, actual) => {
190                write!(f, "invalid line width: expected {expected}, got {actual}")
191            }
192            Self::IoError(e) => e.fmt(f),
193        }
194    }
195}
196
197impl From<io::Error> for IndexError {
198    fn from(error: io::Error) -> Self {
199        Self::IoError(error)
200    }
201}
202
203impl From<ParseError> for IndexError {
204    fn from(error: ParseError) -> Self {
205        Self::InvalidDefinition(error)
206    }
207}
208
209impl From<IndexError> for io::Error {
210    fn from(error: IndexError) -> Self {
211        match error {
212            IndexError::IoError(e) => e,
213            _ => Self::new(io::ErrorKind::InvalidInput, error),
214        }
215    }
216}
217
218#[cfg(test)]
219mod tests {
220    use super::*;
221
222    #[test]
223    fn test_index_record() -> Result<(), IndexError> {
224        let data = b">sq0\nACGT\n>sq1\nNNNN\nNNNN\nNN\n";
225        let mut indexer = Indexer::new(&data[..]);
226
227        let record = indexer.index_record()?;
228        assert_eq!(record, Some(Record::new("sq0", 4, 5, 4, 5)));
229
230        let record = indexer.index_record()?;
231        assert_eq!(record, Some(Record::new("sq1", 10, 15, 4, 5)));
232
233        assert!(indexer.index_record()?.is_none());
234
235        Ok(())
236    }
237
238    #[test]
239    fn test_index_record_with_invalid_line_bases() {
240        let data = b">sq0\nACGT\nACG\nACGT\nAC\n";
241        let mut indexer = Indexer::new(&data[..]);
242
243        assert!(matches!(
244            indexer.index_record(),
245            Err(IndexError::InvalidLineBases(4, 3))
246        ));
247    }
248
249    #[test]
250    fn test_index_record_with_invalid_line_width() {
251        let data = b">sq0\nACGT\nACGT\r\nACGT\nAC\n";
252        let mut indexer = Indexer::new(&data[..]);
253
254        assert!(matches!(
255            indexer.index_record(),
256            Err(IndexError::InvalidLineWidth(5, 6))
257        ));
258    }
259
260    #[test]
261    fn test_index_record_with_empty_sequence() {
262        let data = b">sq0\n";
263        let mut indexer = Indexer::new(&data[..]);
264
265        assert!(matches!(
266            indexer.index_record(),
267            Err(IndexError::EmptySequence(5))
268        ));
269    }
270
271    #[test]
272    fn test_consume_sequence_line() -> io::Result<()> {
273        use std::io::BufReader;
274
275        let data = b"ACGT\nNNNN\n";
276        let mut reader = &data[..];
277        let (len, base_count) = consume_sequence_line(&mut reader)?;
278        assert_eq!(len, 5);
279        assert_eq!(base_count, 4);
280
281        let data = b"ACGT\r\nNNNN\r\n";
282        let mut reader = &data[..];
283        let (len, base_count) = consume_sequence_line(&mut reader)?;
284        assert_eq!(len, 6);
285        assert_eq!(base_count, 4);
286
287        let data = b"ACGT\r\nNNNN\r\n";
288        let mut reader = BufReader::with_capacity(3, &data[..]);
289        let (len, base_count) = consume_sequence_line(&mut reader)?;
290        assert_eq!(len, 6);
291        assert_eq!(base_count, 4);
292
293        Ok(())
294    }
295}