noodles_bgzf/
reader.rs

1//! BGZF reader.
2
3mod builder;
4pub(crate) mod frame;
5
6pub use self::builder::Builder;
7
8use std::io::{self, BufRead, Read, Seek, SeekFrom};
9
10use super::{gzi, Block, VirtualPosition, BGZF_MAX_ISIZE};
11
12/// A BGZF reader.
13///
14/// The reader implements both [`std::io::Read`] and [`std::io::BufRead`], consuming compressed
15/// data and emitting uncompressed data. It is internally buffered by a single block, and to
16/// correctly track (virtual) positions, the reader _cannot_ be double buffered (e.g., using
17/// [`std::io::BufReader`]).
18///
19/// # Examples
20///
21/// ```no_run
22/// # use std::{fs::File, io::{self, Read}};
23/// use noodles_bgzf as bgzf;
24/// let mut reader = File::open("data.gz").map(bgzf::Reader::new)?;
25/// let mut data = Vec::new();
26/// reader.read_to_end(&mut data)?;
27/// # Ok::<(), io::Error>(())
28/// ```
29pub struct Reader<R> {
30    inner: R,
31    buf: Vec<u8>,
32    position: u64,
33    block: Block,
34}
35
36impl<R> Reader<R> {
37    /// Returns a reference to the underlying reader.
38    ///
39    /// # Examples
40    ///
41    /// ```
42    /// # use std::io;
43    /// use noodles_bgzf as bgzf;
44    /// let reader = bgzf::Reader::new(io::empty());
45    /// let _inner = reader.get_ref();
46    /// ```
47    pub fn get_ref(&self) -> &R {
48        &self.inner
49    }
50
51    /// Returns a mutable reference to the underlying reader.
52    ///
53    /// # Examples
54    ///
55    /// ```
56    /// # use std::io;
57    /// use noodles_bgzf as bgzf;
58    /// let mut reader = bgzf::Reader::new(io::empty());
59    /// let _inner = reader.get_mut();
60    /// ```
61    pub fn get_mut(&mut self) -> &mut R {
62        &mut self.inner
63    }
64
65    /// Unwraps and returns the underlying writer.
66    ///
67    /// # Examples
68    ///
69    /// ```
70    /// # use std::io;
71    /// use noodles_bgzf as bgzf;
72    /// let reader = bgzf::Reader::new(io::empty());
73    /// let _inner = reader.into_inner();
74    /// ```
75    pub fn into_inner(self) -> R {
76        self.inner
77    }
78}
79
80impl<R> Reader<R>
81where
82    R: Read,
83{
84    /// Creates a BGZF reader.
85    ///
86    /// # Examples
87    ///
88    /// ```
89    /// # use std::io;
90    /// use noodles_bgzf as bgzf;
91    /// let reader = bgzf::Reader::new(io::empty());
92    /// ```
93    pub fn new(inner: R) -> Self {
94        Builder.build_from_reader(inner)
95    }
96
97    /// Returns the current position of the stream.
98    ///
99    /// # Examples
100    ///
101    /// ```
102    /// # use std::io;
103    /// use noodles_bgzf as bgzf;
104    /// let reader = bgzf::Reader::new(io::empty());
105    /// assert_eq!(reader.position(), 0);
106    /// ```
107    pub fn position(&self) -> u64 {
108        self.position
109    }
110
111    /// Returns the current virtual position of the stream.
112    ///
113    /// # Examples
114    ///
115    /// ```
116    /// # use std::io;
117    /// use noodles_bgzf as bgzf;
118    /// let reader = bgzf::Reader::new(io::empty());
119    /// assert_eq!(reader.virtual_position(), bgzf::VirtualPosition::from(0));
120    /// ```
121    pub fn virtual_position(&self) -> VirtualPosition {
122        self.block.virtual_position()
123    }
124
125    fn read_nonempty_block_with<F>(&mut self, mut f: F) -> io::Result<usize>
126    where
127        F: FnMut(&[u8], &mut Block) -> io::Result<()>,
128    {
129        use self::frame::read_frame_into;
130
131        while read_frame_into(&mut self.inner, &mut self.buf)?.is_some() {
132            f(&self.buf, &mut self.block)?;
133
134            self.block.set_position(self.position);
135            self.position += self.block.size();
136
137            if self.block.data().len() > 0 {
138                break;
139            }
140        }
141
142        Ok(self.block.data().len())
143    }
144
145    fn read_block(&mut self) -> io::Result<usize> {
146        use self::frame::parse_block;
147        self.read_nonempty_block_with(parse_block)
148    }
149
150    fn read_block_into_buf(&mut self, buf: &mut [u8]) -> io::Result<usize> {
151        use self::frame::parse_block_into_buf;
152        self.read_nonempty_block_with(|src, block| parse_block_into_buf(src, block, buf))
153    }
154}
155
156impl<R> Reader<R>
157where
158    R: Read + Seek,
159{
160    /// Seeks the stream to the given virtual position.
161    ///
162    /// The underlying stream's cursor is first moved the the compressed position. A block is read,
163    /// decompressed, and has its own cursor moved to the uncompressed position.
164    ///
165    /// # Examples
166    ///
167    /// ```
168    /// # use std::io;
169    /// use noodles_bgzf as bgzf;
170    /// let mut reader = bgzf::Reader::new(io::empty());
171    /// reader.seek(bgzf::VirtualPosition::MIN)?;
172    /// # Ok::<(), io::Error>(())
173    /// ```
174    pub fn seek(&mut self, pos: VirtualPosition) -> io::Result<VirtualPosition> {
175        let (cpos, upos) = pos.into();
176
177        self.inner.seek(SeekFrom::Start(cpos))?;
178        self.position = cpos;
179
180        self.read_block()?;
181
182        self.block.data_mut().set_position(usize::from(upos));
183
184        Ok(pos)
185    }
186
187    /// Seeks the stream to the given uncompressed position.
188    ///
189    /// # Examples
190    ///
191    /// ```
192    /// # use std::io;
193    /// use noodles_bgzf::{self as bgzf, gzi};
194    ///
195    /// let mut reader = bgzf::Reader::new(io::empty());
196    ///
197    /// let index = gzi::Index::default();
198    /// reader.seek_by_uncompressed_position(&index, 0)?;
199    /// # Ok::<_, io::Error>(())
200    /// ```
201    pub fn seek_by_uncompressed_position(
202        &mut self,
203        index: &gzi::Index,
204        pos: u64,
205    ) -> io::Result<u64> {
206        let virtual_position = index.query(pos)?;
207        self.seek(virtual_position)?;
208        Ok(pos)
209    }
210}
211
212impl<R> Read for Reader<R>
213where
214    R: Read,
215{
216    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
217        // If a new block is about to be read and the given buffer is guaranteed to be larger than
218        // the next block, reading to the block buffer can be skipped. The uncompressed data is
219        // decoded into the given buffer to avoid having to subsequently recopy it from the block.
220        if !self.block.data().has_remaining() && buf.len() >= BGZF_MAX_ISIZE {
221            self.read_block_into_buf(buf)
222        } else {
223            let mut src = self.fill_buf()?;
224            let amt = src.read(buf)?;
225            self.consume(amt);
226            Ok(amt)
227        }
228    }
229
230    fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> {
231        if let Some(src) = self.block.data().as_ref().get(..buf.len()) {
232            buf.copy_from_slice(src);
233            self.consume(src.len());
234            Ok(())
235        } else {
236            default_read_exact(self, buf)
237        }
238    }
239}
240
241impl<R> BufRead for Reader<R>
242where
243    R: Read,
244{
245    fn consume(&mut self, amt: usize) {
246        self.block.data_mut().consume(amt);
247    }
248
249    fn fill_buf(&mut self) -> io::Result<&[u8]> {
250        if !self.block.data().has_remaining() {
251            self.read_block()?;
252        }
253
254        Ok(self.block.data().as_ref())
255    }
256}
257
258impl<R> crate::io::Read for Reader<R>
259where
260    R: Read,
261{
262    fn virtual_position(&self) -> VirtualPosition {
263        self.block.virtual_position()
264    }
265}
266
267impl<R> crate::io::BufRead for Reader<R> where R: Read {}
268
269impl<R> crate::io::Seek for Reader<R>
270where
271    R: Read + Seek,
272{
273    fn seek_to_virtual_position(&mut self, pos: VirtualPosition) -> io::Result<VirtualPosition> {
274        self.seek(pos)
275    }
276
277    fn seek_with_index(&mut self, index: &gzi::Index, pos: SeekFrom) -> io::Result<u64> {
278        match pos {
279            SeekFrom::Start(pos) => self.seek_by_uncompressed_position(index, pos),
280            _ => unimplemented!(),
281        }
282    }
283}
284
285pub(crate) fn default_read_exact<R>(reader: &mut R, mut buf: &mut [u8]) -> io::Result<()>
286where
287    R: Read,
288{
289    while !buf.is_empty() {
290        match reader.read(buf) {
291            Ok(0) => break,
292            Ok(n) => buf = &mut buf[n..],
293            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
294            Err(e) => return Err(e),
295        }
296    }
297
298    if buf.is_empty() {
299        Ok(())
300    } else {
301        Err(io::Error::new(
302            io::ErrorKind::UnexpectedEof,
303            "failed to fill whole buffer",
304        ))
305    }
306}
307
308#[cfg(test)]
309mod tests {
310    use std::io::Cursor;
311
312    use super::*;
313
314    #[test]
315    fn test_read_with_empty_block() -> io::Result<()> {
316        #[rustfmt::skip]
317        let data = [
318            // block 0 (b"noodles")
319            0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, 0x42, 0x43,
320            0x02, 0x00, 0x22, 0x00, 0xcb, 0xcb, 0xcf, 0x4f, 0xc9, 0x49, 0x2d, 0x06, 0x00, 0xa1,
321            0x58, 0x2a, 0x80, 0x07, 0x00, 0x00, 0x00,
322            // block 1 (b"")
323            0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, 0x42, 0x43,
324            0x02, 0x00, 0x1b, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
325            // block 2 (b"bgzf")
326            0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, 0x42, 0x43,
327            0x02, 0x00, 0x1f, 0x00, 0x4b, 0x4a, 0xaf, 0x4a, 0x03, 0x00, 0x20, 0x68, 0xf2, 0x8c,
328            0x04, 0x00, 0x00, 0x00,
329            // EOF block
330            0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, 0x42, 0x43,
331            0x02, 0x00, 0x1b, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
332        ];
333
334        let mut reader = Reader::new(&data[..]);
335        let mut buf = Vec::new();
336        reader.read_to_end(&mut buf)?;
337
338        assert_eq!(buf, b"noodlesbgzf");
339
340        Ok(())
341    }
342
343    #[test]
344    fn test_seek() -> Result<(), Box<dyn std::error::Error>> {
345        #[rustfmt::skip]
346        let data = [
347            // block 0 (b"noodles")
348            0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, 0x42, 0x43,
349            0x02, 0x00, 0x22, 0x00, 0xcb, 0xcb, 0xcf, 0x4f, 0xc9, 0x49, 0x2d, 0x06, 0x00, 0xa1,
350            0x58, 0x2a, 0x80, 0x07, 0x00, 0x00, 0x00,
351            // EOF block
352            0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, 0x42, 0x43,
353            0x02, 0x00, 0x1b, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
354        ];
355
356        let eof = VirtualPosition::try_from((63, 0))?;
357
358        let mut reader = Reader::new(Cursor::new(&data));
359
360        let mut buf = Vec::new();
361        reader.read_to_end(&mut buf)?;
362
363        assert_eq!(reader.virtual_position(), eof);
364
365        reader.seek(VirtualPosition::try_from((0, 3))?)?;
366
367        buf.clear();
368        reader.read_to_end(&mut buf)?;
369
370        assert_eq!(buf, b"dles");
371        assert_eq!(reader.virtual_position(), eof);
372
373        Ok(())
374    }
375
376    #[test]
377    fn test_seek_by_uncompressed_position() -> io::Result<()> {
378        #[rustfmt::skip]
379        let data = [
380            // block 0 (b"noodles")
381            0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, 0x42, 0x43,
382            0x02, 0x00, 0x22, 0x00, 0xcb, 0xcb, 0xcf, 0x4f, 0xc9, 0x49, 0x2d, 0x06, 0x00, 0xa1,
383            0x58, 0x2a, 0x80, 0x07, 0x00, 0x00, 0x00,
384            // block 1 (b"bgzf")
385            0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, 0x42, 0x43,
386            0x02, 0x00, 0x1f, 0x00, 0x4b, 0x4a, 0xaf, 0x4a, 0x03, 0x00, 0x20, 0x68, 0xf2, 0x8c,
387            0x04, 0x00, 0x00, 0x00,
388            // EOF block
389            0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x06, 0x00, 0x42, 0x43,
390            0x02, 0x00, 0x1b, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
391        ];
392
393        let index = gzi::Index::from(vec![(35, 7)]);
394
395        let mut reader = Reader::new(Cursor::new(&data));
396
397        reader.seek_by_uncompressed_position(&index, 3)?;
398        let mut buf = [0; 4];
399        reader.read_exact(&mut buf)?;
400        assert_eq!(&buf, b"dles");
401
402        reader.seek_by_uncompressed_position(&index, 8)?;
403        let mut buf = [0; 2];
404        reader.read_exact(&mut buf)?;
405        assert_eq!(&buf, b"gz");
406
407        Ok(())
408    }
409}