noodles_bgzf/
lib.rs

1#![warn(missing_docs)]
2
3//! **noodles-bgzf** handles the reading and writing of the blocked gzip format (BGZF).
4//!
5//! While the gzip format is typically a single stream, a BGZF is the concatenation of many gzip
6//! streams. Each stream is called a block, with its uncompressed data size being constrained to
7//! less than 64 KiB. This multistream gzip allows random access using [`virtual positions`].
8//!
9//! noodles-bgzf abstracts away the concept of blocks, implementing [`std::io::Read`] for the
10//! reader and [`std::io::Write`] for the writer.
11//!
12//! [`virtual positions`]: VirtualPosition
13//!
14//! # Examples
15//!
16//! ## Read an entire BGZF file
17//!
18//! ```no_run
19//! # use std::{fs::File, io::{self, Read}};
20//! use noodles_bgzf as bgzf;
21//! let mut reader = File::open("data.gz").map(bgzf::Reader::new)?;
22//! let mut data = Vec::new();
23//! reader.read_to_end(&mut data)?;
24//! # Ok::<(), io::Error>(())
25//! ```
26//!
27//! ## Write a BGZF file
28//!
29//! ```no_run
30//! # use std::{fs::File, io::{self, Write}};
31//! use noodles_bgzf as bgzf;
32//! let mut writer = File::create("data.gz").map(bgzf::Writer::new)?;
33//! writer.write_all(b"noodles-bgzf")?;
34//! # Ok::<(), io::Error>(())
35//! ```
36
37#[cfg(feature = "async")]
38pub mod r#async;
39
40mod block;
41pub(crate) mod deflate;
42mod gz;
43pub mod gzi;
44pub mod indexed_reader;
45pub mod io;
46mod multithreaded_reader;
47pub mod multithreaded_writer;
48pub mod reader;
49pub mod virtual_position;
50pub mod writer;
51
52pub use self::{
53    indexed_reader::IndexedReader, multithreaded_reader::MultithreadedReader,
54    multithreaded_writer::MultithreadedWriter, reader::Reader, virtual_position::VirtualPosition,
55    writer::Writer,
56};
57
58#[cfg(feature = "async")]
59#[deprecated(since = "0.35.0", note = "Use `bgzf::r#async::Reader` instead.")]
60pub use self::r#async::Reader as AsyncReader;
61
62#[cfg(feature = "async")]
63#[deprecated(since = "0.35.0", note = "Use `bgzf::r#async::Writer` instead.")]
64pub use self::r#async::Writer as AsyncWriter;
65
66use self::block::Block;
67
68// XLEN (2)
69const GZIP_XLEN_SIZE: usize = 2;
70
71// SI1 (1) + SI2 (1) + SLEN (2) + BSIZE (2)
72const BGZF_XLEN: usize = 6;
73
74// ยง 4.1 The BGZF compression format (2021-06-03): "Thus while `ISIZE` is stored as a `uint32_t` as
75// per the gzip format, in BGZF it is limited to the range [0, 65536]."
76const BGZF_MAX_ISIZE: usize = 1 << 16;
77
78pub(crate) const BGZF_HEADER_SIZE: usize = gz::HEADER_SIZE + GZIP_XLEN_SIZE + BGZF_XLEN;
79
80#[cfg(test)]
81mod tests {
82    use std::io::{self, BufRead, Cursor, Read, Write};
83
84    use super::*;
85
86    #[test]
87    fn test_self() -> io::Result<()> {
88        let mut writer = Writer::new(Vec::new());
89
90        writer.write_all(b"noodles")?;
91        writer.flush()?;
92        writer.write_all(b"-")?;
93        writer.flush()?;
94        writer.write_all(b"bgzf")?;
95
96        let data = writer.finish()?;
97        let mut reader = Reader::new(&data[..]);
98
99        let mut buf = Vec::new();
100        reader.read_to_end(&mut buf)?;
101
102        assert_eq!(buf, b"noodles-bgzf");
103
104        Ok(())
105    }
106
107    #[test]
108    fn test_self_buffered() -> io::Result<()> {
109        let mut writer = Writer::new(Vec::new());
110
111        writer.write_all(b"noodles\n-\nbgzf\nbuffered")?;
112
113        let data = writer.finish()?;
114        let mut reader = Reader::new(&data[..]);
115
116        let mut lines = Vec::new();
117        let mut virtual_positions = Vec::new();
118
119        loop {
120            virtual_positions.push(reader.virtual_position());
121
122            let mut line = String::new();
123            match reader.read_line(&mut line) {
124                Ok(0) => {
125                    virtual_positions.pop();
126                    break;
127                }
128                Err(e) => return Err(e),
129                _ => (),
130            }
131
132            lines.push(line);
133        }
134
135        let expected_lines = vec!["noodles\n", "-\n", "bgzf\n", "buffered"];
136        assert_eq!(lines, expected_lines);
137
138        let expected_upos = [0, 8, 10, 15];
139        let expected_virtual_positions: Vec<VirtualPosition> = expected_upos
140            .iter()
141            .map(|x| VirtualPosition::try_from((0, *x)).unwrap())
142            .collect();
143        assert_eq!(virtual_positions, expected_virtual_positions);
144
145        Ok(())
146    }
147
148    #[test]
149    fn test_self_multithreaded() -> io::Result<()> {
150        let mut writer = MultithreadedWriter::new(Vec::new());
151
152        writer.write_all(b"noodles")?;
153        writer.flush()?;
154        writer.write_all(b"-")?;
155        writer.flush()?;
156        writer.write_all(b"bgzf")?;
157
158        let data = writer.finish().map(Cursor::new)?;
159        let mut reader = MultithreadedReader::new(data);
160
161        let mut buf = Vec::new();
162        reader.read_to_end(&mut buf)?;
163
164        assert_eq!(buf, b"noodles-bgzf");
165
166        Ok(())
167    }
168}