rust_htslib/bgzf/
mod.rs

1// Copyright 2020 Manuel Landesfeind, Evotec International GmbH
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5
6//!
7//! Module for working with bgzipped file.
8//!
9
10use std::ffi;
11use std::path::Path;
12use url::Url;
13
14use crate::htslib;
15use crate::tpool::ThreadPool;
16
17use crate::errors::{Error, Result};
18
19fn path_as_bytes<'a, P: 'a + AsRef<Path>>(path: P, must_exist: bool) -> Result<Vec<u8>> {
20    if path.as_ref().exists() || !must_exist {
21        Ok(path
22            .as_ref()
23            .to_str()
24            .ok_or(Error::NonUnicodePath)?
25            .as_bytes()
26            .to_owned())
27    } else {
28        Err(Error::FileNotFound {
29            path: path.as_ref().to_owned(),
30        })
31    }
32}
33
34/// Test if a file is a Bgzip compressed file
35///
36/// # Arguments
37///
38/// * `path` - the path to test.
39///
40/// # Returns:
41/// Will return `Ok(true)` or `Ok(false)` if the file at `path` is BGZIP compressed. Will return an `Err` in
42/// cases where no testing is possible.
43pub fn is_bgzip<P: AsRef<Path>>(path: P) -> Result<bool, Error> {
44    let byte_path = path_as_bytes(path, true)?;
45    let cpath = ffi::CString::new(byte_path).unwrap();
46    let is_bgzf = unsafe { htslib::bgzf_is_bgzf(cpath.as_ptr()) == 1 };
47    Ok(is_bgzf)
48}
49
50/// A reader that transparently reads uncompressed, gzip, and bgzip files.
51#[derive(Debug)]
52pub struct Reader {
53    inner: *mut htslib::BGZF,
54}
55
56impl Reader {
57    /// Create a new Reader to read from stdin.
58    pub fn from_stdin() -> Result<Self, Error> {
59        Self::new(b"-")
60    }
61
62    /// Create a new Reader from a path.
63    ///
64    /// # Arguments
65    ///
66    /// * `path` - the path to open.
67    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self, Error> {
68        Self::new(&path_as_bytes(path, true)?)
69    }
70
71    /// Create a new Reader from an URL.
72    ///
73    /// # Arguments
74    ///
75    /// * `url` - the url to open
76    pub fn from_url(url: &Url) -> Result<Self, Error> {
77        Self::new(url.as_str().as_bytes())
78    }
79
80    /// Internal function to create a Reader from some sort of path (could be file path but also URL).
81    /// The path or URL will be handled by the c-implementation transparently.
82    ///
83    /// # Arguments
84    ///
85    /// * `path` - the path or URL to open
86    fn new(path: &[u8]) -> Result<Self, Error> {
87        let mode = ffi::CString::new("r").unwrap();
88        let cpath = ffi::CString::new(path).unwrap();
89        let inner = unsafe { htslib::bgzf_open(cpath.as_ptr(), mode.as_ptr()) };
90        if inner != std::ptr::null_mut() {
91            Ok(Self { inner })
92        } else {
93            Err(Error::FileOpen {
94                path: String::from_utf8(path.to_vec()).unwrap(),
95            })
96        }
97    }
98
99    /// Set the thread pool to use for parallel decompression.
100    ///
101    /// # Arguments
102    ///
103    /// * `tpool` - the thread-pool to use
104    pub fn set_thread_pool(&mut self, tpool: &ThreadPool) -> Result<()> {
105        let b = tpool.handle.borrow_mut();
106        let r = unsafe {
107            htslib::bgzf_thread_pool(self.inner, b.inner.pool as *mut _, 0) // let htslib decide on the queue-size
108        };
109
110        if r != 0 {
111            Err(Error::ThreadPool)
112        } else {
113            Ok(())
114        }
115    }
116}
117
118impl std::io::Read for Reader {
119    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
120        let nbytes = unsafe {
121            htslib::bgzf_read(self.inner, buf.as_mut_ptr() as *mut libc::c_void, buf.len())
122        };
123        if nbytes < 0 {
124            Err(std::io::Error::new(
125                std::io::ErrorKind::Other,
126                "Can not read",
127            ))
128        } else {
129            Ok(nbytes as usize)
130        }
131    }
132}
133
134/// The CompressionLevel used by the underlying GZIP writer
135/// Note that the special level NoCompression will not use the GZIP writer.
136/// Compression levels in BGZF files
137///
138/// * Uncompressed: No compression, zlib level 0
139/// * Fastest: Lowest compression level, zlib level 1
140/// * Maximum: Highest compression level, zlib level 9
141/// * Default: Default compression level, zlib level 6
142/// * Level(i): Custom compression level in the range [0, 9]
143/// * NoCompression: No compression, zlib not used. Output will be identical to input
144#[derive(Debug, Clone, Copy)]
145pub enum CompressionLevel {
146    Default,
147    NoCompression,
148    Uncompressed,
149    Fastest,
150    Maximum,
151    Level(i8),
152}
153impl CompressionLevel {
154    // Convert and check the variants of the `CompressionLevel` enum to a numeric level
155    fn convert(self) -> Result<i8> {
156        match self {
157            CompressionLevel::NoCompression => Ok(-2),
158            CompressionLevel::Default => Ok(-1),
159            CompressionLevel::Uncompressed => Ok(0),
160            CompressionLevel::Fastest => Ok(1),
161            CompressionLevel::Maximum => Ok(9),
162            CompressionLevel::Level(i @ -2..=9) => Ok(i),
163            CompressionLevel::Level(i) => Err(Error::BgzfInvalidCompressionLevel { level: i }),
164        }
165    }
166}
167
168/// A writer that writes uncompressed, gzip, and bgzip files.
169#[derive(Debug)]
170pub struct Writer {
171    inner: *mut htslib::BGZF,
172    tpool: Option<ThreadPool>,
173}
174
175impl Writer {
176    /// Create a new Writer to write to stdout with default compression.
177    pub fn from_stdout() -> Result<Self, Error> {
178        Self::from_stdout_with_compression(CompressionLevel::Default)
179    }
180
181    /// Create a new Writer to write to stdout with specific compression
182    ///
183    /// # Arguments
184    ///
185    /// * `level` the compression level to use
186    pub fn from_stdout_with_compression(level: CompressionLevel) -> Result<Self, Error> {
187        Self::new(b"-", level)
188    }
189
190    /// Create a new Writer from a path with default compression.
191    ///
192    /// # Arguments
193    ///
194    /// * `path` - the path to open.
195    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self, Error> {
196        Self::from_path_with_level(path, CompressionLevel::Default)
197    }
198
199    /// Create a new Writer from a path with a specific compression level.
200    ///
201    /// # Arguments
202    ///
203    /// * `path` - the path to open.
204    pub fn from_path_with_level<P: AsRef<Path>>(
205        path: P,
206        level: CompressionLevel,
207    ) -> Result<Self, Error> {
208        Self::new(&path_as_bytes(path, false)?, level)
209    }
210
211    /// Internal function to create a Writer from a file path
212    ///
213    /// # Arguments
214    ///
215    /// * `path` - the path or URL to open
216    fn new(path: &[u8], level: CompressionLevel) -> Result<Self, Error> {
217        let mode = Self::get_open_mode(level)?;
218        let cpath = ffi::CString::new(path).unwrap();
219        let inner = unsafe { htslib::bgzf_open(cpath.as_ptr(), mode.as_ptr()) };
220        if inner != std::ptr::null_mut() {
221            Ok(Self { inner, tpool: None })
222        } else {
223            Err(Error::FileOpen {
224                path: String::from_utf8(path.to_vec()).unwrap(),
225            })
226        }
227    }
228
229    /// Internal function to convert compression level to "mode"
230    /// bgzf.c expects mode for writers to be one of: 'w', 'wu', 'w#', where # is 0-9.
231    /// # Arguments
232    ///
233    /// * `level` - the level of compression to use
234    fn get_open_mode(level: CompressionLevel) -> Result<ffi::CString, Error> {
235        let write_string = match level.convert() {
236            Ok(-2) => "wu".to_string(),
237            Ok(-1) => "w".to_string(),
238            Ok(n @ 0..=9) => format!("w{}", n),
239            Err(e) => return Err(e),
240            // This should be unreachable
241            Ok(i) => return Err(Error::BgzfInvalidCompressionLevel { level: i }),
242        };
243        return Ok(ffi::CString::new(write_string).unwrap());
244    }
245
246    /// Set the thread pool to use for parallel compression.
247    ///
248    /// # Arguments
249    ///
250    /// * `tpool` - the thread-pool to use
251    pub fn set_thread_pool(&mut self, tpool: &ThreadPool) -> Result<()> {
252        self.tpool = Some(tpool.clone());
253        let b = tpool.handle.borrow_mut();
254        let r = unsafe {
255            htslib::bgzf_thread_pool(self.inner, b.inner.pool as *mut _, 0) // let htslib decide on the queue-size
256        };
257
258        if r != 0 {
259            Err(Error::ThreadPool)
260        } else {
261            Ok(())
262        }
263    }
264}
265
266impl std::io::Write for Writer {
267    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
268        let nbytes =
269            unsafe { htslib::bgzf_write(self.inner, buf.as_ptr() as *mut libc::c_void, buf.len()) };
270        if nbytes < 0 {
271            Err(std::io::Error::new(
272                std::io::ErrorKind::Other,
273                "Can not write",
274            ))
275        } else {
276            Ok(nbytes as usize)
277        }
278    }
279
280    fn flush(&mut self) -> std::io::Result<()> {
281        let exit_code: i32 = unsafe { htslib::bgzf_flush(self.inner) };
282        if exit_code == 0 {
283            Ok(())
284        } else {
285            Err(std::io::Error::new(
286                std::io::ErrorKind::Other,
287                "Can not flush",
288            ))
289        }
290    }
291}
292
293impl std::ops::Drop for Writer {
294    fn drop(&mut self) {
295        unsafe {
296            htslib::bgzf_close(self.inner);
297        }
298    }
299}
300
301#[cfg(test)]
302mod tests {
303    use super::*;
304    use std::io::Read;
305    use std::io::Write;
306
307    // Define paths to the test files
308    const FN_PLAIN: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/test/bgzip/plain.vcf");
309    const FN_GZIP: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/test/bgzip/gzip.vcf.gz");
310    const FN_BGZIP: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/test/bgzip/bgzip.vcf.gz");
311
312    const CONTENT: &str = include_str!("../../test/bgzip/plain.vcf");
313
314    #[test]
315    fn test_is_bgzip_plain() {
316        assert!(
317            !is_bgzip(FN_PLAIN).unwrap(),
318            "Plain file not detected as BGZIP"
319        );
320        assert!(
321            !is_bgzip(FN_GZIP).unwrap(),
322            "Zip file not detected as BGZIP"
323        );
324        assert!(is_bgzip(FN_BGZIP).unwrap(), "Bgzip file detected as BGZIP");
325    }
326
327    #[test]
328    fn test_open_plain() {
329        let r_result = Reader::from_path(FN_PLAIN);
330        assert!(r_result.is_ok(), "Open plain file with Bgzip reader");
331
332        let mut my_content = String::new();
333        let reading_result = r_result.unwrap().read_to_string(&mut my_content);
334        assert!(
335            reading_result.is_ok(),
336            "Reading plain file into buffer is ok"
337        );
338        assert_eq!(
339            reading_result.unwrap(),
340            190,
341            "Reading plain file into buffer is correct size"
342        );
343        assert_eq!(
344            my_content, CONTENT,
345            "Reading plain file with correct content"
346        );
347    }
348
349    #[test]
350    fn test_open_gzip() {
351        let r_result = Reader::from_path(FN_GZIP);
352        assert!(r_result.is_ok(), "Open gzip file with Bgzip reader");
353
354        let mut my_content = String::new();
355        let reading_result = r_result.unwrap().read_to_string(&mut my_content);
356        assert!(
357            reading_result.is_ok(),
358            "Reading gzip file into buffer is ok"
359        );
360        assert_eq!(
361            reading_result.unwrap(),
362            190,
363            "Reading gzip file into buffer is correct size"
364        );
365        assert_eq!(
366            my_content, CONTENT,
367            "Reading gzip file with correct content"
368        );
369    }
370
371    #[test]
372    fn test_open_bgzip() {
373        let r_result = Reader::from_path(FN_BGZIP);
374        assert!(r_result.is_ok(), "Open bgzip file with Bgzip reader");
375
376        let mut my_content = String::new();
377        let reading_result = r_result.unwrap().read_to_string(&mut my_content);
378        assert!(
379            reading_result.is_ok(),
380            "Reading bgzip file into buffer is ok"
381        );
382        assert_eq!(
383            reading_result.unwrap(),
384            190,
385            "Reading bgzip file into buffer is correct size"
386        );
387        assert_eq!(
388            my_content, CONTENT,
389            "Reading bgzip file with correct content"
390        );
391    }
392    #[test]
393    fn test_set_threadpool() {
394        let r_result = Reader::from_path(FN_BGZIP);
395        assert!(r_result.is_ok(), "Open bgzip file with Bgzip reader");
396        let mut r = r_result.unwrap();
397
398        let tpool_result = ThreadPool::new(5);
399        assert!(tpool_result.is_ok(), "Creating thread pool");
400        let tpool = tpool_result.unwrap();
401
402        let set_result = r.set_thread_pool(&tpool);
403        assert_eq!(set_result, Ok(()), "Setting thread pool okay");
404
405        let mut my_content = String::new();
406        let reading_result = r.read_to_string(&mut my_content);
407        assert!(
408            reading_result.is_ok(),
409            "Reading bgzip file into buffer is ok - using a threadpool"
410        );
411        assert_eq!(
412            reading_result.unwrap(),
413            190,
414            "Reading bgzip file into buffer is correct size using a threadpool"
415        );
416        assert_eq!(
417            my_content, CONTENT,
418            "Reading bgzip file with correct content using a threadpool"
419        );
420    }
421
422    #[test]
423    fn test_write_plain() {
424        let tmp = tempfile::Builder::new()
425            .prefix("rust-htslib")
426            .tempdir()
427            .expect("Cannot create temp dir");
428        let out_path = tmp.path().join("test.vcf");
429        println!("{:?}", out_path);
430
431        {
432            let w_result = Writer::from_path_with_level(&out_path, CompressionLevel::NoCompression);
433            if let Err(ref e) = w_result {
434                println!("w_result is {}", e);
435            }
436            assert!(w_result.is_ok(), "Create plain file with Bgzip writer");
437            assert!(out_path.exists(), "Plain file is created with Bgzip writer");
438            let mut w = w_result.unwrap();
439            let write_result = w.write_all(CONTENT.as_bytes());
440            assert!(
441                write_result.is_ok(),
442                "Plain file can write with Bgzip writer"
443            );
444        } // let Writer fall out of scope and implicitly close
445        assert!(
446            !is_bgzip(&out_path).unwrap(),
447            "NoCompression file should not be detected as BGZIP"
448        );
449        let my_content = std::fs::read_to_string(&out_path).unwrap();
450        assert_eq!(
451            my_content, CONTENT,
452            "Writing bgzip file with no compression"
453        );
454
455        tmp.close().expect("Failed to delete temp dir");
456    }
457
458    #[test]
459    fn test_write_default() {
460        let tmp = tempfile::Builder::new()
461            .prefix("rust-htslib")
462            .tempdir()
463            .expect("Cannot create temp dir");
464        let out_path = tmp.path().join("test.vcf.bgzf");
465        println!("{:?}", out_path);
466        {
467            let w_result = Writer::from_path(&out_path);
468            if let Err(ref e) = w_result {
469                println!("w_result is {}", e);
470            }
471            assert!(w_result.is_ok(), "Create bgzip file with Bgzip writer");
472            assert!(
473                std::path::Path::new(&out_path).exists(),
474                "Bgzip file is created with Bgzip writer"
475            );
476            let mut w = w_result.unwrap();
477            let write_result = w.write_all(CONTENT.as_bytes());
478            assert!(
479                write_result.is_ok(),
480                "Bgzip file can write with Bgzip writer"
481            );
482        } // let Writer fall out of scope and implicitly close
483
484        // Read in with bgzip reader
485        let mut my_content = String::new();
486        Reader::from_path(&out_path)
487            .unwrap()
488            .read_to_string(&mut my_content)
489            .unwrap();
490        assert_eq!(
491            my_content, CONTENT,
492            "Writing bgzip file with default compression"
493        );
494
495        assert!(
496            is_bgzip(&out_path).unwrap(),
497            "Default BGZIP file detected as BGZIP"
498        );
499        tmp.close().expect("Failed to delete temp dir");
500    }
501
502    #[test]
503    fn test_write_compression_levels() {
504        let tmp = tempfile::Builder::new()
505            .prefix("rust-htslib")
506            .tempdir()
507            .expect("Cannot create temp dir");
508        let out_path = tmp.path().join("test.vcf.bgzf");
509
510        // Test all levels except NoCompression
511        let compression_levels = vec![
512            CompressionLevel::Fastest,
513            CompressionLevel::Maximum,
514            CompressionLevel::Uncompressed,
515        ]
516        .into_iter()
517        .chain((-1..=9_i8).map(|n| CompressionLevel::Level(n)));
518
519        for level in compression_levels {
520            {
521                let w_result = Writer::from_path_with_level(&out_path, level);
522                if let Err(ref e) = w_result {
523                    println!("w_result is {}", e);
524                }
525                assert!(w_result.is_ok(), "Create bgzip file with Bgzip writer");
526                assert!(
527                    std::path::Path::new(&out_path).exists(),
528                    "Bgzip file is created with Bgzip writer"
529                );
530                let mut w = w_result.unwrap();
531                let write_result = w.write_all(CONTENT.as_bytes());
532                assert!(
533                    write_result.is_ok(),
534                    "Bgzip file can write with Bgzip writer"
535                );
536            } // let Writer fall out of scope and implicitly close
537
538            // Read in with bgzip reader
539            let mut my_content = String::new();
540            Reader::from_path(&out_path)
541                .unwrap()
542                .read_to_string(&mut my_content)
543                .unwrap();
544            assert_eq!(
545                my_content, CONTENT,
546                "Writing bgzip file with {:?} compression",
547                level
548            );
549
550            assert!(
551                is_bgzip(&out_path).unwrap(),
552                "Writing BGZIP file with {:?} compression detected as BGZIP",
553                level
554            );
555        }
556        tmp.close().expect("Failed to delete temp dir");
557    }
558
559    #[test]
560    fn test_write_with_threadpool() {
561        let tmp = tempfile::Builder::new()
562            .prefix("rust-htslib")
563            .tempdir()
564            .expect("Cannot create temp dir");
565        let out_path = tmp.path().join("test.vcf.bgzf");
566
567        let content = CONTENT.as_bytes();
568        println!("{:?}", out_path);
569        {
570            let w_result = Writer::from_path(&out_path);
571            if let Err(ref e) = w_result {
572                println!("w_result is {}", e);
573            }
574            assert!(w_result.is_ok(), "Create bgzip file with Bgzip threadpool");
575            assert!(
576                std::path::Path::new(&out_path).exists(),
577                "Bgzip file is created with Bgzip threadpool"
578            );
579
580            let mut w = w_result.unwrap();
581            let tpool_result = ThreadPool::new(5);
582            assert!(tpool_result.is_ok(), "Creating thread pool");
583            let tpool = tpool_result.unwrap();
584
585            let set_tpool_result = w.set_thread_pool(&tpool);
586            assert!(set_tpool_result.is_ok(), "Setting thread pool");
587
588            let write_result = w.write_all(content);
589            assert!(
590                write_result.is_ok(),
591                "Bgzip file can write with Bgzip threadpool"
592            );
593        } // let Writer fall out of scope and implicitly close
594
595        // Read in with bgzip reader
596        let mut my_content = String::new();
597        Reader::from_path(&out_path)
598            .unwrap()
599            .read_to_string(&mut my_content)
600            .unwrap();
601        assert_eq!(my_content, CONTENT, "Writing bgzip file with threadpool");
602
603        assert!(
604            is_bgzip(&out_path).unwrap(),
605            "Threadpool BGZIP file detected as BGZIP"
606        );
607
608        tmp.close().expect("Failed to delete temp dir");
609    }
610}