rust_htslib/tbx/
mod.rs

1// Copyright 2018 Manuel Holtgrewe, Berlin Institute of Health.
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5
6//! Module for working with tabix-indexed text files.
7//!
8//! This module allows to read tabix-indexed text files (such as BED) in a convenient but in a
9//! line-based (and thus format-agnostic way). For accessing tabix-inxed VCF files, using the
10//! `bcf` module is probably a better choice as this module gives you lines from the text files
11//! which you then have to take care of parsing.
12//!
13//! In general, for reading tabix-indexed files, first to open the file by creating a `tbx::Reader`
14//! objects, possibly translate the chromosome name to its numeric ID in the file, fetch the region
15//! of interest using `fetch()`, and finally iterate over the records using `records()`.
16//!
17//! # Examples
18//!
19//! ```rust,no_run
20//! use rust_htslib::tbx::{self, Read};
21//!
22//! // Create a tabix reader for reading a tabix-indexed BED file.
23//! let path_bed = "file.bed.gz";
24//! let mut tbx_reader = tbx::Reader::from_path(&path_bed)
25//!     .expect(&format!("Could not open {}", path_bed));
26//!
27//! // Resolve chromosome name to numeric ID.
28//! let tid = match tbx_reader.tid("chr1") {
29//!     Ok(tid) => tid,
30//!     Err(_) => panic!("Could not resolve 'chr1' to contig ID"),
31//! };
32//!
33//! // Set region to fetch.
34//! tbx_reader
35//!     .fetch(tid, 0, 100_000)
36//!     .expect("Could not seek to chr1:1-100,000");
37//!
38//! // Read through all records in region.
39//! for record in tbx_reader.records() {
40//!     // ... actually do some work
41//! }
42//! ```
43
44use std::ffi;
45use std::path::Path;
46use std::ptr;
47use url::Url;
48
49use crate::errors::{Error, Result};
50use crate::htslib;
51use crate::utils::path_as_bytes;
52
53/// A trait for a Tabix reader with a read method.
54pub trait Read: Sized {
55    /// Read next line into the given `Vec<u8>` (i.e., ASCII string).
56    ///
57    /// Use this method in combination with a single allocated record to avoid the reallocations
58    /// occurring with the iterator.
59    ///
60    /// # Arguments
61    ///
62    /// * `record` - the `Vec<u8>` to be filled
63    ///
64    /// # Returns
65    /// Ok(true) if record was read, Ok(false) if no more record in file
66    fn read(&mut self, record: &mut Vec<u8>) -> Result<bool>;
67
68    /// Iterator over the lines/records of the seeked region.
69    ///
70    /// Note that, while being convenient, this is less efficient than pre-allocating a
71    /// `Vec<u8>` and reading into it with the `read()` method, since every iteration involves
72    /// the allocation of a new `Vec<u8>`.
73    fn records(&mut self) -> Records<'_, Self>;
74
75    /// Return the text headers, split by line.
76    fn header(&self) -> &Vec<String>;
77}
78
79/// A Tabix file reader.
80///
81/// This struct and its associated functions are meant for reading plain-text tabix indexed
82/// by `tabix`.
83///
84/// Note that the `tabix` command from `htslib` can actually several more things, including
85/// building indices and converting BCF to VCF text output.  Both is out of scope here.
86#[derive(Debug)]
87pub struct Reader {
88    /// The header lines (if any).
89    header: Vec<String>,
90
91    /// The file to read from.
92    hts_file: *mut htslib::htsFile,
93    /// The file format information.
94    hts_format: htslib::htsExactFormat,
95    /// The tbx_t structure to read from.
96    tbx: *mut htslib::tbx_t,
97    /// The current buffer.
98    buf: htslib::kstring_t,
99    /// Iterator over the buffer.
100    itr: Option<*mut htslib::hts_itr_t>,
101
102    /// The currently fetch region's tid.
103    tid: i64,
104    /// The currently fetch region's 0-based begin pos.
105    start: i64,
106    /// The currently fetch region's 0-based end pos.
107    end: i64,
108}
109
110unsafe impl Send for Reader {}
111
112/// Redefinition of `KS_SEP_LINE` from `htslib/kseq.h`.
113const KS_SEP_LINE: i32 = 2;
114
115impl Reader {
116    /// Create a new Reader from path.
117    ///
118    /// # Arguments
119    ///
120    /// * `path` - the path to open.
121    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
122        Self::new(&path_as_bytes(path, true)?)
123    }
124
125    pub fn from_url(url: &Url) -> Result<Self> {
126        Self::new(url.as_str().as_bytes())
127    }
128
129    /// Create a new Reader.
130    ///
131    /// # Arguments
132    ///
133    /// * `path` - the path.
134    fn new(path: &[u8]) -> Result<Self> {
135        let path = ffi::CString::new(path).unwrap();
136        let c_str = ffi::CString::new("r").unwrap();
137        let hts_file = unsafe { htslib::hts_open(path.as_ptr(), c_str.as_ptr()) };
138        let hts_format: u32 = unsafe {
139            let file_format: *const hts_sys::htsFormat = htslib::hts_get_format(hts_file);
140            (*file_format).format
141        };
142
143        let tbx = unsafe { htslib::tbx_index_load(path.as_ptr()) };
144        if tbx.is_null() {
145            return Err(Error::TabixInvalidIndex);
146        }
147        let mut header = Vec::new();
148        let mut buf = htslib::kstring_t {
149            l: 0,
150            m: 0,
151            s: ptr::null_mut(),
152        };
153        unsafe {
154            while htslib::hts_getline(hts_file, KS_SEP_LINE, &mut buf) >= 0 {
155                if buf.l > 0 && i32::from(*buf.s) == (*tbx).conf.meta_char {
156                    header.push(String::from(ffi::CStr::from_ptr(buf.s).to_str().unwrap()));
157                } else {
158                    break;
159                }
160            }
161        }
162
163        Ok(Reader {
164            header,
165            hts_file,
166            hts_format,
167            tbx,
168            buf,
169            itr: None,
170            tid: -1,
171            start: -1,
172            end: -1,
173        })
174    }
175
176    /// Get sequence/target ID from sequence name.
177    pub fn tid(&self, name: &str) -> Result<u64> {
178        let name_cstr = ffi::CString::new(name.as_bytes()).unwrap();
179        let res = unsafe { htslib::tbx_name2id(self.tbx, name_cstr.as_ptr()) };
180        if res < 0 {
181            Err(Error::UnknownSequence {
182                sequence: name.to_owned(),
183            })
184        } else {
185            Ok(res as u64)
186        }
187    }
188
189    /// Fetch region given by numeric sequence number and 0-based begin and end position.
190    pub fn fetch(&mut self, tid: u64, start: u64, end: u64) -> Result<()> {
191        self.tid = tid as i64;
192        self.start = start as i64;
193        self.end = end as i64;
194
195        if let Some(itr) = self.itr {
196            unsafe {
197                htslib::hts_itr_destroy(itr);
198            }
199        }
200        let itr = unsafe {
201            htslib::hts_itr_query(
202                (*self.tbx).idx,
203                tid as i32,
204                start as i64,
205                end as i64,
206                Some(htslib::tbx_readrec),
207            )
208        };
209        if itr.is_null() {
210            self.itr = None;
211            Err(Error::Fetch)
212        } else {
213            self.itr = Some(itr);
214            Ok(())
215        }
216    }
217
218    /// Return the sequence contig names.
219    pub fn seqnames(&self) -> Vec<String> {
220        let mut result = Vec::new();
221
222        let mut nseq: i32 = 0;
223        let seqs = unsafe { htslib::tbx_seqnames(self.tbx, &mut nseq) };
224        for i in 0..nseq {
225            unsafe {
226                result.push(String::from(
227                    ffi::CStr::from_ptr(*seqs.offset(i as isize))
228                        .to_str()
229                        .unwrap(),
230                ));
231            }
232        }
233        unsafe {
234            libc::free(seqs as *mut libc::c_void);
235        };
236
237        result
238    }
239
240    /// Activate multi-threaded BGZF read support in htslib. This should permit faster
241    /// reading of large BGZF files.
242    ///
243    /// # Arguments
244    ///
245    /// * `n_threads` - number of extra background reader threads to use
246    pub fn set_threads(&mut self, n_threads: usize) -> Result<()> {
247        assert!(n_threads > 0, "n_threads must be > 0");
248
249        let r = unsafe { htslib::hts_set_threads(self.hts_file, n_threads as i32) };
250        if r != 0 {
251            Err(Error::SetThreads)
252        } else {
253            Ok(())
254        }
255    }
256}
257
258/// Return whether the two given genomic intervals overlap.
259fn overlap(tid1: i64, begin1: i64, end1: i64, tid2: i64, begin2: i64, end2: i64) -> bool {
260    (tid1 == tid2) && (begin1 < end2) && (begin2 < end1)
261}
262
263impl Read for Reader {
264    fn read(&mut self, record: &mut Vec<u8>) -> Result<bool> {
265        match self.itr {
266            Some(itr) => {
267                loop {
268                    // Try to read next line.
269                    let ret = unsafe {
270                        htslib::hts_itr_next(
271                            htslib::hts_get_bgzfp(self.hts_file),
272                            itr,
273                            //mem::transmute(&mut self.buf),
274                            &mut self.buf as *mut htslib::kstring_t as *mut libc::c_void,
275                            //mem::transmute(self.tbx),
276                            self.tbx as *mut libc::c_void,
277                        )
278                    };
279                    // Handle errors first.
280                    if ret == -1 {
281                        return Ok(false);
282                    } else if ret == -2 {
283                        return Err(Error::TabixTruncatedRecord);
284                    } else if ret < 0 {
285                        panic!("Return value should not be <0 but was: {}", ret);
286                    }
287                    // Return first overlapping record (loop will stop when `hts_itr_next(...)`
288                    // returns `< 0`).
289                    let (tid, start, end) =
290                        unsafe { ((*itr).curr_tid, (*itr).curr_beg, (*itr).curr_end) };
291                    // XXX: Careful with this tid conversion!!!
292                    if overlap(self.tid, self.start, self.end, tid as i64, start, end) {
293                        *record =
294                            unsafe { Vec::from(ffi::CStr::from_ptr(self.buf.s).to_str().unwrap()) };
295                        return Ok(true);
296                    }
297                }
298            }
299            _ => Err(Error::TabixNoIter),
300        }
301    }
302
303    fn records(&mut self) -> Records<'_, Self> {
304        Records { reader: self }
305    }
306
307    fn header(&self) -> &Vec<String> {
308        &self.header
309    }
310}
311
312impl Drop for Reader {
313    fn drop(&mut self) {
314        unsafe {
315            if self.itr.is_some() {
316                htslib::hts_itr_destroy(self.itr.unwrap());
317            }
318            htslib::tbx_destroy(self.tbx);
319            htslib::hts_close(self.hts_file);
320        }
321    }
322}
323
324/// Iterator over the lines of a tabix file.
325#[derive(Debug)]
326pub struct Records<'a, R: Read> {
327    reader: &'a mut R,
328}
329
330impl<'a, R: Read> Iterator for Records<'a, R> {
331    type Item = Result<Vec<u8>>;
332
333    #[allow(clippy::read_zero_byte_vec)]
334    fn next(&mut self) -> Option<Result<Vec<u8>>> {
335        let mut record = Vec::new();
336        match self.reader.read(&mut record) {
337            Ok(false) => None,
338            Ok(true) => Some(Ok(record)),
339            Err(err) => Some(Err(err)),
340        }
341    }
342}
343
344#[cfg(test)]
345mod tests {
346    use super::*;
347
348    #[test]
349    fn bed_basic() {
350        let reader =
351            Reader::from_path("test/tabix_reader/test_bed3.bed.gz").expect("Error opening file.");
352
353        // Check sequence name vector.
354        assert_eq!(
355            reader.seqnames(),
356            vec![String::from("chr1"), String::from("chr2")]
357        );
358
359        // Check mapping between name and idx.
360        assert_eq!(reader.tid("chr1").unwrap(), 0);
361        assert_eq!(reader.tid("chr2").unwrap(), 1);
362        assert!(reader.tid("chr3").is_err());
363    }
364
365    #[test]
366    fn bed_fetch_from_chr1_read_api() {
367        let mut reader =
368            Reader::from_path("test/tabix_reader/test_bed3.bed.gz").expect("Error opening file.");
369
370        let chr1_id = reader.tid("chr1").unwrap();
371        assert!(reader.fetch(chr1_id, 1000, 1003).is_ok());
372
373        let mut record = Vec::new();
374        assert!(reader.read(&mut record).is_ok());
375        assert_eq!(record, Vec::from("chr1\t1001\t1002"));
376        assert_eq!(reader.read(&mut record), Ok(false)); // EOF
377    }
378
379    #[test]
380    fn bed_fetch_from_chr1_iterator_api() {
381        let mut reader =
382            Reader::from_path("test/tabix_reader/test_bed3.bed.gz").expect("Error opening file.");
383
384        let chr1_id = reader.tid("chr1").unwrap();
385        assert!(reader.fetch(chr1_id, 1000, 1003).is_ok());
386
387        let records: Vec<Vec<u8>> = reader.records().map(|r| r.unwrap()).collect();
388        assert_eq!(records, vec![Vec::from("chr1\t1001\t1002")]);
389    }
390
391    #[test]
392    fn test_fails_on_bam() {
393        let reader = Reader::from_path("test/test.bam");
394        assert!(reader.is_err());
395    }
396
397    #[test]
398    fn test_fails_on_non_existiant() {
399        let reader = Reader::from_path("test/no_such_file");
400        assert!(reader.is_err());
401    }
402
403    #[test]
404    fn test_fails_on_vcf() {
405        let reader = Reader::from_path("test/test_left.vcf");
406        assert!(reader.is_err());
407    }
408
409    #[test]
410    fn test_text_header_regions() {
411        // This file has chromosome, start, and end positions with a header line.
412        Reader::from_path("test/tabix_reader/genomic_regions_header.txt.gz")
413            .expect("Error opening file.");
414    }
415
416    #[test]
417    fn test_text_header_positions() {
418        // This file has chromosome and position with a header line, indexed with
419        // `tabix -b2 -e2 <file>`.
420        Reader::from_path("test/tabix_reader/genomic_positions_header.txt.gz")
421            .expect("Error opening file.");
422    }
423
424    #[test]
425    fn test_text_bad_header() {
426        // This is a duplicate of the above file but the index file is nonsense text.
427        Reader::from_path("test/tabix_reader/bad_header.txt.gz")
428            .expect_err("Invalid index file should fail.");
429    }
430}