rust_htslib/faidx/
mod.rs

1// Copyright 2020 Manuel Landesfeind, Evotec International GmbH
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5
6//!
7//! Module for working with faidx-indexed FASTA files.
8//!
9
10use std::ffi;
11use std::path::Path;
12use url::Url;
13
14use crate::htslib;
15
16use crate::errors::{Error, Result};
17use crate::utils::path_as_bytes;
18
19/// A Fasta reader.
20#[derive(Debug)]
21pub struct Reader {
22    inner: *mut htslib::faidx_t,
23}
24
25///
26/// Build a faidx for input path.
27///
28/// # Errors
29/// If indexing fails. Could be malformatted or file could not be accessible.
30///
31///```
32/// use rust_htslib::faidx::build;
33/// let path = std::path::PathBuf::from(concat!(env!("CARGO_MANIFEST_DIR"),"/test/test_cram.fa"));
34/// build(&path).expect("Failed to build fasta index");
35///```
36///
37pub fn build(
38    path: impl Into<std::path::PathBuf>,
39) -> Result<(), std::boxed::Box<dyn std::error::Error>> {
40    let path = path.into();
41    let os_path = std::ffi::CString::new(path.display().to_string())?;
42    let rc = unsafe { htslib::fai_build(os_path.as_ptr()) };
43    if rc < 0 {
44        Err(Error::FaidxBuildFailed { path })?
45    } else {
46        Ok(())
47    }
48}
49
50impl Reader {
51    /// Create a new Reader from a path.
52    ///
53    /// # Arguments
54    ///
55    /// * `path` - the path to open.
56    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self, Error> {
57        Self::new(&path_as_bytes(path, true)?)
58    }
59
60    /// Create a new Reader from an URL.
61    ///
62    /// # Arguments
63    ///
64    /// * `url` - the url to open
65    pub fn from_url(url: &Url) -> Result<Self, Error> {
66        Self::new(url.as_str().as_bytes())
67    }
68
69    /// Internal function to create a Reader from some sort of path (could be file path but also URL).
70    /// The path or URL will be handled by the c-implementation transparently.
71    ///
72    /// # Arguments
73    ///
74    /// * `path` - the path or URL to open
75    fn new(path: &[u8]) -> Result<Self, Error> {
76        let cpath = ffi::CString::new(path).unwrap();
77        let inner = unsafe { htslib::fai_load(cpath.as_ptr()) };
78        Ok(Self { inner })
79    }
80
81    /// Fetch the sequence as a byte array.
82    ///
83    /// # Arguments
84    ///
85    /// * `name` - the name of the template sequence (e.g., "chr1")
86    /// * `begin` - the offset within the template sequence (starting with 0)
87    /// * `end` - the end position to return (if smaller than `begin`, the behavior is undefined).
88    pub fn fetch_seq<N: AsRef<str>>(&self, name: N, begin: usize, end: usize) -> Result<Vec<u8>> {
89        if begin > std::i64::MAX as usize {
90            return Err(Error::FaidxPositionTooLarge);
91        }
92        if end > std::i64::MAX as usize {
93            return Err(Error::FaidxPositionTooLarge);
94        }
95        let cname = ffi::CString::new(name.as_ref().as_bytes()).unwrap();
96        let mut len_out: htslib::hts_pos_t = 0;
97        let ptr = unsafe {
98            htslib::faidx_fetch_seq64(
99                self.inner,                 //*const faidx_t,
100                cname.as_ptr(),             // c_name
101                begin as htslib::hts_pos_t, // p_beg_i
102                end as htslib::hts_pos_t,   // p_end_i
103                &mut len_out,               //len
104            )
105        };
106        let vec =
107            unsafe { Vec::from_raw_parts(ptr as *mut u8, len_out as usize, len_out as usize) };
108        Ok(vec)
109    }
110
111    /// Fetches the sequence and returns it as string.
112    ///
113    /// # Arguments
114    ///
115    /// * `name` - the name of the template sequence (e.g., "chr1")
116    /// * `begin` - the offset within the template sequence (starting with 0)
117    /// * `end` - the end position to return (if smaller than `begin`, the behavior is undefined).
118    pub fn fetch_seq_string<N: AsRef<str>>(
119        &self,
120        name: N,
121        begin: usize,
122        end: usize,
123    ) -> Result<String> {
124        let bytes = self.fetch_seq(name, begin, end)?;
125        Ok(std::str::from_utf8(&bytes).unwrap().to_owned())
126    }
127
128    /// Fetches the number of sequences in the fai index
129    pub fn n_seqs(&self) -> u64 {
130        let n = unsafe { htslib::faidx_nseq(self.inner) };
131        n as u64
132    }
133
134    /// Fetches the i-th sequence name
135    ///
136    /// # Arguments
137    ///
138    /// * `i` - index to query
139    pub fn seq_name(&self, i: i32) -> Result<String> {
140        let cname = unsafe {
141            let ptr = htslib::faidx_iseq(self.inner, i);
142            ffi::CStr::from_ptr(ptr)
143        };
144
145        let out = match cname.to_str() {
146            Ok(s) => s.to_string(),
147            Err(_) => {
148                return Err(Error::FaidxBadSeqName);
149            }
150        };
151
152        Ok(out)
153    }
154
155    /// Fetches the length of the given sequence name.
156    ///
157    /// # Arguments
158    ///
159    /// * `name` - the name of the template sequence (e.g., "chr1")
160    pub fn fetch_seq_len<N: AsRef<str>>(&self, name: N) -> u64 {
161        let cname = ffi::CString::new(name.as_ref().as_bytes()).unwrap();
162        let seq_len = unsafe { htslib::faidx_seq_len(self.inner, cname.as_ptr()) };
163        seq_len as u64
164    }
165
166    /// Returns a Result<Vector<String>> for all seq names.
167    /// # Errors
168    ///
169    /// * `errors::Error::FaidxBadSeqName` - missing sequence name for sequence id.
170    ///
171    /// If thrown, the index is malformed, and the number of sequences in the index does not match the number of sequence names available.
172    ///```
173    /// use rust_htslib::faidx::build;
174    /// let path = std::path::PathBuf::from(concat!(env!("CARGO_MANIFEST_DIR"),"/test/test_cram.fa"));
175    /// build(&path).expect("Failed to build fasta index");
176    /// let reader = rust_htslib::faidx::Reader::from_path(path).expect("Failed to open faidx");
177    /// assert_eq!(reader.seq_names(), Ok(vec!["chr1".to_string(), "chr2".to_string(), "chr3".to_string()]));
178    ///```
179    ///
180    pub fn seq_names(&self) -> Result<Vec<String>> {
181        let num_seq = self.n_seqs();
182        let mut ret = Vec::with_capacity(num_seq as usize);
183        for seq_id in 0..num_seq {
184            ret.push(self.seq_name(seq_id as i32)?);
185        }
186        Ok(ret)
187    }
188}
189
190impl Drop for Reader {
191    fn drop(&mut self) {
192        unsafe {
193            htslib::fai_destroy(self.inner);
194        }
195    }
196}
197
198#[cfg(test)]
199mod tests {
200    use super::*;
201
202    fn open_reader() -> Reader {
203        Reader::from_path(format!("{}/test/test_cram.fa", env!("CARGO_MANIFEST_DIR")))
204            .ok()
205            .unwrap()
206    }
207    #[test]
208    fn faidx_open() {
209        open_reader();
210    }
211
212    #[test]
213    fn faidx_read_chr_first_base() {
214        let r = open_reader();
215
216        let bseq = r.fetch_seq("chr1", 0, 0).unwrap();
217        assert_eq!(bseq.len(), 1);
218        assert_eq!(bseq, b"G");
219
220        let seq = r.fetch_seq_string("chr1", 0, 0).unwrap();
221        assert_eq!(seq.len(), 1);
222        assert_eq!(seq, "G");
223    }
224
225    #[test]
226    fn faidx_read_chr_start() {
227        let r = open_reader();
228
229        //for _i in 0..100_000_000 { // loop to check for memory leaks
230        let bseq = r.fetch_seq("chr1", 0, 9).unwrap();
231        assert_eq!(bseq.len(), 10);
232        assert_eq!(bseq, b"GGGCACAGCC");
233
234        let seq = r.fetch_seq_string("chr1", 0, 9).unwrap();
235        assert_eq!(seq.len(), 10);
236        assert_eq!(seq, "GGGCACAGCC");
237        //}
238    }
239
240    #[test]
241    fn faidx_read_chr_between() {
242        let r = open_reader();
243
244        let bseq = r.fetch_seq("chr1", 4, 14).unwrap();
245        assert_eq!(bseq.len(), 11);
246        assert_eq!(bseq, b"ACAGCCTCACC");
247
248        let seq = r.fetch_seq_string("chr1", 4, 14).unwrap();
249        assert_eq!(seq.len(), 11);
250        assert_eq!(seq, "ACAGCCTCACC");
251    }
252
253    #[test]
254    fn faidx_read_chr_end() {
255        let r = open_reader();
256
257        let bseq = r.fetch_seq("chr1", 110, 120).unwrap();
258        assert_eq!(bseq.len(), 10);
259        assert_eq!(bseq, b"CCCCTCCGTG");
260
261        let seq = r.fetch_seq_string("chr1", 110, 120).unwrap();
262        assert_eq!(seq.len(), 10);
263        assert_eq!(seq, "CCCCTCCGTG");
264    }
265
266    #[test]
267    fn faidx_read_twice_string() {
268        let r = open_reader();
269        let seq = r.fetch_seq_string("chr1", 110, 120).unwrap();
270        assert_eq!(seq.len(), 10);
271        assert_eq!(seq, "CCCCTCCGTG");
272
273        let seq = r.fetch_seq_string("chr1", 5, 9).unwrap();
274        assert_eq!(seq.len(), 5);
275        assert_eq!(seq, "CAGCC");
276    }
277
278    #[test]
279    fn faidx_read_twice_bytes() {
280        let r = open_reader();
281        let seq = r.fetch_seq("chr1", 110, 120).unwrap();
282        assert_eq!(seq.len(), 10);
283        assert_eq!(seq, b"CCCCTCCGTG");
284
285        let seq = r.fetch_seq("chr1", 5, 9).unwrap();
286        assert_eq!(seq.len(), 5);
287        assert_eq!(seq, b"CAGCC");
288    }
289
290    #[test]
291    fn faidx_position_too_large() {
292        let r = open_reader();
293        let position_too_large = i64::MAX as usize;
294        let res = r.fetch_seq("chr1", position_too_large, position_too_large + 1);
295        assert_eq!(res, Err(Error::FaidxPositionTooLarge));
296    }
297
298    #[test]
299    fn faidx_n_seqs() {
300        let r = open_reader();
301        assert_eq!(r.n_seqs(), 3);
302    }
303
304    #[test]
305    fn faidx_seq_name() {
306        let r = open_reader();
307        let n = r.seq_name(1).unwrap();
308        assert_eq!(n, "chr2");
309    }
310
311    #[test]
312    fn faidx_get_seq_len() {
313        let r = open_reader();
314        let chr1_len = r.fetch_seq_len("chr1");
315        let chr2_len = r.fetch_seq_len("chr2");
316        assert_eq!(chr1_len, 120u64);
317        assert_eq!(chr2_len, 120u64);
318    }
319
320    #[test]
321    fn open_many_readers() {
322        for _ in 0..500_000 {
323            let reader = open_reader();
324            drop(reader);
325        }
326    }
327}