rust_htslib/bcf/
header.rs

1// Copyright 2014 Johannes Köster.
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5//! Module for working with VCF or BCF headers.
6//!
7//! # Examples
8//! From the header of a VCF file we can
9//!   - Output sample count of a VCF file
10//!   - Output sample names of a VCF file
11//!   - Output sample index given a sample name of a VCF file.
12//! ```
13//! use crate::rust_htslib::bcf::{Reader, Read};
14//! use std::io::Read as IoRead;
15//!
16//! let path = &"test/test_string.vcf";
17//! let mut bcf = Reader::from_path(path).expect("Error opening file.");
18//! let header = bcf.header();
19//! assert_eq!(header.sample_count(), 2);  // Sample count
20//! let mut s = String::new();
21//! for (i, mut x) in header.samples().into_iter().enumerate() {
22//!     x.read_to_string(&mut s);  // Read sample name in to `s`
23//!     println!("{}", s);  // output sample name
24//! }
25//! assert_eq!(header.sample_id(b"one").unwrap(), 0);  // Sample index wrapped in Option<usize>
26//! assert_eq!(header.sample_id(b"two").unwrap(), 1);  // Sample index wrapped in Option<usize>
27//! assert!(header.sample_id(b"non existent sample").is_none());  // Return none if not found
28//!
29//! assert_eq!(header.contig_count(), 1); // Number of contig in header.
30//! // obtain the data type of an INFO field
31//! let (tag_type, tag_length) = header.info_type(b"S1").unwrap();
32//! let (fmt_type, fmt_length) = header.format_type(b"GT").unwrap();
33//! ```
34
35use std::ffi;
36use std::os::raw::c_char;
37use std::rc::Rc;
38use std::slice;
39use std::str;
40
41use crate::htslib;
42
43use linear_map::LinearMap;
44
45use crate::errors::{Error, Result};
46
47pub type SampleSubset = Vec<i32>;
48
49custom_derive! {
50    /// A newtype for IDs from BCF headers.
51    #[derive(
52        NewtypeFrom,
53        NewtypeDeref,
54        PartialEq,
55        PartialOrd,
56        Eq,
57        Ord,
58        Copy,
59        Clone,
60        Debug
61    )]
62    pub struct Id(pub u32);
63}
64
65/// A BCF header.
66#[derive(Debug)]
67pub struct Header {
68    pub inner: *mut htslib::bcf_hdr_t,
69    pub subset: Option<SampleSubset>,
70}
71
72impl Default for Header {
73    fn default() -> Self {
74        Self::new()
75    }
76}
77
78impl Header {
79    /// Create a new (empty) `Header`.
80    pub fn new() -> Self {
81        let c_str = ffi::CString::new(&b"w"[..]).unwrap();
82        Header {
83            inner: unsafe { htslib::bcf_hdr_init(c_str.as_ptr()) },
84            subset: None,
85        }
86    }
87
88    /// Create a new `Header` using the given `HeaderView` as the template.
89    ///
90    /// After construction, you can modify the header independently from the template `header`.
91    ///
92    /// # Arguments
93    ///
94    /// - `header` - The `HeaderView` to use as the template.
95    pub fn from_template(header: &HeaderView) -> Self {
96        Header {
97            inner: unsafe { htslib::bcf_hdr_dup(header.inner) },
98            subset: None,
99        }
100    }
101
102    /// Create a new `Header` using the given `HeaderView` as as template, but subsetting to the
103    /// given `samples`.
104    ///
105    /// # Arguments
106    ///
107    /// - `header` - The `HeaderView` to use for the template.
108    /// - `samples` - A slice of byte-encoded (`[u8]`) sample names.
109    pub fn from_template_subset(header: &HeaderView, samples: &[&[u8]]) -> Result<Self> {
110        let mut imap = vec![0; samples.len()];
111        let names: Vec<_> = samples
112            .iter()
113            .map(|&s| ffi::CString::new(s).unwrap())
114            .collect();
115        let name_pointers: Vec<_> = names.iter().map(|s| s.as_ptr() as *mut i8).collect();
116        let inner = unsafe {
117            htslib::bcf_hdr_subset(
118                header.inner,
119                samples.len() as i32,
120                name_pointers.as_ptr() as *const *mut c_char,
121                imap.as_mut_ptr() as *mut i32,
122            )
123        };
124        if inner.is_null() {
125            Err(Error::BcfDuplicateSampleNames)
126        } else {
127            Ok(Header {
128                inner,
129                subset: Some(imap),
130            })
131        }
132    }
133
134    /// Add a `sample` to the header.
135    ///
136    /// # Arguments
137    ///
138    /// - `sample` - Name of the sample to add (to the end of the sample list).
139    pub fn push_sample(&mut self, sample: &[u8]) -> &mut Self {
140        let c_str = ffi::CString::new(sample).unwrap();
141        unsafe { htslib::bcf_hdr_add_sample(self.inner, c_str.as_ptr()) };
142        self
143    }
144
145    /// Add a record to the header.
146    ///
147    /// # Arguments
148    ///
149    /// - `record` - String representation of the header line
150    ///
151    /// # Example
152    ///
153    /// ```rust,ignore
154    /// header.push_record(format!("##contig=<ID={},length={}>", "chrX", 155270560).as_bytes());
155    /// ```
156    pub fn push_record(&mut self, record: &[u8]) -> &mut Self {
157        let c_str = ffi::CString::new(record).unwrap();
158        unsafe { htslib::bcf_hdr_append(self.inner, c_str.as_ptr()) };
159        self
160    }
161
162    /// Remove a `FILTER` entry from the header.
163    ///
164    /// # Arguments
165    ///
166    /// - `tag` - Name of the `FLT` tag to remove.
167    pub fn remove_filter(&mut self, tag: &[u8]) -> &mut Self {
168        self.remove_impl(tag, htslib::BCF_HL_FLT)
169    }
170
171    /// Remove an `INFO` entry from the header.
172    ///
173    /// # Arguments
174    ///
175    /// - `tag` - Name of the `INFO` tag to remove.
176    pub fn remove_info(&mut self, tag: &[u8]) -> &mut Self {
177        self.remove_impl(tag, htslib::BCF_HL_INFO)
178    }
179
180    /// Remove a `FORMAT` entry from the header.
181    ///
182    /// # Arguments
183    ///
184    /// - `tag` - Name of the `FORMAT` tag to remove.
185    pub fn remove_format(&mut self, tag: &[u8]) -> &mut Self {
186        self.remove_impl(tag, htslib::BCF_HL_FMT)
187    }
188
189    /// Remove a contig entry from the header.
190    ///
191    /// # Arguments
192    ///
193    /// - `tag` - Name of the `FORMAT` tag to remove.
194    pub fn remove_contig(&mut self, tag: &[u8]) -> &mut Self {
195        self.remove_impl(tag, htslib::BCF_HL_CTG)
196    }
197
198    /// Remove a structured entry from the header.
199    ///
200    /// # Arguments
201    ///
202    /// - `tag` - Name of the structured tag to remove.
203    pub fn remove_structured(&mut self, tag: &[u8]) -> &mut Self {
204        self.remove_impl(tag, htslib::BCF_HL_STR)
205    }
206
207    /// Remove a generic entry from the header.
208    ///
209    /// # Arguments
210    ///
211    /// - `tag` - Name of the generic tag to remove.
212    pub fn remove_generic(&mut self, tag: &[u8]) -> &mut Self {
213        self.remove_impl(tag, htslib::BCF_HL_GEN)
214    }
215
216    /// Implementation of removing header tags.
217    fn remove_impl(&mut self, tag: &[u8], type_: u32) -> &mut Self {
218        unsafe {
219            let v = tag.to_vec();
220            let c_str = ffi::CString::new(v).unwrap();
221            htslib::bcf_hdr_remove(self.inner, type_ as i32, c_str.as_ptr());
222        }
223        self
224    }
225}
226
227impl Drop for Header {
228    fn drop(&mut self) {
229        unsafe { htslib::bcf_hdr_destroy(self.inner) };
230    }
231}
232
233/// A header record.
234#[derive(Debug)]
235pub enum HeaderRecord {
236    /// A `FILTER` header record.
237    Filter {
238        key: String,
239        values: LinearMap<String, String>,
240    },
241    /// An `INFO` header record.
242    Info {
243        key: String,
244        values: LinearMap<String, String>,
245    },
246    /// A `FORMAT` header record.
247    Format {
248        key: String,
249        values: LinearMap<String, String>,
250    },
251    /// A `contig` header record.
252    Contig {
253        key: String,
254        values: LinearMap<String, String>,
255    },
256    /// A structured header record.
257    Structured {
258        key: String,
259        values: LinearMap<String, String>,
260    },
261    /// A generic, unstructured header record.
262    Generic { key: String, value: String },
263}
264
265#[derive(Debug)]
266pub struct HeaderView {
267    pub inner: *mut htslib::bcf_hdr_t,
268}
269
270impl HeaderView {
271    pub fn new(inner: *mut htslib::bcf_hdr_t) -> Self {
272        HeaderView { inner }
273    }
274
275    #[inline]
276    fn inner(&self) -> htslib::bcf_hdr_t {
277        unsafe { *self.inner }
278    }
279
280    /// Get the number of samples defined in the header.
281    pub fn sample_count(&self) -> u32 {
282        self.inner().n[htslib::BCF_DT_SAMPLE as usize] as u32
283    }
284
285    /// Get vector of sample names defined in the header.
286    pub fn samples(&self) -> Vec<&[u8]> {
287        let names =
288            unsafe { slice::from_raw_parts(self.inner().samples, self.sample_count() as usize) };
289        names
290            .iter()
291            .map(|name| unsafe { ffi::CStr::from_ptr(*name).to_bytes() })
292            .collect()
293    }
294
295    /// Obtain id (column index) of given sample.
296    /// Returns `None` if sample is not present in header.
297    pub fn sample_id(&self, sample: &[u8]) -> Option<usize> {
298        self.samples().iter().position(|s| *s == sample)
299    }
300
301    /// Get the number of contigs defined in the header.
302    pub fn contig_count(&self) -> u32 {
303        self.inner().n[htslib::BCF_DT_CTG as usize] as u32
304    }
305
306    pub fn rid2name(&self, rid: u32) -> Result<&[u8]> {
307        if rid <= self.contig_count() {
308            unsafe {
309                let dict = self.inner().id[htslib::BCF_DT_CTG as usize];
310                let ptr = (*dict.offset(rid as isize)).key;
311                Ok(ffi::CStr::from_ptr(ptr).to_bytes())
312            }
313        } else {
314            Err(Error::BcfUnknownRID { rid })
315        }
316    }
317
318    /// Retrieve the (internal) chromosome identifier
319    /// # Examples
320    /// ```rust
321    /// use rust_htslib::bcf::header::Header;
322    /// use rust_htslib::bcf::{Format, Writer};
323    ///
324    /// let mut header = Header::new();
325    /// let contig_field = br#"##contig=<ID=foo,length=10>"#;
326    /// header.push_record(contig_field);
327    /// let mut vcf = Writer::from_stdout(&header, true, Format::Vcf).unwrap();
328    /// let header_view = vcf.header();
329    /// let rid = header_view.name2rid(b"foo").unwrap();
330    /// assert_eq!(rid, 0);
331    /// // try and retrieve a contig not in the header
332    /// let result = header_view.name2rid(b"bar");
333    /// assert!(result.is_err())
334    /// ```
335    /// # Errors
336    /// If `name` does not match a chromosome currently in the VCF header, returns [`Error::BcfUnknownContig`]
337    pub fn name2rid(&self, name: &[u8]) -> Result<u32> {
338        let c_str = ffi::CString::new(name).unwrap();
339        unsafe {
340            match htslib::bcf_hdr_id2int(
341                self.inner,
342                htslib::BCF_DT_CTG as i32,
343                c_str.as_ptr() as *mut c_char,
344            ) {
345                -1 => Err(Error::BcfUnknownContig {
346                    contig: str::from_utf8(name).unwrap().to_owned(),
347                }),
348                i => Ok(i as u32),
349            }
350        }
351    }
352
353    pub fn info_type(&self, tag: &[u8]) -> Result<(TagType, TagLength)> {
354        self.tag_type(tag, htslib::BCF_HL_INFO)
355    }
356
357    pub fn format_type(&self, tag: &[u8]) -> Result<(TagType, TagLength)> {
358        self.tag_type(tag, htslib::BCF_HL_FMT)
359    }
360
361    fn tag_type(&self, tag: &[u8], hdr_type: ::libc::c_uint) -> Result<(TagType, TagLength)> {
362        let tag_desc = || str::from_utf8(tag).unwrap().to_owned();
363        let c_str_tag = ffi::CString::new(tag).unwrap();
364        let (_type, length, num_values) = unsafe {
365            let id = htslib::bcf_hdr_id2int(
366                self.inner,
367                htslib::BCF_DT_ID as i32,
368                c_str_tag.as_ptr() as *mut c_char,
369            );
370            if id < 0 {
371                return Err(Error::BcfUndefinedTag { tag: tag_desc() });
372            }
373            let n = (*self.inner).n[htslib::BCF_DT_ID as usize] as usize;
374            let entry = slice::from_raw_parts((*self.inner).id[htslib::BCF_DT_ID as usize], n);
375            let d = (*entry[id as usize].val).info[hdr_type as usize];
376            (d >> 4 & 0xf, d >> 8 & 0xf, d >> 12)
377        };
378        let _type = match _type as ::libc::c_uint {
379            htslib::BCF_HT_FLAG => TagType::Flag,
380            htslib::BCF_HT_INT => TagType::Integer,
381            htslib::BCF_HT_REAL => TagType::Float,
382            htslib::BCF_HT_STR => TagType::String,
383            _ => return Err(Error::BcfUnexpectedType { tag: tag_desc() }),
384        };
385        let length = match length as ::libc::c_uint {
386            // XXX: Hacky "as u32" cast. Trace back through unsafe{} towards BCF struct and rollback to proper type
387            htslib::BCF_VL_FIXED => TagLength::Fixed(num_values as u32),
388            htslib::BCF_VL_VAR => TagLength::Variable,
389            htslib::BCF_VL_A => TagLength::AltAlleles,
390            htslib::BCF_VL_R => TagLength::Alleles,
391            htslib::BCF_VL_G => TagLength::Genotypes,
392            _ => return Err(Error::BcfUnexpectedType { tag: tag_desc() }),
393        };
394
395        Ok((_type, length))
396    }
397
398    /// Convert string ID (e.g., for a `FILTER` value) to its numeric identifier.
399    pub fn name_to_id(&self, id: &[u8]) -> Result<Id> {
400        let c_str = ffi::CString::new(id).unwrap();
401        unsafe {
402            match htslib::bcf_hdr_id2int(
403                self.inner,
404                htslib::BCF_DT_ID as i32,
405                c_str.as_ptr() as *const c_char,
406            ) {
407                -1 => Err(Error::BcfUnknownID {
408                    id: str::from_utf8(id).unwrap().to_owned(),
409                }),
410                i => Ok(Id(i as u32)),
411            }
412        }
413    }
414
415    /// Convert integer representing an identifier (e.g., a `FILTER` value) to its string
416    /// name.bam.
417    pub fn id_to_name(&self, id: Id) -> Vec<u8> {
418        let key = unsafe {
419            ffi::CStr::from_ptr(
420                (*(*self.inner).id[htslib::BCF_DT_ID as usize].offset(*id as isize)).key,
421            )
422        };
423        key.to_bytes().to_vec()
424    }
425
426    /// Convert string sample name to its numeric identifier.
427    pub fn sample_to_id(&self, id: &[u8]) -> Result<Id> {
428        let c_str = ffi::CString::new(id).unwrap();
429        unsafe {
430            match htslib::bcf_hdr_id2int(
431                self.inner,
432                htslib::BCF_DT_SAMPLE as i32,
433                c_str.as_ptr() as *const c_char,
434            ) {
435                -1 => Err(Error::BcfUnknownSample {
436                    name: str::from_utf8(id).unwrap().to_owned(),
437                }),
438                i => Ok(Id(i as u32)),
439            }
440        }
441    }
442
443    /// Convert integer representing an contig to its name.
444    pub fn id_to_sample(&self, id: Id) -> Vec<u8> {
445        let key = unsafe {
446            ffi::CStr::from_ptr(
447                (*(*self.inner).id[htslib::BCF_DT_SAMPLE as usize].offset(*id as isize)).key,
448            )
449        };
450        key.to_bytes().to_vec()
451    }
452
453    /// Return structured `HeaderRecord`s.
454    pub fn header_records(&self) -> Vec<HeaderRecord> {
455        fn parse_kv(rec: &htslib::bcf_hrec_t) -> LinearMap<String, String> {
456            let mut result: LinearMap<String, String> = LinearMap::new();
457            for i in 0_i32..(rec.nkeys) {
458                let key = unsafe {
459                    ffi::CStr::from_ptr(*rec.keys.offset(i as isize))
460                        .to_str()
461                        .unwrap()
462                        .to_string()
463                };
464                let value = unsafe {
465                    ffi::CStr::from_ptr(*rec.vals.offset(i as isize))
466                        .to_str()
467                        .unwrap()
468                        .to_string()
469                };
470                result.insert(key, value);
471            }
472            result
473        }
474
475        let mut result: Vec<HeaderRecord> = Vec::new();
476        for i in 0_i32..unsafe { (*self.inner).nhrec } {
477            let rec = unsafe { &(**(*self.inner).hrec.offset(i as isize)) };
478            let key = unsafe { ffi::CStr::from_ptr(rec.key).to_str().unwrap().to_string() };
479            let record = match rec.type_ {
480                0 => HeaderRecord::Filter {
481                    key,
482                    values: parse_kv(rec),
483                },
484                1 => HeaderRecord::Info {
485                    key,
486                    values: parse_kv(rec),
487                },
488                2 => HeaderRecord::Format {
489                    key,
490                    values: parse_kv(rec),
491                },
492                3 => HeaderRecord::Contig {
493                    key,
494                    values: parse_kv(rec),
495                },
496                4 => HeaderRecord::Structured {
497                    key,
498                    values: parse_kv(rec),
499                },
500                5 => HeaderRecord::Generic {
501                    key,
502                    value: unsafe { ffi::CStr::from_ptr(rec.value).to_str().unwrap().to_string() },
503                },
504                _ => panic!("Unknown type: {}", rec.type_),
505            };
506            result.push(record);
507        }
508        result
509    }
510
511    /// Create an empty record using this header view.
512    ///
513    /// The record can be reused multiple times.
514    pub fn empty_record(&self) -> crate::bcf::Record {
515        crate::bcf::Record::new(Rc::new(self.clone()))
516    }
517}
518
519impl Clone for HeaderView {
520    fn clone(&self) -> Self {
521        HeaderView {
522            inner: unsafe { htslib::bcf_hdr_dup(self.inner) },
523        }
524    }
525}
526
527impl Drop for HeaderView {
528    fn drop(&mut self) {
529        unsafe {
530            htslib::bcf_hdr_destroy(self.inner);
531        }
532    }
533}
534
535#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
536pub enum TagType {
537    Flag,
538    Integer,
539    Float,
540    String,
541}
542
543#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
544pub enum TagLength {
545    Fixed(u32),
546    AltAlleles,
547    Alleles,
548    Genotypes,
549    Variable,
550}
551
552#[cfg(test)]
553mod tests {
554    use super::*;
555    use crate::bcf::Reader;
556
557    #[test]
558    fn test_header_view_empty_record() {
559        // Open a VCF file to get a HeaderView
560        let vcf = Reader::from_path("test/test_string.vcf").expect("Error opening file");
561        let header_view = vcf.header.clone();
562
563        // Create an empty record from the HeaderView
564        let record = header_view.empty_record();
565        eprintln!("{:?}", record.rid());
566
567        // Verify the record is properly initialized with default/empty values
568        assert_eq!(record.rid(), Some(0)); // No chromosome/contig set
569        assert_eq!(record.pos(), 0); // No position set
570        assert_eq!(record.qual(), 0.0); // No quality score set
571    }
572}