noodles_vcf/io/
reader.rs

1//! VCF reader and iterators.
2
3mod builder;
4pub mod header;
5pub(crate) mod query;
6pub(crate) mod record;
7pub mod record_buf;
8mod record_bufs;
9
10use self::record::read_record;
11pub(crate) use self::record_buf::parse_record_buf;
12pub use self::{builder::Builder, query::Query, record_bufs::RecordBufs};
13
14use std::{
15    io::{self, BufRead},
16    iter,
17};
18
19use noodles_bgzf as bgzf;
20use noodles_core::Region;
21use noodles_csi::BinningIndex;
22
23use self::header::read_header;
24use crate::{variant::RecordBuf, Header, Record};
25
26/// A VCF reader.
27///
28/// The VCF format has two main parts: 1) a header and 2) a list of VCF records.
29///
30/// Each header line is prefixed with a `#` (number sign) and is terminated by the header header
31/// (`#CHROM`...; inclusive).
32///
33/// VCF records are line-based and follow directly after the header until EOF.
34///
35/// # Examples
36///
37/// ```no_run
38/// use noodles_vcf as vcf;
39///
40/// let mut reader = vcf::io::reader::Builder::default().build_from_path("sample.vcf")?;
41/// let header = reader.read_header()?;
42///
43/// for result in reader.records() {
44///     let record = result?;
45///     // ...
46/// }
47/// # Ok::<_, std::io::Error>(())
48/// ```
49#[derive(Debug)]
50pub struct Reader<R> {
51    inner: R,
52    buf: String,
53}
54
55impl<R> Reader<R> {
56    /// Returns a reference to the underlying reader.
57    ///
58    /// # Examples
59    ///
60    /// ```
61    /// use noodles_vcf as vcf;
62    /// let data = [];
63    /// let reader = vcf::io::Reader::new(&data[..]);
64    /// assert!(reader.get_ref().is_empty());
65    /// ```
66    pub fn get_ref(&self) -> &R {
67        &self.inner
68    }
69
70    /// Returns a mutable reference to the underlying reader.
71    ///
72    /// # Examples
73    ///
74    /// ```
75    /// use noodles_vcf as vcf;
76    /// let data = [];
77    /// let mut reader = vcf::io::Reader::new(&data[..]);
78    /// assert!(reader.get_mut().is_empty());
79    /// ```
80    pub fn get_mut(&mut self) -> &mut R {
81        &mut self.inner
82    }
83
84    /// Unwraps and returns the underlying writer.
85    ///
86    /// # Examples
87    ///
88    /// ```
89    /// use noodles_vcf as vcf;
90    /// let data = [];
91    /// let reader = vcf::io::Reader::new(&data[..]);
92    /// assert!(reader.into_inner().is_empty());
93    /// ```
94    pub fn into_inner(self) -> R {
95        self.inner
96    }
97}
98
99impl<R> Reader<R>
100where
101    R: BufRead,
102{
103    /// Creates a VCF reader.
104    ///
105    /// # Examples
106    ///
107    /// ```
108    /// use noodles_vcf as vcf;
109    ///
110    /// let data = b"##fileformat=VCFv4.3
111    /// #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
112    /// sq0\t1\t.\tA\t.\t.\tPASS\t.
113    /// ";
114    ///
115    /// let reader = vcf::io::Reader::new(&data[..]);
116    /// ```
117    pub fn new(inner: R) -> Self {
118        Self {
119            inner,
120            buf: String::new(),
121        }
122    }
123
124    /// Returns a VCF header reader.
125    ///
126    /// This creates an adapter that reads at most the length of the header, i.e., all lines
127    /// prefixed with a `#` (number sign).
128    ///
129    /// It is more ergonomic to read and parse the header using [`Self::read_header`], but using
130    /// this adapter allows for control of how the header is read, e.g., to read the raw VCF
131    /// header.
132    ///
133    /// The position of the stream is expected to be at the start.
134    ///
135    /// # Examples
136    ///
137    /// ```
138    /// # use std::io::Read;
139    /// use noodles_vcf as vcf;
140    ///
141    /// let data = b"##fileformat=VCFv4.3
142    /// #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
143    /// sq0\t1\t.\tA\t.\t.\tPASS\t.
144    /// ";
145    ///
146    /// let mut reader = vcf::io::Reader::new(&data[..]);
147    /// let mut header_reader = reader.header_reader();
148    ///
149    /// let mut raw_header = String::new();
150    /// header_reader.read_to_string(&mut raw_header)?;
151    ///
152    /// assert_eq!(raw_header, "##fileformat=VCFv4.3\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n");
153    /// # Ok::<_, std::io::Error>(())
154    /// ```
155    pub fn header_reader(&mut self) -> header::Reader<&mut R> {
156        header::Reader::new(&mut self.inner)
157    }
158
159    /// Reads the VCF header.
160    ///
161    /// This reads all header lines prefixed with a `#` (number sign), which includes the header
162    /// header (`#CHROM`...), and parses it as a [`crate::Header`].
163    ///
164    /// The position of the stream is expected to be at the start.
165    ///
166    /// # Examples
167    ///
168    /// ```
169    /// # use std::io;
170    /// use noodles_vcf as vcf;
171    ///
172    /// let data = b"##fileformat=VCFv4.3
173    /// #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
174    /// sq0\t1\t.\tA\t.\t.\tPASS\t.
175    /// ";
176    ///
177    /// let mut reader = vcf::io::Reader::new(&data[..]);
178    /// let header = reader.read_header()?;
179    /// # Ok::<(), io::Error>(())
180    /// ```
181    pub fn read_header(&mut self) -> io::Result<Header> {
182        read_header(&mut self.inner)
183    }
184
185    /// Reads a single VCF record.
186    ///
187    /// This reads a line from the underlying stream until a newline is reached and parses that
188    /// line into the given record.
189    ///
190    /// The stream is expected to be directly after the header or at the start of another record.
191    ///
192    /// It is more ergonomic to read records using an iterator (see [`Self::records`]), but using
193    /// this method allows control of the record buffer.
194    ///
195    /// If successful, the number of bytes read is returned. If the number of bytes read is 0, the
196    /// stream reached EOF.
197    ///
198    /// # Examples
199    ///
200    /// ```
201    /// use noodles_vcf as vcf;
202    ///
203    /// let data = b"##fileformat=VCFv4.3
204    /// #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
205    /// sq0\t1\t.\tA\t.\t.\tPASS\t.
206    /// ";
207    ///
208    /// let mut reader = vcf::io::Reader::new(&data[..]);
209    /// let header = reader.read_header()?;
210    ///
211    /// let mut record = vcf::variant::RecordBuf::default();
212    /// reader.read_record_buf(&header, &mut record)?;
213    /// # Ok::<_, std::io::Error>(())
214    /// ```
215    pub fn read_record_buf(
216        &mut self,
217        header: &Header,
218        record: &mut RecordBuf,
219    ) -> io::Result<usize> {
220        self.buf.clear();
221
222        match read_line(&mut self.inner, &mut self.buf)? {
223            0 => Ok(0),
224            n => {
225                parse_record_buf(&self.buf, header, record)
226                    .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
227
228                Ok(n)
229            }
230        }
231    }
232
233    /// Returns an iterator over records starting from the current stream position.
234    ///
235    /// The stream is expected to be directly after the header or at the start of another record.
236    ///
237    /// # Examples
238    ///
239    /// ```
240    /// use noodles_vcf as vcf;
241    ///
242    /// let data = b"##fileformat=VCFv4.3
243    /// #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
244    /// sq0\t1\t.\tA\t.\t.\tPASS\t.
245    /// ";
246    ///
247    /// let mut reader = vcf::io::Reader::new(&data[..]);
248    /// let header = reader.read_header()?;
249    ///
250    /// let mut records = reader.record_bufs(&header);
251    /// assert!(records.next().is_some());
252    /// assert!(records.next().is_none());
253    /// # Ok::<_, std::io::Error>(())
254    /// ```
255    pub fn record_bufs<'r, 'h: 'r>(&'r mut self, header: &'h Header) -> RecordBufs<'r, 'h, R> {
256        RecordBufs::new(self, header)
257    }
258
259    /// Reads a single record without eagerly parsing its fields.
260    ///
261    /// The reads VCF record fields from the underlying stream into the given record's buffer until
262    /// a newline is reached. No fields are parsed, meaning the record is not necessarily valid.
263    /// However, the structure of the line is guaranteed to be record-like.
264    ///
265    /// The stream is expected to be directly after the header or at the start of another record.
266    ///
267    /// If successful, the number of bytes read is returned. If the number of bytes read is 0, the
268    /// stream reached EOF.
269    ///
270    /// # Examples
271    ///
272    /// ```
273    /// use noodles_vcf as vcf;
274    ///
275    /// let data = b"##fileformat=VCFv4.3
276    /// #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
277    /// sq0\t1\t.\tA\t.\t.\tPASS\t.
278    /// ";
279    ///
280    /// let mut reader = vcf::io::Reader::new(&data[..]);
281    /// reader.read_header()?;
282    ///
283    /// let mut record = vcf::Record::default();
284    /// reader.read_record(&mut record)?;
285    /// # Ok::<_, std::io::Error>(())
286    /// ```
287    pub fn read_record(&mut self, record: &mut Record) -> io::Result<usize> {
288        read_record(&mut self.inner, record)
289    }
290
291    /// Returns an iterator over records.
292    ///
293    /// The stream is expected to be directly after the header or at the start of another record.
294    ///
295    /// # Examples
296    ///
297    /// ```
298    /// use noodles_vcf as vcf;
299    ///
300    /// const DATA: &[u8] = b"##fileformat=VCFv4.3
301    /// #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
302    /// sq0\t1\t.\tA\t.\t.\tPASS\t.
303    /// ";
304    ///
305    /// let mut reader = vcf::io::Reader::new(DATA);
306    /// reader.read_header()?;
307    ///
308    /// for result in reader.records() {
309    ///     let record = result?;
310    ///     // ...
311    /// }
312    /// # Ok::<_, std::io::Error>(())
313    /// ```
314    pub fn records(&mut self) -> impl Iterator<Item = io::Result<Record>> + '_ {
315        let mut record = Record::default();
316
317        iter::from_fn(move || match self.read_record(&mut record) {
318            Ok(0) => None,
319            Ok(_) => Some(Ok(record.clone())),
320            Err(e) => Some(Err(e)),
321        })
322    }
323}
324
325impl<R> Reader<R>
326where
327    R: bgzf::io::BufRead + bgzf::io::Seek,
328{
329    /// Returns an iterator over records that intersects the given region.
330    ///
331    /// # Examples
332    ///
333    /// ```no_run
334    /// # use std::fs::File;
335    /// use noodles_bgzf as bgzf;;
336    /// use noodles_core::Region;
337    /// use noodles_tabix as tabix;
338    /// use noodles_vcf as vcf;
339    ///
340    /// let mut reader = File::open("sample.vcf.gz")
341    ///     .map(bgzf::Reader::new)
342    ///     .map(vcf::io::Reader::new)?;
343    ///
344    /// let header = reader.read_header()?;
345    ///
346    /// let index = tabix::fs::read("sample.vcf.gz.tbi")?;
347    /// let region = "sq0:8-13".parse()?;
348    /// let query = reader.query(&header, &index, &region)?;
349    ///
350    /// for result in query {
351    ///     let record = result?;
352    ///     println!("{:?}", record);
353    /// }
354    /// Ok::<_, Box<dyn std::error::Error>>(())
355    /// ```
356    pub fn query<'r, 'h, I>(
357        &'r mut self,
358        header: &'h Header,
359        index: &I,
360        region: &Region,
361    ) -> io::Result<Query<'r, 'h, R>>
362    where
363        I: BinningIndex,
364    {
365        let (reference_sequence_id, reference_sequence_name) = resolve_region(index, region)?;
366        let chunks = index.query(reference_sequence_id, region.interval())?;
367
368        Ok(Query::new(
369            self.get_mut(),
370            chunks,
371            reference_sequence_name,
372            region.interval(),
373            header,
374        ))
375    }
376}
377
378impl<R> crate::variant::io::Read<R> for Reader<R>
379where
380    R: BufRead,
381{
382    fn read_variant_header(&mut self) -> io::Result<Header> {
383        self.read_header()
384    }
385
386    fn variant_records<'r, 'h: 'r>(
387        &'r mut self,
388        _: &'h Header,
389    ) -> Box<dyn Iterator<Item = io::Result<Box<dyn crate::variant::Record>>> + 'r> {
390        Box::new(
391            self.records().map(|result| {
392                result.map(|record| Box::new(record) as Box<dyn crate::variant::Record>)
393            }),
394        )
395    }
396}
397
398// Reads all bytes until a line feed ('\n') or EOF is reached.
399//
400// The buffer will not include the trailing newline ('\n' or '\r\n').
401fn read_line<R>(reader: &mut R, buf: &mut String) -> io::Result<usize>
402where
403    R: BufRead,
404{
405    const LINE_FEED: char = '\n';
406    const CARRIAGE_RETURN: char = '\r';
407
408    match reader.read_line(buf) {
409        Ok(0) => Ok(0),
410        Ok(n) => {
411            if buf.ends_with(LINE_FEED) {
412                buf.pop();
413
414                if buf.ends_with(CARRIAGE_RETURN) {
415                    buf.pop();
416                }
417            }
418
419            Ok(n)
420        }
421        Err(e) => Err(e),
422    }
423}
424
425pub(crate) fn resolve_region<I>(index: &I, region: &Region) -> io::Result<(usize, Vec<u8>)>
426where
427    I: BinningIndex,
428{
429    let header = index
430        .header()
431        .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "missing tabix header"))?;
432
433    let i = header
434        .reference_sequence_names()
435        .get_index_of(region.name())
436        .ok_or_else(|| {
437            io::Error::new(
438                io::ErrorKind::InvalidInput,
439                format!(
440                    "region reference sequence does not exist in reference sequences: {region:?}"
441                ),
442            )
443        })?;
444
445    Ok((i, region.name().to_vec()))
446}
447
448#[cfg(test)]
449mod tests {
450    use super::*;
451
452    #[test]
453    fn test_read_record() -> io::Result<()> {
454        static DATA: &[u8] = b"\
455##fileformat=VCFv4.3
456##fileDate=20200501
457#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
458sq0\t1\t.\tA\t.\t.\tPASS\t.
459";
460
461        let mut reader = Reader::new(DATA);
462        let header = reader.read_header()?;
463
464        let mut record = RecordBuf::default();
465
466        let bytes_read = reader.read_record_buf(&header, &mut record)?;
467        assert_eq!(bytes_read, 21);
468
469        let bytes_read = reader.read_record_buf(&header, &mut record)?;
470        assert_eq!(bytes_read, 0);
471
472        Ok(())
473    }
474
475    #[test]
476    fn test_read_line() -> io::Result<()> {
477        let mut buf = String::new();
478
479        let data = b"noodles\n";
480        let mut reader = &data[..];
481        buf.clear();
482        read_line(&mut reader, &mut buf)?;
483        assert_eq!(buf, "noodles");
484
485        let data = b"noodles\r\n";
486        let mut reader = &data[..];
487        buf.clear();
488        read_line(&mut reader, &mut buf)?;
489        assert_eq!(buf, "noodles");
490
491        let data = b"noodles";
492        let mut reader = &data[..];
493        buf.clear();
494        read_line(&mut reader, &mut buf)?;
495        assert_eq!(buf, "noodles");
496
497        Ok(())
498    }
499}