noodles_vcf/io/reader.rs
1//! VCF reader and iterators.
2
3mod builder;
4pub mod header;
5pub(crate) mod query;
6pub(crate) mod record;
7pub mod record_buf;
8mod record_bufs;
9
10use self::record::read_record;
11pub(crate) use self::record_buf::parse_record_buf;
12pub use self::{builder::Builder, query::Query, record_bufs::RecordBufs};
13
14use std::{
15 io::{self, BufRead},
16 iter,
17};
18
19use noodles_bgzf as bgzf;
20use noodles_core::Region;
21use noodles_csi::BinningIndex;
22
23use self::header::read_header;
24use crate::{variant::RecordBuf, Header, Record};
25
26/// A VCF reader.
27///
28/// The VCF format has two main parts: 1) a header and 2) a list of VCF records.
29///
30/// Each header line is prefixed with a `#` (number sign) and is terminated by the header header
31/// (`#CHROM`...; inclusive).
32///
33/// VCF records are line-based and follow directly after the header until EOF.
34///
35/// # Examples
36///
37/// ```no_run
38/// use noodles_vcf as vcf;
39///
40/// let mut reader = vcf::io::reader::Builder::default().build_from_path("sample.vcf")?;
41/// let header = reader.read_header()?;
42///
43/// for result in reader.records() {
44/// let record = result?;
45/// // ...
46/// }
47/// # Ok::<_, std::io::Error>(())
48/// ```
49#[derive(Debug)]
50pub struct Reader<R> {
51 inner: R,
52 buf: String,
53}
54
55impl<R> Reader<R> {
56 /// Returns a reference to the underlying reader.
57 ///
58 /// # Examples
59 ///
60 /// ```
61 /// use noodles_vcf as vcf;
62 /// let data = [];
63 /// let reader = vcf::io::Reader::new(&data[..]);
64 /// assert!(reader.get_ref().is_empty());
65 /// ```
66 pub fn get_ref(&self) -> &R {
67 &self.inner
68 }
69
70 /// Returns a mutable reference to the underlying reader.
71 ///
72 /// # Examples
73 ///
74 /// ```
75 /// use noodles_vcf as vcf;
76 /// let data = [];
77 /// let mut reader = vcf::io::Reader::new(&data[..]);
78 /// assert!(reader.get_mut().is_empty());
79 /// ```
80 pub fn get_mut(&mut self) -> &mut R {
81 &mut self.inner
82 }
83
84 /// Unwraps and returns the underlying writer.
85 ///
86 /// # Examples
87 ///
88 /// ```
89 /// use noodles_vcf as vcf;
90 /// let data = [];
91 /// let reader = vcf::io::Reader::new(&data[..]);
92 /// assert!(reader.into_inner().is_empty());
93 /// ```
94 pub fn into_inner(self) -> R {
95 self.inner
96 }
97}
98
99impl<R> Reader<R>
100where
101 R: BufRead,
102{
103 /// Creates a VCF reader.
104 ///
105 /// # Examples
106 ///
107 /// ```
108 /// use noodles_vcf as vcf;
109 ///
110 /// let data = b"##fileformat=VCFv4.3
111 /// #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
112 /// sq0\t1\t.\tA\t.\t.\tPASS\t.
113 /// ";
114 ///
115 /// let reader = vcf::io::Reader::new(&data[..]);
116 /// ```
117 pub fn new(inner: R) -> Self {
118 Self {
119 inner,
120 buf: String::new(),
121 }
122 }
123
124 /// Returns a VCF header reader.
125 ///
126 /// This creates an adapter that reads at most the length of the header, i.e., all lines
127 /// prefixed with a `#` (number sign).
128 ///
129 /// It is more ergonomic to read and parse the header using [`Self::read_header`], but using
130 /// this adapter allows for control of how the header is read, e.g., to read the raw VCF
131 /// header.
132 ///
133 /// The position of the stream is expected to be at the start.
134 ///
135 /// # Examples
136 ///
137 /// ```
138 /// # use std::io::Read;
139 /// use noodles_vcf as vcf;
140 ///
141 /// let data = b"##fileformat=VCFv4.3
142 /// #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
143 /// sq0\t1\t.\tA\t.\t.\tPASS\t.
144 /// ";
145 ///
146 /// let mut reader = vcf::io::Reader::new(&data[..]);
147 /// let mut header_reader = reader.header_reader();
148 ///
149 /// let mut raw_header = String::new();
150 /// header_reader.read_to_string(&mut raw_header)?;
151 ///
152 /// assert_eq!(raw_header, "##fileformat=VCFv4.3\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n");
153 /// # Ok::<_, std::io::Error>(())
154 /// ```
155 pub fn header_reader(&mut self) -> header::Reader<&mut R> {
156 header::Reader::new(&mut self.inner)
157 }
158
159 /// Reads the VCF header.
160 ///
161 /// This reads all header lines prefixed with a `#` (number sign), which includes the header
162 /// header (`#CHROM`...), and parses it as a [`crate::Header`].
163 ///
164 /// The position of the stream is expected to be at the start.
165 ///
166 /// # Examples
167 ///
168 /// ```
169 /// # use std::io;
170 /// use noodles_vcf as vcf;
171 ///
172 /// let data = b"##fileformat=VCFv4.3
173 /// #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
174 /// sq0\t1\t.\tA\t.\t.\tPASS\t.
175 /// ";
176 ///
177 /// let mut reader = vcf::io::Reader::new(&data[..]);
178 /// let header = reader.read_header()?;
179 /// # Ok::<(), io::Error>(())
180 /// ```
181 pub fn read_header(&mut self) -> io::Result<Header> {
182 read_header(&mut self.inner)
183 }
184
185 /// Reads a single VCF record.
186 ///
187 /// This reads a line from the underlying stream until a newline is reached and parses that
188 /// line into the given record.
189 ///
190 /// The stream is expected to be directly after the header or at the start of another record.
191 ///
192 /// It is more ergonomic to read records using an iterator (see [`Self::records`]), but using
193 /// this method allows control of the record buffer.
194 ///
195 /// If successful, the number of bytes read is returned. If the number of bytes read is 0, the
196 /// stream reached EOF.
197 ///
198 /// # Examples
199 ///
200 /// ```
201 /// use noodles_vcf as vcf;
202 ///
203 /// let data = b"##fileformat=VCFv4.3
204 /// #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
205 /// sq0\t1\t.\tA\t.\t.\tPASS\t.
206 /// ";
207 ///
208 /// let mut reader = vcf::io::Reader::new(&data[..]);
209 /// let header = reader.read_header()?;
210 ///
211 /// let mut record = vcf::variant::RecordBuf::default();
212 /// reader.read_record_buf(&header, &mut record)?;
213 /// # Ok::<_, std::io::Error>(())
214 /// ```
215 pub fn read_record_buf(
216 &mut self,
217 header: &Header,
218 record: &mut RecordBuf,
219 ) -> io::Result<usize> {
220 self.buf.clear();
221
222 match read_line(&mut self.inner, &mut self.buf)? {
223 0 => Ok(0),
224 n => {
225 parse_record_buf(&self.buf, header, record)
226 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
227
228 Ok(n)
229 }
230 }
231 }
232
233 /// Returns an iterator over records starting from the current stream position.
234 ///
235 /// The stream is expected to be directly after the header or at the start of another record.
236 ///
237 /// # Examples
238 ///
239 /// ```
240 /// use noodles_vcf as vcf;
241 ///
242 /// let data = b"##fileformat=VCFv4.3
243 /// #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
244 /// sq0\t1\t.\tA\t.\t.\tPASS\t.
245 /// ";
246 ///
247 /// let mut reader = vcf::io::Reader::new(&data[..]);
248 /// let header = reader.read_header()?;
249 ///
250 /// let mut records = reader.record_bufs(&header);
251 /// assert!(records.next().is_some());
252 /// assert!(records.next().is_none());
253 /// # Ok::<_, std::io::Error>(())
254 /// ```
255 pub fn record_bufs<'r, 'h: 'r>(&'r mut self, header: &'h Header) -> RecordBufs<'r, 'h, R> {
256 RecordBufs::new(self, header)
257 }
258
259 /// Reads a single record without eagerly parsing its fields.
260 ///
261 /// The reads VCF record fields from the underlying stream into the given record's buffer until
262 /// a newline is reached. No fields are parsed, meaning the record is not necessarily valid.
263 /// However, the structure of the line is guaranteed to be record-like.
264 ///
265 /// The stream is expected to be directly after the header or at the start of another record.
266 ///
267 /// If successful, the number of bytes read is returned. If the number of bytes read is 0, the
268 /// stream reached EOF.
269 ///
270 /// # Examples
271 ///
272 /// ```
273 /// use noodles_vcf as vcf;
274 ///
275 /// let data = b"##fileformat=VCFv4.3
276 /// #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
277 /// sq0\t1\t.\tA\t.\t.\tPASS\t.
278 /// ";
279 ///
280 /// let mut reader = vcf::io::Reader::new(&data[..]);
281 /// reader.read_header()?;
282 ///
283 /// let mut record = vcf::Record::default();
284 /// reader.read_record(&mut record)?;
285 /// # Ok::<_, std::io::Error>(())
286 /// ```
287 pub fn read_record(&mut self, record: &mut Record) -> io::Result<usize> {
288 read_record(&mut self.inner, record)
289 }
290
291 /// Returns an iterator over records.
292 ///
293 /// The stream is expected to be directly after the header or at the start of another record.
294 ///
295 /// # Examples
296 ///
297 /// ```
298 /// use noodles_vcf as vcf;
299 ///
300 /// const DATA: &[u8] = b"##fileformat=VCFv4.3
301 /// #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
302 /// sq0\t1\t.\tA\t.\t.\tPASS\t.
303 /// ";
304 ///
305 /// let mut reader = vcf::io::Reader::new(DATA);
306 /// reader.read_header()?;
307 ///
308 /// for result in reader.records() {
309 /// let record = result?;
310 /// // ...
311 /// }
312 /// # Ok::<_, std::io::Error>(())
313 /// ```
314 pub fn records(&mut self) -> impl Iterator<Item = io::Result<Record>> + '_ {
315 let mut record = Record::default();
316
317 iter::from_fn(move || match self.read_record(&mut record) {
318 Ok(0) => None,
319 Ok(_) => Some(Ok(record.clone())),
320 Err(e) => Some(Err(e)),
321 })
322 }
323}
324
325impl<R> Reader<R>
326where
327 R: bgzf::io::BufRead + bgzf::io::Seek,
328{
329 /// Returns an iterator over records that intersects the given region.
330 ///
331 /// # Examples
332 ///
333 /// ```no_run
334 /// # use std::fs::File;
335 /// use noodles_bgzf as bgzf;;
336 /// use noodles_core::Region;
337 /// use noodles_tabix as tabix;
338 /// use noodles_vcf as vcf;
339 ///
340 /// let mut reader = File::open("sample.vcf.gz")
341 /// .map(bgzf::Reader::new)
342 /// .map(vcf::io::Reader::new)?;
343 ///
344 /// let header = reader.read_header()?;
345 ///
346 /// let index = tabix::fs::read("sample.vcf.gz.tbi")?;
347 /// let region = "sq0:8-13".parse()?;
348 /// let query = reader.query(&header, &index, ®ion)?;
349 ///
350 /// for result in query {
351 /// let record = result?;
352 /// println!("{:?}", record);
353 /// }
354 /// Ok::<_, Box<dyn std::error::Error>>(())
355 /// ```
356 pub fn query<'r, 'h, I>(
357 &'r mut self,
358 header: &'h Header,
359 index: &I,
360 region: &Region,
361 ) -> io::Result<Query<'r, 'h, R>>
362 where
363 I: BinningIndex,
364 {
365 let (reference_sequence_id, reference_sequence_name) = resolve_region(index, region)?;
366 let chunks = index.query(reference_sequence_id, region.interval())?;
367
368 Ok(Query::new(
369 self.get_mut(),
370 chunks,
371 reference_sequence_name,
372 region.interval(),
373 header,
374 ))
375 }
376}
377
378impl<R> crate::variant::io::Read<R> for Reader<R>
379where
380 R: BufRead,
381{
382 fn read_variant_header(&mut self) -> io::Result<Header> {
383 self.read_header()
384 }
385
386 fn variant_records<'r, 'h: 'r>(
387 &'r mut self,
388 _: &'h Header,
389 ) -> Box<dyn Iterator<Item = io::Result<Box<dyn crate::variant::Record>>> + 'r> {
390 Box::new(
391 self.records().map(|result| {
392 result.map(|record| Box::new(record) as Box<dyn crate::variant::Record>)
393 }),
394 )
395 }
396}
397
398// Reads all bytes until a line feed ('\n') or EOF is reached.
399//
400// The buffer will not include the trailing newline ('\n' or '\r\n').
401fn read_line<R>(reader: &mut R, buf: &mut String) -> io::Result<usize>
402where
403 R: BufRead,
404{
405 const LINE_FEED: char = '\n';
406 const CARRIAGE_RETURN: char = '\r';
407
408 match reader.read_line(buf) {
409 Ok(0) => Ok(0),
410 Ok(n) => {
411 if buf.ends_with(LINE_FEED) {
412 buf.pop();
413
414 if buf.ends_with(CARRIAGE_RETURN) {
415 buf.pop();
416 }
417 }
418
419 Ok(n)
420 }
421 Err(e) => Err(e),
422 }
423}
424
425pub(crate) fn resolve_region<I>(index: &I, region: &Region) -> io::Result<(usize, Vec<u8>)>
426where
427 I: BinningIndex,
428{
429 let header = index
430 .header()
431 .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "missing tabix header"))?;
432
433 let i = header
434 .reference_sequence_names()
435 .get_index_of(region.name())
436 .ok_or_else(|| {
437 io::Error::new(
438 io::ErrorKind::InvalidInput,
439 format!(
440 "region reference sequence does not exist in reference sequences: {region:?}"
441 ),
442 )
443 })?;
444
445 Ok((i, region.name().to_vec()))
446}
447
448#[cfg(test)]
449mod tests {
450 use super::*;
451
452 #[test]
453 fn test_read_record() -> io::Result<()> {
454 static DATA: &[u8] = b"\
455##fileformat=VCFv4.3
456##fileDate=20200501
457#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
458sq0\t1\t.\tA\t.\t.\tPASS\t.
459";
460
461 let mut reader = Reader::new(DATA);
462 let header = reader.read_header()?;
463
464 let mut record = RecordBuf::default();
465
466 let bytes_read = reader.read_record_buf(&header, &mut record)?;
467 assert_eq!(bytes_read, 21);
468
469 let bytes_read = reader.read_record_buf(&header, &mut record)?;
470 assert_eq!(bytes_read, 0);
471
472 Ok(())
473 }
474
475 #[test]
476 fn test_read_line() -> io::Result<()> {
477 let mut buf = String::new();
478
479 let data = b"noodles\n";
480 let mut reader = &data[..];
481 buf.clear();
482 read_line(&mut reader, &mut buf)?;
483 assert_eq!(buf, "noodles");
484
485 let data = b"noodles\r\n";
486 let mut reader = &data[..];
487 buf.clear();
488 read_line(&mut reader, &mut buf)?;
489 assert_eq!(buf, "noodles");
490
491 let data = b"noodles";
492 let mut reader = &data[..];
493 buf.clear();
494 read_line(&mut reader, &mut buf)?;
495 assert_eq!(buf, "noodles");
496
497 Ok(())
498 }
499}