noodles_sam/
fs.rs

1//! SAM filesystem operations.
2
3use std::{fs::File, io, path::Path};
4
5use noodles_bgzf as bgzf;
6use noodles_core::Position;
7use noodles_csi::{
8    self as csi,
9    binning_index::{index::reference_sequence::bin::Chunk, Indexer},
10};
11
12use super::{
13    alignment::Record as _,
14    header::record::value::map::header::{sort_order::COORDINATE, tag::SORT_ORDER},
15    io::Reader,
16    Header, Record,
17};
18
19/// Indexes a bgzipped-compressed SAM file.
20///
21/// The input must be coordinate-sorted and marked as such in the SAM header, i.e.,
22/// `SO:coordinate`.
23///
24/// See also [`csi::fs::write`] to write the resulting [`csi::Index`] to a file.
25///
26/// # Examples
27///
28/// ```no_run
29/// use noodles_sam as sam;
30/// let index = sam::fs::index("sample.sam.gz")?;
31/// # Ok::<(), std::io::Error>(())
32/// ````
33pub fn index<P>(src: P) -> io::Result<csi::Index>
34where
35    P: AsRef<Path>,
36{
37    let mut reader = File::open(src).map(bgzf::Reader::new).map(Reader::new)?;
38    index_inner(&mut reader)
39}
40
41fn index_inner<R>(reader: &mut Reader<R>) -> io::Result<csi::Index>
42where
43    R: bgzf::io::BufRead,
44{
45    let header = reader.read_header()?;
46
47    if !is_coordinate_sorted(&header) {
48        return Err(io::Error::new(
49            io::ErrorKind::InvalidData,
50            format!(
51                "invalid sort order: expected {:?}, got {:?}",
52                Some(COORDINATE),
53                header
54                    .header()
55                    .and_then(|hdr| hdr.other_fields().get(&SORT_ORDER))
56            ),
57        ));
58    }
59
60    let mut indexer = Indexer::default();
61
62    let mut record = Record::default();
63    let mut start_position = reader.get_ref().virtual_position();
64
65    while reader.read_record(&mut record)? != 0 {
66        let end_position = reader.get_ref().virtual_position();
67        let chunk = Chunk::new(start_position, end_position);
68
69        let alignment_context = match alignment_context(&header, &record)? {
70            (Some(id), Some(start), Some(end)) => {
71                let is_mapped = !record.flags()?.is_unmapped();
72                Some((id, start, end, is_mapped))
73            }
74            _ => None,
75        };
76
77        indexer.add_record(alignment_context, chunk)?;
78
79        start_position = end_position;
80    }
81
82    Ok(indexer.build(header.reference_sequences().len()))
83}
84
85fn is_coordinate_sorted(header: &Header) -> bool {
86    header
87        .header()
88        .and_then(|hdr| hdr.other_fields().get(&SORT_ORDER))
89        .map(|sort_order| sort_order == COORDINATE)
90        .unwrap_or_default()
91}
92
93fn alignment_context(
94    header: &Header,
95    record: &Record,
96) -> io::Result<(Option<usize>, Option<Position>, Option<Position>)> {
97    Ok((
98        record.reference_sequence_id(header).transpose()?,
99        record.alignment_start().transpose()?,
100        record.alignment_end().transpose()?,
101    ))
102}
103
104#[cfg(test)]
105mod tests {
106    use super::*;
107    use crate::header::record::value::{
108        map::{self, builder::BuildError},
109        Map,
110    };
111
112    #[test]
113    fn test_is_coordinate_sorted() -> Result<(), BuildError> {
114        let header = Header::default();
115        assert!(!is_coordinate_sorted(&header));
116
117        let header = Header::builder()
118            .set_header(
119                Map::<map::Header>::builder()
120                    .insert(SORT_ORDER, COORDINATE)
121                    .build()?,
122            )
123            .build();
124
125        assert!(is_coordinate_sorted(&header));
126
127        Ok(())
128    }
129}