noodles_cram/
fs.rs

1//! CRAM filesystem operations.
2
3use std::{cmp, collections::HashMap, fs::File, io, path::Path};
4
5use noodles_core::Position;
6use noodles_fasta as fasta;
7use noodles_sam as sam;
8
9use super::{
10    container::{slice, CompressionHeader},
11    crai,
12    io::{reader::container::Slice, Reader},
13};
14use crate::io::reader::Container;
15
16/// Indexes a CRAM file.
17///
18/// # Examples
19///
20/// ```no_run
21/// use noodles_cram as cram;
22/// let index = cram::fs::index("sample.cram")?;
23/// # Ok::<(), std::io::Error>(())
24/// ```
25pub fn index<P>(src: P) -> io::Result<crai::Index>
26where
27    P: AsRef<Path>,
28{
29    let mut reader = File::open(src).map(Reader::new)?;
30    let header = reader.read_header()?;
31
32    let mut index = Vec::new();
33
34    let mut container = Container::default();
35    let mut container_position = reader.position()?;
36
37    loop {
38        let container_len = match reader.read_container(&mut container)? {
39            0 => break,
40            n => n,
41        };
42
43        let compression_header = container.compression_header()?;
44
45        let landmarks = container.header().landmarks();
46        let slice_count = landmarks.len();
47
48        for (i, result) in container.slices().enumerate() {
49            let slice = result?;
50            let landmark = landmarks[i];
51
52            let slice_length = if i < slice_count - 1 {
53                landmarks[i + 1] - landmark
54            } else {
55                container_len - landmark
56            };
57
58            push_index_records(
59                &mut index,
60                &header,
61                &compression_header,
62                &slice,
63                container_position,
64                landmark as u64,
65                slice_length as u64,
66            )?;
67        }
68
69        container_position = reader.position()?;
70    }
71
72    Ok(index)
73}
74
75fn push_index_records(
76    index: &mut crai::Index,
77    header: &sam::Header,
78    compression_header: &CompressionHeader,
79    slice: &Slice,
80    container_position: u64,
81    landmark: u64,
82    slice_length: u64,
83) -> io::Result<()> {
84    if slice.header().reference_sequence_context().is_many() {
85        push_index_records_for_multi_reference_slice(
86            index,
87            header,
88            compression_header,
89            slice,
90            container_position,
91            landmark,
92            slice_length,
93        )
94    } else {
95        push_index_record_for_single_reference_slice(
96            index,
97            slice.header(),
98            container_position,
99            landmark,
100            slice_length,
101        )
102    }
103}
104
105#[derive(Debug)]
106struct SliceReferenceSequenceAlignmentRangeInclusive {
107    start: Option<Position>,
108    end: Option<Position>,
109}
110
111impl Default for SliceReferenceSequenceAlignmentRangeInclusive {
112    fn default() -> Self {
113        Self {
114            start: Position::new(usize::MAX),
115            end: None,
116        }
117    }
118}
119
120fn push_index_records_for_multi_reference_slice(
121    index: &mut crai::Index,
122    header: &sam::Header,
123    compression_header: &CompressionHeader,
124    slice: &Slice,
125    container_position: u64,
126    landmark: u64,
127    slice_length: u64,
128) -> io::Result<()> {
129    let mut reference_sequence_ids: HashMap<
130        Option<usize>,
131        SliceReferenceSequenceAlignmentRangeInclusive,
132    > = HashMap::new();
133
134    let (core_data_src, external_data_srcs) = slice.decode_blocks()?;
135
136    for record in slice.records(
137        fasta::Repository::default(), // TODO
138        header,
139        compression_header,
140        &core_data_src,
141        &external_data_srcs,
142    )? {
143        let range = reference_sequence_ids
144            .entry(record.reference_sequence_id)
145            .or_default();
146
147        range.start = cmp::min(range.start, record.alignment_start);
148
149        let alignment_end = record.alignment_end();
150        range.end = cmp::max(range.end, alignment_end);
151    }
152
153    let mut sorted_reference_sequence_ids: Vec<_> =
154        reference_sequence_ids.keys().copied().collect();
155    sorted_reference_sequence_ids.sort_unstable();
156
157    for reference_sequence_id in sorted_reference_sequence_ids {
158        let (alignment_start, alignment_span) = if reference_sequence_id.is_some() {
159            let range = &reference_sequence_ids[&reference_sequence_id];
160
161            if let (Some(start), Some(end)) = (range.start, range.end) {
162                let span = usize::from(end) - usize::from(start) + 1;
163                (Some(start), span)
164            } else {
165                todo!("unhandled interval: {:?}", range);
166            }
167        } else {
168            (None, 0)
169        };
170
171        let record = crai::Record::new(
172            reference_sequence_id,
173            alignment_start,
174            alignment_span,
175            container_position,
176            landmark,
177            slice_length,
178        );
179
180        index.push(record);
181    }
182
183    Ok(())
184}
185
186fn push_index_record_for_single_reference_slice(
187    index: &mut crai::Index,
188    slice_header: &slice::Header,
189    container_position: u64,
190    landmark: u64,
191    slice_length: u64,
192) -> io::Result<()> {
193    use crate::container::ReferenceSequenceContext;
194
195    let (reference_sequence_id, alignment_start, alignment_span) =
196        match slice_header.reference_sequence_context() {
197            ReferenceSequenceContext::Some(context) => {
198                let reference_sequence_id = Some(context.reference_sequence_id());
199                let alignment_start = Some(context.alignment_start());
200                let alignment_span = context.alignment_span();
201                (reference_sequence_id, alignment_start, alignment_span)
202            }
203            ReferenceSequenceContext::None => (None, None, 0),
204            ReferenceSequenceContext::Many => unreachable!(),
205        };
206
207    let record = crai::Record::new(
208        reference_sequence_id,
209        alignment_start,
210        alignment_span,
211        container_position,
212        landmark,
213        slice_length,
214    );
215
216    index.push(record);
217
218    Ok(())
219}