noodles_cram/
lib.rs

1//! **noodles-cram** handles the reading and writing of the CRAM format.
2
3#[cfg(feature = "async")]
4pub mod r#async;
5
6pub mod codecs;
7pub mod container;
8pub mod crai;
9pub mod file_definition;
10pub mod fs;
11mod huffman;
12pub mod io;
13pub mod record;
14
15use md5::{Digest, Md5};
16
17pub use self::{file_definition::FileDefinition, record::Record};
18
19#[deprecated(since = "0.78.0", note = "Use `cram::container` instead.")]
20pub use self::container as data_container;
21
22#[deprecated(since = "0.78.0", note = "Use `cram::io::reader::Container` instead.")]
23pub use self::io::reader::Container;
24
25#[deprecated(since = "0.78.0", note = "Use `cram::io::reader::Container` instead.")]
26pub use self::io::reader::Container as DataContainer;
27
28#[deprecated(since = "0.76.0", note = "Use `cram::fs::index` instead.")]
29pub use self::fs::index;
30
31#[cfg(feature = "async")]
32#[deprecated(since = "0.69.0", note = "Use `cram::r#async::io::Reader` instead.")]
33pub use self::r#async::io::Reader as AsyncReader;
34
35#[cfg(feature = "async")]
36#[deprecated(since = "0.69.0", note = "Use `cram::r#async::io::Writer` instead.")]
37pub use self::r#async::io::Writer as AsyncWriter;
38
39const MAGIC_NUMBER: [u8; 4] = *b"CRAM";
40const MD5_OUTPUT_SIZE: usize = 16;
41
42// _Sequence Alignment/Map Format Specification_ (2021-06-03) § 1.3.2 "Reference MD5 calculation"
43fn calculate_normalized_sequence_digest(mut sequence: &[u8]) -> [u8; MD5_OUTPUT_SIZE] {
44    const MD5_BLOCK_SIZE: usize = 64;
45    const CHUNK_SIZE: usize = 8 * MD5_BLOCK_SIZE;
46
47    let mut hasher = Md5::new();
48    let mut buf = [0; CHUNK_SIZE];
49
50    while !sequence.is_empty() {
51        let n = sequence
52            .iter()
53            .position(|b| !b.is_ascii_graphic() || b.is_ascii_lowercase())
54            .unwrap_or(sequence.len());
55
56        hasher.update(&sequence[..n]);
57        sequence = &sequence[n..];
58
59        // "All lowercase characters are converted to uppercase."
60        loop {
61            let mut n = 0;
62
63            for (src, dst) in sequence
64                .iter()
65                .take_while(|b| b.is_ascii_lowercase())
66                .zip(&mut buf)
67            {
68                *dst = src.to_ascii_uppercase();
69                n += 1;
70            }
71
72            if n == 0 {
73                break;
74            }
75
76            hasher.update(&buf[..n]);
77            sequence = &sequence[n..];
78        }
79
80        // "All characters outside of the inclusive range 33 ('!') to 126 ('~') are stripped out."
81        let n = sequence
82            .iter()
83            .position(|b| b.is_ascii_graphic())
84            .unwrap_or(sequence.len());
85
86        sequence = &sequence[n..];
87    }
88
89    hasher.finalize().into()
90}
91
92#[cfg(test)]
93mod tests {
94    use super::*;
95
96    #[test]
97    fn test_calculate_normalized_sequence_digest() {
98        assert_eq!(
99            calculate_normalized_sequence_digest(b"ACGT"),
100            [
101                0xf1, 0xf8, 0xf4, 0xbf, 0x41, 0x3b, 0x16, 0xad, 0x13, 0x57, 0x22, 0xaa, 0x45, 0x91,
102                0x04, 0x3e
103            ]
104        );
105
106        assert_eq!(
107            calculate_normalized_sequence_digest(b" AC\tgt\n"),
108            [
109                0xf1, 0xf8, 0xf4, 0xbf, 0x41, 0x3b, 0x16, 0xad, 0x13, 0x57, 0x22, 0xaa, 0x45, 0x91,
110                0x04, 0x3e
111            ]
112        );
113
114        // _Sequence Alignment/Map Format Specification_ (2024-11-06) § 1.3.2 "Reference MD5
115        // calculation"
116        assert_eq!(
117            calculate_normalized_sequence_digest(b"ACGT ACGT ACGT\nacgt acgt acgt\n... 12345 !!!"),
118            [
119                0xdf, 0xab, 0xdb, 0xb3, 0x6e, 0x23, 0x9a, 0x6d, 0xa8, 0x89, 0x57, 0x84, 0x1f, 0x32,
120                0xb8, 0xe4
121            ]
122        );
123    }
124}