noodles_csi/io/indexed_records/
record.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
mod position;

use std::{error, fmt, ops::Range};

use noodles_core::Position;

use self::position::parse_start_position;
use crate::{
    binning_index::index::header::format::coordinate_system::CoordinateSystem, io::IndexedRecord,
};

/// A CSI or Tabix record.
pub struct Record {
    buf: String,
    reference_sequence_name_bounds: Range<usize>,
    start_position: Position,
    end_position: Position,
}

impl IndexedRecord for Record {
    fn indexed_reference_sequence_name(&self) -> &str {
        &self.buf[self.reference_sequence_name_bounds.clone()]
    }

    fn indexed_start_position(&self) -> Position {
        self.start_position
    }

    fn indexed_end_position(&self) -> Position {
        self.end_position
    }
}

impl AsRef<str> for Record {
    fn as_ref(&self) -> &str {
        &self.buf
    }
}

#[derive(Clone, Debug, Eq, PartialEq)]
pub enum ParseError {
    /// The reference sequence name is missing.
    MissingReferenceSequenceName,
    /// The start position is missing.
    MissingStartPosition,
    /// The start position is invalid.
    InvalidStartPosition(position::ParseError),
    /// The end position is missing.
    MissingEndPosition,
    /// The end position is invalid.
    InvalidEndPosition(position::ParseError),
}

impl error::Error for ParseError {
    fn source(&self) -> Option<&(dyn error::Error + 'static)> {
        match self {
            Self::InvalidStartPosition(e) => Some(e),
            Self::InvalidEndPosition(e) => Some(e),
            _ => None,
        }
    }
}

impl fmt::Display for ParseError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::MissingReferenceSequenceName => write!(f, "missing reference sequence name"),
            Self::MissingStartPosition => write!(f, "missing start position"),
            Self::InvalidStartPosition(_) => write!(f, "invalid start position"),
            Self::MissingEndPosition => write!(f, "missing end position"),
            Self::InvalidEndPosition(_) => write!(f, "invalid end position"),
        }
    }
}

pub(super) fn parse_record(
    s: String,
    reference_sequence_name_index: usize,
    start_position_index: usize,
    end_position_index: Option<usize>,
    coordinate_system: CoordinateSystem,
) -> Result<Record, ParseError> {
    const DELIMITER: char = '\t';

    let fields: Vec<_> = s.split(DELIMITER).collect();

    let reference_sequence_name_bounds =
        calculate_reference_sequence_name_bounds(&fields, reference_sequence_name_index)?;

    let raw_start = fields
        .get(start_position_index)
        .ok_or(ParseError::MissingStartPosition)?;

    let start_position = parse_start_position(raw_start, coordinate_system)
        .map_err(ParseError::InvalidStartPosition)?;

    let end_position = if let Some(i) = end_position_index {
        fields
            .get(i)
            .ok_or(ParseError::MissingEndPosition)
            .and_then(|s| {
                s.parse()
                    .map_err(position::ParseError::Parse)
                    .map_err(ParseError::InvalidEndPosition)
            })?
    } else {
        // _The Tabix index file format_: "Field `col_beg` may equal `col_end`, and in this case,
        // the end of a region is `end=beg+1`."
        start_position
            .checked_add(1)
            .expect("attempt to add with overflow")
    };

    Ok(Record {
        buf: s,
        reference_sequence_name_bounds,
        start_position,
        end_position,
    })
}

fn calculate_reference_sequence_name_bounds(
    fields: &[&str],
    i: usize,
) -> Result<Range<usize>, ParseError> {
    let name = fields
        .get(i)
        .ok_or(ParseError::MissingReferenceSequenceName)?;

    let mut start = 0;

    for s in fields.iter().take(i) {
        start += s.len() + 1;
    }

    let end = start + name.len();

    Ok(start..end)
}