bio_types/annot/
contig.rs

1// Copyright 2017 Nicholas Ingolia
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5
6//! Contiguous region on a named sequence, e.g., chromosome XI
7//! 334,915-334,412.
8
9use std::cmp::{max, min};
10use std::convert::Into;
11use std::fmt::{self, Display, Formatter};
12use std::ops::Neg;
13use std::str::FromStr;
14
15use regex::Regex;
16
17use crate::annot::loc::Loc;
18use crate::annot::pos::Pos;
19use crate::annot::*;
20use crate::strand::*;
21
22/// Contiguous sequence region on a particular, named sequence (e.g. a
23/// chromosome)
24///
25/// Parameterized over the type of the reference sequence identifier
26/// and over the strandedness of the position.
27///
28/// The display format for a `Contig` is _chr:start-end(+/-/.)_. The
29/// boundaries are given as a half-open 0-based interval, like the
30/// Rust `Range` and BED format.
31///
32/// ```
33/// # use bio_types::annot::ParseAnnotError;
34/// # fn try_main() -> Result<(), Box<ParseAnnotError>> {
35/// use bio_types::annot::contig::Contig;
36/// use bio_types::strand::ReqStrand;
37/// let tma19 = Contig::new("chrXI".to_owned(), 334412, (334916 - 334412), ReqStrand::Reverse);
38/// let tma19_str = tma19.to_string();
39/// assert_eq!(tma19_str, "chrXI:334412-334916(-)");
40/// let tma19_str_loc = tma19_str.parse()?;
41/// assert_eq!(tma19, tma19_str_loc);
42/// # Ok(())
43/// # }
44/// # fn main() { try_main().unwrap(); }
45/// ```
46#[derive(Debug, Clone, Hash, PartialEq, Eq)]
47pub struct Contig<R, S> {
48    refid: R,
49    start: isize,
50    length: usize,
51    strand: S,
52}
53
54impl<R, S> Contig<R, S> {
55    /// Construct a new sequence contig location
56    ///
57    /// ```
58    /// use std::rc::Rc;
59    /// use bio_types::annot::contig::Contig;
60    /// use bio_types::strand::ReqStrand;
61    /// let chr = Rc::new("chrX".to_owned());
62    /// let tma22 = Contig::new(chr, 461829, 462426 - 461829, ReqStrand::Forward);
63    /// ```
64    pub fn new(refid: R, start: isize, length: usize, strand: S) -> Self {
65        Contig {
66            refid,
67            start,
68            length,
69            strand,
70        }
71    }
72
73    /// Construct a new sequence contig location from a starting
74    /// position and length.
75    ///
76    /// In general, the starting position must have a "strandedness",
77    /// and reverse-strand starting positions will extend towards
78    /// lower coordinates from the starting position.
79    ///
80    ///
81    ///
82    /// ```
83    /// # use bio_types::annot::AnnotError;
84    /// # fn try_main() -> Result<(), Box<AnnotError>> {
85    /// use bio_types::annot::contig::Contig;
86    /// use bio_types::annot::pos::Pos;
87    /// use bio_types::strand::ReqStrand;
88    ///
89    /// let tma22_first = Pos::new("chrX".to_string(), 461829, ReqStrand::Forward);
90    /// let tma22 = Contig::with_first_length(&tma22_first, 462426 - 461829)?;
91    /// assert_eq!(tma22.to_string(), "chrX:461829-462426(+)");
92    ///
93    /// let tma19_first = Pos::new("chrXI".to_string(), 335015, ReqStrand::Reverse);
94    /// let tma19 = Contig::with_first_length(&tma19_first, 335016 - 334412)?;
95    /// assert_eq!(tma19.to_string(), "chrXI:334412-335016(-)");
96    /// # Ok(())
97    /// # }
98    /// # fn main() { try_main().unwrap(); }
99    /// ```
100    pub fn with_first_length(pos: &Pos<R, S>, length: usize) -> Result<Self, AnnotError>
101    where
102        R: Clone,
103        S: Into<Option<ReqStrand>> + Copy,
104    {
105        if length < 2 {
106            Ok(Contig {
107                refid: pos.refid().clone(),
108                start: pos.start(),
109                length,
110                strand: pos.strand(),
111            })
112        } else {
113            let start = match pos.strand().into() {
114                None => Err(AnnotError::NoStrand),
115                Some(ReqStrand::Forward) => Ok(pos.start()),
116                Some(ReqStrand::Reverse) => Ok(1 + pos.start() - length as isize),
117            }?;
118
119            Ok(Contig {
120                refid: pos.refid().clone(),
121                start,
122                length,
123                strand: pos.strand(),
124            })
125        }
126    }
127
128    /// Convert into a stranded sequence location on the specified strand
129    pub fn into_stranded(self, strand: ReqStrand) -> Contig<R, ReqStrand> {
130        Contig {
131            refid: self.refid,
132            start: self.start,
133            length: self.length,
134            strand,
135        }
136    }
137}
138
139impl<R> Contig<R, ReqStrand> {
140    /// Extend the annotation by `dist` in the upstream direction on the
141    /// annotated strand.
142    ///
143    /// # Arguments
144    ///
145    /// * `dist` specifies the offset for sliding the position. The
146    /// left, 5'-most end of the contig will expand for forward-strand
147    /// annotations and the right, 3'-most end will expand for
148    /// reverse-strand annotations.
149    ///
150    /// ```
151    /// use bio_types::annot::contig::Contig;
152    /// use bio_types::strand::ReqStrand;
153    /// let mut tma22 = Contig::new("chrX".to_owned(), 461829, 462426 - 461829, ReqStrand::Forward);
154    /// tma22.extend_upstream(100);
155    /// assert_eq!(tma22.to_string(), "chrX:461729-462426(+)");
156    /// let mut tma19 = Contig::new("chrXI".to_owned(), 334412, 334916 - 334412, ReqStrand::Reverse);
157    /// tma19.extend_upstream(100);
158    /// assert_eq!(tma19.to_string(), "chrXI:334412-335016(-)");
159    /// ```
160    pub fn extend_upstream(&mut self, dist: usize) {
161        self.length += dist;
162        if self.strand == ReqStrand::Forward {
163            self.start -= dist as isize;
164        }
165    }
166
167    /// Extend the annotation by `dist` in the downstream direction on the
168    /// annotated strand.
169    ///
170    /// # Arguments
171    ///
172    /// * `dist` specifies the offset for sliding the position. The
173    /// right, 3'-most end of the contig will expand for
174    /// forward-strand annotations and the left, 5'-most end will
175    /// expand for reverse-strand annotations.
176    ///
177    /// ```
178    /// use bio_types::annot::contig::Contig;
179    /// use bio_types::strand::ReqStrand;
180    /// let mut tma22 = Contig::new("chrX".to_owned(), 461829, 462426 - 461829, ReqStrand::Forward);
181    /// tma22.extend_downstream(100);
182    /// assert_eq!(tma22.to_string(), "chrX:461829-462526(+)");
183    /// let mut tma19 = Contig::new("chrXI".to_owned(), 334412, 334916 - 334412, ReqStrand::Reverse);
184    /// tma19.extend_downstream(100);
185    /// assert_eq!(tma19.to_string(), "chrXI:334312-334916(-)");
186    /// ```
187    pub fn extend_downstream(&mut self, dist: usize) {
188        self.length += dist;
189        if self.strand == ReqStrand::Reverse {
190            self.start -= dist as isize;
191        }
192    }
193}
194
195impl<R, S> Loc for Contig<R, S> {
196    type RefID = R;
197    type Strand = S;
198    fn refid(&self) -> &R {
199        &self.refid
200    }
201    fn start(&self) -> isize {
202        self.start
203    }
204    fn length(&self) -> usize {
205        self.length
206    }
207    fn strand(&self) -> S
208    where
209        S: Copy,
210    {
211        self.strand
212    }
213
214    fn pos_into<T>(&self, pos: &Pos<Self::RefID, T>) -> Option<Pos<(), T>>
215    where
216        Self::RefID: Eq,
217        Self::Strand: Into<ReqStrand> + Copy,
218        T: Neg<Output = T> + Copy,
219    {
220        if self.refid != *pos.refid() {
221            None
222        } else {
223            let offset = pos.pos() - self.start;
224            if offset < 0 || offset >= self.length as isize {
225                None
226            } else {
227                Some(match self.strand().into() {
228                    ReqStrand::Forward => Pos::new((), offset, pos.strand()),
229                    ReqStrand::Reverse => {
230                        Pos::new((), self.length as isize - (offset + 1), -pos.strand())
231                    }
232                })
233            }
234        }
235    }
236
237    fn pos_outof<Q, T>(&self, pos: &Pos<Q, T>) -> Option<Pos<Self::RefID, T>>
238    where
239        Self::RefID: Clone,
240        Self::Strand: Into<ReqStrand> + Copy,
241        T: Neg<Output = T> + Copy,
242    {
243        let offset = match self.strand().into() {
244            ReqStrand::Forward => pos.pos(),
245            ReqStrand::Reverse => self.length as isize - (pos.pos() + 1),
246        };
247
248        if offset >= 0 && offset < self.length as isize {
249            Some(Pos::new(
250                self.refid.clone(),
251                self.start + offset,
252                self.strand().into().on_strand(pos.strand()),
253            ))
254        } else {
255            None
256        }
257    }
258
259    fn contig_intersection<T>(&self, contig: &Contig<Self::RefID, T>) -> Option<Self>
260    where
261        Self::RefID: PartialEq + Clone,
262        Self::Strand: Copy,
263    {
264        if self.refid() != contig.refid() {
265            return None;
266        }
267
268        let start = max(self.start, contig.start);
269        let end = min(
270            self.start + self.length as isize,
271            contig.start + contig.length as isize,
272        );
273
274        if start <= end {
275            Some(Self::new(
276                self.refid.clone(),
277                start,
278                (end - start) as usize,
279                self.strand,
280            ))
281        } else {
282            None
283        }
284    }
285}
286
287impl<R, S> Display for Contig<R, S>
288where
289    R: Display,
290    S: Display + Clone + Into<Strand>,
291{
292    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
293        write!(
294            f,
295            "{}:{}-{}",
296            self.refid,
297            self.start,
298            self.start + self.length as isize
299        )?;
300        let strand: Strand = self.strand.clone().into();
301        if !strand.is_unknown() {
302            write!(f, "({})", strand)?;
303        }
304        Ok(())
305    }
306}
307
308impl<R, S> FromStr for Contig<R, S>
309where
310    R: From<String>,
311    S: FromStr<Err = StrandError>,
312{
313    type Err = ParseAnnotError;
314
315    fn from_str(s: &str) -> Result<Self, Self::Err> {
316        lazy_static! {
317            static ref CONTIG_RE: Regex = Regex::new(r"^(.*):(\d+)-(\d+)(\([+-]\))?$").unwrap();
318        }
319
320        let cap = CONTIG_RE.captures(s).ok_or(ParseAnnotError::BadAnnot)?;
321
322        let start = cap[2].parse::<isize>().map_err(ParseAnnotError::ParseInt)?;
323        let end = cap[3].parse::<isize>().map_err(ParseAnnotError::ParseInt)?;
324        let strand = cap
325            .get(4)
326            .map_or("", |m| m.as_str())
327            .parse::<S>()
328            .map_err(ParseAnnotError::ParseStrand)?;
329
330        if start <= end {
331            Ok(Contig::new(
332                R::from(cap[1].to_owned()),
333                start,
334                (end - start) as usize,
335                strand,
336            ))
337        } else {
338            Err(ParseAnnotError::EndBeforeStart)
339        }
340    }
341}
342
343impl<R> From<Contig<R, ReqStrand>> for Contig<R, Strand> {
344    fn from(x: Contig<R, ReqStrand>) -> Self {
345        Contig {
346            refid: x.refid,
347            start: x.start,
348            length: x.length,
349            strand: match x.strand {
350                ReqStrand::Forward => Strand::Forward,
351                ReqStrand::Reverse => Strand::Reverse,
352            },
353        }
354    }
355}
356
357impl<R> From<Contig<R, NoStrand>> for Contig<R, Strand> {
358    fn from(x: Contig<R, NoStrand>) -> Self {
359        Contig {
360            refid: x.refid,
361            start: x.start,
362            length: x.length,
363            strand: Strand::Unknown,
364        }
365    }
366}
367
368impl<R> From<Contig<R, Strand>> for Contig<R, NoStrand> {
369    fn from(x: Contig<R, Strand>) -> Self {
370        Contig {
371            refid: x.refid,
372            start: x.start,
373            length: x.length,
374            strand: NoStrand::Unknown,
375        }
376    }
377}
378
379impl<R> From<Contig<R, ReqStrand>> for Contig<R, NoStrand> {
380    fn from(x: Contig<R, ReqStrand>) -> Self {
381        Contig {
382            refid: x.refid,
383            start: x.start,
384            length: x.length,
385            strand: NoStrand::Unknown,
386        }
387    }
388}
389
390/// Default stranded sequence position on a reference sequence named
391/// by a `String`.
392pub type SeqContigStranded = Contig<String, ReqStrand>;
393
394/// Default unstranded sequence position on a reference sequence named
395/// by a `String`
396pub type SeqContigUnstranded = Contig<String, NoStrand>;
397
398#[cfg(test)]
399mod tests {
400    use super::*;
401
402    #[test]
403    fn first_and_last() {
404        let tma22 = "chrX:461829-462426(+)"
405            .parse::<SeqContigStranded>()
406            .unwrap();
407        let first = tma22.first_pos();
408        assert_eq!(first.to_string(), "chrX:461829(+)");
409        let last = tma22.last_pos();
410        assert_eq!(last.to_string(), "chrX:462425(+)");
411
412        let tma19 = "chrXI:334412-334916(-)"
413            .parse::<SeqContigStranded>()
414            .unwrap();
415        let first = tma19.first_pos();
416        assert_eq!(first.to_string(), "chrXI:334915(-)");
417        let last = tma19.last_pos();
418        assert_eq!(last.to_string(), "chrXI:334412(-)");
419
420        let tma22_first = Pos::new("chrX".to_string(), 461829, ReqStrand::Forward);
421        let tma22 = Contig::with_first_length(&tma22_first, 462426 - 461829).unwrap();
422        assert_eq!(tma22.to_string(), "chrX:461829-462426(+)");
423
424        let tma19_first = Pos::new("chrXI".to_string(), 335015, ReqStrand::Reverse);
425        let tma19 = Contig::with_first_length(&tma19_first, 335016 - 334412).unwrap();
426        assert_eq!(tma19.to_string(), "chrXI:334412-335016(-)");
427    }
428
429    #[test]
430    fn into_outof() {
431        let tma22 = "chrX:461829-462426(+)"
432            .parse::<SeqContigStranded>()
433            .unwrap();
434        let p0 = "chrX:461829(+)".parse::<Pos<String, ReqStrand>>().unwrap();
435        let p0_into = tma22.pos_into(&p0);
436        assert!(Some(Pos::new((), 0, ReqStrand::Forward)).same(&p0_into));
437        let p0_outof = tma22.pos_outof(&p0_into.unwrap());
438        assert!(Some(p0).same(&p0_outof));
439
440        let p0 = "chrX:461839(-)".parse::<Pos<String, ReqStrand>>().unwrap();
441        let p0_into = tma22.pos_into(&p0);
442        assert!(Some(Pos::new((), 10, ReqStrand::Reverse)).same(&p0_into));
443        let p0_outof = tma22.pos_outof(&p0_into.unwrap());
444        assert!(Some(p0).same(&p0_outof));
445
446        let p0 = "chrX:462425(+)".parse::<Pos<String, ReqStrand>>().unwrap();
447        let p0_into = tma22.pos_into(&p0);
448        assert!(Some(Pos::new((), 596, ReqStrand::Forward)).same(&p0_into));
449        let p0_outof = tma22.pos_outof(&p0_into.unwrap());
450        assert!(Some(p0).same(&p0_outof));
451
452        let p0 = "chrX:461828(+)".parse::<Pos<String, ReqStrand>>().unwrap();
453        let p0_into = tma22.pos_into(&p0);
454        assert!(None.same(&p0_into));
455
456        let p0 = "chrV:461829(+)".parse::<Pos<String, ReqStrand>>().unwrap();
457        let p0_into = tma22.pos_into(&p0);
458        assert!(None.same(&p0_into));
459
460        let p0 = "chrV:462426(+)".parse::<Pos<String, ReqStrand>>().unwrap();
461        let p0_into = tma22.pos_into(&p0);
462        assert!(None.same(&p0_into));
463    }
464
465    fn test_contig_ixn(ca_str: &str, cb_str: &str, cab_str: Option<String>) -> () {
466        let ca = ca_str.parse::<SeqContigStranded>().unwrap();
467        let cb = cb_str.parse::<SeqContigStranded>().unwrap();
468        match ca.contig_intersection(&cb) {
469            None => assert_eq!(None, cab_str),
470            Some(cab) => assert_eq!(Some(cab.to_string()), cab_str),
471        };
472    }
473
474    #[test]
475    fn test_display_fmt() {
476        let tma19 = Contig::new(
477            "chrXI".to_owned(),
478            334412,
479            334916 - 334412,
480            ReqStrand::Reverse,
481        );
482        assert_eq!(format!("{}", tma19), "chrXI:334412-334916(-)");
483    }
484
485    #[test]
486    fn intersection() {
487        test_contig_ixn(
488            "chrX:461829-462426(+)",
489            "chrX:461800-461900(+)",
490            Some("chrX:461829-461900(+)".to_owned()),
491        );
492        test_contig_ixn(
493            "chrX:461829-462426(-)",
494            "chrX:461800-461900(+)",
495            Some("chrX:461829-461900(-)".to_owned()),
496        );
497        test_contig_ixn(
498            "chrX:461829-462426(+)",
499            "chrX:461800-461900(-)",
500            Some("chrX:461829-461900(+)".to_owned()),
501        );
502
503        test_contig_ixn(
504            "chrX:461829-462426(+)",
505            "chrX:462000-463000(+)",
506            Some("chrX:462000-462426(+)".to_owned()),
507        );
508        test_contig_ixn(
509            "chrX:461829-462426(+)",
510            "chrX:461000-463000(+)",
511            Some("chrX:461829-462426(+)".to_owned()),
512        );
513        test_contig_ixn(
514            "chrX:461829-462426(+)",
515            "chrX:462000-462100(+)",
516            Some("chrX:462000-462100(+)".to_owned()),
517        );
518
519        test_contig_ixn("chrX:461829-462426(+)", "chrX:461000-461500(+)", None);
520        test_contig_ixn("chrX:461829-462426(+)", "chrX:463000-463500(+)", None);
521        test_contig_ixn("chrX:461829-462426(+)", "chrV:461000-463000(+)", None);
522    }
523}