bio_types/
alignment.rs

1// Copyright 2014-2015 Johannes Köster, Vadim Nazarov, Patrick Marks
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5
6//! Types for representing pairwise sequence alignments
7
8#[cfg(feature = "clap")]
9use clap::ValueEnum;
10#[cfg(feature = "serde")]
11use serde::{Deserialize, Serialize};
12
13pub type TextSlice<'a> = &'a [u8];
14
15/// Alignment operations supported are match, substitution, insertion, deletion
16/// and clipping. Clipping is a special boundary condition where you are allowed
17/// to clip off the beginning/end of the sequence for a fixed clip penalty. The
18/// clip penalty could be different for the two sequences x and y, and the
19/// clipping operations on both are distinguishable (Xclip and Yclip). The usize
20/// value associated with the clipping operations are the lengths clipped. In case
21/// of standard modes like Global, Semi-Global and Local alignment, the clip operations
22/// are filtered out
23#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
24#[derive(Eq, PartialEq, Debug, Copy, Clone, Hash)]
25pub enum AlignmentOperation {
26    Match,
27    Subst,
28    Del,
29    Ins,
30    Xclip(usize),
31    Yclip(usize),
32}
33
34/// The modes of alignment supported by the aligner include standard modes such as
35/// Global, Semi-Global and Local alignment. In addition to this, user can also invoke
36/// the custom mode. In the custom mode, users can explicitly specify the clipping penalties
37/// for prefix and suffix of strings 'x' and 'y' independently. Under the hood the standard
38/// modes are implemented as special cases of the custom mode with the clipping penalties
39/// appropriately set.
40///
41/// The default alignment mode is Global.
42#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
43#[cfg_attr(feature = "clap", derive(ValueEnum))]
44#[derive(Debug, PartialEq, Eq, Copy, Clone)]
45pub enum AlignmentMode {
46    Local,
47    Semiglobal,
48    Global,
49    Custom,
50}
51
52impl Default for AlignmentMode {
53    fn default() -> Self {
54        AlignmentMode::Global
55    }
56}
57
58/// We consider alignment between two sequences x and  y. x is the query or read sequence
59/// and y is the reference or template sequence. An alignment, consisting of a score,
60/// the start and end position of the alignment on sequence x and sequence y, the
61/// lengths of sequences x and y, and the alignment edit operations. The start position
62/// and end position of the alignment does not include the clipped regions. The length
63/// of clipped regions are already encapsulated in the Alignment Operation.
64#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
65#[derive(Debug, Eq, PartialEq, Clone, Default)]
66pub struct Alignment {
67    /// Smith-Waterman alignment score
68    pub score: i32,
69
70    /// Start position of alignment in reference
71    pub ystart: usize,
72
73    /// Start position of alignment in query
74    pub xstart: usize,
75
76    /// End position of alignment in reference
77    pub yend: usize,
78
79    /// End position of alignment in query
80    pub xend: usize,
81
82    /// Length of the reference sequence
83    pub ylen: usize,
84
85    /// Length of the query sequence
86    pub xlen: usize,
87
88    /// Vector of alignment operations
89    pub operations: Vec<AlignmentOperation>,
90    pub mode: AlignmentMode,
91}
92
93impl Alignment {
94    /// Calculate the cigar string from the alignment struct. x is the target string
95    ///
96    /// # Example
97    ///
98    /// ```
99    /// use bio_types::alignment::{Alignment,AlignmentMode};
100    /// use bio_types::alignment::AlignmentOperation::{Match, Subst, Ins, Del};
101    /// let alignment = Alignment {
102    ///     score: 5,
103    ///     xstart: 3,
104    ///     ystart: 0,
105    ///     xend: 9,
106    ///     yend: 10,
107    ///     ylen: 10,
108    ///     xlen: 10,
109    ///     operations: vec![Match, Match, Match, Subst, Ins, Ins, Del, Del],
110    ///     mode: AlignmentMode::Semiglobal
111    /// };
112    /// assert_eq!(alignment.cigar(false), "3S3=1X2I2D1S");
113    /// ```
114    pub fn cigar(&self, hard_clip: bool) -> String {
115        match self.mode {
116            AlignmentMode::Global => panic!(" Cigar fn not supported for Global Alignment mode"),
117            AlignmentMode::Local => panic!(" Cigar fn not supported for Local Alignment mode"),
118            _ => {}
119        }
120
121        let clip_str = if hard_clip { "H" } else { "S" };
122
123        let add_op = |op: AlignmentOperation, k, cigar: &mut String| match op {
124            AlignmentOperation::Match => cigar.push_str(&format!("{}{}", k, "=")),
125            AlignmentOperation::Subst => cigar.push_str(&format!("{}{}", k, "X")),
126            AlignmentOperation::Del => cigar.push_str(&format!("{}{}", k, "D")),
127            AlignmentOperation::Ins => cigar.push_str(&format!("{}{}", k, "I")),
128            _ => {}
129        };
130
131        let mut cigar = "".to_owned();
132        if self.operations.is_empty() {
133            return cigar;
134        }
135
136        let mut last = self.operations[0];
137        if self.xstart > 0 {
138            cigar.push_str(&format!("{}{}", self.xstart, clip_str))
139        }
140        let mut k = 1;
141        for &op in self.operations[1..].iter() {
142            if op == last {
143                k += 1;
144            } else {
145                add_op(last, k, &mut cigar);
146                k = 1;
147            }
148            last = op;
149        }
150        add_op(last, k, &mut cigar);
151        if self.xlen > self.xend {
152            cigar.push_str(&format!("{}{}", self.xlen - self.xend, clip_str))
153        }
154
155        cigar
156    }
157
158    /// Return the pretty formatted alignment as a String. The string
159    /// contains sets of 3 lines of length 100. First line is for the
160    /// sequence x, second line is for the alignment operation and the
161    /// the third line is for the sequence y. A '-' in the sequence
162    /// indicates a blank (insertion/deletion). The operations follow
163    /// the following convention: '|' for a match, '\\' (a single backslash) for a mismatch,
164    /// '+' for an insertion, 'x' for a deletion and ' ' for clipping
165    ///
166    /// # Example
167    ///
168    /// If we align the strings "CCGTCCGGCAAGGG" and "AAAAACCGTTGACGGCCAA"
169    /// in various modes, we will get the following output:
170    ///
171    /// Semiglobal:
172    /// ```c
173    ///         CCGTCCGGCAAGGG
174    ///         ||||++++\\|\||
175    ///    AAAAACCGT----TGACGGCCAA
176    /// ```
177    ///
178    /// Local:
179    /// ```c
180    ///         CCGTCCGGCAAGGG
181    ///         ||||
182    ///    AAAAACCGT          TGACGGCCAA
183    /// ```
184    ///
185    /// Global:
186    /// ```c
187    ///    -----CCGT--CCGGCAAGGG
188    ///    xxxxx||||xx\||||\|++\
189    ///    AAAAACCGTTGACGGCCA--A
190    /// ```
191    ///
192    pub fn pretty(&self, x: TextSlice, y: TextSlice, ncol: usize) -> String {
193        let mut x_pretty = String::new();
194        let mut y_pretty = String::new();
195        let mut inb_pretty = String::new();
196
197        if !self.operations.is_empty() {
198            let mut x_i: usize;
199            let mut y_i: usize;
200
201            // If the alignment mode is one of the standard ones, the prefix clipping is
202            // implicit so we need to process it here
203            match self.mode {
204                AlignmentMode::Custom => {
205                    x_i = 0;
206                    y_i = 0;
207                }
208                _ => {
209                    x_i = self.xstart;
210                    y_i = self.ystart;
211                    for k in x.iter().take(self.xstart) {
212                        x_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[*k])));
213                        inb_pretty.push(' ');
214                        y_pretty.push(' ')
215                    }
216                    for k in y.iter().take(self.ystart) {
217                        y_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[*k])));
218                        inb_pretty.push(' ');
219                        x_pretty.push(' ')
220                    }
221                }
222            }
223
224            // Process the alignment.
225            for i in 0..self.operations.len() {
226                match self.operations[i] {
227                    AlignmentOperation::Match => {
228                        x_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[x[x_i]])));
229                        x_i += 1;
230
231                        inb_pretty.push('|');
232
233                        y_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[y[y_i]])));
234                        y_i += 1;
235                    }
236                    AlignmentOperation::Subst => {
237                        x_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[x[x_i]])));
238                        x_i += 1;
239
240                        inb_pretty.push('\\');
241
242                        y_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[y[y_i]])));
243                        y_i += 1;
244                    }
245                    AlignmentOperation::Del => {
246                        x_pretty.push('-');
247
248                        inb_pretty.push('x');
249
250                        y_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[y[y_i]])));
251                        y_i += 1;
252                    }
253                    AlignmentOperation::Ins => {
254                        x_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[x[x_i]])));
255                        x_i += 1;
256
257                        inb_pretty.push('+');
258
259                        y_pretty.push('-');
260                    }
261                    AlignmentOperation::Xclip(len) => {
262                        for k in x.iter().take(len) {
263                            x_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[*k])));
264                            x_i += 1;
265
266                            inb_pretty.push(' ');
267
268                            y_pretty.push(' ')
269                        }
270                    }
271                    AlignmentOperation::Yclip(len) => {
272                        for k in y.iter().take(len) {
273                            y_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[*k])));
274                            y_i += 1;
275
276                            inb_pretty.push(' ');
277
278                            x_pretty.push(' ')
279                        }
280                    }
281                }
282            }
283
284            // If the alignment mode is one of the standard ones, the suffix clipping is
285            // implicit so we need to process it here
286            match self.mode {
287                AlignmentMode::Custom => {}
288                _ => {
289                    for k in x.iter().take(self.xlen).skip(x_i) {
290                        x_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[*k])));
291                        inb_pretty.push(' ');
292                        y_pretty.push(' ')
293                    }
294                    for k in y.iter().take(self.ylen).skip(y_i) {
295                        y_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[*k])));
296                        inb_pretty.push(' ');
297                        x_pretty.push(' ')
298                    }
299                }
300            }
301        }
302
303        let mut s = String::new();
304        let mut idx = 0;
305        use std::cmp::min;
306
307        assert_eq!(x_pretty.len(), inb_pretty.len());
308        assert_eq!(y_pretty.len(), inb_pretty.len());
309
310        let ml = x_pretty.len();
311
312        while idx < ml {
313            let rng = idx..min(idx + ncol, ml);
314            s.push_str(&x_pretty[rng.clone()]);
315            s.push('\n');
316
317            s.push_str(&inb_pretty[rng.clone()]);
318            s.push('\n');
319
320            s.push_str(&y_pretty[rng]);
321            s.push('\n');
322
323            s.push_str("\n\n");
324            idx += ncol;
325        }
326
327        s
328    }
329
330    /// Returns the optimal path in the alignment matrix
331    ///
332    /// # Example
333    ///
334    /// ```
335    /// use bio_types::alignment::{Alignment,AlignmentMode};
336    /// use bio_types::alignment::AlignmentOperation::*;
337    /// let alignment = Alignment {
338    ///     score: 5,
339    ///     xstart: 3,
340    ///     ystart: 0,
341    ///     xend: 9,
342    ///     yend: 10,
343    ///     ylen: 10,
344    ///     xlen: 10,
345    ///     operations: vec![Match, Match, Match, Subst, Ins, Ins, Del, Del],
346    ///     mode: AlignmentMode::Semiglobal,
347    /// };
348    /// assert_eq!(alignment.path(),[
349    ///     (4, 5, Match),
350    ///     (5, 6, Match),
351    ///     (6, 7, Match),
352    ///     (7, 8, Subst),
353    ///     (8, 8, Ins),
354    ///     (9, 8, Ins),
355    ///     (9, 9, Del),
356    ///     (9, 10, Del)])
357    /// ```
358    pub fn path(&self) -> Vec<(usize, usize, AlignmentOperation)> {
359        let mut path = Vec::new();
360
361        if !self.operations.is_empty() {
362            let last = match self.mode {
363                AlignmentMode::Custom => (self.xlen, self.ylen),
364                _ => (self.xend, self.yend),
365            };
366            let mut x_i = last.0;
367            let mut y_i = last.1;
368
369            let mut ops = self.operations.clone();
370            ops.reverse();
371
372            // Process the alignment.
373            for i in ops {
374                path.push((x_i, y_i, i));
375                match i {
376                    AlignmentOperation::Match => {
377                        x_i -= 1;
378                        y_i -= 1;
379                    }
380                    AlignmentOperation::Subst => {
381                        x_i -= 1;
382                        y_i -= 1;
383                    }
384                    AlignmentOperation::Del => {
385                        y_i -= 1;
386                    }
387                    AlignmentOperation::Ins => {
388                        x_i -= 1;
389                    }
390                    AlignmentOperation::Xclip(len) => {
391                        x_i -= len;
392                    }
393                    AlignmentOperation::Yclip(len) => {
394                        y_i -= len;
395                    }
396                }
397            }
398        }
399        path.reverse();
400        path
401    }
402
403    /// Filter out Xclip and Yclip operations from the list of operations. Useful
404    /// when invoking the standard modes.
405    pub fn filter_clip_operations(&mut self) {
406        use self::AlignmentOperation::{Del, Ins, Match, Subst};
407        self.operations
408            .retain(|x| (*x == Match || *x == Subst || *x == Ins || *x == Del));
409    }
410
411    /// Number of bases in reference sequence that are aligned
412    pub fn y_aln_len(&self) -> usize {
413        self.yend - self.ystart
414    }
415
416    /// Number of bases in query sequence that are aigned
417    pub fn x_aln_len(&self) -> usize {
418        self.xend - self.xstart
419    }
420}
421
422#[cfg(test)]
423mod tests {
424    use super::AlignmentOperation::*;
425    use super::*;
426
427    #[test]
428    fn test_cigar() {
429        let alignment = Alignment {
430            score: 5,
431            xstart: 3,
432            ystart: 0,
433            xend: 9,
434            yend: 10,
435            ylen: 10,
436            xlen: 10,
437            operations: vec![Match, Match, Match, Subst, Ins, Ins, Del, Del],
438            mode: AlignmentMode::Semiglobal,
439        };
440        assert_eq!(alignment.cigar(false), "3S3=1X2I2D1S");
441
442        let alignment = Alignment {
443            score: 5,
444            xstart: 0,
445            ystart: 5,
446            xend: 4,
447            yend: 10,
448            ylen: 10,
449            xlen: 5,
450            operations: vec![Yclip(5), Match, Subst, Subst, Ins, Del, Del, Xclip(1)],
451            mode: AlignmentMode::Custom,
452        };
453        assert_eq!(alignment.cigar(false), "1=2X1I2D1S");
454        assert_eq!(alignment.cigar(true), "1=2X1I2D1H");
455
456        let alignment = Alignment {
457            score: 5,
458            xstart: 0,
459            ystart: 5,
460            xend: 3,
461            yend: 8,
462            ylen: 10,
463            xlen: 3,
464            operations: vec![Yclip(5), Subst, Match, Subst, Yclip(2)],
465            mode: AlignmentMode::Custom,
466        };
467        assert_eq!(alignment.cigar(false), "1X1=1X");
468
469        let alignment = Alignment {
470            score: 5,
471            xstart: 0,
472            ystart: 5,
473            xend: 3,
474            yend: 8,
475            ylen: 10,
476            xlen: 3,
477            operations: vec![Subst, Match, Subst],
478            mode: AlignmentMode::Semiglobal,
479        };
480        assert_eq!(alignment.cigar(false), "1X1=1X");
481    }
482
483    #[test]
484    fn test_pretty() {
485        let alignment = Alignment {
486            score: 1,
487            xstart: 0,
488            ystart: 2,
489            xend: 3,
490            yend: 5,
491            ylen: 7,
492            xlen: 2,
493            operations: vec![Subst, Match, Match],
494            mode: AlignmentMode::Semiglobal,
495        };
496        let pretty = concat!("  GAT  \n", "  \\||  \n", "CTAATCC\n", "\n\n");
497        assert_eq!(alignment.pretty(b"GAT", b"CTAATCC", 100), pretty);
498        let alignment = Alignment {
499            score: 5,
500            xstart: 0,
501            ystart: 5,
502            xend: 4,
503            yend: 10,
504            ylen: 10,
505            xlen: 5,
506            operations: vec![Yclip(5), Match, Subst, Subst, Ins, Del, Del, Xclip(1)],
507            mode: AlignmentMode::Custom,
508        };
509        let pretty = concat!("     AAAA--A\n     |\\\\+xx \nTTTTTTTT-TT \n\n\n");
510        assert_eq!(alignment.pretty(b"AAAAA", b"TTTTTTTTTT", 100), pretty);
511    }
512
513    #[test]
514    fn test_path() {
515        let alignment = Alignment {
516            score: 5,
517            xstart: 3,
518            ystart: 0,
519            xend: 9,
520            yend: 10,
521            ylen: 10,
522            xlen: 10,
523            operations: vec![Match, Match, Match, Subst, Ins, Ins, Del, Del],
524            mode: AlignmentMode::Semiglobal,
525        };
526        assert_eq!(
527            alignment.path(),
528            [
529                (4, 5, Match),
530                (5, 6, Match),
531                (6, 7, Match),
532                (7, 8, Subst),
533                (8, 8, Ins),
534                (9, 8, Ins),
535                (9, 9, Del),
536                (9, 10, Del)
537            ]
538        )
539    }
540}