bio_types/alignment.rs
1// Copyright 2014-2015 Johannes Köster, Vadim Nazarov, Patrick Marks
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5
6//! Types for representing pairwise sequence alignments
7
8#[cfg(feature = "clap")]
9use clap::ValueEnum;
10#[cfg(feature = "serde")]
11use serde::{Deserialize, Serialize};
12
13pub type TextSlice<'a> = &'a [u8];
14
15/// Alignment operations supported are match, substitution, insertion, deletion
16/// and clipping. Clipping is a special boundary condition where you are allowed
17/// to clip off the beginning/end of the sequence for a fixed clip penalty. The
18/// clip penalty could be different for the two sequences x and y, and the
19/// clipping operations on both are distinguishable (Xclip and Yclip). The usize
20/// value associated with the clipping operations are the lengths clipped. In case
21/// of standard modes like Global, Semi-Global and Local alignment, the clip operations
22/// are filtered out
23#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
24#[derive(Eq, PartialEq, Debug, Copy, Clone, Hash)]
25pub enum AlignmentOperation {
26 Match,
27 Subst,
28 Del,
29 Ins,
30 Xclip(usize),
31 Yclip(usize),
32}
33
34/// The modes of alignment supported by the aligner include standard modes such as
35/// Global, Semi-Global and Local alignment. In addition to this, user can also invoke
36/// the custom mode. In the custom mode, users can explicitly specify the clipping penalties
37/// for prefix and suffix of strings 'x' and 'y' independently. Under the hood the standard
38/// modes are implemented as special cases of the custom mode with the clipping penalties
39/// appropriately set.
40///
41/// The default alignment mode is Global.
42#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
43#[cfg_attr(feature = "clap", derive(ValueEnum))]
44#[derive(Debug, PartialEq, Eq, Copy, Clone)]
45pub enum AlignmentMode {
46 Local,
47 Semiglobal,
48 Global,
49 Custom,
50}
51
52impl Default for AlignmentMode {
53 fn default() -> Self {
54 AlignmentMode::Global
55 }
56}
57
58/// We consider alignment between two sequences x and y. x is the query or read sequence
59/// and y is the reference or template sequence. An alignment, consisting of a score,
60/// the start and end position of the alignment on sequence x and sequence y, the
61/// lengths of sequences x and y, and the alignment edit operations. The start position
62/// and end position of the alignment does not include the clipped regions. The length
63/// of clipped regions are already encapsulated in the Alignment Operation.
64#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
65#[derive(Debug, Eq, PartialEq, Clone, Default)]
66pub struct Alignment {
67 /// Smith-Waterman alignment score
68 pub score: i32,
69
70 /// Start position of alignment in reference
71 pub ystart: usize,
72
73 /// Start position of alignment in query
74 pub xstart: usize,
75
76 /// End position of alignment in reference
77 pub yend: usize,
78
79 /// End position of alignment in query
80 pub xend: usize,
81
82 /// Length of the reference sequence
83 pub ylen: usize,
84
85 /// Length of the query sequence
86 pub xlen: usize,
87
88 /// Vector of alignment operations
89 pub operations: Vec<AlignmentOperation>,
90 pub mode: AlignmentMode,
91}
92
93impl Alignment {
94 /// Calculate the cigar string from the alignment struct. x is the target string
95 ///
96 /// # Example
97 ///
98 /// ```
99 /// use bio_types::alignment::{Alignment,AlignmentMode};
100 /// use bio_types::alignment::AlignmentOperation::{Match, Subst, Ins, Del};
101 /// let alignment = Alignment {
102 /// score: 5,
103 /// xstart: 3,
104 /// ystart: 0,
105 /// xend: 9,
106 /// yend: 10,
107 /// ylen: 10,
108 /// xlen: 10,
109 /// operations: vec![Match, Match, Match, Subst, Ins, Ins, Del, Del],
110 /// mode: AlignmentMode::Semiglobal
111 /// };
112 /// assert_eq!(alignment.cigar(false), "3S3=1X2I2D1S");
113 /// ```
114 pub fn cigar(&self, hard_clip: bool) -> String {
115 match self.mode {
116 AlignmentMode::Global => panic!(" Cigar fn not supported for Global Alignment mode"),
117 AlignmentMode::Local => panic!(" Cigar fn not supported for Local Alignment mode"),
118 _ => {}
119 }
120
121 let clip_str = if hard_clip { "H" } else { "S" };
122
123 let add_op = |op: AlignmentOperation, k, cigar: &mut String| match op {
124 AlignmentOperation::Match => cigar.push_str(&format!("{}{}", k, "=")),
125 AlignmentOperation::Subst => cigar.push_str(&format!("{}{}", k, "X")),
126 AlignmentOperation::Del => cigar.push_str(&format!("{}{}", k, "D")),
127 AlignmentOperation::Ins => cigar.push_str(&format!("{}{}", k, "I")),
128 _ => {}
129 };
130
131 let mut cigar = "".to_owned();
132 if self.operations.is_empty() {
133 return cigar;
134 }
135
136 let mut last = self.operations[0];
137 if self.xstart > 0 {
138 cigar.push_str(&format!("{}{}", self.xstart, clip_str))
139 }
140 let mut k = 1;
141 for &op in self.operations[1..].iter() {
142 if op == last {
143 k += 1;
144 } else {
145 add_op(last, k, &mut cigar);
146 k = 1;
147 }
148 last = op;
149 }
150 add_op(last, k, &mut cigar);
151 if self.xlen > self.xend {
152 cigar.push_str(&format!("{}{}", self.xlen - self.xend, clip_str))
153 }
154
155 cigar
156 }
157
158 /// Return the pretty formatted alignment as a String. The string
159 /// contains sets of 3 lines of length 100. First line is for the
160 /// sequence x, second line is for the alignment operation and the
161 /// the third line is for the sequence y. A '-' in the sequence
162 /// indicates a blank (insertion/deletion). The operations follow
163 /// the following convention: '|' for a match, '\\' (a single backslash) for a mismatch,
164 /// '+' for an insertion, 'x' for a deletion and ' ' for clipping
165 ///
166 /// # Example
167 ///
168 /// If we align the strings "CCGTCCGGCAAGGG" and "AAAAACCGTTGACGGCCAA"
169 /// in various modes, we will get the following output:
170 ///
171 /// Semiglobal:
172 /// ```c
173 /// CCGTCCGGCAAGGG
174 /// ||||++++\\|\||
175 /// AAAAACCGT----TGACGGCCAA
176 /// ```
177 ///
178 /// Local:
179 /// ```c
180 /// CCGTCCGGCAAGGG
181 /// ||||
182 /// AAAAACCGT TGACGGCCAA
183 /// ```
184 ///
185 /// Global:
186 /// ```c
187 /// -----CCGT--CCGGCAAGGG
188 /// xxxxx||||xx\||||\|++\
189 /// AAAAACCGTTGACGGCCA--A
190 /// ```
191 ///
192 pub fn pretty(&self, x: TextSlice, y: TextSlice, ncol: usize) -> String {
193 let mut x_pretty = String::new();
194 let mut y_pretty = String::new();
195 let mut inb_pretty = String::new();
196
197 if !self.operations.is_empty() {
198 let mut x_i: usize;
199 let mut y_i: usize;
200
201 // If the alignment mode is one of the standard ones, the prefix clipping is
202 // implicit so we need to process it here
203 match self.mode {
204 AlignmentMode::Custom => {
205 x_i = 0;
206 y_i = 0;
207 }
208 _ => {
209 x_i = self.xstart;
210 y_i = self.ystart;
211 for k in x.iter().take(self.xstart) {
212 x_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[*k])));
213 inb_pretty.push(' ');
214 y_pretty.push(' ')
215 }
216 for k in y.iter().take(self.ystart) {
217 y_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[*k])));
218 inb_pretty.push(' ');
219 x_pretty.push(' ')
220 }
221 }
222 }
223
224 // Process the alignment.
225 for i in 0..self.operations.len() {
226 match self.operations[i] {
227 AlignmentOperation::Match => {
228 x_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[x[x_i]])));
229 x_i += 1;
230
231 inb_pretty.push('|');
232
233 y_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[y[y_i]])));
234 y_i += 1;
235 }
236 AlignmentOperation::Subst => {
237 x_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[x[x_i]])));
238 x_i += 1;
239
240 inb_pretty.push('\\');
241
242 y_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[y[y_i]])));
243 y_i += 1;
244 }
245 AlignmentOperation::Del => {
246 x_pretty.push('-');
247
248 inb_pretty.push('x');
249
250 y_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[y[y_i]])));
251 y_i += 1;
252 }
253 AlignmentOperation::Ins => {
254 x_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[x[x_i]])));
255 x_i += 1;
256
257 inb_pretty.push('+');
258
259 y_pretty.push('-');
260 }
261 AlignmentOperation::Xclip(len) => {
262 for k in x.iter().take(len) {
263 x_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[*k])));
264 x_i += 1;
265
266 inb_pretty.push(' ');
267
268 y_pretty.push(' ')
269 }
270 }
271 AlignmentOperation::Yclip(len) => {
272 for k in y.iter().take(len) {
273 y_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[*k])));
274 y_i += 1;
275
276 inb_pretty.push(' ');
277
278 x_pretty.push(' ')
279 }
280 }
281 }
282 }
283
284 // If the alignment mode is one of the standard ones, the suffix clipping is
285 // implicit so we need to process it here
286 match self.mode {
287 AlignmentMode::Custom => {}
288 _ => {
289 for k in x.iter().take(self.xlen).skip(x_i) {
290 x_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[*k])));
291 inb_pretty.push(' ');
292 y_pretty.push(' ')
293 }
294 for k in y.iter().take(self.ylen).skip(y_i) {
295 y_pretty.push_str(&format!("{}", String::from_utf8_lossy(&[*k])));
296 inb_pretty.push(' ');
297 x_pretty.push(' ')
298 }
299 }
300 }
301 }
302
303 let mut s = String::new();
304 let mut idx = 0;
305 use std::cmp::min;
306
307 assert_eq!(x_pretty.len(), inb_pretty.len());
308 assert_eq!(y_pretty.len(), inb_pretty.len());
309
310 let ml = x_pretty.len();
311
312 while idx < ml {
313 let rng = idx..min(idx + ncol, ml);
314 s.push_str(&x_pretty[rng.clone()]);
315 s.push('\n');
316
317 s.push_str(&inb_pretty[rng.clone()]);
318 s.push('\n');
319
320 s.push_str(&y_pretty[rng]);
321 s.push('\n');
322
323 s.push_str("\n\n");
324 idx += ncol;
325 }
326
327 s
328 }
329
330 /// Returns the optimal path in the alignment matrix
331 ///
332 /// # Example
333 ///
334 /// ```
335 /// use bio_types::alignment::{Alignment,AlignmentMode};
336 /// use bio_types::alignment::AlignmentOperation::*;
337 /// let alignment = Alignment {
338 /// score: 5,
339 /// xstart: 3,
340 /// ystart: 0,
341 /// xend: 9,
342 /// yend: 10,
343 /// ylen: 10,
344 /// xlen: 10,
345 /// operations: vec![Match, Match, Match, Subst, Ins, Ins, Del, Del],
346 /// mode: AlignmentMode::Semiglobal,
347 /// };
348 /// assert_eq!(alignment.path(),[
349 /// (4, 5, Match),
350 /// (5, 6, Match),
351 /// (6, 7, Match),
352 /// (7, 8, Subst),
353 /// (8, 8, Ins),
354 /// (9, 8, Ins),
355 /// (9, 9, Del),
356 /// (9, 10, Del)])
357 /// ```
358 pub fn path(&self) -> Vec<(usize, usize, AlignmentOperation)> {
359 let mut path = Vec::new();
360
361 if !self.operations.is_empty() {
362 let last = match self.mode {
363 AlignmentMode::Custom => (self.xlen, self.ylen),
364 _ => (self.xend, self.yend),
365 };
366 let mut x_i = last.0;
367 let mut y_i = last.1;
368
369 let mut ops = self.operations.clone();
370 ops.reverse();
371
372 // Process the alignment.
373 for i in ops {
374 path.push((x_i, y_i, i));
375 match i {
376 AlignmentOperation::Match => {
377 x_i -= 1;
378 y_i -= 1;
379 }
380 AlignmentOperation::Subst => {
381 x_i -= 1;
382 y_i -= 1;
383 }
384 AlignmentOperation::Del => {
385 y_i -= 1;
386 }
387 AlignmentOperation::Ins => {
388 x_i -= 1;
389 }
390 AlignmentOperation::Xclip(len) => {
391 x_i -= len;
392 }
393 AlignmentOperation::Yclip(len) => {
394 y_i -= len;
395 }
396 }
397 }
398 }
399 path.reverse();
400 path
401 }
402
403 /// Filter out Xclip and Yclip operations from the list of operations. Useful
404 /// when invoking the standard modes.
405 pub fn filter_clip_operations(&mut self) {
406 use self::AlignmentOperation::{Del, Ins, Match, Subst};
407 self.operations
408 .retain(|x| (*x == Match || *x == Subst || *x == Ins || *x == Del));
409 }
410
411 /// Number of bases in reference sequence that are aligned
412 pub fn y_aln_len(&self) -> usize {
413 self.yend - self.ystart
414 }
415
416 /// Number of bases in query sequence that are aigned
417 pub fn x_aln_len(&self) -> usize {
418 self.xend - self.xstart
419 }
420}
421
422#[cfg(test)]
423mod tests {
424 use super::AlignmentOperation::*;
425 use super::*;
426
427 #[test]
428 fn test_cigar() {
429 let alignment = Alignment {
430 score: 5,
431 xstart: 3,
432 ystart: 0,
433 xend: 9,
434 yend: 10,
435 ylen: 10,
436 xlen: 10,
437 operations: vec![Match, Match, Match, Subst, Ins, Ins, Del, Del],
438 mode: AlignmentMode::Semiglobal,
439 };
440 assert_eq!(alignment.cigar(false), "3S3=1X2I2D1S");
441
442 let alignment = Alignment {
443 score: 5,
444 xstart: 0,
445 ystart: 5,
446 xend: 4,
447 yend: 10,
448 ylen: 10,
449 xlen: 5,
450 operations: vec![Yclip(5), Match, Subst, Subst, Ins, Del, Del, Xclip(1)],
451 mode: AlignmentMode::Custom,
452 };
453 assert_eq!(alignment.cigar(false), "1=2X1I2D1S");
454 assert_eq!(alignment.cigar(true), "1=2X1I2D1H");
455
456 let alignment = Alignment {
457 score: 5,
458 xstart: 0,
459 ystart: 5,
460 xend: 3,
461 yend: 8,
462 ylen: 10,
463 xlen: 3,
464 operations: vec![Yclip(5), Subst, Match, Subst, Yclip(2)],
465 mode: AlignmentMode::Custom,
466 };
467 assert_eq!(alignment.cigar(false), "1X1=1X");
468
469 let alignment = Alignment {
470 score: 5,
471 xstart: 0,
472 ystart: 5,
473 xend: 3,
474 yend: 8,
475 ylen: 10,
476 xlen: 3,
477 operations: vec![Subst, Match, Subst],
478 mode: AlignmentMode::Semiglobal,
479 };
480 assert_eq!(alignment.cigar(false), "1X1=1X");
481 }
482
483 #[test]
484 fn test_pretty() {
485 let alignment = Alignment {
486 score: 1,
487 xstart: 0,
488 ystart: 2,
489 xend: 3,
490 yend: 5,
491 ylen: 7,
492 xlen: 2,
493 operations: vec![Subst, Match, Match],
494 mode: AlignmentMode::Semiglobal,
495 };
496 let pretty = concat!(" GAT \n", " \\|| \n", "CTAATCC\n", "\n\n");
497 assert_eq!(alignment.pretty(b"GAT", b"CTAATCC", 100), pretty);
498 let alignment = Alignment {
499 score: 5,
500 xstart: 0,
501 ystart: 5,
502 xend: 4,
503 yend: 10,
504 ylen: 10,
505 xlen: 5,
506 operations: vec![Yclip(5), Match, Subst, Subst, Ins, Del, Del, Xclip(1)],
507 mode: AlignmentMode::Custom,
508 };
509 let pretty = concat!(" AAAA--A\n |\\\\+xx \nTTTTTTTT-TT \n\n\n");
510 assert_eq!(alignment.pretty(b"AAAAA", b"TTTTTTTTTT", 100), pretty);
511 }
512
513 #[test]
514 fn test_path() {
515 let alignment = Alignment {
516 score: 5,
517 xstart: 3,
518 ystart: 0,
519 xend: 9,
520 yend: 10,
521 ylen: 10,
522 xlen: 10,
523 operations: vec![Match, Match, Match, Subst, Ins, Ins, Del, Del],
524 mode: AlignmentMode::Semiglobal,
525 };
526 assert_eq!(
527 alignment.path(),
528 [
529 (4, 5, Match),
530 (5, 6, Match),
531 (6, 7, Match),
532 (7, 8, Subst),
533 (8, 8, Ins),
534 (9, 8, Ins),
535 (9, 9, Del),
536 (9, 10, Del)
537 ]
538 )
539 }
540}