imara_diff/lib.rs
1//! Imara-diff is a solid (imara in swahili) diff library for rust.
2//! Solid refers to the fact that imara-diff provides very good runtime performance even
3//! in pathologic cases so that your application never appears to freeze while waiting on a diff.
4//! The performance improvements are achieved using battle tested heuristics used in gnu-diff and git
5//! that are known to yield fast runtime and performance.
6//!
7//! Imara-diff is also designed to be flexible so that it can be used with arbitrary collections and
8//! not just lists and strings and even allows reusing large parts of the computation when
9//! comparing the same file to multiple different files.
10//!
11//! Imara-diff provides two diff algorithms:
12//!
13//! * The linear-space variant of the well known [**myer** algorithm](http://www.xmailserver.org/diff2.pdf)
14//! * The **Histogram** algorithm which variant of the patience diff algorithm.
15//!
16//! Myers algorithm has been enhanced with preprocessing and multiple heuristics to ensure fast runtime in pathological
17//! cases to avoid quadratic time complexity and closely matches the behaviour of gnu-diff and git.
18//! The Histogram algorithm was originally ported from git but has been heavily optimized.
19//! The **Histogram algorithm outperforms Myers diff** by 10% - 100% across a **wide variety of workloads**.
20//!
21//! Imara-diffs algorithms have been benchmarked over a wide variety of real-world code.
22//! For example while comparing multiple different linux kernel it performs up to 30 times better than the `similar` crate:
23#![cfg_attr(doc, doc=concat!("<img width=\"600\" class=\"figure\" src=\"data:image/svg+xml;base64,", include_str!("../plots/linux_comparison.svg.base64"), "\"></img>"))]
24//!
25//! # Api Overview
26//!
27//! Imara-diff provides the [`UnifiedDiffBuilder`](crate::UnifiedDiffBuilder) for building
28//! a human-readable diff similar to the output of `git diff` or `diff -u`.
29//! This makes building a tool similar to gnu diff easy:
30//!
31//! ```
32//! use imara_diff::intern::InternedInput;
33//! use imara_diff::{diff, Algorithm, UnifiedDiffBuilder};
34//!
35//! let before = r#"fn foo() -> Bar {
36//! let mut foo = 2;
37//! foo *= 50;
38//! println!("hello world")
39//! }"#;
40//!
41//! let after = r#"// lorem ipsum
42//! fn foo() -> Bar {
43//! let mut foo = 2;
44//! foo *= 50;
45//! println!("hello world");
46//! println!("{foo}");
47//! }
48//! // foo
49//! "#;
50//!
51//! let input = InternedInput::new(before, after);
52//! let diff = diff(Algorithm::Histogram, &input, UnifiedDiffBuilder::new(&input));
53//! assert_eq!(
54//! diff,
55//! r#"@@ -1,5 +1,8 @@
56//! +// lorem ipsum
57//! fn foo() -> Bar {
58//! let mut foo = 2;
59//! foo *= 50;
60//! - println!("hello world")
61//! + println!("hello world");
62//! + println!("{foo}");
63//! }
64//! +// foo
65//! "#
66//! );
67//! ```
68//!
69//! If you want to process the diff in some way you can provide your own implementation of [`Sink`](crate::sink::Sink).
70//! For closures [`Sink`](crate::sink::Sink) is already implemented, so simple [`Sink`]s can be easily added:
71//!
72//! ```
73//! use std::ops::Range;
74//!
75//! use imara_diff::intern::InternedInput;
76//! use imara_diff::{diff, Algorithm, UnifiedDiffBuilder};
77//!
78//! let before = r#"fn foo() -> Bar {
79//! let mut foo = 2;
80//! foo *= 50;
81//! println!("hello world")
82//! }"#;
83//!
84//! let after = r#"// lorem ipsum
85//! fn foo() -> Bar {
86//! let mut foo = 2;
87//! foo *= 50;
88//! println!("hello world");
89//! println!("{foo}");
90//! }
91//! // foo
92//! "#;
93//!
94//! let mut insertions = Vec::new();
95//! let mut removals = Vec::new();
96//! let mut replacements = Vec::new();
97//!
98//! let input = InternedInput::new(before, after);
99//! let sink = |before: Range<u32>, after: Range<u32>| {
100//! let hunk_before: Vec<_> = input.before[before.start as usize..before.end as usize]
101//! .iter()
102//! .map(|&line| input.interner[line])
103//! .collect();
104//! let hunk_after: Vec<_> = input.after[after.start as usize..after.end as usize]
105//! .iter()
106//! .map(|&line| input.interner[line])
107//! .collect();
108//! if hunk_after.is_empty() {
109//! removals.push(hunk_before)
110//! } else if hunk_before.is_empty() {
111//! insertions.push(hunk_after)
112//! } else {
113//! replacements.push((hunk_before, hunk_after))
114//! }
115//! };
116//! let diff = diff(Algorithm::Histogram, &input, sink);
117//! assert_eq!(&insertions, &[vec!["// lorem ipsum"], vec!["// foo"]]);
118//! assert!(removals.is_empty());
119//! assert_eq!(
120//! &replacements,
121//! &[(
122//! vec![" println!(\"hello world\")"],
123//! vec![" println!(\"hello world\");", " println!(\"{foo}\");"]
124//! )]
125//! );
126//! ```
127//!
128//! For `&str` and `&[u8]` imara-diff will compute a line diff by default.
129//! To perform diffs of different tokenizations and collections you can implement the [`TokenSource`](crate::intern::TokenSource) trait.
130//! For example the imara-diff provides an alternative tokenziser for line-diffs that includes the line terminator in the line:
131//!
132//! ```
133//! use imara_diff::intern::InternedInput;
134//! use imara_diff::sink::Counter;
135//! use imara_diff::sources::lines_with_terminator;
136//! use imara_diff::{diff, Algorithm, UnifiedDiffBuilder};
137//!
138//! let before = "foo";
139//! let after = "foo\n";
140//!
141//! let input = InternedInput::new(before, after);
142//! let changes = diff(Algorithm::Histogram, &input, Counter::default());
143//! assert_eq!(changes.insertions, 0);
144//! assert_eq!(changes.removals, 0);
145//!
146//! let input = InternedInput::new(lines_with_terminator(before), lines_with_terminator(after));
147//! let changes = diff(Algorithm::Histogram, &input, Counter::default());
148//! assert_eq!(changes.insertions, 1);
149//! assert_eq!(changes.removals, 1);
150//! ```
151
152#[cfg(feature = "unified_diff")]
153pub use unified_diff::UnifiedDiffBuilder;
154
155use crate::intern::{InternedInput, Token, TokenSource};
156pub use crate::sink::Sink;
157mod histogram;
158pub mod intern;
159mod myers;
160pub mod sink;
161pub mod sources;
162#[cfg(feature = "unified_diff")]
163mod unified_diff;
164mod util;
165
166#[cfg(test)]
167mod tests;
168
169/// `imara-diff` supports multiple different algorithms
170/// for computing an edit sequence.
171/// These algorithms have different performance and all produce different output.
172#[derive(Debug, PartialEq, Eq, Clone, Copy, Default)]
173pub enum Algorithm {
174 /// A variation of the [`patience` diff algorithm described by Bram Cohen's blog post](https://bramcohen.livejournal.com/73318.html)
175 /// that uses a histogram to find the least common LCS.
176 /// Just like the `patience` diff algorithm, this algorithm usually produces
177 /// more human readable output then myers algorithm.
178 /// However compared to the `patience` diff algorithm (which is slower then myers algorithm),
179 /// the Histogram algorithm performs much better.
180 ///
181 /// The implementation here was originally ported from `git` but has been significantly
182 /// modified to improve performance.
183 /// As a result it consistently **performs better then myers algorithm** (5%-100%) over
184 /// a wide variety of test data. For example a benchmark of diffing linux kernel commits is shown below:
185 #[cfg_attr(doc, doc=concat!("<img width=\"600\" class=\"figure\" src=\"data:image/svg+xml;base64,", include_str!("../plots/linux_speedup.svg.base64"), "\"></img>"))]
186 ///
187 /// For pathological subsequences that only contain highly repeating tokens (64+ occurrences)
188 /// the algorithm falls back on Myers algorithm (with heuristics) to avoid quadratic behavior.
189 ///
190 /// Compared to Myers algorithm, the Histogram diff algorithm is more focused on providing
191 /// human readable diffs instead of minimal diffs. In practice this means that the edit-sequences
192 /// produced by the histogram diff are often longer then those produced by Myers algorithm.
193 ///
194 /// The heuristic used by the histogram diff does not work well for inputs with small (often repeated)
195 /// tokens. For example **character diffs do not work well** as most (english) text is madeup of
196 /// a fairly small set of characters. The `Histogram` algorithm will automatically these cases and
197 /// fallback to Myers algorithm. However this detection has a nontrivial overhead, so
198 /// if its known upfront that the sort of tokens is very small `Myers` algorithm should
199 /// be used instead.
200 #[default]
201 Histogram,
202 /// An implementation of the linear space variant of
203 /// [Myers `O((N+M)D)` algorithm](http://www.xmailserver.org/diff2.pdf).
204 /// The algorithm is enhanced with preprocessing that removes
205 /// tokens that don't occur in the other file at all.
206 /// Furthermore two heuristics to the middle snake search are implemented
207 /// that ensure reasonable runtime (mostly linear time complexity) even for large files.
208 ///
209 /// Due to the divide and conquer nature of the algorithm
210 /// the edit sequenced produced are still fairly small even when the middle snake
211 /// search is aborted by a heuristic.
212 /// However, the produced edit sequences are not guaranteed to be fully minimal.
213 /// If that property is vital to you, use the `MyersMinimal` algorithm instead.
214 ///
215 /// The implementation (including the preprocessing) are mostly
216 /// ported from `git` and `gnu-diff` where Myers algorithm is used
217 /// as the default diff algorithm.
218 /// Therefore the used heuristics have been heavily battle tested and
219 /// are known to behave well over a large variety of inputs
220 Myers,
221 /// Same as `Myers` but the early abort heuristics are disabled to guarantee
222 /// a minimal edit sequence.
223 /// This can mean significant slowdown in pathological cases.
224 MyersMinimal,
225}
226
227impl Algorithm {
228 #[cfg(test)]
229 const ALL: [Self; 2] = [Algorithm::Histogram, Algorithm::Myers];
230}
231
232/// Computes an edit-script that transforms `input.before` into `input.after` using
233/// the specified `algorithm`
234/// The edit-script is passed to `sink.process_change` while it is produced.
235pub fn diff<S: Sink, T>(algorithm: Algorithm, input: &InternedInput<T>, sink: S) -> S::Out {
236 diff_with_tokens(
237 algorithm,
238 &input.before,
239 &input.after,
240 input.interner.num_tokens(),
241 sink,
242 )
243}
244
245/// Computes an edit-script that transforms `before` into `after` using
246/// the specified `algorithm`
247/// The edit-script is passed to `sink.process_change` while it is produced.
248pub fn diff_with_tokens<S: Sink>(
249 algorithm: Algorithm,
250 before: &[Token],
251 after: &[Token],
252 num_tokens: u32,
253 sink: S,
254) -> S::Out {
255 assert!(
256 before.len() < i32::MAX as usize,
257 "imara-diff only supports up to {} tokens",
258 i32::MAX
259 );
260 assert!(
261 after.len() < i32::MAX as usize,
262 "imara-diff only supports up to {} tokens",
263 i32::MAX
264 );
265 match algorithm {
266 Algorithm::Histogram => histogram::diff(before, after, num_tokens, sink),
267 Algorithm::Myers => myers::diff(before, after, num_tokens, sink, false),
268 Algorithm::MyersMinimal => myers::diff(before, after, num_tokens, sink, true),
269 }
270}