gix_diff/blob/
unified_diff.rs

1//! Facilities to produce the unified diff format.
2//!
3//! Originally based on <https://github.com/pascalkuthe/imara-diff/pull/14>.
4
5/// Defines the size of the context printed before and after each change.
6///
7/// Similar to the `-U` option in git diff or gnu-diff. If the context overlaps
8/// with previous or next change, the context gets reduced accordingly.
9#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq, Ord, PartialOrd)]
10pub struct ContextSize {
11    /// Defines the size of the context printed before and after each change.
12    symmetrical: u32,
13}
14
15impl Default for ContextSize {
16    fn default() -> Self {
17        ContextSize::symmetrical(3)
18    }
19}
20
21/// Instantiation
22impl ContextSize {
23    /// Create a symmetrical context with `n` lines before and after a changed hunk.
24    pub fn symmetrical(n: u32) -> Self {
25        ContextSize { symmetrical: n }
26    }
27}
28
29/// Specify where to put a newline.
30#[derive(Debug, Copy, Clone)]
31pub enum NewlineSeparator<'a> {
32    /// Place the given newline separator, like `\n`, after each patch header as well as after each line.
33    /// This is the right choice if tokens don't include newlines.
34    AfterHeaderAndLine(&'a str),
35    /// Place the given newline separator, like `\n`, only after each patch header or if a line doesn't contain a newline.
36    /// This is the right choice if tokens do include newlines.
37    /// Note that diff-tokens *with* newlines may diff strangely at the end of files when lines have been appended,
38    /// as it will make the last line look like it changed just because the whitespace at the end 'changed'.
39    AfterHeaderAndWhenNeeded(&'a str),
40}
41
42/// A utility trait for use in [`UnifiedDiff`](super::UnifiedDiff).
43pub trait ConsumeHunk {
44    /// The item this instance produces after consuming all hunks.
45    type Out;
46
47    /// Consume a single `hunk` in unified diff format, that would be prefixed with `header`.
48    /// Note that all newlines are added.
49    ///
50    /// Note that the [`UnifiedDiff`](super::UnifiedDiff) sink will wrap its output in an [`std::io::Result`].
51    /// After this method returned its first error, it will not be called anymore.
52    ///
53    /// The following is hunk-related information and the same that is used in the `header`.
54    /// * `before_hunk_start` is the 1-based first line of this hunk in the old file.
55    /// * `before_hunk_len` the amount of lines of this hunk in the old file.
56    /// * `after_hunk_start` is the 1-based first line of this hunk in the new file.
57    /// * `after_hunk_len` the amount of lines of this hunk in the new file.
58    fn consume_hunk(
59        &mut self,
60        before_hunk_start: u32,
61        before_hunk_len: u32,
62        after_hunk_start: u32,
63        after_hunk_len: u32,
64        header: &str,
65        hunk: &[u8],
66    ) -> std::io::Result<()>;
67    /// Called after the last hunk is consumed to produce an output.
68    fn finish(self) -> Self::Out;
69}
70
71pub(super) mod _impl {
72    use super::{ConsumeHunk, ContextSize, NewlineSeparator};
73    use bstr::{ByteSlice, ByteVec};
74    use imara_diff::{intern, Sink};
75    use intern::{InternedInput, Interner, Token};
76    use std::hash::Hash;
77    use std::io::ErrorKind;
78    use std::ops::Range;
79
80    /// A [`Sink`] that creates a textual diff in the format typically output by git or `gnu-diff` if the `-u` option is used,
81    /// and passes it in full to a consumer.
82    pub struct UnifiedDiff<'a, T, D>
83    where
84        T: Hash + Eq + AsRef<[u8]>,
85        D: ConsumeHunk,
86    {
87        before: &'a [Token],
88        after: &'a [Token],
89        interner: &'a Interner<T>,
90
91        pos: u32,
92        before_hunk_start: u32,
93        after_hunk_start: u32,
94        before_hunk_len: u32,
95        after_hunk_len: u32,
96        /// Symmetrical context before and after the changed hunk.
97        ctx_size: u32,
98
99        buffer: Vec<u8>,
100        header_buf: String,
101        delegate: D,
102        newline: NewlineSeparator<'a>,
103
104        err: Option<std::io::Error>,
105    }
106
107    impl<'a, T, D> UnifiedDiff<'a, T, D>
108    where
109        T: Hash + Eq + AsRef<[u8]>,
110        D: ConsumeHunk,
111    {
112        /// Create a new instance to create unified diff using the lines in `input`,
113        /// which also must be used when running the diff algorithm.
114        /// `context_size` is the amount of lines around each hunk which will be passed
115        ///to `consume_hunk`.
116        ///
117        /// `consume_hunk` is called for each hunk in unified-diff format, as created from each line separated by `newline_separator`.
118        pub fn new(
119            input: &'a InternedInput<T>,
120            consume_hunk: D,
121            newline_separator: NewlineSeparator<'a>,
122            context_size: ContextSize,
123        ) -> Self {
124            Self {
125                before_hunk_start: 0,
126                after_hunk_start: 0,
127                before_hunk_len: 0,
128                after_hunk_len: 0,
129                buffer: Vec::with_capacity(8),
130                header_buf: String::new(),
131                delegate: consume_hunk,
132                interner: &input.interner,
133                before: &input.before,
134                after: &input.after,
135                pos: 0,
136                ctx_size: context_size.symmetrical,
137                newline: newline_separator,
138
139                err: None,
140            }
141        }
142
143        fn print_tokens(&mut self, tokens: &[Token], prefix: char) {
144            for &token in tokens {
145                self.buffer.push_char(prefix);
146                let line = &self.interner[token];
147                self.buffer.push_str(line);
148                match self.newline {
149                    NewlineSeparator::AfterHeaderAndLine(nl) => {
150                        self.buffer.push_str(nl);
151                    }
152                    NewlineSeparator::AfterHeaderAndWhenNeeded(nl) => {
153                        if !line.as_ref().ends_with_str(nl) {
154                            self.buffer.push_str(nl);
155                        }
156                    }
157                }
158            }
159        }
160
161        fn flush(&mut self) -> std::io::Result<()> {
162            if self.before_hunk_len == 0 && self.after_hunk_len == 0 {
163                return Ok(());
164            }
165
166            let end = (self.pos + self.ctx_size).min(self.before.len() as u32);
167            self.update_pos(end, end);
168
169            self.header_buf.clear();
170
171            std::fmt::Write::write_fmt(
172                &mut self.header_buf,
173                format_args!(
174                    "@@ -{},{} +{},{} @@{nl}",
175                    self.before_hunk_start + 1,
176                    self.before_hunk_len,
177                    self.after_hunk_start + 1,
178                    self.after_hunk_len,
179                    nl = match self.newline {
180                        NewlineSeparator::AfterHeaderAndLine(nl) | NewlineSeparator::AfterHeaderAndWhenNeeded(nl) => {
181                            nl
182                        }
183                    }
184                ),
185            )
186            .map_err(|err| std::io::Error::new(ErrorKind::Other, err))?;
187            self.delegate.consume_hunk(
188                self.before_hunk_start + 1,
189                self.before_hunk_len,
190                self.after_hunk_start + 1,
191                self.after_hunk_len,
192                &self.header_buf,
193                &self.buffer,
194            )?;
195            self.buffer.clear();
196            self.before_hunk_len = 0;
197            self.after_hunk_len = 0;
198            Ok(())
199        }
200
201        fn update_pos(&mut self, print_to: u32, move_to: u32) {
202            self.print_tokens(&self.before[self.pos as usize..print_to as usize], ' ');
203            let len = print_to - self.pos;
204            self.pos = move_to;
205            self.before_hunk_len += len;
206            self.after_hunk_len += len;
207        }
208    }
209
210    impl<T, D> Sink for UnifiedDiff<'_, T, D>
211    where
212        T: Hash + Eq + AsRef<[u8]>,
213        D: ConsumeHunk,
214    {
215        type Out = std::io::Result<D::Out>;
216
217        fn process_change(&mut self, before: Range<u32>, after: Range<u32>) {
218            if self.err.is_some() {
219                return;
220            }
221            if ((self.pos == 0) && (before.start - self.pos > self.ctx_size))
222                || (before.start - self.pos > 2 * self.ctx_size)
223            {
224                if let Err(err) = self.flush() {
225                    self.err = Some(err);
226                    return;
227                }
228                self.pos = before.start - self.ctx_size;
229                self.before_hunk_start = self.pos;
230                self.after_hunk_start = after.start - self.ctx_size;
231            }
232            self.update_pos(before.start, before.end);
233            self.before_hunk_len += before.end - before.start;
234            self.after_hunk_len += after.end - after.start;
235            self.print_tokens(&self.before[before.start as usize..before.end as usize], '-');
236            self.print_tokens(&self.after[after.start as usize..after.end as usize], '+');
237        }
238
239        fn finish(mut self) -> Self::Out {
240            if let Err(err) = self.flush() {
241                self.err = Some(err);
242            }
243            if let Some(err) = self.err {
244                return Err(err);
245            }
246            Ok(self.delegate.finish())
247        }
248    }
249
250    /// An implementation that fails if the input isn't UTF-8.
251    impl ConsumeHunk for String {
252        type Out = Self;
253
254        fn consume_hunk(&mut self, _: u32, _: u32, _: u32, _: u32, header: &str, hunk: &[u8]) -> std::io::Result<()> {
255            self.push_str(header);
256            self.push_str(
257                hunk.to_str()
258                    .map_err(|err| std::io::Error::new(ErrorKind::Other, err))?,
259            );
260            Ok(())
261        }
262
263        fn finish(self) -> Self::Out {
264            self
265        }
266    }
267
268    /// An implementation that writes hunks into a byte buffer.
269    impl ConsumeHunk for Vec<u8> {
270        type Out = Self;
271
272        fn consume_hunk(&mut self, _: u32, _: u32, _: u32, _: u32, header: &str, hunk: &[u8]) -> std::io::Result<()> {
273            self.push_str(header);
274            self.push_str(hunk);
275            Ok(())
276        }
277
278        fn finish(self) -> Self::Out {
279            self
280        }
281    }
282}