gix_diff/blob/
pipeline.rs

1use std::{
2    io::{Read, Write},
3    path::{Path, PathBuf},
4    process::{Command, Stdio},
5};
6
7use bstr::{BStr, ByteSlice};
8use gix_filter::{
9    driver::apply::{Delay, MaybeDelayed},
10    pipeline::convert::{ToGitOutcome, ToWorktreeOutcome},
11};
12use gix_object::tree::EntryKind;
13
14use crate::blob::{Driver, Pipeline, ResourceKind};
15
16/// A way to access roots for different kinds of resources that are possibly located and accessible in a worktree.
17#[derive(Clone, Debug, Default)]
18pub struct WorktreeRoots {
19    /// A place where the source of a rewrite, rename or copy, or generally the previous version of resources, are located.
20    pub old_root: Option<PathBuf>,
21    /// A place where the destination of a rewrite, rename or copy, or generally the new version of resources, are located.
22    pub new_root: Option<PathBuf>,
23}
24
25/// Access
26impl WorktreeRoots {
27    /// Return the root path for the given `kind`
28    pub fn by_kind(&self, kind: ResourceKind) -> Option<&Path> {
29        match kind {
30            ResourceKind::OldOrSource => self.old_root.as_deref(),
31            ResourceKind::NewOrDestination => self.new_root.as_deref(),
32        }
33    }
34
35    /// Return `true` if all worktree roots are unset.
36    pub fn is_unset(&self) -> bool {
37        self.new_root.is_none() && self.old_root.is_none()
38    }
39}
40
41/// Data as part of an [Outcome].
42#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)]
43pub enum Data {
44    /// The data to use for diffing was written into the buffer that was passed during the call to [`Pipeline::convert_to_diffable()`].
45    Buffer,
46    /// The size that the binary blob had at the given revision, without having applied filters, as it's either
47    /// considered binary or above the big-file threshold.
48    ///
49    /// In this state, the binary file cannot be diffed.
50    Binary {
51        /// The size of the object prior to performing any filtering or as it was found on disk.
52        ///
53        /// Note that technically, the size isn't always representative of the same 'state' of the
54        /// content, as once it can be the size of the blob in git, and once it's the size of file
55        /// in the worktree.
56        size: u64,
57    },
58}
59
60/// The outcome returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()).
61#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)]
62pub struct Outcome {
63    /// If available, an index into the `drivers` field to access more diff-related information of the driver for items
64    /// at the given path, as previously determined by git-attributes.
65    ///
66    /// Note that drivers are queried even if there is no object available.
67    pub driver_index: Option<usize>,
68    /// The data itself, suitable for diffing, and if the object or worktree item is present at all.
69    pub data: Option<Data>,
70}
71
72/// Options for use in a [`Pipeline`].
73#[derive(Default, Clone, Copy, PartialEq, Eq, Debug, Hash, Ord, PartialOrd)]
74pub struct Options {
75    /// The amount of bytes that an object has to reach before being treated as binary.
76    /// These objects will not be queried, nor will their data be processed in any way.
77    /// If `0`, no file is ever considered binary due to their size.
78    ///
79    /// Note that for files stored in `git`, what counts is their stored, decompressed size,
80    /// thus `git-lfs` files would typically not be considered binary unless one explicitly sets
81    /// them
82    pub large_file_threshold_bytes: u64,
83    /// Capabilities of the file system which affect how we read worktree files.
84    pub fs: gix_fs::Capabilities,
85}
86
87/// The specific way to convert a resource.
88#[derive(Default, Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
89pub enum Mode {
90    /// Always prepare the version of the resource as it would be in the work-tree, and
91    /// apply binary-to-text filters if present.
92    ///
93    /// This is typically free for resources in the worktree, and will apply filters to resources in the
94    /// object database.
95    #[default]
96    ToWorktreeAndBinaryToText,
97    /// Prepare the version of the resource as it would be in the work-tree if
98    /// binary-to-text filters are present (and apply them), or use the version in `git` otherwise.
99    ToGitUnlessBinaryToTextIsPresent,
100    /// Always prepare resources as they are stored in `git`.
101    ///
102    /// This is usually fastest, even though resources in the worktree needed to be converted files.
103    ToGit,
104}
105
106impl Mode {
107    fn to_worktree(self) -> bool {
108        matches!(
109            self,
110            Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToWorktreeAndBinaryToText
111        )
112    }
113
114    fn to_git(self) -> bool {
115        matches!(self, Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToGit)
116    }
117}
118
119///
120pub mod convert_to_diffable {
121    use std::collections::TryReserveError;
122
123    use bstr::BString;
124    use gix_object::tree::EntryKind;
125
126    /// The error returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()).
127    #[derive(Debug, thiserror::Error)]
128    #[allow(missing_docs)]
129    pub enum Error {
130        #[error("Entry at '{rela_path}' must be regular file or symlink, but was {actual:?}")]
131        InvalidEntryKind { rela_path: BString, actual: EntryKind },
132        #[error("Entry at '{rela_path}' could not be read as symbolic link")]
133        ReadLink { rela_path: BString, source: std::io::Error },
134        #[error("Entry at '{rela_path}' could not be opened for reading or read from")]
135        OpenOrRead { rela_path: BString, source: std::io::Error },
136        #[error("Entry at '{rela_path}' could not be copied from a filter process to a memory buffer")]
137        StreamCopy { rela_path: BString, source: std::io::Error },
138        #[error("Failed to run '{cmd}' for binary-to-text conversion of entry at {rela_path}")]
139        RunTextConvFilter {
140            rela_path: BString,
141            cmd: String,
142            source: std::io::Error,
143        },
144        #[error("Tempfile for binary-to-text conversion for entry at {rela_path} could not be created")]
145        CreateTempfile { rela_path: BString, source: std::io::Error },
146        #[error("Binary-to-text conversion '{cmd}' for entry at {rela_path} failed with: {stderr}")]
147        TextConvFilterFailed {
148            rela_path: BString,
149            cmd: String,
150            stderr: BString,
151        },
152        #[error(transparent)]
153        FindObject(#[from] gix_object::find::existing_object::Error),
154        #[error(transparent)]
155        ConvertToWorktree(#[from] gix_filter::pipeline::convert::to_worktree::Error),
156        #[error(transparent)]
157        ConvertToGit(#[from] gix_filter::pipeline::convert::to_git::Error),
158        #[error("Memory allocation failed")]
159        OutOfMemory(#[from] TryReserveError),
160    }
161}
162
163/// Lifecycle
164impl Pipeline {
165    /// Create a new instance of a pipeline which produces blobs suitable for diffing. `roots` allow to read worktree files directly, otherwise
166    /// `worktree_filter` is used to transform object database data directly. `drivers` further configure individual paths.
167    /// `options` are used to further configure the way we act..
168    pub fn new(
169        roots: WorktreeRoots,
170        worktree_filter: gix_filter::Pipeline,
171        mut drivers: Vec<super::Driver>,
172        options: Options,
173    ) -> Self {
174        drivers.sort_by(|a, b| a.name.cmp(&b.name));
175        Pipeline {
176            roots,
177            worktree_filter,
178            drivers,
179            options,
180            attrs: {
181                let mut out = gix_filter::attributes::search::Outcome::default();
182                out.initialize_with_selection(&Default::default(), Some("diff"));
183                out
184            },
185            path: Default::default(),
186        }
187    }
188}
189
190/// Access
191impl Pipeline {
192    /// Return all drivers that this instance was initialized with.
193    ///
194    /// They are sorted by [`name`](Driver::name) to support binary searches.
195    pub fn drivers(&self) -> &[super::Driver] {
196        &self.drivers
197    }
198}
199
200/// Conversion
201impl Pipeline {
202    /// Convert the object at `id`, `mode`, `rela_path` and `kind`, providing access to `attributes` and `objects`.
203    /// The resulting diff-able data is written into `out`, assuming it's not too large. The returned [`Outcome`]
204    /// contains information on how to use `out`, or if it's filled at all.
205    ///
206    /// `attributes` must be returning the attributes at `rela_path`, and `objects` must be usable if `kind` is
207    /// a resource in the object database, i.e. has no worktree root available.
208    ///
209    /// If `id` [is null](gix_hash::ObjectId::is_null()) or the file in question doesn't exist in the worktree in case
210    /// [a root](WorktreeRoots) is present, then `out` will be left cleared and [Outcome::data] will be `None`.
211    ///
212    /// Note that `mode` is trusted, and we will not re-validate that the entry in the worktree actually is of that mode.
213    ///
214    /// Use `convert` to control what kind of the resource will be produced.
215    ///
216    /// ### About Tempfiles
217    ///
218    /// When querying from the object database and a binary and a [binary-to-text](Driver::binary_to_text_command) is set,
219    /// a temporary file will be created to serve as input for the converter program, containing the worktree-data that
220    /// exactly as it would be present in the worktree if checked out.
221    ///
222    /// As these files are ultimately named tempfiles, they will be leaked unless the [gix_tempfile] is configured with
223    /// a signal handler. If they leak, they would remain in the system's `$TMP` directory.
224    #[allow(clippy::too_many_arguments)]
225    pub fn convert_to_diffable(
226        &mut self,
227        id: &gix_hash::oid,
228        mode: EntryKind,
229        rela_path: &BStr,
230        kind: ResourceKind,
231        attributes: &mut dyn FnMut(&BStr, &mut gix_filter::attributes::search::Outcome),
232        objects: &dyn gix_object::FindObjectOrHeader,
233        convert: Mode,
234        out: &mut Vec<u8>,
235    ) -> Result<Outcome, convert_to_diffable::Error> {
236        let is_symlink = match mode {
237            EntryKind::Link if self.options.fs.symlink => true,
238            EntryKind::Blob | EntryKind::BlobExecutable => false,
239            _ => {
240                return Err(convert_to_diffable::Error::InvalidEntryKind {
241                    rela_path: rela_path.to_owned(),
242                    actual: mode,
243                })
244            }
245        };
246
247        out.clear();
248        attributes(rela_path, &mut self.attrs);
249        let attr = self.attrs.iter_selected().next().expect("pre-initialized with 'diff'");
250        let driver_index = attr
251            .assignment
252            .state
253            .as_bstr()
254            .and_then(|name| self.drivers.binary_search_by(|d| d.name.as_bstr().cmp(name)).ok());
255        let driver = driver_index.map(|idx| &self.drivers[idx]);
256        let mut is_binary = if let Some(driver) = driver {
257            driver
258                .is_binary
259                .map(|is_binary| is_binary && driver.binary_to_text_command.is_none())
260        } else {
261            attr.assignment.state.is_unset().then_some(true)
262        };
263        match self.roots.by_kind(kind) {
264            Some(root) => {
265                self.path.clear();
266                self.path.push(root);
267                self.path.push(gix_path::from_bstr(rela_path));
268                let data = if is_symlink {
269                    let target = none_if_missing(std::fs::read_link(&self.path)).map_err(|err| {
270                        convert_to_diffable::Error::ReadLink {
271                            rela_path: rela_path.to_owned(),
272                            source: err,
273                        }
274                    })?;
275                    target.map(|target| {
276                        out.extend_from_slice(gix_path::into_bstr(target).as_ref());
277                        Data::Buffer
278                    })
279                } else {
280                    let need_size_only = is_binary == Some(true);
281                    let size_in_bytes = (need_size_only
282                        || (is_binary != Some(false) && self.options.large_file_threshold_bytes > 0))
283                        .then(|| {
284                            none_if_missing(self.path.metadata().map(|md| md.len())).map_err(|err| {
285                                convert_to_diffable::Error::OpenOrRead {
286                                    rela_path: rela_path.to_owned(),
287                                    source: err,
288                                }
289                            })
290                        })
291                        .transpose()?;
292                    match size_in_bytes {
293                        Some(None) => None, // missing as identified by the size check
294                        Some(Some(size)) if size > self.options.large_file_threshold_bytes || need_size_only => {
295                            Some(Data::Binary { size })
296                        }
297                        _ => {
298                            match driver
299                                .filter(|_| convert.to_worktree())
300                                .and_then(|d| d.prepare_binary_to_text_cmd(&self.path))
301                            {
302                                Some(cmd) => {
303                                    // Avoid letting the driver program fail if it doesn't exist.
304                                    if self.options.large_file_threshold_bytes == 0
305                                        && none_if_missing(std::fs::symlink_metadata(&self.path))
306                                            .map_err(|err| convert_to_diffable::Error::OpenOrRead {
307                                                rela_path: rela_path.to_owned(),
308                                                source: err,
309                                            })?
310                                            .is_none()
311                                    {
312                                        None
313                                    } else {
314                                        run_cmd(rela_path, cmd, out)?;
315                                        Some(Data::Buffer)
316                                    }
317                                }
318                                None => {
319                                    let file = none_if_missing(std::fs::File::open(&self.path)).map_err(|err| {
320                                        convert_to_diffable::Error::OpenOrRead {
321                                            rela_path: rela_path.to_owned(),
322                                            source: err,
323                                        }
324                                    })?;
325
326                                    match file {
327                                        Some(mut file) => {
328                                            if convert.to_git() {
329                                                let res = self.worktree_filter.convert_to_git(
330                                                    file,
331                                                    gix_path::from_bstr(rela_path).as_ref(),
332                                                    attributes,
333                                                    &mut |buf| objects.try_find(id, buf).map(|obj| obj.map(|_| ())),
334                                                )?;
335
336                                                match res {
337                                                    ToGitOutcome::Unchanged(mut file) => {
338                                                        file.read_to_end(out).map_err(|err| {
339                                                            convert_to_diffable::Error::OpenOrRead {
340                                                                rela_path: rela_path.to_owned(),
341                                                                source: err,
342                                                            }
343                                                        })?;
344                                                    }
345                                                    ToGitOutcome::Process(mut stream) => {
346                                                        stream.read_to_end(out).map_err(|err| {
347                                                            convert_to_diffable::Error::OpenOrRead {
348                                                                rela_path: rela_path.to_owned(),
349                                                                source: err,
350                                                            }
351                                                        })?;
352                                                    }
353                                                    ToGitOutcome::Buffer(buf) => {
354                                                        out.clear();
355                                                        out.try_reserve(buf.len())?;
356                                                        out.extend_from_slice(buf);
357                                                    }
358                                                }
359                                            } else {
360                                                file.read_to_end(out).map_err(|err| {
361                                                    convert_to_diffable::Error::OpenOrRead {
362                                                        rela_path: rela_path.to_owned(),
363                                                        source: err,
364                                                    }
365                                                })?;
366                                            }
367
368                                            Some(if is_binary.unwrap_or_else(|| is_binary_buf(out)) {
369                                                let size = out.len() as u64;
370                                                out.clear();
371                                                Data::Binary { size }
372                                            } else {
373                                                Data::Buffer
374                                            })
375                                        }
376                                        None => None,
377                                    }
378                                }
379                            }
380                        }
381                    }
382                };
383                Ok(Outcome { driver_index, data })
384            }
385            None => {
386                let data = if id.is_null() {
387                    None
388                } else {
389                    let header = objects
390                        .try_header(id)
391                        .map_err(gix_object::find::existing_object::Error::Find)?
392                        .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?;
393                    if is_binary.is_none()
394                        && self.options.large_file_threshold_bytes > 0
395                        && header.size > self.options.large_file_threshold_bytes
396                    {
397                        is_binary = Some(true);
398                    };
399                    let data = if is_binary == Some(true) {
400                        Data::Binary { size: header.size }
401                    } else {
402                        objects
403                            .try_find(id, out)
404                            .map_err(gix_object::find::existing_object::Error::Find)?
405                            .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?;
406                        if matches!(mode, EntryKind::Blob | EntryKind::BlobExecutable)
407                            && convert == Mode::ToWorktreeAndBinaryToText
408                            || (convert == Mode::ToGitUnlessBinaryToTextIsPresent
409                                && driver.is_some_and(|d| d.binary_to_text_command.is_some()))
410                        {
411                            let res =
412                                self.worktree_filter
413                                    .convert_to_worktree(out, rela_path, attributes, Delay::Forbid)?;
414
415                            let cmd_and_file = driver
416                                .and_then(|d| {
417                                    d.binary_to_text_command.is_some().then(|| {
418                                        gix_tempfile::new(
419                                            std::env::temp_dir(),
420                                            gix_tempfile::ContainingDirectory::Exists,
421                                            gix_tempfile::AutoRemove::Tempfile,
422                                        )
423                                        .and_then(|mut tmp_file| {
424                                            self.path.clear();
425                                            tmp_file.with_mut(|tmp| self.path.push(tmp.path()))?;
426                                            Ok(tmp_file)
427                                        })
428                                        .map(|tmp_file| {
429                                            (
430                                                d.prepare_binary_to_text_cmd(&self.path)
431                                                    .expect("always get cmd if command is set"),
432                                                tmp_file,
433                                            )
434                                        })
435                                    })
436                                })
437                                .transpose()
438                                .map_err(|err| convert_to_diffable::Error::CreateTempfile {
439                                    source: err,
440                                    rela_path: rela_path.to_owned(),
441                                })?;
442                            match cmd_and_file {
443                                Some((cmd, mut tmp_file)) => {
444                                    match res {
445                                        ToWorktreeOutcome::Unchanged(buf) | ToWorktreeOutcome::Buffer(buf) => {
446                                            tmp_file.write_all(buf)
447                                        }
448                                        ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => {
449                                            std::io::copy(&mut stream, &mut tmp_file).map(|_| ())
450                                        }
451                                        ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => {
452                                            unreachable!("we prohibit this")
453                                        }
454                                    }
455                                    .map_err(|err| {
456                                        convert_to_diffable::Error::StreamCopy {
457                                            source: err,
458                                            rela_path: rela_path.to_owned(),
459                                        }
460                                    })?;
461                                    out.clear();
462                                    run_cmd(rela_path, cmd, out)?;
463                                }
464                                None => {
465                                    match res {
466                                        ToWorktreeOutcome::Unchanged(_) => {}
467                                        ToWorktreeOutcome::Buffer(src) => {
468                                            out.clear();
469                                            out.try_reserve(src.len())?;
470                                            out.extend_from_slice(src);
471                                        }
472                                        ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => {
473                                            std::io::copy(&mut stream, out).map_err(|err| {
474                                                convert_to_diffable::Error::StreamCopy {
475                                                    rela_path: rela_path.to_owned(),
476                                                    source: err,
477                                                }
478                                            })?;
479                                        }
480                                        ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => {
481                                            unreachable!("we prohibit this")
482                                        }
483                                    };
484                                }
485                            }
486                        }
487
488                        if driver.map_or(true, |d| d.binary_to_text_command.is_none())
489                            && is_binary.unwrap_or_else(|| is_binary_buf(out))
490                        {
491                            let size = out.len() as u64;
492                            out.clear();
493                            Data::Binary { size }
494                        } else {
495                            Data::Buffer
496                        }
497                    };
498                    Some(data)
499                };
500                Ok(Outcome { driver_index, data })
501            }
502        }
503    }
504}
505
506fn is_binary_buf(buf: &[u8]) -> bool {
507    let buf = &buf[..buf.len().min(8000)];
508    buf.contains(&0)
509}
510
511fn none_if_missing<T>(res: std::io::Result<T>) -> std::io::Result<Option<T>> {
512    match res {
513        Ok(data) => Ok(Some(data)),
514        Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(None),
515        Err(err) => Err(err),
516    }
517}
518
519fn run_cmd(rela_path: &BStr, mut cmd: Command, out: &mut Vec<u8>) -> Result<(), convert_to_diffable::Error> {
520    gix_trace::debug!(cmd = ?cmd, "Running binary-to-text command");
521    let mut res = cmd
522        .output()
523        .map_err(|err| convert_to_diffable::Error::RunTextConvFilter {
524            rela_path: rela_path.to_owned(),
525            cmd: format!("{cmd:?}"),
526            source: err,
527        })?;
528    if !res.status.success() {
529        return Err(convert_to_diffable::Error::TextConvFilterFailed {
530            rela_path: rela_path.to_owned(),
531            cmd: format!("{cmd:?}"),
532            stderr: res.stderr.into(),
533        });
534    }
535    out.append(&mut res.stdout);
536    Ok(())
537}
538
539impl Driver {
540    /// Produce an invocable command pre-configured to produce the filtered output on stdout after reading `path`.
541    pub fn prepare_binary_to_text_cmd(&self, path: &Path) -> Option<std::process::Command> {
542        let command: &BStr = self.binary_to_text_command.as_ref()?.as_ref();
543        let cmd = gix_command::prepare(gix_path::from_bstr(command).into_owned())
544            // TODO: Add support for an actual Context, validate it *can* match Git
545            .with_context(Default::default())
546            .with_shell()
547            .stdin(Stdio::null())
548            .stdout(Stdio::piped())
549            .stderr(Stdio::piped())
550            .arg(path)
551            .into();
552        Some(cmd)
553    }
554}