gix_diff/blob/pipeline.rs
1use std::{
2 io::{Read, Write},
3 path::{Path, PathBuf},
4 process::{Command, Stdio},
5};
6
7use bstr::{BStr, ByteSlice};
8use gix_filter::{
9 driver::apply::{Delay, MaybeDelayed},
10 pipeline::convert::{ToGitOutcome, ToWorktreeOutcome},
11};
12use gix_object::tree::EntryKind;
13
14use crate::blob::{Driver, Pipeline, ResourceKind};
15
16/// A way to access roots for different kinds of resources that are possibly located and accessible in a worktree.
17#[derive(Clone, Debug, Default)]
18pub struct WorktreeRoots {
19 /// A place where the source of a rewrite, rename or copy, or generally the previous version of resources, are located.
20 pub old_root: Option<PathBuf>,
21 /// A place where the destination of a rewrite, rename or copy, or generally the new version of resources, are located.
22 pub new_root: Option<PathBuf>,
23}
24
25/// Access
26impl WorktreeRoots {
27 /// Return the root path for the given `kind`
28 pub fn by_kind(&self, kind: ResourceKind) -> Option<&Path> {
29 match kind {
30 ResourceKind::OldOrSource => self.old_root.as_deref(),
31 ResourceKind::NewOrDestination => self.new_root.as_deref(),
32 }
33 }
34
35 /// Return `true` if all worktree roots are unset.
36 pub fn is_unset(&self) -> bool {
37 self.new_root.is_none() && self.old_root.is_none()
38 }
39}
40
41/// Data as part of an [Outcome].
42#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)]
43pub enum Data {
44 /// The data to use for diffing was written into the buffer that was passed during the call to [`Pipeline::convert_to_diffable()`].
45 Buffer,
46 /// The size that the binary blob had at the given revision, without having applied filters, as it's either
47 /// considered binary or above the big-file threshold.
48 ///
49 /// In this state, the binary file cannot be diffed.
50 Binary {
51 /// The size of the object prior to performing any filtering or as it was found on disk.
52 ///
53 /// Note that technically, the size isn't always representative of the same 'state' of the
54 /// content, as once it can be the size of the blob in git, and once it's the size of file
55 /// in the worktree.
56 size: u64,
57 },
58}
59
60/// The outcome returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()).
61#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)]
62pub struct Outcome {
63 /// If available, an index into the `drivers` field to access more diff-related information of the driver for items
64 /// at the given path, as previously determined by git-attributes.
65 ///
66 /// Note that drivers are queried even if there is no object available.
67 pub driver_index: Option<usize>,
68 /// The data itself, suitable for diffing, and if the object or worktree item is present at all.
69 pub data: Option<Data>,
70}
71
72/// Options for use in a [`Pipeline`].
73#[derive(Default, Clone, Copy, PartialEq, Eq, Debug, Hash, Ord, PartialOrd)]
74pub struct Options {
75 /// The amount of bytes that an object has to reach before being treated as binary.
76 /// These objects will not be queried, nor will their data be processed in any way.
77 /// If `0`, no file is ever considered binary due to their size.
78 ///
79 /// Note that for files stored in `git`, what counts is their stored, decompressed size,
80 /// thus `git-lfs` files would typically not be considered binary unless one explicitly sets
81 /// them
82 pub large_file_threshold_bytes: u64,
83 /// Capabilities of the file system which affect how we read worktree files.
84 pub fs: gix_fs::Capabilities,
85}
86
87/// The specific way to convert a resource.
88#[derive(Default, Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
89pub enum Mode {
90 /// Always prepare the version of the resource as it would be in the work-tree, and
91 /// apply binary-to-text filters if present.
92 ///
93 /// This is typically free for resources in the worktree, and will apply filters to resources in the
94 /// object database.
95 #[default]
96 ToWorktreeAndBinaryToText,
97 /// Prepare the version of the resource as it would be in the work-tree if
98 /// binary-to-text filters are present (and apply them), or use the version in `git` otherwise.
99 ToGitUnlessBinaryToTextIsPresent,
100 /// Always prepare resources as they are stored in `git`.
101 ///
102 /// This is usually fastest, even though resources in the worktree needed to be converted files.
103 ToGit,
104}
105
106impl Mode {
107 fn to_worktree(self) -> bool {
108 matches!(
109 self,
110 Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToWorktreeAndBinaryToText
111 )
112 }
113
114 fn to_git(self) -> bool {
115 matches!(self, Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToGit)
116 }
117}
118
119///
120pub mod convert_to_diffable {
121 use std::collections::TryReserveError;
122
123 use bstr::BString;
124 use gix_object::tree::EntryKind;
125
126 /// The error returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()).
127 #[derive(Debug, thiserror::Error)]
128 #[allow(missing_docs)]
129 pub enum Error {
130 #[error("Entry at '{rela_path}' must be regular file or symlink, but was {actual:?}")]
131 InvalidEntryKind { rela_path: BString, actual: EntryKind },
132 #[error("Entry at '{rela_path}' could not be read as symbolic link")]
133 ReadLink { rela_path: BString, source: std::io::Error },
134 #[error("Entry at '{rela_path}' could not be opened for reading or read from")]
135 OpenOrRead { rela_path: BString, source: std::io::Error },
136 #[error("Entry at '{rela_path}' could not be copied from a filter process to a memory buffer")]
137 StreamCopy { rela_path: BString, source: std::io::Error },
138 #[error("Failed to run '{cmd}' for binary-to-text conversion of entry at {rela_path}")]
139 RunTextConvFilter {
140 rela_path: BString,
141 cmd: String,
142 source: std::io::Error,
143 },
144 #[error("Tempfile for binary-to-text conversion for entry at {rela_path} could not be created")]
145 CreateTempfile { rela_path: BString, source: std::io::Error },
146 #[error("Binary-to-text conversion '{cmd}' for entry at {rela_path} failed with: {stderr}")]
147 TextConvFilterFailed {
148 rela_path: BString,
149 cmd: String,
150 stderr: BString,
151 },
152 #[error(transparent)]
153 FindObject(#[from] gix_object::find::existing_object::Error),
154 #[error(transparent)]
155 ConvertToWorktree(#[from] gix_filter::pipeline::convert::to_worktree::Error),
156 #[error(transparent)]
157 ConvertToGit(#[from] gix_filter::pipeline::convert::to_git::Error),
158 #[error("Memory allocation failed")]
159 OutOfMemory(#[from] TryReserveError),
160 }
161}
162
163/// Lifecycle
164impl Pipeline {
165 /// Create a new instance of a pipeline which produces blobs suitable for diffing. `roots` allow to read worktree files directly, otherwise
166 /// `worktree_filter` is used to transform object database data directly. `drivers` further configure individual paths.
167 /// `options` are used to further configure the way we act..
168 pub fn new(
169 roots: WorktreeRoots,
170 worktree_filter: gix_filter::Pipeline,
171 mut drivers: Vec<super::Driver>,
172 options: Options,
173 ) -> Self {
174 drivers.sort_by(|a, b| a.name.cmp(&b.name));
175 Pipeline {
176 roots,
177 worktree_filter,
178 drivers,
179 options,
180 attrs: {
181 let mut out = gix_filter::attributes::search::Outcome::default();
182 out.initialize_with_selection(&Default::default(), Some("diff"));
183 out
184 },
185 path: Default::default(),
186 }
187 }
188}
189
190/// Access
191impl Pipeline {
192 /// Return all drivers that this instance was initialized with.
193 ///
194 /// They are sorted by [`name`](Driver::name) to support binary searches.
195 pub fn drivers(&self) -> &[super::Driver] {
196 &self.drivers
197 }
198}
199
200/// Conversion
201impl Pipeline {
202 /// Convert the object at `id`, `mode`, `rela_path` and `kind`, providing access to `attributes` and `objects`.
203 /// The resulting diff-able data is written into `out`, assuming it's not too large. The returned [`Outcome`]
204 /// contains information on how to use `out`, or if it's filled at all.
205 ///
206 /// `attributes` must be returning the attributes at `rela_path`, and `objects` must be usable if `kind` is
207 /// a resource in the object database, i.e. has no worktree root available.
208 ///
209 /// If `id` [is null](gix_hash::ObjectId::is_null()) or the file in question doesn't exist in the worktree in case
210 /// [a root](WorktreeRoots) is present, then `out` will be left cleared and [Outcome::data] will be `None`.
211 ///
212 /// Note that `mode` is trusted, and we will not re-validate that the entry in the worktree actually is of that mode.
213 ///
214 /// Use `convert` to control what kind of the resource will be produced.
215 ///
216 /// ### About Tempfiles
217 ///
218 /// When querying from the object database and a binary and a [binary-to-text](Driver::binary_to_text_command) is set,
219 /// a temporary file will be created to serve as input for the converter program, containing the worktree-data that
220 /// exactly as it would be present in the worktree if checked out.
221 ///
222 /// As these files are ultimately named tempfiles, they will be leaked unless the [gix_tempfile] is configured with
223 /// a signal handler. If they leak, they would remain in the system's `$TMP` directory.
224 #[allow(clippy::too_many_arguments)]
225 pub fn convert_to_diffable(
226 &mut self,
227 id: &gix_hash::oid,
228 mode: EntryKind,
229 rela_path: &BStr,
230 kind: ResourceKind,
231 attributes: &mut dyn FnMut(&BStr, &mut gix_filter::attributes::search::Outcome),
232 objects: &dyn gix_object::FindObjectOrHeader,
233 convert: Mode,
234 out: &mut Vec<u8>,
235 ) -> Result<Outcome, convert_to_diffable::Error> {
236 let is_symlink = match mode {
237 EntryKind::Link if self.options.fs.symlink => true,
238 EntryKind::Blob | EntryKind::BlobExecutable => false,
239 _ => {
240 return Err(convert_to_diffable::Error::InvalidEntryKind {
241 rela_path: rela_path.to_owned(),
242 actual: mode,
243 })
244 }
245 };
246
247 out.clear();
248 attributes(rela_path, &mut self.attrs);
249 let attr = self.attrs.iter_selected().next().expect("pre-initialized with 'diff'");
250 let driver_index = attr
251 .assignment
252 .state
253 .as_bstr()
254 .and_then(|name| self.drivers.binary_search_by(|d| d.name.as_bstr().cmp(name)).ok());
255 let driver = driver_index.map(|idx| &self.drivers[idx]);
256 let mut is_binary = if let Some(driver) = driver {
257 driver
258 .is_binary
259 .map(|is_binary| is_binary && driver.binary_to_text_command.is_none())
260 } else {
261 attr.assignment.state.is_unset().then_some(true)
262 };
263 match self.roots.by_kind(kind) {
264 Some(root) => {
265 self.path.clear();
266 self.path.push(root);
267 self.path.push(gix_path::from_bstr(rela_path));
268 let data = if is_symlink {
269 let target = none_if_missing(std::fs::read_link(&self.path)).map_err(|err| {
270 convert_to_diffable::Error::ReadLink {
271 rela_path: rela_path.to_owned(),
272 source: err,
273 }
274 })?;
275 target.map(|target| {
276 out.extend_from_slice(gix_path::into_bstr(target).as_ref());
277 Data::Buffer
278 })
279 } else {
280 let need_size_only = is_binary == Some(true);
281 let size_in_bytes = (need_size_only
282 || (is_binary != Some(false) && self.options.large_file_threshold_bytes > 0))
283 .then(|| {
284 none_if_missing(self.path.metadata().map(|md| md.len())).map_err(|err| {
285 convert_to_diffable::Error::OpenOrRead {
286 rela_path: rela_path.to_owned(),
287 source: err,
288 }
289 })
290 })
291 .transpose()?;
292 match size_in_bytes {
293 Some(None) => None, // missing as identified by the size check
294 Some(Some(size)) if size > self.options.large_file_threshold_bytes || need_size_only => {
295 Some(Data::Binary { size })
296 }
297 _ => {
298 match driver
299 .filter(|_| convert.to_worktree())
300 .and_then(|d| d.prepare_binary_to_text_cmd(&self.path))
301 {
302 Some(cmd) => {
303 // Avoid letting the driver program fail if it doesn't exist.
304 if self.options.large_file_threshold_bytes == 0
305 && none_if_missing(std::fs::symlink_metadata(&self.path))
306 .map_err(|err| convert_to_diffable::Error::OpenOrRead {
307 rela_path: rela_path.to_owned(),
308 source: err,
309 })?
310 .is_none()
311 {
312 None
313 } else {
314 run_cmd(rela_path, cmd, out)?;
315 Some(Data::Buffer)
316 }
317 }
318 None => {
319 let file = none_if_missing(std::fs::File::open(&self.path)).map_err(|err| {
320 convert_to_diffable::Error::OpenOrRead {
321 rela_path: rela_path.to_owned(),
322 source: err,
323 }
324 })?;
325
326 match file {
327 Some(mut file) => {
328 if convert.to_git() {
329 let res = self.worktree_filter.convert_to_git(
330 file,
331 gix_path::from_bstr(rela_path).as_ref(),
332 attributes,
333 &mut |buf| objects.try_find(id, buf).map(|obj| obj.map(|_| ())),
334 )?;
335
336 match res {
337 ToGitOutcome::Unchanged(mut file) => {
338 file.read_to_end(out).map_err(|err| {
339 convert_to_diffable::Error::OpenOrRead {
340 rela_path: rela_path.to_owned(),
341 source: err,
342 }
343 })?;
344 }
345 ToGitOutcome::Process(mut stream) => {
346 stream.read_to_end(out).map_err(|err| {
347 convert_to_diffable::Error::OpenOrRead {
348 rela_path: rela_path.to_owned(),
349 source: err,
350 }
351 })?;
352 }
353 ToGitOutcome::Buffer(buf) => {
354 out.clear();
355 out.try_reserve(buf.len())?;
356 out.extend_from_slice(buf);
357 }
358 }
359 } else {
360 file.read_to_end(out).map_err(|err| {
361 convert_to_diffable::Error::OpenOrRead {
362 rela_path: rela_path.to_owned(),
363 source: err,
364 }
365 })?;
366 }
367
368 Some(if is_binary.unwrap_or_else(|| is_binary_buf(out)) {
369 let size = out.len() as u64;
370 out.clear();
371 Data::Binary { size }
372 } else {
373 Data::Buffer
374 })
375 }
376 None => None,
377 }
378 }
379 }
380 }
381 }
382 };
383 Ok(Outcome { driver_index, data })
384 }
385 None => {
386 let data = if id.is_null() {
387 None
388 } else {
389 let header = objects
390 .try_header(id)
391 .map_err(gix_object::find::existing_object::Error::Find)?
392 .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?;
393 if is_binary.is_none()
394 && self.options.large_file_threshold_bytes > 0
395 && header.size > self.options.large_file_threshold_bytes
396 {
397 is_binary = Some(true);
398 };
399 let data = if is_binary == Some(true) {
400 Data::Binary { size: header.size }
401 } else {
402 objects
403 .try_find(id, out)
404 .map_err(gix_object::find::existing_object::Error::Find)?
405 .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?;
406 if matches!(mode, EntryKind::Blob | EntryKind::BlobExecutable)
407 && convert == Mode::ToWorktreeAndBinaryToText
408 || (convert == Mode::ToGitUnlessBinaryToTextIsPresent
409 && driver.is_some_and(|d| d.binary_to_text_command.is_some()))
410 {
411 let res =
412 self.worktree_filter
413 .convert_to_worktree(out, rela_path, attributes, Delay::Forbid)?;
414
415 let cmd_and_file = driver
416 .and_then(|d| {
417 d.binary_to_text_command.is_some().then(|| {
418 gix_tempfile::new(
419 std::env::temp_dir(),
420 gix_tempfile::ContainingDirectory::Exists,
421 gix_tempfile::AutoRemove::Tempfile,
422 )
423 .and_then(|mut tmp_file| {
424 self.path.clear();
425 tmp_file.with_mut(|tmp| self.path.push(tmp.path()))?;
426 Ok(tmp_file)
427 })
428 .map(|tmp_file| {
429 (
430 d.prepare_binary_to_text_cmd(&self.path)
431 .expect("always get cmd if command is set"),
432 tmp_file,
433 )
434 })
435 })
436 })
437 .transpose()
438 .map_err(|err| convert_to_diffable::Error::CreateTempfile {
439 source: err,
440 rela_path: rela_path.to_owned(),
441 })?;
442 match cmd_and_file {
443 Some((cmd, mut tmp_file)) => {
444 match res {
445 ToWorktreeOutcome::Unchanged(buf) | ToWorktreeOutcome::Buffer(buf) => {
446 tmp_file.write_all(buf)
447 }
448 ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => {
449 std::io::copy(&mut stream, &mut tmp_file).map(|_| ())
450 }
451 ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => {
452 unreachable!("we prohibit this")
453 }
454 }
455 .map_err(|err| {
456 convert_to_diffable::Error::StreamCopy {
457 source: err,
458 rela_path: rela_path.to_owned(),
459 }
460 })?;
461 out.clear();
462 run_cmd(rela_path, cmd, out)?;
463 }
464 None => {
465 match res {
466 ToWorktreeOutcome::Unchanged(_) => {}
467 ToWorktreeOutcome::Buffer(src) => {
468 out.clear();
469 out.try_reserve(src.len())?;
470 out.extend_from_slice(src);
471 }
472 ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => {
473 std::io::copy(&mut stream, out).map_err(|err| {
474 convert_to_diffable::Error::StreamCopy {
475 rela_path: rela_path.to_owned(),
476 source: err,
477 }
478 })?;
479 }
480 ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => {
481 unreachable!("we prohibit this")
482 }
483 };
484 }
485 }
486 }
487
488 if driver.map_or(true, |d| d.binary_to_text_command.is_none())
489 && is_binary.unwrap_or_else(|| is_binary_buf(out))
490 {
491 let size = out.len() as u64;
492 out.clear();
493 Data::Binary { size }
494 } else {
495 Data::Buffer
496 }
497 };
498 Some(data)
499 };
500 Ok(Outcome { driver_index, data })
501 }
502 }
503 }
504}
505
506fn is_binary_buf(buf: &[u8]) -> bool {
507 let buf = &buf[..buf.len().min(8000)];
508 buf.contains(&0)
509}
510
511fn none_if_missing<T>(res: std::io::Result<T>) -> std::io::Result<Option<T>> {
512 match res {
513 Ok(data) => Ok(Some(data)),
514 Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(None),
515 Err(err) => Err(err),
516 }
517}
518
519fn run_cmd(rela_path: &BStr, mut cmd: Command, out: &mut Vec<u8>) -> Result<(), convert_to_diffable::Error> {
520 gix_trace::debug!(cmd = ?cmd, "Running binary-to-text command");
521 let mut res = cmd
522 .output()
523 .map_err(|err| convert_to_diffable::Error::RunTextConvFilter {
524 rela_path: rela_path.to_owned(),
525 cmd: format!("{cmd:?}"),
526 source: err,
527 })?;
528 if !res.status.success() {
529 return Err(convert_to_diffable::Error::TextConvFilterFailed {
530 rela_path: rela_path.to_owned(),
531 cmd: format!("{cmd:?}"),
532 stderr: res.stderr.into(),
533 });
534 }
535 out.append(&mut res.stdout);
536 Ok(())
537}
538
539impl Driver {
540 /// Produce an invocable command pre-configured to produce the filtered output on stdout after reading `path`.
541 pub fn prepare_binary_to_text_cmd(&self, path: &Path) -> Option<std::process::Command> {
542 let command: &BStr = self.binary_to_text_command.as_ref()?.as_ref();
543 let cmd = gix_command::prepare(gix_path::from_bstr(command).into_owned())
544 // TODO: Add support for an actual Context, validate it *can* match Git
545 .with_context(Default::default())
546 .with_shell()
547 .stdin(Stdio::null())
548 .stdout(Stdio::piped())
549 .stderr(Stdio::piped())
550 .arg(path)
551 .into();
552 Some(cmd)
553 }
554}