gix_diff/rewrites/
tracker.rs

1//! ### Deviation
2//!
3//! Note that the algorithm implemented here is in many ways different from what `git` does.
4//!
5//! - it's less sophisticated and doesn't use any ranking of candidates. Instead, it picks the first possible match.
6//! - the set used for copy-detection is probably smaller by default.
7
8use std::ops::Range;
9
10use bstr::{BStr, ByteSlice};
11use gix_object::tree::{EntryKind, EntryMode};
12
13use crate::rewrites::tracker::visit::SourceKind;
14use crate::tree::visit::{Action, ChangeId, Relation};
15use crate::{
16    blob::{platform::prepare_diff::Operation, DiffLineStats, ResourceKind},
17    rewrites::{CopySource, Outcome, Tracker},
18    Rewrites,
19};
20
21/// The kind of a change.
22#[derive(Debug, Copy, Clone, Ord, PartialOrd, PartialEq, Eq)]
23pub enum ChangeKind {
24    /// The change represents the *deletion* of an item.
25    Deletion,
26    /// The change represents the *modification* of an item.
27    Modification,
28    /// The change represents the *addition* of an item.
29    Addition,
30}
31
32/// A trait providing all functionality to abstract over the concept of a change, as seen by the [`Tracker`].
33pub trait Change: Clone {
34    /// Return the hash of the object behind this change for identification.
35    ///
36    /// Note that this is the id of the object as stored in `git`, i.e. it must have gone through workspace
37    /// conversions. What matters is that the IDs are comparable.
38    fn id(&self) -> &gix_hash::oid;
39    /// Return the relation that this change may have with other changes.
40    ///
41    /// It allows to associate a directory with its children that are added or removed at the same moment.
42    /// Note that this is ignored for modifications.
43    ///
44    /// If rename-tracking should always be on leaf-level, this should be set to `None` consistently.
45    /// Note that trees will never be looked up by their `id` as their children are assumed to be passed in
46    /// with the respective relationship.
47    ///
48    /// Also note that the tracker only sees what's given to it, it will not lookup trees or match paths itself.
49    fn relation(&self) -> Option<Relation>;
50    /// Return the kind of this change.
51    fn kind(&self) -> ChangeKind;
52    /// Return more information about the kind of entry affected by this change.
53    fn entry_mode(&self) -> EntryMode;
54    /// Return the id of the change along with its mode.
55    fn id_and_entry_mode(&self) -> (&gix_hash::oid, EntryMode);
56}
57
58/// A set of tracked items allows to figure out their relations by figuring out their similarity.
59pub(crate) struct Item<T> {
60    /// The underlying raw change
61    change: T,
62    /// That slice into the backing for paths.
63    path: Range<usize>,
64    /// If true, this item was already emitted, i.e. seen by the caller.
65    emitted: bool,
66}
67
68impl<T: Change> Item<T> {
69    fn location<'a>(&self, backing: &'a [u8]) -> &'a BStr {
70        backing[self.path.clone()].as_ref()
71    }
72    fn entry_mode_compatible(&self, other: EntryMode) -> bool {
73        use EntryKind::*;
74        matches!(
75            (other.kind(), self.change.entry_mode().kind()),
76            (Blob | BlobExecutable, Blob | BlobExecutable) | (Link, Link) | (Tree, Tree)
77        )
78    }
79
80    fn is_source_for_destination_of(&self, kind: visit::SourceKind, dest_item_mode: EntryMode) -> bool {
81        self.entry_mode_compatible(dest_item_mode)
82            && match kind {
83                visit::SourceKind::Rename => !self.emitted && matches!(self.change.kind(), ChangeKind::Deletion),
84                visit::SourceKind::Copy => {
85                    matches!(self.change.kind(), ChangeKind::Modification)
86                }
87            }
88    }
89}
90
91/// A module with types used in the user-callback in [Tracker::emit()](crate::rewrites::Tracker::emit()).
92pub mod visit {
93    use bstr::BStr;
94    use gix_object::tree::EntryMode;
95
96    use crate::blob::DiffLineStats;
97
98    /// The source of a rewrite, rename or copy.
99    #[derive(Debug, Clone, PartialEq, PartialOrd)]
100    pub struct Source<'a, T> {
101        /// The kind of entry.
102        pub entry_mode: EntryMode,
103        /// The hash of the state of the source as seen in the object database.
104        pub id: gix_hash::ObjectId,
105        /// Further specify what kind of source this is.
106        pub kind: SourceKind,
107        /// The repository-relative location of this entry.
108        pub location: &'a BStr,
109        /// The change that was registered as source.
110        pub change: &'a T,
111        /// If this is a rewrite, indicate how many lines would need to change to turn this source into the destination.
112        pub diff: Option<DiffLineStats>,
113    }
114
115    /// Further identify the kind of [Source].
116    #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
117    pub enum SourceKind {
118        /// This is the source of an entry that was renamed, as `source` was renamed to `destination`.
119        Rename,
120        /// This is the source of a copy, as `source` was copied into `destination`.
121        Copy,
122    }
123
124    /// A change along with a location.
125    #[derive(Debug, Clone)]
126    pub struct Destination<'a, T: Clone> {
127        /// The change at the given `location`.
128        pub change: T,
129        /// The repository-relative location of this destination.
130        pub location: &'a BStr,
131    }
132}
133
134///
135pub mod emit {
136    /// The error returned by [Tracker::emit()](super::Tracker::emit()).
137    #[derive(Debug, thiserror::Error)]
138    #[allow(missing_docs)]
139    pub enum Error {
140        #[error("Could not find blob for similarity checking")]
141        FindExistingBlob(#[from] gix_object::find::existing_object::Error),
142        #[error("Could not obtain exhaustive item set to use as possible sources for copy detection")]
143        GetItemsForExhaustiveCopyDetection(#[source] Box<dyn std::error::Error + Send + Sync>),
144        #[error(transparent)]
145        SetResource(#[from] crate::blob::platform::set_resource::Error),
146        #[error(transparent)]
147        PrepareDiff(#[from] crate::blob::platform::prepare_diff::Error),
148    }
149}
150
151/// Lifecycle
152impl<T: Change> Tracker<T> {
153    /// Create a new instance with `rewrites` configuration.
154    pub fn new(rewrites: Rewrites) -> Self {
155        Tracker {
156            items: vec![],
157            path_backing: vec![],
158            rewrites,
159            child_renames: Default::default(),
160        }
161    }
162}
163
164/// build state and find matches.
165impl<T: Change> Tracker<T> {
166    /// We may refuse the push if that information isn't needed for what we have to track.
167    pub fn try_push_change(&mut self, change: T, location: &BStr) -> Option<T> {
168        let change_kind = change.kind();
169        if let (None, ChangeKind::Modification { .. }) = (self.rewrites.copies, change_kind) {
170            return Some(change);
171        };
172
173        let entry_kind = change.entry_mode().kind();
174        if entry_kind == EntryKind::Commit {
175            return Some(change);
176        }
177        let relation = change
178            .relation()
179            .filter(|_| matches!(change_kind, ChangeKind::Addition | ChangeKind::Deletion));
180        if let (None, EntryKind::Tree) = (relation, entry_kind) {
181            return Some(change);
182        };
183
184        let start = self.path_backing.len();
185        self.path_backing.extend_from_slice(location);
186        let path = start..self.path_backing.len();
187
188        self.items.push(Item {
189            path,
190            change,
191            emitted: false,
192        });
193        None
194    }
195
196    /// Can only be called once effectively as it alters its own state to assure each item is only emitted once.
197    ///
198    /// `cb(destination, source)` is called for each item, either with `Some(source)` if it's
199    /// the destination of a copy or rename, or with `None` for source if no relation to other
200    /// items in the tracked set exist, which is like saying 'no rename or rewrite or copy' happened.
201    /// Note that directories with [relation](Relation) will be emitted if there is a match, along with all their matching
202    /// child-items which are similarly bundled as rename.
203    ///
204    /// `objects` is used to access blob data for similarity checks if required and is taken directly from the object database.
205    /// Worktree filters and text conversions will be applied afterwards automatically. Note that object-caching *should not*
206    /// be enabled as caching is implemented by `diff_cache`, after all, the blob that's actually diffed is going
207    /// through conversion steps.
208    ///
209    /// `diff_cache` is a way to retain a cache of resources that are prepared for rapid diffing, and it also controls
210    /// the diff-algorithm (provided no user-algorithm is set).
211    /// Note that we control a few options of `diff_cache` to assure it will ignore external commands.
212    /// Note that we do not control how the `diff_cache` converts resources, it's left to the caller to decide
213    /// if it should look at what's stored in `git`, or in the working tree, along with all diff-specific conversions.
214    ///
215    /// `push_source_tree(push_fn: push(change, location))` is a function that is called when the entire tree of the source
216    /// should be added as modifications by calling `push` repeatedly to use for perfect copy tracking. Note that `push`
217    /// will panic if `change` is not a modification, and it's valid to not call `push` at all.
218    pub fn emit<PushSourceTreeFn, E>(
219        &mut self,
220        mut cb: impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
221        diff_cache: &mut crate::blob::Platform,
222        objects: &impl gix_object::FindObjectOrHeader,
223        mut push_source_tree: PushSourceTreeFn,
224    ) -> Result<Outcome, emit::Error>
225    where
226        PushSourceTreeFn: FnMut(&mut dyn FnMut(T, &BStr)) -> Result<(), E>,
227        E: std::error::Error + Send + Sync + 'static,
228    {
229        fn is_parent(change: &impl Change) -> bool {
230            matches!(change.relation(), Some(Relation::Parent(_)))
231        }
232        diff_cache.options.skip_internal_diff_if_external_is_configured = false;
233
234        // The point of this is to optimize for identity-based lookups, which should be easy to find
235        // by partitioning.
236        fn by_id_and_location<T: Change>(a: &Item<T>, b: &Item<T>) -> std::cmp::Ordering {
237            a.change
238                .id()
239                .cmp(b.change.id())
240                .then_with(|| a.path.start.cmp(&b.path.start).then(a.path.end.cmp(&b.path.end)))
241        }
242
243        let mut out = Outcome {
244            options: self.rewrites,
245            ..Default::default()
246        };
247        self.items.sort_by(by_id_and_location);
248
249        // Rewrites by directory (without local changes) can be pruned out quickly,
250        // by finding only parents, their counterpart, and then all children can be matched by
251        // relationship ID.
252        self.match_pairs_of_kind(
253            visit::SourceKind::Rename,
254            &mut cb,
255            None, /* by identity for parents */
256            &mut out,
257            diff_cache,
258            objects,
259            Some(is_parent),
260        )?;
261
262        self.match_pairs_of_kind(
263            visit::SourceKind::Rename,
264            &mut cb,
265            self.rewrites.percentage,
266            &mut out,
267            diff_cache,
268            objects,
269            None,
270        )?;
271
272        self.match_renamed_directories(&mut cb)?;
273
274        if let Some(copies) = self.rewrites.copies {
275            self.match_pairs_of_kind(
276                visit::SourceKind::Copy,
277                &mut cb,
278                copies.percentage,
279                &mut out,
280                diff_cache,
281                objects,
282                None,
283            )?;
284
285            match copies.source {
286                CopySource::FromSetOfModifiedFiles => {}
287                CopySource::FromSetOfModifiedFilesAndAllSources => {
288                    push_source_tree(&mut |change, location| {
289                        if self.try_push_change(change, location).is_none() {
290                            // make sure these aren't viable to be emitted anymore.
291                            self.items.last_mut().expect("just pushed").emitted = true;
292                        }
293                    })
294                    .map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?;
295                    self.items.sort_by(by_id_and_location);
296
297                    self.match_pairs_of_kind(
298                        visit::SourceKind::Copy,
299                        &mut cb,
300                        copies.percentage,
301                        &mut out,
302                        diff_cache,
303                        objects,
304                        None,
305                    )?;
306                }
307            }
308        }
309
310        self.items
311            .sort_by(|a, b| a.location(&self.path_backing).cmp(b.location(&self.path_backing)));
312        for item in self.items.drain(..).filter(|item| !item.emitted) {
313            if cb(
314                visit::Destination {
315                    location: item.location(&self.path_backing),
316                    change: item.change,
317                },
318                None,
319            ) == Action::Cancel
320            {
321                break;
322            }
323        }
324        Ok(out)
325    }
326}
327
328impl<T: Change> Tracker<T> {
329    #[allow(clippy::too_many_arguments)]
330    fn match_pairs_of_kind(
331        &mut self,
332        kind: visit::SourceKind,
333        cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
334        percentage: Option<f32>,
335        out: &mut Outcome,
336        diff_cache: &mut crate::blob::Platform,
337        objects: &impl gix_object::FindObjectOrHeader,
338        filter: Option<fn(&T) -> bool>,
339    ) -> Result<(), emit::Error> {
340        // we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively.
341        let needs_second_pass = !needs_exact_match(percentage);
342
343        // https://github.com/git/git/blob/cc01bad4a9f566cf4453c7edd6b433851b0835e2/diffcore-rename.c#L350-L369
344        // We would need a hashmap to be OK to not use the limit here, otherwise the performance is too bad.
345        // This also means we don't find all renames if we hit the rename limit.
346        if self.match_pairs(cb, None /* by identity */, kind, out, diff_cache, objects, filter)? == Action::Cancel {
347            return Ok(());
348        }
349        if needs_second_pass {
350            let is_limited = if self.rewrites.limit == 0 {
351                false
352            } else {
353                let (num_src, num_dst) =
354                    estimate_involved_items(self.items.iter().map(|item| (item.emitted, item.change.kind())), kind);
355                let permutations = num_src * num_dst;
356                if permutations > self.rewrites.limit {
357                    match kind {
358                        visit::SourceKind::Rename => {
359                            out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations;
360                        }
361                        visit::SourceKind::Copy => {
362                            out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations;
363                        }
364                    }
365                    true
366                } else {
367                    false
368                }
369            };
370            if !is_limited {
371                self.match_pairs(cb, percentage, kind, out, diff_cache, objects, None)?;
372            }
373        }
374        Ok(())
375    }
376
377    #[allow(clippy::too_many_arguments)]
378    fn match_pairs(
379        &mut self,
380        cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
381        percentage: Option<f32>,
382        kind: visit::SourceKind,
383        stats: &mut Outcome,
384        diff_cache: &mut crate::blob::Platform,
385        objects: &impl gix_object::FindObjectOrHeader,
386        filter: Option<fn(&T) -> bool>,
387    ) -> Result<Action, emit::Error> {
388        let mut dest_ofs = 0;
389        let mut num_checks = 0;
390        let max_checks = {
391            let limit = self.rewrites.limit.saturating_pow(2);
392            // There can be trees with a lot of entries and pathological search behaviour, as they can be repeated
393            // and then have a lot of similar hashes. This also means we have to search a lot of candidates which
394            // can be too slow despite best attempts. So play it save and detect such cases 'roughly' by amount of items.
395            if self.items.len() < 100_000 {
396                0
397            } else {
398                limit
399            }
400        };
401
402        while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| {
403            (!item.emitted
404                && matches!(item.change.kind(), ChangeKind::Addition)
405                && filter.map_or_else(
406                    || {
407                        self.rewrites.track_empty
408                            // We always want to keep track of entries that are involved of a directory rename.
409                            // Note that this may still match them up arbitrarily if empty, but empty is empty.
410                            || matches!(item.change.relation(), Some(Relation::ChildOfParent(_)))
411                            || {
412                                let id = item.change.id();
413                                id != gix_hash::ObjectId::empty_blob(id.kind())
414                            }
415                    },
416                    |f| f(&item.change),
417                ))
418            .then_some((idx, item))
419        }) {
420            dest_idx += dest_ofs;
421            dest_ofs = dest_idx + 1;
422            self.items[dest_idx].location(&self.path_backing);
423            let src = find_match(
424                &self.items,
425                dest,
426                dest_idx,
427                percentage,
428                kind,
429                stats,
430                objects,
431                diff_cache,
432                &self.path_backing,
433                &mut num_checks,
434            )?
435            .map(|(src_idx, src, diff)| {
436                let (id, entry_mode) = src.change.id_and_entry_mode();
437                let id = id.to_owned();
438                let location = src.location(&self.path_backing);
439                (
440                    visit::Source {
441                        entry_mode,
442                        id,
443                        kind,
444                        location,
445                        change: &src.change,
446                        diff,
447                    },
448                    src_idx,
449                )
450            });
451            if max_checks != 0 && num_checks > max_checks {
452                gix_trace::warn!(
453                    "Cancelled rename matching as there were too many iterations ({num_checks} > {max_checks})"
454                );
455                return Ok(Action::Cancel);
456            }
457            let Some((src, src_idx)) = src else {
458                continue;
459            };
460            let location = dest.location(&self.path_backing);
461            let change = dest.change.clone();
462            let dest = visit::Destination { change, location };
463            let relations = if percentage.is_none() {
464                src.change.relation().zip(dest.change.relation())
465            } else {
466                None
467            };
468            let res = cb(dest, Some(src));
469
470            self.items[dest_idx].emitted = true;
471            self.items[src_idx].emitted = true;
472
473            if res == Action::Cancel {
474                return Ok(Action::Cancel);
475            }
476
477            match relations {
478                Some((Relation::Parent(src), Relation::Parent(dst))) => {
479                    let res = self.emit_child_renames_matching_identity(cb, kind, src, dst)?;
480                    if res == Action::Cancel {
481                        return Ok(Action::Cancel);
482                    }
483                }
484                Some((Relation::ChildOfParent(src), Relation::ChildOfParent(dst))) => {
485                    self.child_renames.insert((src, dst));
486                }
487                _ => {}
488            }
489        }
490        Ok(Action::Continue)
491    }
492
493    /// Emit the children of `src_parent_id` and `dst_parent_id` as pairs of exact matches, which are assumed
494    /// as `src` and `dst` were an exact match (so all children have to match exactly).
495    /// Note that we intentionally do not record them as their parents will be emitted, too.
496    fn emit_child_renames_matching_identity(
497        &mut self,
498        cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
499        kind: visit::SourceKind,
500        src_parent_id: ChangeId,
501        dst_parent_id: ChangeId,
502    ) -> Result<Action, emit::Error> {
503        debug_assert_ne!(
504            src_parent_id, dst_parent_id,
505            "src and destination directories must be distinct"
506        );
507        let (mut src_items, mut dst_items) = (Vec::with_capacity(1), Vec::with_capacity(1));
508        for item in self.items.iter_mut().filter(|item| !item.emitted) {
509            match item.change.relation() {
510                Some(Relation::ChildOfParent(id)) if id == src_parent_id => {
511                    src_items.push((item.change.id().to_owned(), item));
512                }
513                Some(Relation::ChildOfParent(id)) if id == dst_parent_id => {
514                    dst_items.push((item.change.id().to_owned(), item));
515                }
516                _ => continue,
517            };
518        }
519
520        for ((src_id, src_item), (dst_id, dst_item)) in src_items.into_iter().zip(dst_items) {
521            // Since the parent items are already identical by ID, we know that the children will also match, we just
522            // double-check to still have a chance to be correct in case some of that goes wrong.
523            if src_id == dst_id
524                && filename(src_item.location(&self.path_backing)) == filename(dst_item.location(&self.path_backing))
525            {
526                let entry_mode = src_item.change.entry_mode();
527                let location = src_item.location(&self.path_backing);
528                let src = visit::Source {
529                    entry_mode,
530                    id: src_id,
531                    kind,
532                    location,
533                    change: &src_item.change,
534                    diff: None,
535                };
536                let location = dst_item.location(&self.path_backing);
537                let change = dst_item.change.clone();
538                let dst = visit::Destination { change, location };
539                let res = cb(dst, Some(src));
540
541                src_item.emitted = true;
542                dst_item.emitted = true;
543
544                if res == Action::Cancel {
545                    return Ok(res);
546                }
547            } else {
548                gix_trace::warn!("Children of parents with change-id {src_parent_id} and {dst_parent_id} were not equal, even though their parents claimed to be");
549                break;
550            }
551        }
552        Ok(Action::Continue)
553    }
554
555    /// Find directories with relation id that haven't been emitted yet and store them for lookup.
556    /// Then use the previously stored emitted renames with relation id to learn which directories they 'link'
557    /// and emit them, too.
558    /// Note that this works whenever top-level directories are renamed because they are always added and deleted,
559    /// and we only match those. Thus, one rewrite inside the directory is enough.
560    fn match_renamed_directories(
561        &mut self,
562        cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
563    ) -> Result<(), emit::Error> {
564        fn unemitted_directory_matching_relation_id<T: Change>(items: &[Item<T>], child_id: ChangeId) -> Option<usize> {
565            items.iter().position(|i| {
566                !i.emitted && matches!(i.change.relation(), Some(Relation::Parent(pid)) if pid == child_id)
567            })
568        }
569        for (deleted_child_id, added_child_id) in &self.child_renames {
570            let Some(src_idx) = unemitted_directory_matching_relation_id(&self.items, *deleted_child_id) else {
571                continue;
572            };
573            let Some(dst_idx) = unemitted_directory_matching_relation_id(&self.items, *added_child_id) else {
574                // This could go wrong in case there are mismatches, so be defensive here.
575                // But generally, we'd expect the destination item to exist.
576                continue;
577            };
578
579            let (src_item, dst_item) = (&self.items[src_idx], &self.items[dst_idx]);
580            let entry_mode = src_item.change.entry_mode();
581            let location = src_item.location(&self.path_backing);
582            let src = visit::Source {
583                entry_mode,
584                id: src_item.change.id().to_owned(),
585                kind: SourceKind::Rename,
586                location,
587                change: &src_item.change,
588                diff: None,
589            };
590            let location = dst_item.location(&self.path_backing);
591            let change = dst_item.change.clone();
592            let dst = visit::Destination { change, location };
593            let res = cb(dst, Some(src));
594
595            self.items[src_idx].emitted = true;
596            self.items[dst_idx].emitted = true;
597
598            if res == Action::Cancel {
599                return Ok(());
600            }
601        }
602        Ok(())
603    }
604}
605
606fn filename(path: &BStr) -> &BStr {
607    path.rfind_byte(b'/').map_or(path, |idx| path[idx + 1..].as_bstr())
608}
609
610/// Returns the amount of viable sources and destinations for `items` as eligible for the given `kind` of operation.
611fn estimate_involved_items(
612    items: impl IntoIterator<Item = (bool, ChangeKind)>,
613    kind: visit::SourceKind,
614) -> (usize, usize) {
615    items
616        .into_iter()
617        .filter(|(emitted, _)| match kind {
618            visit::SourceKind::Rename => !*emitted,
619            visit::SourceKind::Copy => true,
620        })
621        .fold((0, 0), |(mut src, mut dest), (emitted, change_kind)| {
622            match change_kind {
623                ChangeKind::Addition => {
624                    if kind == visit::SourceKind::Rename || !emitted {
625                        dest += 1;
626                    }
627                }
628                ChangeKind::Deletion => {
629                    if kind == visit::SourceKind::Rename {
630                        src += 1;
631                    }
632                }
633                ChangeKind::Modification => {
634                    if kind == visit::SourceKind::Copy {
635                        src += 1;
636                    }
637                }
638            }
639            (src, dest)
640        })
641}
642
643fn needs_exact_match(percentage: Option<f32>) -> bool {
644    percentage.map_or(true, |p| p >= 1.0)
645}
646
647/// <`src_idx`, src, possibly diff stat>
648type SourceTuple<'a, T> = (usize, &'a Item<T>, Option<DiffLineStats>);
649
650/// Find `item` in our set of items ignoring `item_idx` to avoid finding ourselves, by similarity indicated by `percentage`.
651/// The latter can be `None` or `Some(x)` where `x>=1` for identity, and anything else for similarity.
652/// We also ignore emitted items entirely.
653/// Use `kind` to indicate what kind of match we are looking for, which might be deletions matching an `item` addition, or
654/// any non-deletion otherwise.
655/// Note that we always try to find by identity first even if a percentage is given as it's much faster and may reduce the set
656/// of items to be searched.
657#[allow(clippy::too_many_arguments)]
658fn find_match<'a, T: Change>(
659    items: &'a [Item<T>],
660    item: &Item<T>,
661    item_idx: usize,
662    percentage: Option<f32>,
663    kind: visit::SourceKind,
664    stats: &mut Outcome,
665    objects: &impl gix_object::FindObjectOrHeader,
666    diff_cache: &mut crate::blob::Platform,
667    path_backing: &[u8],
668    num_checks: &mut usize,
669) -> Result<Option<SourceTuple<'a, T>>, emit::Error> {
670    let (item_id, item_mode) = item.change.id_and_entry_mode();
671    if needs_exact_match(percentage) || item_mode.is_link() {
672        let first_idx = items.partition_point(|a| a.change.id() < item_id);
673        let range = items.get(first_idx..).map(|slice| {
674            let end = slice
675                .iter()
676                .position(|a| a.change.id() != item_id)
677                .map_or(items.len(), |idx| first_idx + idx);
678            first_idx..end
679        });
680        let range = match range {
681            Some(range) => range,
682            None => return Ok(None),
683        };
684        if range.is_empty() {
685            return Ok(None);
686        }
687        let res = items[range.clone()].iter().enumerate().find_map(|(mut src_idx, src)| {
688            src_idx += range.start;
689            *num_checks += 1;
690            (src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)).then_some((src_idx, src, None))
691        });
692        if let Some(src) = res {
693            return Ok(Some(src));
694        }
695    } else if item_mode.is_blob() {
696        let mut has_new = false;
697        let percentage = percentage.expect("it's set to something below 1.0 and we assured this");
698
699        for (can_idx, src) in items
700            .iter()
701            .enumerate()
702            .filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode))
703        {
704            if !has_new {
705                diff_cache.set_resource(
706                    item_id.to_owned(),
707                    item_mode.kind(),
708                    item.location(path_backing),
709                    ResourceKind::NewOrDestination,
710                    objects,
711                )?;
712                has_new = true;
713            }
714            let (src_id, src_mode) = src.change.id_and_entry_mode();
715            diff_cache.set_resource(
716                src_id.to_owned(),
717                src_mode.kind(),
718                src.location(path_backing),
719                ResourceKind::OldOrSource,
720                objects,
721            )?;
722            let prep = diff_cache.prepare_diff()?;
723            stats.num_similarity_checks += 1;
724            *num_checks += 1;
725            match prep.operation {
726                Operation::InternalDiff { algorithm } => {
727                    let tokens =
728                        crate::blob::intern::InternedInput::new(prep.old.intern_source(), prep.new.intern_source());
729                    let counts = crate::blob::diff(
730                        algorithm,
731                        &tokens,
732                        crate::blob::sink::Counter::new(diff::Statistics {
733                            removed_bytes: 0,
734                            input: &tokens,
735                        }),
736                    );
737                    let old_data_len = prep.old.data.as_slice().unwrap_or_default().len();
738                    let new_data_len = prep.new.data.as_slice().unwrap_or_default().len();
739                    let similarity = (old_data_len - counts.wrapped) as f32 / old_data_len.max(new_data_len) as f32;
740                    if similarity >= percentage {
741                        return Ok(Some((
742                            can_idx,
743                            src,
744                            DiffLineStats {
745                                removals: counts.removals,
746                                insertions: counts.insertions,
747                                before: tokens.before.len().try_into().expect("interner handles only u32"),
748                                after: tokens.after.len().try_into().expect("interner handles only u32"),
749                                similarity,
750                            }
751                            .into(),
752                        )));
753                    }
754                }
755                Operation::ExternalCommand { .. } => {
756                    unreachable!("we have disabled this possibility with an option")
757                }
758                Operation::SourceOrDestinationIsBinary => {
759                    // TODO: figure out if git does more here
760                }
761            };
762        }
763    }
764    Ok(None)
765}
766
767mod diff {
768    use std::ops::Range;
769
770    pub struct Statistics<'a, 'data> {
771        pub removed_bytes: usize,
772        pub input: &'a crate::blob::intern::InternedInput<&'data [u8]>,
773    }
774
775    impl crate::blob::Sink for Statistics<'_, '_> {
776        type Out = usize;
777
778        fn process_change(&mut self, before: Range<u32>, _after: Range<u32>) {
779            self.removed_bytes += self.input.before[before.start as usize..before.end as usize]
780                .iter()
781                .map(|token| self.input.interner[*token].len())
782                .sum::<usize>();
783        }
784
785        fn finish(self) -> Self::Out {
786            self.removed_bytes
787        }
788    }
789}
790
791#[cfg(test)]
792mod estimate_involved_items {
793    use super::estimate_involved_items;
794    use crate::rewrites::tracker::{visit::SourceKind, ChangeKind};
795
796    #[test]
797    fn renames_count_unemitted_as_sources_and_destinations() {
798        let items = [
799            (false, ChangeKind::Addition),
800            (true, ChangeKind::Deletion),
801            (true, ChangeKind::Deletion),
802        ];
803        assert_eq!(
804            estimate_involved_items(items, SourceKind::Rename),
805            (0, 1),
806            "here we only have one eligible source, hence nothing to do"
807        );
808        assert_eq!(
809            estimate_involved_items(items.into_iter().map(|t| (false, t.1)), SourceKind::Rename),
810            (2, 1),
811            "now we have more possibilities as renames count un-emitted deletions as source"
812        );
813    }
814
815    #[test]
816    fn copies_do_not_count_additions_as_sources() {
817        let items = [
818            (false, ChangeKind::Addition),
819            (true, ChangeKind::Addition),
820            (true, ChangeKind::Deletion),
821        ];
822        assert_eq!(
823            estimate_involved_items(items, SourceKind::Copy),
824            (0, 1),
825            "one addition as source, the other isn't counted as it's emitted, nor is it considered a copy-source.\
826            deletions don't count"
827        );
828    }
829
830    #[test]
831    fn copies_count_modifications_as_sources() {
832        let items = [
833            (false, ChangeKind::Addition),
834            (true, ChangeKind::Modification),
835            (false, ChangeKind::Modification),
836        ];
837        assert_eq!(
838            estimate_involved_items(items, SourceKind::Copy),
839            (2, 1),
840            "any modifications is a valid source, emitted or not"
841        );
842    }
843}