1use std::ops::Range;
9
10use bstr::{BStr, ByteSlice};
11use gix_object::tree::{EntryKind, EntryMode};
12
13use crate::{
14 blob::{platform::prepare_diff::Operation, DiffLineStats, ResourceKind},
15 rewrites::{tracker::visit::SourceKind, CopySource, Outcome, Tracker},
16 tree::visit::{Action, ChangeId, Relation},
17 Rewrites,
18};
19
20#[derive(Debug, Copy, Clone, Ord, PartialOrd, PartialEq, Eq)]
22pub enum ChangeKind {
23 Deletion,
25 Modification,
27 Addition,
29}
30
31pub trait Change: Clone {
33 fn id(&self) -> &gix_hash::oid;
38 fn relation(&self) -> Option<Relation>;
49 fn kind(&self) -> ChangeKind;
51 fn entry_mode(&self) -> EntryMode;
53 fn id_and_entry_mode(&self) -> (&gix_hash::oid, EntryMode);
55}
56
57pub(crate) struct Item<T> {
59 change: T,
61 path: Range<usize>,
63 emitted: bool,
65}
66
67impl<T: Change> Item<T> {
68 fn location<'a>(&self, backing: &'a [u8]) -> &'a BStr {
69 backing[self.path.clone()].as_ref()
70 }
71 fn entry_mode_compatible(&self, other: EntryMode) -> bool {
72 use EntryKind::*;
73 matches!(
74 (other.kind(), self.change.entry_mode().kind()),
75 (Blob | BlobExecutable, Blob | BlobExecutable) | (Link, Link) | (Tree, Tree)
76 )
77 }
78
79 fn is_source_for_destination_of(&self, kind: visit::SourceKind, dest_item_mode: EntryMode) -> bool {
80 self.entry_mode_compatible(dest_item_mode)
81 && match kind {
82 visit::SourceKind::Rename => !self.emitted && matches!(self.change.kind(), ChangeKind::Deletion),
83 visit::SourceKind::Copy => {
84 matches!(self.change.kind(), ChangeKind::Modification)
85 }
86 }
87 }
88}
89
90pub mod visit {
92 use bstr::BStr;
93 use gix_object::tree::EntryMode;
94
95 use crate::blob::DiffLineStats;
96
97 #[derive(Debug, Clone, PartialEq, PartialOrd)]
99 pub struct Source<'a, T> {
100 pub entry_mode: EntryMode,
102 pub id: gix_hash::ObjectId,
104 pub kind: SourceKind,
106 pub location: &'a BStr,
108 pub change: &'a T,
110 pub diff: Option<DiffLineStats>,
112 }
113
114 #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
116 pub enum SourceKind {
117 Rename,
119 Copy,
121 }
122
123 #[derive(Debug, Clone)]
125 pub struct Destination<'a, T: Clone> {
126 pub change: T,
128 pub location: &'a BStr,
130 }
131}
132
133pub mod emit {
135 #[derive(Debug, thiserror::Error)]
137 #[allow(missing_docs)]
138 pub enum Error {
139 #[error("Could not find blob for similarity checking")]
140 FindExistingBlob(#[from] gix_object::find::existing_object::Error),
141 #[error("Could not obtain exhaustive item set to use as possible sources for copy detection")]
142 GetItemsForExhaustiveCopyDetection(#[source] Box<dyn std::error::Error + Send + Sync>),
143 #[error(transparent)]
144 SetResource(#[from] crate::blob::platform::set_resource::Error),
145 #[error(transparent)]
146 PrepareDiff(#[from] crate::blob::platform::prepare_diff::Error),
147 }
148}
149
150impl<T: Change> Tracker<T> {
152 pub fn new(rewrites: Rewrites) -> Self {
154 Tracker {
155 items: vec![],
156 path_backing: vec![],
157 rewrites,
158 child_renames: Default::default(),
159 }
160 }
161}
162
163impl<T: Change> Tracker<T> {
165 pub fn try_push_change(&mut self, change: T, location: &BStr) -> Option<T> {
167 let change_kind = change.kind();
168 if let (None, ChangeKind::Modification) = (self.rewrites.copies, change_kind) {
169 return Some(change);
170 }
171
172 let entry_kind = change.entry_mode().kind();
173 if entry_kind == EntryKind::Commit {
174 return Some(change);
175 }
176 let relation = change
177 .relation()
178 .filter(|_| matches!(change_kind, ChangeKind::Addition | ChangeKind::Deletion));
179 if let (None, EntryKind::Tree) = (relation, entry_kind) {
180 return Some(change);
181 }
182
183 let start = self.path_backing.len();
184 self.path_backing.extend_from_slice(location);
185 let path = start..self.path_backing.len();
186
187 self.items.push(Item {
188 path,
189 change,
190 emitted: false,
191 });
192 None
193 }
194
195 pub fn emit<PushSourceTreeFn, E>(
218 &mut self,
219 mut cb: impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
220 diff_cache: &mut crate::blob::Platform,
221 objects: &impl gix_object::FindObjectOrHeader,
222 mut push_source_tree: PushSourceTreeFn,
223 ) -> Result<Outcome, emit::Error>
224 where
225 PushSourceTreeFn: FnMut(&mut dyn FnMut(T, &BStr)) -> Result<(), E>,
226 E: std::error::Error + Send + Sync + 'static,
227 {
228 fn is_parent(change: &impl Change) -> bool {
229 matches!(change.relation(), Some(Relation::Parent(_)))
230 }
231 diff_cache.options.skip_internal_diff_if_external_is_configured = false;
232
233 fn by_id_and_location<T: Change>(a: &Item<T>, b: &Item<T>) -> std::cmp::Ordering {
236 a.change
237 .id()
238 .cmp(b.change.id())
239 .then_with(|| a.path.start.cmp(&b.path.start).then(a.path.end.cmp(&b.path.end)))
240 }
241
242 let mut out = Outcome {
243 options: self.rewrites,
244 ..Default::default()
245 };
246 self.items.sort_by(by_id_and_location);
247
248 self.match_pairs_of_kind(
252 visit::SourceKind::Rename,
253 &mut cb,
254 None, &mut out,
256 diff_cache,
257 objects,
258 Some(is_parent),
259 )?;
260
261 self.match_pairs_of_kind(
262 visit::SourceKind::Rename,
263 &mut cb,
264 self.rewrites.percentage,
265 &mut out,
266 diff_cache,
267 objects,
268 None,
269 )?;
270
271 self.match_renamed_directories(&mut cb)?;
272
273 if let Some(copies) = self.rewrites.copies {
274 self.match_pairs_of_kind(
275 visit::SourceKind::Copy,
276 &mut cb,
277 copies.percentage,
278 &mut out,
279 diff_cache,
280 objects,
281 None,
282 )?;
283
284 match copies.source {
285 CopySource::FromSetOfModifiedFiles => {}
286 CopySource::FromSetOfModifiedFilesAndAllSources => {
287 push_source_tree(&mut |change, location| {
288 if self.try_push_change(change, location).is_none() {
289 self.items.last_mut().expect("just pushed").emitted = true;
291 }
292 })
293 .map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?;
294 self.items.sort_by(by_id_and_location);
295
296 self.match_pairs_of_kind(
297 visit::SourceKind::Copy,
298 &mut cb,
299 copies.percentage,
300 &mut out,
301 diff_cache,
302 objects,
303 None,
304 )?;
305 }
306 }
307 }
308
309 self.items
310 .sort_by(|a, b| a.location(&self.path_backing).cmp(b.location(&self.path_backing)));
311 for item in self.items.drain(..).filter(|item| !item.emitted) {
312 if cb(
313 visit::Destination {
314 location: item.location(&self.path_backing),
315 change: item.change,
316 },
317 None,
318 ) == Action::Cancel
319 {
320 break;
321 }
322 }
323 Ok(out)
324 }
325}
326
327impl<T: Change> Tracker<T> {
328 #[allow(clippy::too_many_arguments)]
329 fn match_pairs_of_kind(
330 &mut self,
331 kind: visit::SourceKind,
332 cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
333 percentage: Option<f32>,
334 out: &mut Outcome,
335 diff_cache: &mut crate::blob::Platform,
336 objects: &impl gix_object::FindObjectOrHeader,
337 filter: Option<fn(&T) -> bool>,
338 ) -> Result<(), emit::Error> {
339 let needs_second_pass = !needs_exact_match(percentage);
341
342 if self.match_pairs(cb, None , kind, out, diff_cache, objects, filter)? == Action::Cancel {
346 return Ok(());
347 }
348 if needs_second_pass {
349 let is_limited = if self.rewrites.limit == 0 {
350 false
351 } else {
352 let (num_src, num_dst) =
353 estimate_involved_items(self.items.iter().map(|item| (item.emitted, item.change.kind())), kind);
354 let permutations = num_src * num_dst;
355 if permutations > self.rewrites.limit {
356 match kind {
357 visit::SourceKind::Rename => {
358 out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations;
359 }
360 visit::SourceKind::Copy => {
361 out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations;
362 }
363 }
364 true
365 } else {
366 false
367 }
368 };
369 if !is_limited {
370 self.match_pairs(cb, percentage, kind, out, diff_cache, objects, None)?;
371 }
372 }
373 Ok(())
374 }
375
376 #[allow(clippy::too_many_arguments)]
377 fn match_pairs(
378 &mut self,
379 cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
380 percentage: Option<f32>,
381 kind: visit::SourceKind,
382 stats: &mut Outcome,
383 diff_cache: &mut crate::blob::Platform,
384 objects: &impl gix_object::FindObjectOrHeader,
385 filter: Option<fn(&T) -> bool>,
386 ) -> Result<Action, emit::Error> {
387 let mut dest_ofs = 0;
388 let mut num_checks = 0;
389 let max_checks = {
390 let limit = self.rewrites.limit.saturating_pow(2);
391 if self.items.len() < 100_000 {
395 0
396 } else {
397 limit
398 }
399 };
400
401 while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| {
402 (!item.emitted
403 && matches!(item.change.kind(), ChangeKind::Addition)
404 && filter.map_or_else(
405 || {
406 self.rewrites.track_empty
407 || matches!(item.change.relation(), Some(Relation::ChildOfParent(_)))
410 || {
411 let id = item.change.id();
412 id != gix_hash::ObjectId::empty_blob(id.kind())
413 }
414 },
415 |f| f(&item.change),
416 ))
417 .then_some((idx, item))
418 }) {
419 dest_idx += dest_ofs;
420 dest_ofs = dest_idx + 1;
421 self.items[dest_idx].location(&self.path_backing);
422 let src = find_match(
423 &self.items,
424 dest,
425 dest_idx,
426 percentage,
427 kind,
428 stats,
429 objects,
430 diff_cache,
431 &self.path_backing,
432 &mut num_checks,
433 )?
434 .map(|(src_idx, src, diff)| {
435 let (id, entry_mode) = src.change.id_and_entry_mode();
436 let id = id.to_owned();
437 let location = src.location(&self.path_backing);
438 (
439 visit::Source {
440 entry_mode,
441 id,
442 kind,
443 location,
444 change: &src.change,
445 diff,
446 },
447 src_idx,
448 )
449 });
450 if max_checks != 0 && num_checks > max_checks {
451 gix_trace::warn!(
452 "Cancelled rename matching as there were too many iterations ({num_checks} > {max_checks})"
453 );
454 return Ok(Action::Cancel);
455 }
456 let Some((src, src_idx)) = src else {
457 continue;
458 };
459 let location = dest.location(&self.path_backing);
460 let change = dest.change.clone();
461 let dest = visit::Destination { change, location };
462 let relations = if percentage.is_none() {
463 src.change.relation().zip(dest.change.relation())
464 } else {
465 None
466 };
467 let res = cb(dest, Some(src));
468
469 self.items[dest_idx].emitted = true;
470 self.items[src_idx].emitted = true;
471
472 if res == Action::Cancel {
473 return Ok(Action::Cancel);
474 }
475
476 match relations {
477 Some((Relation::Parent(src), Relation::Parent(dst))) => {
478 let res = self.emit_child_renames_matching_identity(cb, kind, src, dst)?;
479 if res == Action::Cancel {
480 return Ok(Action::Cancel);
481 }
482 }
483 Some((Relation::ChildOfParent(src), Relation::ChildOfParent(dst))) => {
484 self.child_renames.insert((src, dst));
485 }
486 _ => {}
487 }
488 }
489 Ok(Action::Continue)
490 }
491
492 fn emit_child_renames_matching_identity(
496 &mut self,
497 cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
498 kind: visit::SourceKind,
499 src_parent_id: ChangeId,
500 dst_parent_id: ChangeId,
501 ) -> Result<Action, emit::Error> {
502 debug_assert_ne!(
503 src_parent_id, dst_parent_id,
504 "src and destination directories must be distinct"
505 );
506 let (mut src_items, mut dst_items) = (Vec::with_capacity(1), Vec::with_capacity(1));
507 for item in self.items.iter_mut().filter(|item| !item.emitted) {
508 match item.change.relation() {
509 Some(Relation::ChildOfParent(id)) if id == src_parent_id => {
510 src_items.push((item.change.id().to_owned(), item));
511 }
512 Some(Relation::ChildOfParent(id)) if id == dst_parent_id => {
513 dst_items.push((item.change.id().to_owned(), item));
514 }
515 _ => continue,
516 }
517 }
518
519 for ((src_id, src_item), (dst_id, dst_item)) in src_items.into_iter().zip(dst_items) {
520 if src_id == dst_id
523 && filename(src_item.location(&self.path_backing)) == filename(dst_item.location(&self.path_backing))
524 {
525 let entry_mode = src_item.change.entry_mode();
526 let location = src_item.location(&self.path_backing);
527 let src = visit::Source {
528 entry_mode,
529 id: src_id,
530 kind,
531 location,
532 change: &src_item.change,
533 diff: None,
534 };
535 let location = dst_item.location(&self.path_backing);
536 let change = dst_item.change.clone();
537 let dst = visit::Destination { change, location };
538 let res = cb(dst, Some(src));
539
540 src_item.emitted = true;
541 dst_item.emitted = true;
542
543 if res == Action::Cancel {
544 return Ok(res);
545 }
546 } else {
547 gix_trace::warn!("Children of parents with change-id {src_parent_id} and {dst_parent_id} were not equal, even though their parents claimed to be");
548 break;
549 }
550 }
551 Ok(Action::Continue)
552 }
553
554 fn match_renamed_directories(
560 &mut self,
561 cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
562 ) -> Result<(), emit::Error> {
563 fn unemitted_directory_matching_relation_id<T: Change>(items: &[Item<T>], child_id: ChangeId) -> Option<usize> {
564 items.iter().position(|i| {
565 !i.emitted && matches!(i.change.relation(), Some(Relation::Parent(pid)) if pid == child_id)
566 })
567 }
568 for (deleted_child_id, added_child_id) in &self.child_renames {
569 let Some(src_idx) = unemitted_directory_matching_relation_id(&self.items, *deleted_child_id) else {
570 continue;
571 };
572 let Some(dst_idx) = unemitted_directory_matching_relation_id(&self.items, *added_child_id) else {
573 continue;
576 };
577
578 let (src_item, dst_item) = (&self.items[src_idx], &self.items[dst_idx]);
579 let entry_mode = src_item.change.entry_mode();
580 let location = src_item.location(&self.path_backing);
581 let src = visit::Source {
582 entry_mode,
583 id: src_item.change.id().to_owned(),
584 kind: SourceKind::Rename,
585 location,
586 change: &src_item.change,
587 diff: None,
588 };
589 let location = dst_item.location(&self.path_backing);
590 let change = dst_item.change.clone();
591 let dst = visit::Destination { change, location };
592 let res = cb(dst, Some(src));
593
594 self.items[src_idx].emitted = true;
595 self.items[dst_idx].emitted = true;
596
597 if res == Action::Cancel {
598 return Ok(());
599 }
600 }
601 Ok(())
602 }
603}
604
605fn filename(path: &BStr) -> &BStr {
606 path.rfind_byte(b'/').map_or(path, |idx| path[idx + 1..].as_bstr())
607}
608
609fn estimate_involved_items(
611 items: impl IntoIterator<Item = (bool, ChangeKind)>,
612 kind: visit::SourceKind,
613) -> (usize, usize) {
614 items
615 .into_iter()
616 .filter(|(emitted, _)| match kind {
617 visit::SourceKind::Rename => !*emitted,
618 visit::SourceKind::Copy => true,
619 })
620 .fold((0, 0), |(mut src, mut dest), (emitted, change_kind)| {
621 match change_kind {
622 ChangeKind::Addition => {
623 if kind == visit::SourceKind::Rename || !emitted {
624 dest += 1;
625 }
626 }
627 ChangeKind::Deletion => {
628 if kind == visit::SourceKind::Rename {
629 src += 1;
630 }
631 }
632 ChangeKind::Modification => {
633 if kind == visit::SourceKind::Copy {
634 src += 1;
635 }
636 }
637 }
638 (src, dest)
639 })
640}
641
642fn needs_exact_match(percentage: Option<f32>) -> bool {
643 percentage.map_or(true, |p| p >= 1.0)
644}
645
646type SourceTuple<'a, T> = (usize, &'a Item<T>, Option<DiffLineStats>);
648
649#[allow(clippy::too_many_arguments)]
657fn find_match<'a, T: Change>(
658 items: &'a [Item<T>],
659 item: &Item<T>,
660 item_idx: usize,
661 percentage: Option<f32>,
662 kind: visit::SourceKind,
663 stats: &mut Outcome,
664 objects: &impl gix_object::FindObjectOrHeader,
665 diff_cache: &mut crate::blob::Platform,
666 path_backing: &[u8],
667 num_checks: &mut usize,
668) -> Result<Option<SourceTuple<'a, T>>, emit::Error> {
669 let (item_id, item_mode) = item.change.id_and_entry_mode();
670 if needs_exact_match(percentage) || item_mode.is_link() {
671 let first_idx = items.partition_point(|a| a.change.id() < item_id);
672 let range = items.get(first_idx..).map(|slice| {
673 let end = slice
674 .iter()
675 .position(|a| a.change.id() != item_id)
676 .map_or(items.len(), |idx| first_idx + idx);
677 first_idx..end
678 });
679 let range = match range {
680 Some(range) => range,
681 None => return Ok(None),
682 };
683 if range.is_empty() {
684 return Ok(None);
685 }
686 let res = items[range.clone()].iter().enumerate().find_map(|(mut src_idx, src)| {
687 src_idx += range.start;
688 *num_checks += 1;
689 (src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)).then_some((src_idx, src, None))
690 });
691 if let Some(src) = res {
692 return Ok(Some(src));
693 }
694 } else if item_mode.is_blob() {
695 let mut has_new = false;
696 let percentage = percentage.expect("it's set to something below 1.0 and we assured this");
697
698 for (can_idx, src) in items
699 .iter()
700 .enumerate()
701 .filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode))
702 {
703 if !has_new {
704 diff_cache.set_resource(
705 item_id.to_owned(),
706 item_mode.kind(),
707 item.location(path_backing),
708 ResourceKind::NewOrDestination,
709 objects,
710 )?;
711 has_new = true;
712 }
713 let (src_id, src_mode) = src.change.id_and_entry_mode();
714 diff_cache.set_resource(
715 src_id.to_owned(),
716 src_mode.kind(),
717 src.location(path_backing),
718 ResourceKind::OldOrSource,
719 objects,
720 )?;
721 let prep = diff_cache.prepare_diff()?;
722 stats.num_similarity_checks += 1;
723 *num_checks += 1;
724 match prep.operation {
725 Operation::InternalDiff { algorithm } => {
726 let tokens =
727 crate::blob::intern::InternedInput::new(prep.old.intern_source(), prep.new.intern_source());
728 let counts = crate::blob::diff(
729 algorithm,
730 &tokens,
731 crate::blob::sink::Counter::new(diff::Statistics {
732 removed_bytes: 0,
733 input: &tokens,
734 }),
735 );
736 let old_data_len = prep.old.data.as_slice().unwrap_or_default().len();
737 let new_data_len = prep.new.data.as_slice().unwrap_or_default().len();
738 let similarity = (old_data_len - counts.wrapped) as f32 / old_data_len.max(new_data_len) as f32;
739 if similarity >= percentage {
740 return Ok(Some((
741 can_idx,
742 src,
743 DiffLineStats {
744 removals: counts.removals,
745 insertions: counts.insertions,
746 before: tokens.before.len().try_into().expect("interner handles only u32"),
747 after: tokens.after.len().try_into().expect("interner handles only u32"),
748 similarity,
749 }
750 .into(),
751 )));
752 }
753 }
754 Operation::ExternalCommand { .. } => {
755 unreachable!("we have disabled this possibility with an option")
756 }
757 Operation::SourceOrDestinationIsBinary => {
758 }
760 }
761 }
762 }
763 Ok(None)
764}
765
766mod diff {
767 use std::ops::Range;
768
769 pub struct Statistics<'a, 'data> {
770 pub removed_bytes: usize,
771 pub input: &'a crate::blob::intern::InternedInput<&'data [u8]>,
772 }
773
774 impl crate::blob::Sink for Statistics<'_, '_> {
775 type Out = usize;
776
777 fn process_change(&mut self, before: Range<u32>, _after: Range<u32>) {
778 self.removed_bytes += self.input.before[before.start as usize..before.end as usize]
779 .iter()
780 .map(|token| self.input.interner[*token].len())
781 .sum::<usize>();
782 }
783
784 fn finish(self) -> Self::Out {
785 self.removed_bytes
786 }
787 }
788}
789
790#[cfg(test)]
791mod estimate_involved_items {
792 use super::estimate_involved_items;
793 use crate::rewrites::tracker::{visit::SourceKind, ChangeKind};
794
795 #[test]
796 fn renames_count_unemitted_as_sources_and_destinations() {
797 let items = [
798 (false, ChangeKind::Addition),
799 (true, ChangeKind::Deletion),
800 (true, ChangeKind::Deletion),
801 ];
802 assert_eq!(
803 estimate_involved_items(items, SourceKind::Rename),
804 (0, 1),
805 "here we only have one eligible source, hence nothing to do"
806 );
807 assert_eq!(
808 estimate_involved_items(items.into_iter().map(|t| (false, t.1)), SourceKind::Rename),
809 (2, 1),
810 "now we have more possibilities as renames count un-emitted deletions as source"
811 );
812 }
813
814 #[test]
815 fn copies_do_not_count_additions_as_sources() {
816 let items = [
817 (false, ChangeKind::Addition),
818 (true, ChangeKind::Addition),
819 (true, ChangeKind::Deletion),
820 ];
821 assert_eq!(
822 estimate_involved_items(items, SourceKind::Copy),
823 (0, 1),
824 "one addition as source, the other isn't counted as it's emitted, nor is it considered a copy-source.\
825 deletions don't count"
826 );
827 }
828
829 #[test]
830 fn copies_count_modifications_as_sources() {
831 let items = [
832 (false, ChangeKind::Addition),
833 (true, ChangeKind::Modification),
834 (false, ChangeKind::Modification),
835 ];
836 assert_eq!(
837 estimate_involved_items(items, SourceKind::Copy),
838 (2, 1),
839 "any modifications is a valid source, emitted or not"
840 );
841 }
842}