1use std::ops::Range;
9
10use bstr::{BStr, ByteSlice};
11use gix_object::tree::{EntryKind, EntryMode};
12
13use crate::rewrites::tracker::visit::SourceKind;
14use crate::tree::visit::{Action, ChangeId, Relation};
15use crate::{
16 blob::{platform::prepare_diff::Operation, DiffLineStats, ResourceKind},
17 rewrites::{CopySource, Outcome, Tracker},
18 Rewrites,
19};
20
21#[derive(Debug, Copy, Clone, Ord, PartialOrd, PartialEq, Eq)]
23pub enum ChangeKind {
24 Deletion,
26 Modification,
28 Addition,
30}
31
32pub trait Change: Clone {
34 fn id(&self) -> &gix_hash::oid;
39 fn relation(&self) -> Option<Relation>;
50 fn kind(&self) -> ChangeKind;
52 fn entry_mode(&self) -> EntryMode;
54 fn id_and_entry_mode(&self) -> (&gix_hash::oid, EntryMode);
56}
57
58pub(crate) struct Item<T> {
60 change: T,
62 path: Range<usize>,
64 emitted: bool,
66}
67
68impl<T: Change> Item<T> {
69 fn location<'a>(&self, backing: &'a [u8]) -> &'a BStr {
70 backing[self.path.clone()].as_ref()
71 }
72 fn entry_mode_compatible(&self, other: EntryMode) -> bool {
73 use EntryKind::*;
74 matches!(
75 (other.kind(), self.change.entry_mode().kind()),
76 (Blob | BlobExecutable, Blob | BlobExecutable) | (Link, Link) | (Tree, Tree)
77 )
78 }
79
80 fn is_source_for_destination_of(&self, kind: visit::SourceKind, dest_item_mode: EntryMode) -> bool {
81 self.entry_mode_compatible(dest_item_mode)
82 && match kind {
83 visit::SourceKind::Rename => !self.emitted && matches!(self.change.kind(), ChangeKind::Deletion),
84 visit::SourceKind::Copy => {
85 matches!(self.change.kind(), ChangeKind::Modification)
86 }
87 }
88 }
89}
90
91pub mod visit {
93 use bstr::BStr;
94 use gix_object::tree::EntryMode;
95
96 use crate::blob::DiffLineStats;
97
98 #[derive(Debug, Clone, PartialEq, PartialOrd)]
100 pub struct Source<'a, T> {
101 pub entry_mode: EntryMode,
103 pub id: gix_hash::ObjectId,
105 pub kind: SourceKind,
107 pub location: &'a BStr,
109 pub change: &'a T,
111 pub diff: Option<DiffLineStats>,
113 }
114
115 #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
117 pub enum SourceKind {
118 Rename,
120 Copy,
122 }
123
124 #[derive(Debug, Clone)]
126 pub struct Destination<'a, T: Clone> {
127 pub change: T,
129 pub location: &'a BStr,
131 }
132}
133
134pub mod emit {
136 #[derive(Debug, thiserror::Error)]
138 #[allow(missing_docs)]
139 pub enum Error {
140 #[error("Could not find blob for similarity checking")]
141 FindExistingBlob(#[from] gix_object::find::existing_object::Error),
142 #[error("Could not obtain exhaustive item set to use as possible sources for copy detection")]
143 GetItemsForExhaustiveCopyDetection(#[source] Box<dyn std::error::Error + Send + Sync>),
144 #[error(transparent)]
145 SetResource(#[from] crate::blob::platform::set_resource::Error),
146 #[error(transparent)]
147 PrepareDiff(#[from] crate::blob::platform::prepare_diff::Error),
148 }
149}
150
151impl<T: Change> Tracker<T> {
153 pub fn new(rewrites: Rewrites) -> Self {
155 Tracker {
156 items: vec![],
157 path_backing: vec![],
158 rewrites,
159 child_renames: Default::default(),
160 }
161 }
162}
163
164impl<T: Change> Tracker<T> {
166 pub fn try_push_change(&mut self, change: T, location: &BStr) -> Option<T> {
168 let change_kind = change.kind();
169 if let (None, ChangeKind::Modification { .. }) = (self.rewrites.copies, change_kind) {
170 return Some(change);
171 };
172
173 let entry_kind = change.entry_mode().kind();
174 if entry_kind == EntryKind::Commit {
175 return Some(change);
176 }
177 let relation = change
178 .relation()
179 .filter(|_| matches!(change_kind, ChangeKind::Addition | ChangeKind::Deletion));
180 if let (None, EntryKind::Tree) = (relation, entry_kind) {
181 return Some(change);
182 };
183
184 let start = self.path_backing.len();
185 self.path_backing.extend_from_slice(location);
186 let path = start..self.path_backing.len();
187
188 self.items.push(Item {
189 path,
190 change,
191 emitted: false,
192 });
193 None
194 }
195
196 pub fn emit<PushSourceTreeFn, E>(
219 &mut self,
220 mut cb: impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
221 diff_cache: &mut crate::blob::Platform,
222 objects: &impl gix_object::FindObjectOrHeader,
223 mut push_source_tree: PushSourceTreeFn,
224 ) -> Result<Outcome, emit::Error>
225 where
226 PushSourceTreeFn: FnMut(&mut dyn FnMut(T, &BStr)) -> Result<(), E>,
227 E: std::error::Error + Send + Sync + 'static,
228 {
229 fn is_parent(change: &impl Change) -> bool {
230 matches!(change.relation(), Some(Relation::Parent(_)))
231 }
232 diff_cache.options.skip_internal_diff_if_external_is_configured = false;
233
234 fn by_id_and_location<T: Change>(a: &Item<T>, b: &Item<T>) -> std::cmp::Ordering {
237 a.change
238 .id()
239 .cmp(b.change.id())
240 .then_with(|| a.path.start.cmp(&b.path.start).then(a.path.end.cmp(&b.path.end)))
241 }
242
243 let mut out = Outcome {
244 options: self.rewrites,
245 ..Default::default()
246 };
247 self.items.sort_by(by_id_and_location);
248
249 self.match_pairs_of_kind(
253 visit::SourceKind::Rename,
254 &mut cb,
255 None, &mut out,
257 diff_cache,
258 objects,
259 Some(is_parent),
260 )?;
261
262 self.match_pairs_of_kind(
263 visit::SourceKind::Rename,
264 &mut cb,
265 self.rewrites.percentage,
266 &mut out,
267 diff_cache,
268 objects,
269 None,
270 )?;
271
272 self.match_renamed_directories(&mut cb)?;
273
274 if let Some(copies) = self.rewrites.copies {
275 self.match_pairs_of_kind(
276 visit::SourceKind::Copy,
277 &mut cb,
278 copies.percentage,
279 &mut out,
280 diff_cache,
281 objects,
282 None,
283 )?;
284
285 match copies.source {
286 CopySource::FromSetOfModifiedFiles => {}
287 CopySource::FromSetOfModifiedFilesAndAllSources => {
288 push_source_tree(&mut |change, location| {
289 if self.try_push_change(change, location).is_none() {
290 self.items.last_mut().expect("just pushed").emitted = true;
292 }
293 })
294 .map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?;
295 self.items.sort_by(by_id_and_location);
296
297 self.match_pairs_of_kind(
298 visit::SourceKind::Copy,
299 &mut cb,
300 copies.percentage,
301 &mut out,
302 diff_cache,
303 objects,
304 None,
305 )?;
306 }
307 }
308 }
309
310 self.items
311 .sort_by(|a, b| a.location(&self.path_backing).cmp(b.location(&self.path_backing)));
312 for item in self.items.drain(..).filter(|item| !item.emitted) {
313 if cb(
314 visit::Destination {
315 location: item.location(&self.path_backing),
316 change: item.change,
317 },
318 None,
319 ) == Action::Cancel
320 {
321 break;
322 }
323 }
324 Ok(out)
325 }
326}
327
328impl<T: Change> Tracker<T> {
329 #[allow(clippy::too_many_arguments)]
330 fn match_pairs_of_kind(
331 &mut self,
332 kind: visit::SourceKind,
333 cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
334 percentage: Option<f32>,
335 out: &mut Outcome,
336 diff_cache: &mut crate::blob::Platform,
337 objects: &impl gix_object::FindObjectOrHeader,
338 filter: Option<fn(&T) -> bool>,
339 ) -> Result<(), emit::Error> {
340 let needs_second_pass = !needs_exact_match(percentage);
342
343 if self.match_pairs(cb, None , kind, out, diff_cache, objects, filter)? == Action::Cancel {
347 return Ok(());
348 }
349 if needs_second_pass {
350 let is_limited = if self.rewrites.limit == 0 {
351 false
352 } else {
353 let (num_src, num_dst) =
354 estimate_involved_items(self.items.iter().map(|item| (item.emitted, item.change.kind())), kind);
355 let permutations = num_src * num_dst;
356 if permutations > self.rewrites.limit {
357 match kind {
358 visit::SourceKind::Rename => {
359 out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations;
360 }
361 visit::SourceKind::Copy => {
362 out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations;
363 }
364 }
365 true
366 } else {
367 false
368 }
369 };
370 if !is_limited {
371 self.match_pairs(cb, percentage, kind, out, diff_cache, objects, None)?;
372 }
373 }
374 Ok(())
375 }
376
377 #[allow(clippy::too_many_arguments)]
378 fn match_pairs(
379 &mut self,
380 cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
381 percentage: Option<f32>,
382 kind: visit::SourceKind,
383 stats: &mut Outcome,
384 diff_cache: &mut crate::blob::Platform,
385 objects: &impl gix_object::FindObjectOrHeader,
386 filter: Option<fn(&T) -> bool>,
387 ) -> Result<Action, emit::Error> {
388 let mut dest_ofs = 0;
389 let mut num_checks = 0;
390 let max_checks = {
391 let limit = self.rewrites.limit.saturating_pow(2);
392 if self.items.len() < 100_000 {
396 0
397 } else {
398 limit
399 }
400 };
401
402 while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| {
403 (!item.emitted
404 && matches!(item.change.kind(), ChangeKind::Addition)
405 && filter.map_or_else(
406 || {
407 self.rewrites.track_empty
408 || matches!(item.change.relation(), Some(Relation::ChildOfParent(_)))
411 || {
412 let id = item.change.id();
413 id != gix_hash::ObjectId::empty_blob(id.kind())
414 }
415 },
416 |f| f(&item.change),
417 ))
418 .then_some((idx, item))
419 }) {
420 dest_idx += dest_ofs;
421 dest_ofs = dest_idx + 1;
422 self.items[dest_idx].location(&self.path_backing);
423 let src = find_match(
424 &self.items,
425 dest,
426 dest_idx,
427 percentage,
428 kind,
429 stats,
430 objects,
431 diff_cache,
432 &self.path_backing,
433 &mut num_checks,
434 )?
435 .map(|(src_idx, src, diff)| {
436 let (id, entry_mode) = src.change.id_and_entry_mode();
437 let id = id.to_owned();
438 let location = src.location(&self.path_backing);
439 (
440 visit::Source {
441 entry_mode,
442 id,
443 kind,
444 location,
445 change: &src.change,
446 diff,
447 },
448 src_idx,
449 )
450 });
451 if max_checks != 0 && num_checks > max_checks {
452 gix_trace::warn!(
453 "Cancelled rename matching as there were too many iterations ({num_checks} > {max_checks})"
454 );
455 return Ok(Action::Cancel);
456 }
457 let Some((src, src_idx)) = src else {
458 continue;
459 };
460 let location = dest.location(&self.path_backing);
461 let change = dest.change.clone();
462 let dest = visit::Destination { change, location };
463 let relations = if percentage.is_none() {
464 src.change.relation().zip(dest.change.relation())
465 } else {
466 None
467 };
468 let res = cb(dest, Some(src));
469
470 self.items[dest_idx].emitted = true;
471 self.items[src_idx].emitted = true;
472
473 if res == Action::Cancel {
474 return Ok(Action::Cancel);
475 }
476
477 match relations {
478 Some((Relation::Parent(src), Relation::Parent(dst))) => {
479 let res = self.emit_child_renames_matching_identity(cb, kind, src, dst)?;
480 if res == Action::Cancel {
481 return Ok(Action::Cancel);
482 }
483 }
484 Some((Relation::ChildOfParent(src), Relation::ChildOfParent(dst))) => {
485 self.child_renames.insert((src, dst));
486 }
487 _ => {}
488 }
489 }
490 Ok(Action::Continue)
491 }
492
493 fn emit_child_renames_matching_identity(
497 &mut self,
498 cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
499 kind: visit::SourceKind,
500 src_parent_id: ChangeId,
501 dst_parent_id: ChangeId,
502 ) -> Result<Action, emit::Error> {
503 debug_assert_ne!(
504 src_parent_id, dst_parent_id,
505 "src and destination directories must be distinct"
506 );
507 let (mut src_items, mut dst_items) = (Vec::with_capacity(1), Vec::with_capacity(1));
508 for item in self.items.iter_mut().filter(|item| !item.emitted) {
509 match item.change.relation() {
510 Some(Relation::ChildOfParent(id)) if id == src_parent_id => {
511 src_items.push((item.change.id().to_owned(), item));
512 }
513 Some(Relation::ChildOfParent(id)) if id == dst_parent_id => {
514 dst_items.push((item.change.id().to_owned(), item));
515 }
516 _ => continue,
517 };
518 }
519
520 for ((src_id, src_item), (dst_id, dst_item)) in src_items.into_iter().zip(dst_items) {
521 if src_id == dst_id
524 && filename(src_item.location(&self.path_backing)) == filename(dst_item.location(&self.path_backing))
525 {
526 let entry_mode = src_item.change.entry_mode();
527 let location = src_item.location(&self.path_backing);
528 let src = visit::Source {
529 entry_mode,
530 id: src_id,
531 kind,
532 location,
533 change: &src_item.change,
534 diff: None,
535 };
536 let location = dst_item.location(&self.path_backing);
537 let change = dst_item.change.clone();
538 let dst = visit::Destination { change, location };
539 let res = cb(dst, Some(src));
540
541 src_item.emitted = true;
542 dst_item.emitted = true;
543
544 if res == Action::Cancel {
545 return Ok(res);
546 }
547 } else {
548 gix_trace::warn!("Children of parents with change-id {src_parent_id} and {dst_parent_id} were not equal, even though their parents claimed to be");
549 break;
550 }
551 }
552 Ok(Action::Continue)
553 }
554
555 fn match_renamed_directories(
561 &mut self,
562 cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
563 ) -> Result<(), emit::Error> {
564 fn unemitted_directory_matching_relation_id<T: Change>(items: &[Item<T>], child_id: ChangeId) -> Option<usize> {
565 items.iter().position(|i| {
566 !i.emitted && matches!(i.change.relation(), Some(Relation::Parent(pid)) if pid == child_id)
567 })
568 }
569 for (deleted_child_id, added_child_id) in &self.child_renames {
570 let Some(src_idx) = unemitted_directory_matching_relation_id(&self.items, *deleted_child_id) else {
571 continue;
572 };
573 let Some(dst_idx) = unemitted_directory_matching_relation_id(&self.items, *added_child_id) else {
574 continue;
577 };
578
579 let (src_item, dst_item) = (&self.items[src_idx], &self.items[dst_idx]);
580 let entry_mode = src_item.change.entry_mode();
581 let location = src_item.location(&self.path_backing);
582 let src = visit::Source {
583 entry_mode,
584 id: src_item.change.id().to_owned(),
585 kind: SourceKind::Rename,
586 location,
587 change: &src_item.change,
588 diff: None,
589 };
590 let location = dst_item.location(&self.path_backing);
591 let change = dst_item.change.clone();
592 let dst = visit::Destination { change, location };
593 let res = cb(dst, Some(src));
594
595 self.items[src_idx].emitted = true;
596 self.items[dst_idx].emitted = true;
597
598 if res == Action::Cancel {
599 return Ok(());
600 }
601 }
602 Ok(())
603 }
604}
605
606fn filename(path: &BStr) -> &BStr {
607 path.rfind_byte(b'/').map_or(path, |idx| path[idx + 1..].as_bstr())
608}
609
610fn estimate_involved_items(
612 items: impl IntoIterator<Item = (bool, ChangeKind)>,
613 kind: visit::SourceKind,
614) -> (usize, usize) {
615 items
616 .into_iter()
617 .filter(|(emitted, _)| match kind {
618 visit::SourceKind::Rename => !*emitted,
619 visit::SourceKind::Copy => true,
620 })
621 .fold((0, 0), |(mut src, mut dest), (emitted, change_kind)| {
622 match change_kind {
623 ChangeKind::Addition => {
624 if kind == visit::SourceKind::Rename || !emitted {
625 dest += 1;
626 }
627 }
628 ChangeKind::Deletion => {
629 if kind == visit::SourceKind::Rename {
630 src += 1;
631 }
632 }
633 ChangeKind::Modification => {
634 if kind == visit::SourceKind::Copy {
635 src += 1;
636 }
637 }
638 }
639 (src, dest)
640 })
641}
642
643fn needs_exact_match(percentage: Option<f32>) -> bool {
644 percentage.map_or(true, |p| p >= 1.0)
645}
646
647type SourceTuple<'a, T> = (usize, &'a Item<T>, Option<DiffLineStats>);
649
650#[allow(clippy::too_many_arguments)]
658fn find_match<'a, T: Change>(
659 items: &'a [Item<T>],
660 item: &Item<T>,
661 item_idx: usize,
662 percentage: Option<f32>,
663 kind: visit::SourceKind,
664 stats: &mut Outcome,
665 objects: &impl gix_object::FindObjectOrHeader,
666 diff_cache: &mut crate::blob::Platform,
667 path_backing: &[u8],
668 num_checks: &mut usize,
669) -> Result<Option<SourceTuple<'a, T>>, emit::Error> {
670 let (item_id, item_mode) = item.change.id_and_entry_mode();
671 if needs_exact_match(percentage) || item_mode.is_link() {
672 let first_idx = items.partition_point(|a| a.change.id() < item_id);
673 let range = items.get(first_idx..).map(|slice| {
674 let end = slice
675 .iter()
676 .position(|a| a.change.id() != item_id)
677 .map_or(items.len(), |idx| first_idx + idx);
678 first_idx..end
679 });
680 let range = match range {
681 Some(range) => range,
682 None => return Ok(None),
683 };
684 if range.is_empty() {
685 return Ok(None);
686 }
687 let res = items[range.clone()].iter().enumerate().find_map(|(mut src_idx, src)| {
688 src_idx += range.start;
689 *num_checks += 1;
690 (src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)).then_some((src_idx, src, None))
691 });
692 if let Some(src) = res {
693 return Ok(Some(src));
694 }
695 } else if item_mode.is_blob() {
696 let mut has_new = false;
697 let percentage = percentage.expect("it's set to something below 1.0 and we assured this");
698
699 for (can_idx, src) in items
700 .iter()
701 .enumerate()
702 .filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode))
703 {
704 if !has_new {
705 diff_cache.set_resource(
706 item_id.to_owned(),
707 item_mode.kind(),
708 item.location(path_backing),
709 ResourceKind::NewOrDestination,
710 objects,
711 )?;
712 has_new = true;
713 }
714 let (src_id, src_mode) = src.change.id_and_entry_mode();
715 diff_cache.set_resource(
716 src_id.to_owned(),
717 src_mode.kind(),
718 src.location(path_backing),
719 ResourceKind::OldOrSource,
720 objects,
721 )?;
722 let prep = diff_cache.prepare_diff()?;
723 stats.num_similarity_checks += 1;
724 *num_checks += 1;
725 match prep.operation {
726 Operation::InternalDiff { algorithm } => {
727 let tokens =
728 crate::blob::intern::InternedInput::new(prep.old.intern_source(), prep.new.intern_source());
729 let counts = crate::blob::diff(
730 algorithm,
731 &tokens,
732 crate::blob::sink::Counter::new(diff::Statistics {
733 removed_bytes: 0,
734 input: &tokens,
735 }),
736 );
737 let old_data_len = prep.old.data.as_slice().unwrap_or_default().len();
738 let new_data_len = prep.new.data.as_slice().unwrap_or_default().len();
739 let similarity = (old_data_len - counts.wrapped) as f32 / old_data_len.max(new_data_len) as f32;
740 if similarity >= percentage {
741 return Ok(Some((
742 can_idx,
743 src,
744 DiffLineStats {
745 removals: counts.removals,
746 insertions: counts.insertions,
747 before: tokens.before.len().try_into().expect("interner handles only u32"),
748 after: tokens.after.len().try_into().expect("interner handles only u32"),
749 similarity,
750 }
751 .into(),
752 )));
753 }
754 }
755 Operation::ExternalCommand { .. } => {
756 unreachable!("we have disabled this possibility with an option")
757 }
758 Operation::SourceOrDestinationIsBinary => {
759 }
761 };
762 }
763 }
764 Ok(None)
765}
766
767mod diff {
768 use std::ops::Range;
769
770 pub struct Statistics<'a, 'data> {
771 pub removed_bytes: usize,
772 pub input: &'a crate::blob::intern::InternedInput<&'data [u8]>,
773 }
774
775 impl crate::blob::Sink for Statistics<'_, '_> {
776 type Out = usize;
777
778 fn process_change(&mut self, before: Range<u32>, _after: Range<u32>) {
779 self.removed_bytes += self.input.before[before.start as usize..before.end as usize]
780 .iter()
781 .map(|token| self.input.interner[*token].len())
782 .sum::<usize>();
783 }
784
785 fn finish(self) -> Self::Out {
786 self.removed_bytes
787 }
788 }
789}
790
791#[cfg(test)]
792mod estimate_involved_items {
793 use super::estimate_involved_items;
794 use crate::rewrites::tracker::{visit::SourceKind, ChangeKind};
795
796 #[test]
797 fn renames_count_unemitted_as_sources_and_destinations() {
798 let items = [
799 (false, ChangeKind::Addition),
800 (true, ChangeKind::Deletion),
801 (true, ChangeKind::Deletion),
802 ];
803 assert_eq!(
804 estimate_involved_items(items, SourceKind::Rename),
805 (0, 1),
806 "here we only have one eligible source, hence nothing to do"
807 );
808 assert_eq!(
809 estimate_involved_items(items.into_iter().map(|t| (false, t.1)), SourceKind::Rename),
810 (2, 1),
811 "now we have more possibilities as renames count un-emitted deletions as source"
812 );
813 }
814
815 #[test]
816 fn copies_do_not_count_additions_as_sources() {
817 let items = [
818 (false, ChangeKind::Addition),
819 (true, ChangeKind::Addition),
820 (true, ChangeKind::Deletion),
821 ];
822 assert_eq!(
823 estimate_involved_items(items, SourceKind::Copy),
824 (0, 1),
825 "one addition as source, the other isn't counted as it's emitted, nor is it considered a copy-source.\
826 deletions don't count"
827 );
828 }
829
830 #[test]
831 fn copies_count_modifications_as_sources() {
832 let items = [
833 (false, ChangeKind::Addition),
834 (true, ChangeKind::Modification),
835 (false, ChangeKind::Modification),
836 ];
837 assert_eq!(
838 estimate_involved_items(items, SourceKind::Copy),
839 (2, 1),
840 "any modifications is a valid source, emitted or not"
841 );
842 }
843}