gix_diff/rewrites/tracker.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806
//! ### Deviation
//!
//! Note that the algorithm implemented here is in many ways different from what `git` does.
//!
//! - it's less sophisticated and doesn't use any ranking of candidates. Instead, it picks the first possible match.
//! - the set used for copy-detection is probably smaller by default.
use std::ops::Range;
use bstr::{BStr, ByteSlice};
use gix_object::tree::{EntryKind, EntryMode};
use crate::rewrites::tracker::visit::SourceKind;
use crate::tree::visit::{Action, ChangeId, Relation};
use crate::{
blob::{platform::prepare_diff::Operation, DiffLineStats, ResourceKind},
rewrites::{CopySource, Outcome, Tracker},
Rewrites,
};
/// The kind of a change.
#[derive(Debug, Copy, Clone, Ord, PartialOrd, PartialEq, Eq)]
pub enum ChangeKind {
/// The change represents the *deletion* of an item.
Deletion,
/// The change represents the *modification* of an item.
Modification,
/// The change represents the *addition* of an item.
Addition,
}
/// A trait providing all functionality to abstract over the concept of a change, as seen by the [`Tracker`].
pub trait Change: Clone {
/// Return the hash of the object behind this change for identification.
///
/// Note that this is the id of the object as stored in `git`, i.e. it must have gone through workspace
/// conversions. What matters is that the IDs are comparable.
fn id(&self) -> &gix_hash::oid;
/// Return the relation that this change may have with other changes.
///
/// It allows to associate a directory with its children that are added or removed at the same moment.
/// Note that this is ignored for modifications.
///
/// If rename-tracking should always be on leaf-level, this should be set to `None` consistently.
/// Note that trees will never be looked up by their `id` as their children are assumed to be passed in
/// with the respective relationship.
///
/// Also note that the tracker only sees what's given to it, it will not lookup trees or match paths itself.
fn relation(&self) -> Option<Relation>;
/// Return the kind of this change.
fn kind(&self) -> ChangeKind;
/// Return more information about the kind of entry affected by this change.
fn entry_mode(&self) -> EntryMode;
/// Return the id of the change along with its mode.
fn id_and_entry_mode(&self) -> (&gix_hash::oid, EntryMode);
}
/// A set of tracked items allows to figure out their relations by figuring out their similarity.
pub(crate) struct Item<T> {
/// The underlying raw change
change: T,
/// That slice into the backing for paths.
path: Range<usize>,
/// If true, this item was already emitted, i.e. seen by the caller.
emitted: bool,
}
impl<T: Change> Item<T> {
fn location<'a>(&self, backing: &'a [u8]) -> &'a BStr {
backing[self.path.clone()].as_ref()
}
fn entry_mode_compatible(&self, other: EntryMode) -> bool {
use EntryKind::*;
matches!(
(other.kind(), self.change.entry_mode().kind()),
(Blob | BlobExecutable, Blob | BlobExecutable) | (Link, Link) | (Tree, Tree)
)
}
fn is_source_for_destination_of(&self, kind: visit::SourceKind, dest_item_mode: EntryMode) -> bool {
self.entry_mode_compatible(dest_item_mode)
&& match kind {
visit::SourceKind::Rename => !self.emitted && matches!(self.change.kind(), ChangeKind::Deletion),
visit::SourceKind::Copy => {
matches!(self.change.kind(), ChangeKind::Modification)
}
}
}
}
/// A module with types used in the user-callback in [Tracker::emit()](crate::rewrites::Tracker::emit()).
pub mod visit {
use bstr::BStr;
use gix_object::tree::EntryMode;
use crate::blob::DiffLineStats;
/// The source of a rewrite, rename or copy.
#[derive(Debug, Clone, PartialEq, PartialOrd)]
pub struct Source<'a, T> {
/// The kind of entry.
pub entry_mode: EntryMode,
/// The hash of the state of the source as seen in the object database.
pub id: gix_hash::ObjectId,
/// Further specify what kind of source this is.
pub kind: SourceKind,
/// The repository-relative location of this entry.
pub location: &'a BStr,
/// The change that was registered as source.
pub change: &'a T,
/// If this is a rewrite, indicate how many lines would need to change to turn this source into the destination.
pub diff: Option<DiffLineStats>,
}
/// Further identify the kind of [Source].
#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
pub enum SourceKind {
/// This is the source of an entry that was renamed, as `source` was renamed to `destination`.
Rename,
/// This is the source of a copy, as `source` was copied into `destination`.
Copy,
}
/// A change along with a location.
#[derive(Debug, Clone)]
pub struct Destination<'a, T: Clone> {
/// The change at the given `location`.
pub change: T,
/// The repository-relative location of this destination.
pub location: &'a BStr,
}
}
///
pub mod emit {
/// The error returned by [Tracker::emit()](super::Tracker::emit()).
#[derive(Debug, thiserror::Error)]
#[allow(missing_docs)]
pub enum Error {
#[error("Could not find blob for similarity checking")]
FindExistingBlob(#[from] gix_object::find::existing_object::Error),
#[error("Could not obtain exhaustive item set to use as possible sources for copy detection")]
GetItemsForExhaustiveCopyDetection(#[source] Box<dyn std::error::Error + Send + Sync>),
#[error(transparent)]
SetResource(#[from] crate::blob::platform::set_resource::Error),
#[error(transparent)]
PrepareDiff(#[from] crate::blob::platform::prepare_diff::Error),
}
}
/// Lifecycle
impl<T: Change> Tracker<T> {
/// Create a new instance with `rewrites` configuration.
pub fn new(rewrites: Rewrites) -> Self {
Tracker {
items: vec![],
path_backing: vec![],
rewrites,
child_renames: Default::default(),
}
}
}
/// build state and find matches.
impl<T: Change> Tracker<T> {
/// We may refuse the push if that information isn't needed for what we have to track.
pub fn try_push_change(&mut self, change: T, location: &BStr) -> Option<T> {
let change_kind = change.kind();
if let (None, ChangeKind::Modification { .. }) = (self.rewrites.copies, change_kind) {
return Some(change);
};
let entry_kind = change.entry_mode().kind();
if entry_kind == EntryKind::Commit {
return Some(change);
}
let relation = change
.relation()
.filter(|_| matches!(change_kind, ChangeKind::Addition | ChangeKind::Deletion));
if let (None, EntryKind::Tree) = (relation, entry_kind) {
return Some(change);
};
let start = self.path_backing.len();
self.path_backing.extend_from_slice(location);
let path = start..self.path_backing.len();
self.items.push(Item {
path,
change,
emitted: false,
});
None
}
/// Can only be called once effectively as it alters its own state to assure each item is only emitted once.
///
/// `cb(destination, source)` is called for each item, either with `Some(source)` if it's
/// the destination of a copy or rename, or with `None` for source if no relation to other
/// items in the tracked set exist, which is like saying 'no rename or rewrite or copy' happened.
/// Note that directories with [relation](Relation) will be emitted if there is a match, along with all their matching
/// child-items which are similarly bundled as rename.
///
/// `objects` is used to access blob data for similarity checks if required and is taken directly from the object database.
/// Worktree filters and text conversions will be applied afterwards automatically. Note that object-caching *should not*
/// be enabled as caching is implemented by `diff_cache`, after all, the blob that's actually diffed is going
/// through conversion steps.
///
/// `diff_cache` is a way to retain a cache of resources that are prepared for rapid diffing, and it also controls
/// the diff-algorithm (provided no user-algorithm is set).
/// Note that we control a few options of `diff_cache` to assure it will ignore external commands.
/// Note that we do not control how the `diff_cache` converts resources, it's left to the caller to decide
/// if it should look at what's stored in `git`, or in the working tree, along with all diff-specific conversions.
///
/// `push_source_tree(push_fn: push(change, location))` is a function that is called when the entire tree of the source
/// should be added as modifications by calling `push` repeatedly to use for perfect copy tracking. Note that `push`
/// will panic if `change` is not a modification, and it's valid to not call `push` at all.
pub fn emit<PushSourceTreeFn, E>(
&mut self,
mut cb: impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
diff_cache: &mut crate::blob::Platform,
objects: &impl gix_object::FindObjectOrHeader,
mut push_source_tree: PushSourceTreeFn,
) -> Result<Outcome, emit::Error>
where
PushSourceTreeFn: FnMut(&mut dyn FnMut(T, &BStr)) -> Result<(), E>,
E: std::error::Error + Send + Sync + 'static,
{
fn is_parent(change: &impl Change) -> bool {
matches!(change.relation(), Some(Relation::Parent(_)))
}
diff_cache.options.skip_internal_diff_if_external_is_configured = false;
// The point of this is to optimize for identity-based lookups, which should be easy to find
// by partitioning.
fn by_id_and_location<T: Change>(a: &Item<T>, b: &Item<T>) -> std::cmp::Ordering {
a.change
.id()
.cmp(b.change.id())
.then_with(|| a.path.start.cmp(&b.path.start).then(a.path.end.cmp(&b.path.end)))
}
let mut out = Outcome {
options: self.rewrites,
..Default::default()
};
self.items.sort_by(by_id_and_location);
// Rewrites by directory (without local changes) can be pruned out quickly,
// by finding only parents, their counterpart, and then all children can be matched by
// relationship ID.
self.match_pairs_of_kind(
visit::SourceKind::Rename,
&mut cb,
None, /* by identity for parents */
&mut out,
diff_cache,
objects,
Some(is_parent),
)?;
self.match_pairs_of_kind(
visit::SourceKind::Rename,
&mut cb,
self.rewrites.percentage,
&mut out,
diff_cache,
objects,
None,
)?;
self.match_renamed_directories(&mut cb)?;
if let Some(copies) = self.rewrites.copies {
self.match_pairs_of_kind(
visit::SourceKind::Copy,
&mut cb,
copies.percentage,
&mut out,
diff_cache,
objects,
None,
)?;
match copies.source {
CopySource::FromSetOfModifiedFiles => {}
CopySource::FromSetOfModifiedFilesAndAllSources => {
push_source_tree(&mut |change, location| {
assert!(
self.try_push_change(change, location).is_none(),
"we must accept every change"
);
// make sure these aren't viable to be emitted anymore.
self.items.last_mut().expect("just pushed").emitted = true;
})
.map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?;
self.items.sort_by(by_id_and_location);
self.match_pairs_of_kind(
visit::SourceKind::Copy,
&mut cb,
copies.percentage,
&mut out,
diff_cache,
objects,
None,
)?;
}
}
}
self.items
.sort_by(|a, b| a.location(&self.path_backing).cmp(b.location(&self.path_backing)));
for item in self.items.drain(..).filter(|item| !item.emitted) {
if cb(
visit::Destination {
location: item.location(&self.path_backing),
change: item.change,
},
None,
) == Action::Cancel
{
break;
}
}
Ok(out)
}
}
impl<T: Change> Tracker<T> {
#[allow(clippy::too_many_arguments)]
fn match_pairs_of_kind(
&mut self,
kind: visit::SourceKind,
cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
percentage: Option<f32>,
out: &mut Outcome,
diff_cache: &mut crate::blob::Platform,
objects: &impl gix_object::FindObjectOrHeader,
filter: Option<fn(&T) -> bool>,
) -> Result<(), emit::Error> {
// we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively.
let needs_second_pass = !needs_exact_match(percentage);
if self.match_pairs(cb, None /* by identity */, kind, out, diff_cache, objects, filter)? == Action::Cancel {
return Ok(());
}
if needs_second_pass {
let is_limited = if self.rewrites.limit == 0 {
false
} else {
let (num_src, num_dst) =
estimate_involved_items(self.items.iter().map(|item| (item.emitted, item.change.kind())), kind);
let permutations = num_src * num_dst;
if permutations > self.rewrites.limit {
match kind {
visit::SourceKind::Rename => {
out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations;
}
visit::SourceKind::Copy => {
out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations;
}
}
true
} else {
false
}
};
if !is_limited {
self.match_pairs(cb, percentage, kind, out, diff_cache, objects, None)?;
}
}
Ok(())
}
#[allow(clippy::too_many_arguments)]
fn match_pairs(
&mut self,
cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
percentage: Option<f32>,
kind: visit::SourceKind,
stats: &mut Outcome,
diff_cache: &mut crate::blob::Platform,
objects: &impl gix_object::FindObjectOrHeader,
filter: Option<fn(&T) -> bool>,
) -> Result<Action, emit::Error> {
let mut dest_ofs = 0;
while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| {
(!item.emitted
&& matches!(item.change.kind(), ChangeKind::Addition)
&& filter.map_or(true, |f| f(&item.change)))
.then_some((idx, item))
}) {
dest_idx += dest_ofs;
dest_ofs = dest_idx + 1;
self.items[dest_idx].location(&self.path_backing);
let src = find_match(
&self.items,
dest,
dest_idx,
percentage,
kind,
stats,
objects,
diff_cache,
&self.path_backing,
)?
.map(|(src_idx, src, diff)| {
let (id, entry_mode) = src.change.id_and_entry_mode();
let id = id.to_owned();
let location = src.location(&self.path_backing);
(
visit::Source {
entry_mode,
id,
kind,
location,
change: &src.change,
diff,
},
src_idx,
)
});
let Some((src, src_idx)) = src else {
continue;
};
let location = dest.location(&self.path_backing);
let change = dest.change.clone();
let dest = visit::Destination { change, location };
let relations = if percentage.is_none() {
src.change.relation().zip(dest.change.relation())
} else {
None
};
let res = cb(dest, Some(src));
self.items[dest_idx].emitted = true;
self.items[src_idx].emitted = true;
if res == Action::Cancel {
return Ok(Action::Cancel);
}
match relations {
Some((Relation::Parent(src), Relation::Parent(dst))) => {
let res = self.emit_child_renames_matching_identity(cb, kind, src, dst)?;
if res == Action::Cancel {
return Ok(Action::Cancel);
}
}
Some((Relation::ChildOfParent(src), Relation::ChildOfParent(dst))) => {
self.child_renames.insert((src, dst));
}
_ => {}
}
}
Ok(Action::Continue)
}
/// Emit the children of `src_parent_id` and `dst_parent_id` as pairs of exact matches, which are assumed
/// as `src` and `dst` were an exact match (so all children have to match exactly).
/// Note that we intentionally do not record them as their parents will be emitted, too.
fn emit_child_renames_matching_identity(
&mut self,
cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
kind: visit::SourceKind,
src_parent_id: ChangeId,
dst_parent_id: ChangeId,
) -> Result<Action, emit::Error> {
debug_assert_ne!(
src_parent_id, dst_parent_id,
"src and destination directories must be distinct"
);
let (mut src_items, mut dst_items) = (Vec::with_capacity(1), Vec::with_capacity(1));
for item in self.items.iter_mut().filter(|item| !item.emitted) {
match item.change.relation() {
Some(Relation::ChildOfParent(id)) if id == src_parent_id => {
src_items.push((item.change.id().to_owned(), item));
}
Some(Relation::ChildOfParent(id)) if id == dst_parent_id => {
dst_items.push((item.change.id().to_owned(), item));
}
_ => continue,
};
}
for ((src_id, src_item), (dst_id, dst_item)) in src_items.into_iter().zip(dst_items) {
// Since the parent items are already identical by ID, we know that the children will also match, we just
// double-check to still have a chance to be correct in case some of that goes wrong.
if src_id == dst_id
&& filename(src_item.location(&self.path_backing)) == filename(dst_item.location(&self.path_backing))
{
let entry_mode = src_item.change.entry_mode();
let location = src_item.location(&self.path_backing);
let src = visit::Source {
entry_mode,
id: src_id,
kind,
location,
change: &src_item.change,
diff: None,
};
let location = dst_item.location(&self.path_backing);
let change = dst_item.change.clone();
let dst = visit::Destination { change, location };
let res = cb(dst, Some(src));
src_item.emitted = true;
dst_item.emitted = true;
if res == Action::Cancel {
return Ok(res);
}
} else {
gix_trace::warn!("Children of parents with change-id {src_parent_id} and {dst_parent_id} were not equal, even though their parents claimed to be");
break;
}
}
Ok(Action::Continue)
}
/// Find directories with relation id that haven't been emitted yet and store them for lookup.
/// Then use the previously stored emitted renames with relation id to learn which directories they 'link'
/// and emit them, too.
/// Note that this works whenever top-level directories are renamed because they are always added and deleted,
/// and we only match those. Thus, one rewrite inside the directory is enough.
fn match_renamed_directories(
&mut self,
cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> Action,
) -> Result<(), emit::Error> {
fn unemitted_directory_matching_relation_id<T: Change>(items: &[Item<T>], child_id: ChangeId) -> Option<usize> {
items.iter().position(|i| {
!i.emitted && matches!(i.change.relation(), Some(Relation::Parent(pid)) if pid == child_id)
})
}
for (deleted_child_id, added_child_id) in &self.child_renames {
let Some(src_idx) = unemitted_directory_matching_relation_id(&self.items, *deleted_child_id) else {
continue;
};
let Some(dst_idx) = unemitted_directory_matching_relation_id(&self.items, *added_child_id) else {
// This could go wrong in case there are mismatches, so be defensive here.
// But generally, we'd expect the destination item to exist.
continue;
};
let (src_item, dst_item) = (&self.items[src_idx], &self.items[dst_idx]);
let entry_mode = src_item.change.entry_mode();
let location = src_item.location(&self.path_backing);
let src = visit::Source {
entry_mode,
id: src_item.change.id().to_owned(),
kind: SourceKind::Rename,
location,
change: &src_item.change,
diff: None,
};
let location = dst_item.location(&self.path_backing);
let change = dst_item.change.clone();
let dst = visit::Destination { change, location };
let res = cb(dst, Some(src));
self.items[src_idx].emitted = true;
self.items[dst_idx].emitted = true;
if res == Action::Cancel {
return Ok(());
}
}
Ok(())
}
}
fn filename(path: &BStr) -> &BStr {
path.rfind_byte(b'/').map_or(path, |idx| path[idx + 1..].as_bstr())
}
/// Returns the amount of viable sources and destinations for `items` as eligible for the given `kind` of operation.
fn estimate_involved_items(
items: impl IntoIterator<Item = (bool, ChangeKind)>,
kind: visit::SourceKind,
) -> (usize, usize) {
items
.into_iter()
.filter(|(emitted, _)| match kind {
visit::SourceKind::Rename => !*emitted,
visit::SourceKind::Copy => true,
})
.fold((0, 0), |(mut src, mut dest), (emitted, change_kind)| {
match change_kind {
ChangeKind::Addition => {
if kind == visit::SourceKind::Rename || !emitted {
dest += 1;
}
}
ChangeKind::Deletion => {
if kind == visit::SourceKind::Rename {
src += 1;
}
}
ChangeKind::Modification => {
if kind == visit::SourceKind::Copy {
src += 1;
}
}
}
(src, dest)
})
}
fn needs_exact_match(percentage: Option<f32>) -> bool {
percentage.map_or(true, |p| p >= 1.0)
}
/// <`src_idx`, src, possibly diff stat>
type SourceTuple<'a, T> = (usize, &'a Item<T>, Option<DiffLineStats>);
/// Find `item` in our set of items ignoring `item_idx` to avoid finding ourselves, by similarity indicated by `percentage`.
/// The latter can be `None` or `Some(x)` where `x>=1` for identity, and anything else for similarity.
/// We also ignore emitted items entirely.
/// Use `kind` to indicate what kind of match we are looking for, which might be deletions matching an `item` addition, or
/// any non-deletion otherwise.
/// Note that we always try to find by identity first even if a percentage is given as it's much faster and may reduce the set
/// of items to be searched.
#[allow(clippy::too_many_arguments)]
fn find_match<'a, T: Change>(
items: &'a [Item<T>],
item: &Item<T>,
item_idx: usize,
percentage: Option<f32>,
kind: visit::SourceKind,
stats: &mut Outcome,
objects: &impl gix_object::FindObjectOrHeader,
diff_cache: &mut crate::blob::Platform,
path_backing: &[u8],
) -> Result<Option<SourceTuple<'a, T>>, emit::Error> {
let (item_id, item_mode) = item.change.id_and_entry_mode();
if needs_exact_match(percentage) || item_mode.is_link() {
let first_idx = items.partition_point(|a| a.change.id() < item_id);
let range = items.get(first_idx..).map(|slice| {
let end = slice
.iter()
.position(|a| a.change.id() != item_id)
.map_or(items.len(), |idx| first_idx + idx);
first_idx..end
});
let range = match range {
Some(range) => range,
None => return Ok(None),
};
if range.is_empty() {
return Ok(None);
}
let res = items[range.clone()].iter().enumerate().find_map(|(mut src_idx, src)| {
src_idx += range.start;
(src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)).then_some((src_idx, src, None))
});
if let Some(src) = res {
return Ok(Some(src));
}
} else if item_mode.is_blob() {
let mut has_new = false;
let percentage = percentage.expect("it's set to something below 1.0 and we assured this");
for (can_idx, src) in items
.iter()
.enumerate()
.filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode))
{
if !has_new {
diff_cache.set_resource(
item_id.to_owned(),
item_mode.kind(),
item.location(path_backing),
ResourceKind::NewOrDestination,
objects,
)?;
has_new = true;
}
let (src_id, src_mode) = src.change.id_and_entry_mode();
diff_cache.set_resource(
src_id.to_owned(),
src_mode.kind(),
src.location(path_backing),
ResourceKind::OldOrSource,
objects,
)?;
let prep = diff_cache.prepare_diff()?;
stats.num_similarity_checks += 1;
match prep.operation {
Operation::InternalDiff { algorithm } => {
let tokens =
crate::blob::intern::InternedInput::new(prep.old.intern_source(), prep.new.intern_source());
let counts = crate::blob::diff(
algorithm,
&tokens,
crate::blob::sink::Counter::new(diff::Statistics {
removed_bytes: 0,
input: &tokens,
}),
);
let old_data_len = prep.old.data.as_slice().unwrap_or_default().len();
let new_data_len = prep.new.data.as_slice().unwrap_or_default().len();
let similarity = (old_data_len - counts.wrapped) as f32 / old_data_len.max(new_data_len) as f32;
if similarity >= percentage {
return Ok(Some((
can_idx,
src,
DiffLineStats {
removals: counts.removals,
insertions: counts.insertions,
before: tokens.before.len().try_into().expect("interner handles only u32"),
after: tokens.after.len().try_into().expect("interner handles only u32"),
similarity,
}
.into(),
)));
}
}
Operation::ExternalCommand { .. } => {
unreachable!("we have disabled this possibility with an option")
}
Operation::SourceOrDestinationIsBinary => {
// TODO: figure out if git does more here
}
};
}
}
Ok(None)
}
mod diff {
use std::ops::Range;
pub struct Statistics<'a, 'data> {
pub removed_bytes: usize,
pub input: &'a crate::blob::intern::InternedInput<&'data [u8]>,
}
impl crate::blob::Sink for Statistics<'_, '_> {
type Out = usize;
fn process_change(&mut self, before: Range<u32>, _after: Range<u32>) {
self.removed_bytes += self.input.before[before.start as usize..before.end as usize]
.iter()
.map(|token| self.input.interner[*token].len())
.sum::<usize>();
}
fn finish(self) -> Self::Out {
self.removed_bytes
}
}
}
#[cfg(test)]
mod estimate_involved_items {
use super::estimate_involved_items;
use crate::rewrites::tracker::{visit::SourceKind, ChangeKind};
#[test]
fn renames_count_unemitted_as_sources_and_destinations() {
let items = [
(false, ChangeKind::Addition),
(true, ChangeKind::Deletion),
(true, ChangeKind::Deletion),
];
assert_eq!(
estimate_involved_items(items, SourceKind::Rename),
(0, 1),
"here we only have one eligible source, hence nothing to do"
);
assert_eq!(
estimate_involved_items(items.into_iter().map(|t| (false, t.1)), SourceKind::Rename),
(2, 1),
"now we have more possibilities as renames count un-emitted deletions as source"
);
}
#[test]
fn copies_do_not_count_additions_as_sources() {
let items = [
(false, ChangeKind::Addition),
(true, ChangeKind::Addition),
(true, ChangeKind::Deletion),
];
assert_eq!(
estimate_involved_items(items, SourceKind::Copy),
(0, 1),
"one addition as source, the other isn't counted as it's emitted, nor is it considered a copy-source.\
deletions don't count"
);
}
#[test]
fn copies_count_modifications_as_sources() {
let items = [
(false, ChangeKind::Addition),
(true, ChangeKind::Modification),
(false, ChangeKind::Modification),
];
assert_eq!(
estimate_involved_items(items, SourceKind::Copy),
(2, 1),
"any modifications is a valid source, emitted or not"
);
}
}