use std::ops::Range;
use bstr::BStr;
use gix_object::tree::{EntryKind, EntryMode};
use crate::{
blob::{platform::prepare_diff::Operation, DiffLineStats, ResourceKind},
rewrites::{CopySource, Outcome, Tracker},
Rewrites,
};
#[derive(Debug, Copy, Clone, Ord, PartialOrd, PartialEq, Eq)]
pub enum ChangeKind {
Deletion,
Modification,
Addition,
}
pub trait Change: Clone {
fn id(&self) -> &gix_hash::oid;
fn kind(&self) -> ChangeKind;
fn entry_mode(&self) -> EntryMode;
fn id_and_entry_mode(&self) -> (&gix_hash::oid, EntryMode);
}
pub(crate) struct Item<T> {
change: T,
path: Range<usize>,
emitted: bool,
}
impl<T: Change> Item<T> {
fn location<'a>(&self, backing: &'a [u8]) -> &'a BStr {
backing[self.path.clone()].as_ref()
}
fn entry_mode_compatible(&self, mode: EntryMode) -> bool {
use EntryKind::*;
matches!(
(mode.kind(), self.change.entry_mode().kind()),
(Blob | BlobExecutable, Blob | BlobExecutable) | (Link, Link)
)
}
fn is_source_for_destination_of(&self, kind: visit::SourceKind, dest_item_mode: EntryMode) -> bool {
self.entry_mode_compatible(dest_item_mode)
&& match kind {
visit::SourceKind::Rename => !self.emitted && matches!(self.change.kind(), ChangeKind::Deletion),
visit::SourceKind::Copy => {
matches!(self.change.kind(), ChangeKind::Modification)
}
}
}
}
pub mod visit {
use bstr::BStr;
use gix_object::tree::EntryMode;
use crate::blob::DiffLineStats;
#[derive(Debug, Clone, PartialEq, PartialOrd)]
pub struct Source<'a, T> {
pub entry_mode: EntryMode,
pub id: gix_hash::ObjectId,
pub kind: SourceKind,
pub location: &'a BStr,
pub change: &'a T,
pub diff: Option<DiffLineStats>,
}
#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
pub enum SourceKind {
Rename,
Copy,
}
#[derive(Clone)]
pub struct Destination<'a, T: Clone> {
pub change: T,
pub location: &'a BStr,
}
}
#[allow(clippy::empty_docs)]
pub mod emit {
#[derive(Debug, thiserror::Error)]
#[allow(missing_docs)]
pub enum Error {
#[error("Could not find blob for similarity checking")]
FindExistingBlob(#[from] gix_object::find::existing_object::Error),
#[error("Could not obtain exhaustive item set to use as possible sources for copy detection")]
GetItemsForExhaustiveCopyDetection(#[source] Box<dyn std::error::Error + Send + Sync>),
#[error(transparent)]
SetResource(#[from] crate::blob::platform::set_resource::Error),
#[error(transparent)]
PrepareDiff(#[from] crate::blob::platform::prepare_diff::Error),
}
}
impl<T: Change> Tracker<T> {
pub fn new(rewrites: Rewrites) -> Self {
Tracker {
items: vec![],
path_backing: vec![],
rewrites,
}
}
}
impl<T: Change> Tracker<T> {
pub fn try_push_change(&mut self, change: T, location: &BStr) -> Option<T> {
if !change.entry_mode().is_blob_or_symlink() {
return Some(change);
}
let keep = match (self.rewrites.copies, change.kind()) {
(Some(_find_copies), _) => true,
(None, ChangeKind::Modification { .. }) => false,
(None, _) => true,
};
if !keep {
return Some(change);
}
let start = self.path_backing.len();
self.path_backing.extend_from_slice(location);
self.items.push(Item {
path: start..self.path_backing.len(),
change,
emitted: false,
});
None
}
pub fn emit<PushSourceTreeFn, E>(
&mut self,
mut cb: impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> crate::tree::visit::Action,
diff_cache: &mut crate::blob::Platform,
objects: &impl gix_object::FindObjectOrHeader,
mut push_source_tree: PushSourceTreeFn,
) -> Result<Outcome, emit::Error>
where
PushSourceTreeFn: FnMut(&mut dyn FnMut(T, &BStr)) -> Result<(), E>,
E: std::error::Error + Send + Sync + 'static,
{
diff_cache.options.skip_internal_diff_if_external_is_configured = false;
fn by_id_and_location<T: Change>(a: &Item<T>, b: &Item<T>) -> std::cmp::Ordering {
a.change
.id()
.cmp(b.change.id())
.then_with(|| a.path.start.cmp(&b.path.start).then(a.path.end.cmp(&b.path.end)))
}
self.items.sort_by(by_id_and_location);
let mut out = Outcome {
options: self.rewrites,
..Default::default()
};
self.match_pairs_of_kind(
visit::SourceKind::Rename,
&mut cb,
self.rewrites.percentage,
&mut out,
diff_cache,
objects,
)?;
if let Some(copies) = self.rewrites.copies {
self.match_pairs_of_kind(
visit::SourceKind::Copy,
&mut cb,
copies.percentage,
&mut out,
diff_cache,
objects,
)?;
match copies.source {
CopySource::FromSetOfModifiedFiles => {}
CopySource::FromSetOfModifiedFilesAndAllSources => {
push_source_tree(&mut |change, location| {
assert!(
self.try_push_change(change, location).is_none(),
"we must accept every change"
);
self.items.last_mut().expect("just pushed").emitted = true;
})
.map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?;
self.items.sort_by(by_id_and_location);
self.match_pairs_of_kind(
visit::SourceKind::Copy,
&mut cb,
copies.percentage,
&mut out,
diff_cache,
objects,
)?;
}
}
}
self.items
.sort_by(|a, b| a.location(&self.path_backing).cmp(b.location(&self.path_backing)));
for item in self.items.drain(..).filter(|item| !item.emitted) {
if cb(
visit::Destination {
location: item.location(&self.path_backing),
change: item.change,
},
None,
) == crate::tree::visit::Action::Cancel
{
break;
}
}
Ok(out)
}
}
impl<T: Change> Tracker<T> {
fn match_pairs_of_kind(
&mut self,
kind: visit::SourceKind,
cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> crate::tree::visit::Action,
percentage: Option<f32>,
out: &mut Outcome,
diff_cache: &mut crate::blob::Platform,
objects: &impl gix_object::FindObjectOrHeader,
) -> Result<(), emit::Error> {
let needs_second_pass = !needs_exact_match(percentage);
if self.match_pairs(cb, None , kind, out, diff_cache, objects)?
== crate::tree::visit::Action::Cancel
{
return Ok(());
}
if needs_second_pass {
let is_limited = if self.rewrites.limit == 0 {
false
} else {
let (num_src, num_dst) =
estimate_involved_items(self.items.iter().map(|item| (item.emitted, item.change.kind())), kind);
let permutations = num_src * num_dst;
if permutations > self.rewrites.limit {
match kind {
visit::SourceKind::Rename => {
out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations;
}
visit::SourceKind::Copy => {
out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations;
}
}
true
} else {
false
}
};
if !is_limited {
self.match_pairs(cb, percentage, kind, out, diff_cache, objects)?;
}
}
Ok(())
}
fn match_pairs(
&mut self,
cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> crate::tree::visit::Action,
percentage: Option<f32>,
kind: visit::SourceKind,
stats: &mut Outcome,
diff_cache: &mut crate::blob::Platform,
objects: &impl gix_object::FindObjectOrHeader,
) -> Result<crate::tree::visit::Action, emit::Error> {
let mut dest_ofs = 0;
while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| {
(!item.emitted && matches!(item.change.kind(), ChangeKind::Addition)).then_some((idx, item))
}) {
dest_idx += dest_ofs;
dest_ofs = dest_idx + 1;
let src = find_match(
&self.items,
dest,
dest_idx,
percentage,
kind,
stats,
objects,
diff_cache,
&self.path_backing,
)?
.map(|(src_idx, src, diff)| {
let (id, entry_mode) = src.change.id_and_entry_mode();
let id = id.to_owned();
let location = src.location(&self.path_backing);
(
visit::Source {
entry_mode,
id,
kind,
location,
change: &src.change,
diff,
},
src_idx,
)
});
if src.is_none() {
continue;
}
let location = dest.location(&self.path_backing);
let change = dest.change.clone();
let dest = visit::Destination { change, location };
let src_idx = src.as_ref().map(|t| t.1);
let res = cb(dest, src.map(|t| t.0));
self.items[dest_idx].emitted = true;
if let Some(src_idx) = src_idx {
self.items[src_idx].emitted = true;
}
if res == crate::tree::visit::Action::Cancel {
return Ok(crate::tree::visit::Action::Cancel);
}
}
Ok(crate::tree::visit::Action::Continue)
}
}
fn estimate_involved_items(
items: impl IntoIterator<Item = (bool, ChangeKind)>,
kind: visit::SourceKind,
) -> (usize, usize) {
items
.into_iter()
.filter(|(emitted, _)| match kind {
visit::SourceKind::Rename => !*emitted,
visit::SourceKind::Copy => true,
})
.fold((0, 0), |(mut src, mut dest), (emitted, change_kind)| {
match change_kind {
ChangeKind::Addition => {
if kind == visit::SourceKind::Rename || !emitted {
dest += 1;
}
}
ChangeKind::Deletion => {
if kind == visit::SourceKind::Rename {
src += 1
}
}
ChangeKind::Modification => {
if kind == visit::SourceKind::Copy {
src += 1
}
}
}
(src, dest)
})
}
fn needs_exact_match(percentage: Option<f32>) -> bool {
percentage.map_or(true, |p| p >= 1.0)
}
type SourceTuple<'a, T> = (usize, &'a Item<T>, Option<DiffLineStats>);
#[allow(clippy::too_many_arguments)]
fn find_match<'a, T: Change>(
items: &'a [Item<T>],
item: &Item<T>,
item_idx: usize,
percentage: Option<f32>,
kind: visit::SourceKind,
stats: &mut Outcome,
objects: &impl gix_object::FindObjectOrHeader,
diff_cache: &mut crate::blob::Platform,
path_backing: &[u8],
) -> Result<Option<SourceTuple<'a, T>>, emit::Error> {
let (item_id, item_mode) = item.change.id_and_entry_mode();
if needs_exact_match(percentage) || item_mode.is_link() {
let first_idx = items.partition_point(|a| a.change.id() < item_id);
let range = items.get(first_idx..).map(|items| {
let end = items
.iter()
.position(|a| a.change.id() != item_id)
.map_or(items.len(), |idx| first_idx + idx);
first_idx..end
});
let range = match range {
Some(range) => range,
None => return Ok(None),
};
if range.is_empty() {
return Ok(None);
}
let res = items[range.clone()].iter().enumerate().find_map(|(mut src_idx, src)| {
src_idx += range.start;
(src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)).then_some((src_idx, src, None))
});
if let Some(src) = res {
return Ok(Some(src));
}
} else {
let mut has_new = false;
let percentage = percentage.expect("it's set to something below 1.0 and we assured this");
debug_assert!(
item_mode.is_blob(),
"symlinks are matched exactly, and trees aren't used here"
);
for (can_idx, src) in items
.iter()
.enumerate()
.filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode))
{
if !has_new {
diff_cache.set_resource(
item_id.to_owned(),
item_mode.kind(),
item.location(path_backing),
ResourceKind::NewOrDestination,
objects,
)?;
has_new = true;
}
let (src_id, src_mode) = src.change.id_and_entry_mode();
diff_cache.set_resource(
src_id.to_owned(),
src_mode.kind(),
src.location(path_backing),
ResourceKind::OldOrSource,
objects,
)?;
let prep = diff_cache.prepare_diff()?;
stats.num_similarity_checks += 1;
match prep.operation {
Operation::InternalDiff { algorithm } => {
let tokens =
crate::blob::intern::InternedInput::new(prep.old.intern_source(), prep.new.intern_source());
let counts = crate::blob::diff(
algorithm,
&tokens,
crate::blob::sink::Counter::new(diff::Statistics {
removed_bytes: 0,
input: &tokens,
}),
);
let old_data_len = prep.old.data.as_slice().unwrap_or_default().len();
let new_data_len = prep.new.data.as_slice().unwrap_or_default().len();
let similarity = (old_data_len - counts.wrapped) as f32 / old_data_len.max(new_data_len) as f32;
if similarity >= percentage {
return Ok(Some((
can_idx,
src,
DiffLineStats {
removals: counts.removals,
insertions: counts.insertions,
before: tokens.before.len().try_into().expect("interner handles only u32"),
after: tokens.after.len().try_into().expect("interner handles only u32"),
similarity,
}
.into(),
)));
}
}
Operation::ExternalCommand { .. } => {
unreachable!("we have disabled this possibility with an option")
}
Operation::SourceOrDestinationIsBinary => {
}
};
}
}
Ok(None)
}
mod diff {
use std::ops::Range;
pub struct Statistics<'a, 'data> {
pub removed_bytes: usize,
pub input: &'a crate::blob::intern::InternedInput<&'data [u8]>,
}
impl<'a, 'data> crate::blob::Sink for Statistics<'a, 'data> {
type Out = usize;
fn process_change(&mut self, before: Range<u32>, _after: Range<u32>) {
self.removed_bytes += self.input.before[before.start as usize..before.end as usize]
.iter()
.map(|token| self.input.interner[*token].len())
.sum::<usize>();
}
fn finish(self) -> Self::Out {
self.removed_bytes
}
}
}
#[cfg(test)]
mod estimate_involved_items {
use super::estimate_involved_items;
use crate::rewrites::tracker::{visit::SourceKind, ChangeKind};
#[test]
fn renames_count_unemitted_as_sources_and_destinations() {
let items = [
(false, ChangeKind::Addition),
(true, ChangeKind::Deletion),
(true, ChangeKind::Deletion),
];
assert_eq!(
estimate_involved_items(items, SourceKind::Rename),
(0, 1),
"here we only have one eligible source, hence nothing to do"
);
assert_eq!(
estimate_involved_items(items.into_iter().map(|t| (false, t.1)), SourceKind::Rename),
(2, 1),
"now we have more possibilities as renames count un-emitted deletions as source"
);
}
#[test]
fn copies_do_not_count_additions_as_sources() {
let items = [
(false, ChangeKind::Addition),
(true, ChangeKind::Addition),
(true, ChangeKind::Deletion),
];
assert_eq!(
estimate_involved_items(items, SourceKind::Copy),
(0, 1),
"one addition as source, the other isn't counted as it's emitted, nor is it considered a copy-source.\
deletions don't count"
);
}
#[test]
fn copies_count_modifications_as_sources() {
let items = [
(false, ChangeKind::Addition),
(true, ChangeKind::Modification),
(false, ChangeKind::Modification),
];
assert_eq!(
estimate_involved_items(items, SourceKind::Copy),
(2, 1),
"any modifications is a valid source, emitted or not"
);
}
}