1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
//! ### Deviation
//!
//! Note that the algorithm implemented here is in many ways different from what `git` does.
//!
//! - it's less sophisticated and doesn't use any ranking of candidates. Instead, it picks the first possible match.
//! - the set used for copy-detection is probably smaller by default.
use std::ops::Range;

use bstr::BStr;
use gix_object::tree::{EntryKind, EntryMode};

use crate::{
    blob::{platform::prepare_diff::Operation, DiffLineStats, ResourceKind},
    rewrites::{CopySource, Outcome, Tracker},
    Rewrites,
};

/// The kind of a change.
#[derive(Debug, Copy, Clone, Ord, PartialOrd, PartialEq, Eq)]
pub enum ChangeKind {
    /// The change represents the *deletion* of an item.
    Deletion,
    /// The change represents the *modification* of an item.
    Modification,
    /// The change represents the *addition* of an item.
    Addition,
}

/// A trait providing all functionality to abstract over the concept of a change, as seen by the [`Tracker`].
pub trait Change: Clone {
    /// Return the hash of this change for identification.
    ///
    /// Note that this is the id of the object as stored in `git`, i.e. it must have gone through workspace
    /// conversions.
    fn id(&self) -> &gix_hash::oid;
    /// Return the kind of this change.
    fn kind(&self) -> ChangeKind;
    /// Return more information about the kind of entry affected by this change.
    fn entry_mode(&self) -> EntryMode;
    /// Return the id of the change along with its mode.
    fn id_and_entry_mode(&self) -> (&gix_hash::oid, EntryMode);
}

/// A set of tracked items allows to figure out their relations by figuring out their similarity.
pub(crate) struct Item<T> {
    /// The underlying raw change
    change: T,
    /// That slice into the backing for paths.
    path: Range<usize>,
    /// If true, this item was already emitted, i.e. seen by the caller.
    emitted: bool,
}

impl<T: Change> Item<T> {
    fn location<'a>(&self, backing: &'a [u8]) -> &'a BStr {
        backing[self.path.clone()].as_ref()
    }
    fn entry_mode_compatible(&self, mode: EntryMode) -> bool {
        use EntryKind::*;
        matches!(
            (mode.kind(), self.change.entry_mode().kind()),
            (Blob | BlobExecutable, Blob | BlobExecutable) | (Link, Link)
        )
    }

    fn is_source_for_destination_of(&self, kind: visit::SourceKind, dest_item_mode: EntryMode) -> bool {
        self.entry_mode_compatible(dest_item_mode)
            && match kind {
                visit::SourceKind::Rename => !self.emitted && matches!(self.change.kind(), ChangeKind::Deletion),
                visit::SourceKind::Copy => {
                    matches!(self.change.kind(), ChangeKind::Modification)
                }
            }
    }
}

/// A module with types used in the user-callback in [Tracker::emit()](crate::rewrites::Tracker::emit()).
pub mod visit {
    use bstr::BStr;
    use gix_object::tree::EntryMode;

    use crate::blob::DiffLineStats;

    /// The source of a rewrite, rename or copy.
    #[derive(Debug, Clone, PartialEq, PartialOrd)]
    pub struct Source<'a> {
        /// The kind of entry.
        pub entry_mode: EntryMode,
        /// The hash of the state of the source as seen in the object database.
        pub id: gix_hash::ObjectId,
        /// Further specify what kind of source this is.
        pub kind: SourceKind,
        /// The repository-relative location of this entry.
        pub location: &'a BStr,
        /// If this is a rewrite, indicate how many lines would need to change to turn this source into the destination.
        pub diff: Option<DiffLineStats>,
    }

    /// Further identify the kind of [Source].
    #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
    pub enum SourceKind {
        /// This is the source of an entry that was renamed, as `source` was renamed to `destination`.
        Rename,
        /// This is the source of a copy, as `source` was copied into `destination`.
        Copy,
    }

    /// A change along with a location.
    #[derive(Clone)]
    pub struct Destination<'a, T: Clone> {
        /// The change at the given `location`.
        pub change: T,
        /// The repository-relative location of this destination.
        pub location: &'a BStr,
    }
}

///
pub mod emit {
    /// The error returned by [Tracker::emit()](super::Tracker::emit()).
    #[derive(Debug, thiserror::Error)]
    #[allow(missing_docs)]
    pub enum Error {
        #[error("Could not find blob for similarity checking")]
        FindExistingBlob(#[from] gix_object::find::existing_object::Error),
        #[error("Could not obtain exhaustive item set to use as possible sources for copy detection")]
        GetItemsForExhaustiveCopyDetection(#[source] Box<dyn std::error::Error + Send + Sync>),
        #[error(transparent)]
        SetResource(#[from] crate::blob::platform::set_resource::Error),
        #[error(transparent)]
        PrepareDiff(#[from] crate::blob::platform::prepare_diff::Error),
    }
}

/// Lifecycle
impl<T: Change> Tracker<T> {
    /// Create a new instance with `rewrites` configuration.
    pub fn new(rewrites: Rewrites) -> Self {
        Tracker {
            items: vec![],
            path_backing: vec![],
            rewrites,
        }
    }
}

/// build state and find matches.
impl<T: Change> Tracker<T> {
    /// We may refuse the push if that information isn't needed for what we have to track.
    pub fn try_push_change(&mut self, change: T, location: &BStr) -> Option<T> {
        if !change.entry_mode().is_blob_or_symlink() {
            return Some(change);
        }
        let keep = match (self.rewrites.copies, change.kind()) {
            (Some(_find_copies), _) => true,
            (None, ChangeKind::Modification { .. }) => false,
            (None, _) => true,
        };

        if !keep {
            return Some(change);
        }

        let start = self.path_backing.len();
        self.path_backing.extend_from_slice(location);
        self.items.push(Item {
            path: start..self.path_backing.len(),
            change,
            emitted: false,
        });
        None
    }

    /// Can only be called once effectively as it alters its own state to assure each item is only emitted once.
    ///
    /// `cb(destination, source)` is called for each item, either with `Some(source)` if it's
    /// the destination of a copy or rename, or with `None` for source if no relation to other
    /// items in the tracked set exist, which is like saying 'no rename or rewrite or copy' happened.
    ///
    /// `objects` is used to access blob data for similarity checks if required and is taken directly from the object database.
    /// Worktree filters and text conversions will be applied afterwards automatically. Note that object-caching *should not*
    /// be enabled as caching is implemented by `diff_cache`, after all, the blob that's actually diffed is going
    /// through conversion steps.
    ///
    /// `diff_cache` is a way to retain a cache of resources that are prepared for rapid diffing, and it also controls
    /// the diff-algorithm (provided no user-algorithm is set).
    /// Note that we control a few options of `diff_cache` to assure it will ignore external commands.
    /// Note that we do not control how the `diff_cache` converts resources, it's left to the caller to decide
    /// if it should look at what's stored in `git`, or in the working tree, along with all diff-specific conversions.
    ///
    /// `push_source_tree(push_fn: push(change, location))` is a function that is called when the entire tree of the source
    /// should be added as modifications by calling `push` repeatedly to use for perfect copy tracking. Note that `push`
    /// will panic if `change` is not a modification, and it's valid to not call `push` at all.
    pub fn emit<PushSourceTreeFn, E>(
        &mut self,
        mut cb: impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_>>) -> crate::tree::visit::Action,
        diff_cache: &mut crate::blob::Platform,
        objects: &impl gix_object::FindObjectOrHeader,
        mut push_source_tree: PushSourceTreeFn,
    ) -> Result<Outcome, emit::Error>
    where
        PushSourceTreeFn: FnMut(&mut dyn FnMut(T, &BStr)) -> Result<(), E>,
        E: std::error::Error + Send + Sync + 'static,
    {
        diff_cache.options.skip_internal_diff_if_external_is_configured = false;

        fn by_id_and_location<T: Change>(a: &Item<T>, b: &Item<T>) -> std::cmp::Ordering {
            a.change
                .id()
                .cmp(b.change.id())
                .then_with(|| a.path.start.cmp(&b.path.start).then(a.path.end.cmp(&b.path.end)))
        }
        self.items.sort_by(by_id_and_location);

        let mut out = Outcome {
            options: self.rewrites,
            ..Default::default()
        };
        self.match_pairs_of_kind(
            visit::SourceKind::Rename,
            &mut cb,
            self.rewrites.percentage,
            &mut out,
            diff_cache,
            objects,
        )?;

        if let Some(copies) = self.rewrites.copies {
            self.match_pairs_of_kind(
                visit::SourceKind::Copy,
                &mut cb,
                copies.percentage,
                &mut out,
                diff_cache,
                objects,
            )?;

            match copies.source {
                CopySource::FromSetOfModifiedFiles => {}
                CopySource::FromSetOfModifiedFilesAndAllSources => {
                    push_source_tree(&mut |change, location| {
                        assert!(
                            self.try_push_change(change, location).is_none(),
                            "we must accept every change"
                        );
                        // make sure these aren't viable to be emitted anymore.
                        self.items.last_mut().expect("just pushed").emitted = true;
                    })
                    .map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?;
                    self.items.sort_by(by_id_and_location);

                    self.match_pairs_of_kind(
                        visit::SourceKind::Copy,
                        &mut cb,
                        copies.percentage,
                        &mut out,
                        diff_cache,
                        objects,
                    )?;
                }
            }
        }

        self.items
            .sort_by(|a, b| a.location(&self.path_backing).cmp(b.location(&self.path_backing)));
        for item in self.items.drain(..).filter(|item| !item.emitted) {
            if cb(
                visit::Destination {
                    location: item.location(&self.path_backing),
                    change: item.change,
                },
                None,
            ) == crate::tree::visit::Action::Cancel
            {
                break;
            }
        }
        Ok(out)
    }
}

impl<T: Change> Tracker<T> {
    fn match_pairs_of_kind(
        &mut self,
        kind: visit::SourceKind,
        cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_>>) -> crate::tree::visit::Action,
        percentage: Option<f32>,
        out: &mut Outcome,
        diff_cache: &mut crate::blob::Platform,
        objects: &impl gix_object::FindObjectOrHeader,
    ) -> Result<(), emit::Error> {
        // we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively.
        let needs_second_pass = !needs_exact_match(percentage);
        if self.match_pairs(cb, None /* by identity */, kind, out, diff_cache, objects)?
            == crate::tree::visit::Action::Cancel
        {
            return Ok(());
        }
        if needs_second_pass {
            let is_limited = if self.rewrites.limit == 0 {
                false
            } else {
                let (num_src, num_dst) =
                    estimate_involved_items(self.items.iter().map(|item| (item.emitted, item.change.kind())), kind);
                let permutations = num_src * num_dst;
                if permutations > self.rewrites.limit {
                    match kind {
                        visit::SourceKind::Rename => {
                            out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations;
                        }
                        visit::SourceKind::Copy => {
                            out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations;
                        }
                    }
                    true
                } else {
                    false
                }
            };
            if !is_limited {
                self.match_pairs(cb, percentage, kind, out, diff_cache, objects)?;
            }
        }
        Ok(())
    }

    fn match_pairs(
        &mut self,
        cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_>>) -> crate::tree::visit::Action,
        percentage: Option<f32>,
        kind: visit::SourceKind,
        stats: &mut Outcome,
        diff_cache: &mut crate::blob::Platform,
        objects: &impl gix_object::FindObjectOrHeader,
    ) -> Result<crate::tree::visit::Action, emit::Error> {
        let mut dest_ofs = 0;
        while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| {
            (!item.emitted && matches!(item.change.kind(), ChangeKind::Addition)).then_some((idx, item))
        }) {
            dest_idx += dest_ofs;
            dest_ofs = dest_idx + 1;
            let src = find_match(
                &self.items,
                dest,
                dest_idx,
                percentage,
                kind,
                stats,
                objects,
                diff_cache,
                &self.path_backing,
            )?
            .map(|(src_idx, src, diff)| {
                let (id, entry_mode) = src.change.id_and_entry_mode();
                let id = id.to_owned();
                let location = src.location(&self.path_backing);
                (
                    visit::Source {
                        entry_mode,
                        id,
                        kind,
                        location,
                        diff,
                    },
                    src_idx,
                )
            });
            if src.is_none() {
                continue;
            }
            let location = dest.location(&self.path_backing);
            let change = dest.change.clone();
            let dest = visit::Destination { change, location };
            self.items[dest_idx].emitted = true;
            if let Some(src_idx) = src.as_ref().map(|t| t.1) {
                self.items[src_idx].emitted = true;
            }
            if cb(dest, src.map(|t| t.0)) == crate::tree::visit::Action::Cancel {
                return Ok(crate::tree::visit::Action::Cancel);
            }
        }
        Ok(crate::tree::visit::Action::Continue)
    }
}

/// Returns the amount of viable sources and destinations for `items` as eligible for the given `kind` of operation.
fn estimate_involved_items(
    items: impl IntoIterator<Item = (bool, ChangeKind)>,
    kind: visit::SourceKind,
) -> (usize, usize) {
    items
        .into_iter()
        .filter(|(emitted, _)| match kind {
            visit::SourceKind::Rename => !*emitted,
            visit::SourceKind::Copy => true,
        })
        .fold((0, 0), |(mut src, mut dest), (emitted, change_kind)| {
            match change_kind {
                ChangeKind::Addition => {
                    if kind == visit::SourceKind::Rename || !emitted {
                        dest += 1;
                    }
                }
                ChangeKind::Deletion => {
                    if kind == visit::SourceKind::Rename {
                        src += 1
                    }
                }
                ChangeKind::Modification => {
                    if kind == visit::SourceKind::Copy {
                        src += 1
                    }
                }
            }
            (src, dest)
        })
}

fn needs_exact_match(percentage: Option<f32>) -> bool {
    percentage.map_or(true, |p| p >= 1.0)
}

/// <`src_idx`, src, possibly diff stat>
type SourceTuple<'a, T> = (usize, &'a Item<T>, Option<DiffLineStats>);

/// Find `item` in our set of items ignoring `item_idx` to avoid finding ourselves, by similarity indicated by `percentage`.
/// The latter can be `None` or `Some(x)` where `x>=1` for identity, and anything else for similarity.
/// We also ignore emitted items entirely.
/// Use `kind` to indicate what kind of match we are looking for, which might be deletions matching an `item` addition, or
/// any non-deletion otherwise.
/// Note that we always try to find by identity first even if a percentage is given as it's much faster and may reduce the set
/// of items to be searched.
#[allow(clippy::too_many_arguments)]
fn find_match<'a, T: Change>(
    items: &'a [Item<T>],
    item: &Item<T>,
    item_idx: usize,
    percentage: Option<f32>,
    kind: visit::SourceKind,
    stats: &mut Outcome,
    objects: &impl gix_object::FindObjectOrHeader,
    diff_cache: &mut crate::blob::Platform,
    path_backing: &[u8],
) -> Result<Option<SourceTuple<'a, T>>, emit::Error> {
    let (item_id, item_mode) = item.change.id_and_entry_mode();
    if needs_exact_match(percentage) || item_mode.is_link() {
        let first_idx = items.partition_point(|a| a.change.id() < item_id);
        let range = match items.get(first_idx..).map(|items| {
            let end = items
                .iter()
                .position(|a| a.change.id() != item_id)
                .map_or(items.len(), |idx| first_idx + idx);
            first_idx..end
        }) {
            Some(range) => range,
            None => return Ok(None),
        };
        if range.is_empty() {
            return Ok(None);
        }
        let res = items[range.clone()].iter().enumerate().find_map(|(mut src_idx, src)| {
            src_idx += range.start;
            (src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)).then_some((src_idx, src, None))
        });
        if let Some(src) = res {
            return Ok(Some(src));
        }
    } else {
        let mut has_new = false;
        let percentage = percentage.expect("it's set to something below 1.0 and we assured this");
        debug_assert!(
            item_mode.is_blob(),
            "symlinks are matched exactly, and trees aren't used here"
        );

        for (can_idx, src) in items
            .iter()
            .enumerate()
            .filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode))
        {
            if !has_new {
                diff_cache.set_resource(
                    item_id.to_owned(),
                    item_mode.kind(),
                    item.location(path_backing),
                    ResourceKind::NewOrDestination,
                    objects,
                )?;
                has_new = true;
            }
            let (src_id, src_mode) = src.change.id_and_entry_mode();
            diff_cache.set_resource(
                src_id.to_owned(),
                src_mode.kind(),
                src.location(path_backing),
                ResourceKind::OldOrSource,
                objects,
            )?;
            let prep = diff_cache.prepare_diff()?;
            stats.num_similarity_checks += 1;
            match prep.operation {
                Operation::InternalDiff { algorithm } => {
                    let tokens =
                        crate::blob::intern::InternedInput::new(prep.old.intern_source(), prep.new.intern_source());
                    let counts = crate::blob::diff(
                        algorithm,
                        &tokens,
                        crate::blob::sink::Counter::new(diff::Statistics {
                            removed_bytes: 0,
                            input: &tokens,
                        }),
                    );
                    let old_data_len = prep.old.data.as_slice().unwrap_or_default().len();
                    let new_data_len = prep.new.data.as_slice().unwrap_or_default().len();
                    let similarity = (old_data_len - counts.wrapped) as f32 / old_data_len.max(new_data_len) as f32;
                    if similarity >= percentage {
                        return Ok(Some((
                            can_idx,
                            src,
                            DiffLineStats {
                                removals: counts.removals,
                                insertions: counts.insertions,
                                before: tokens.before.len().try_into().expect("interner handles only u32"),
                                after: tokens.after.len().try_into().expect("interner handles only u32"),
                                similarity,
                            }
                            .into(),
                        )));
                    }
                }
                Operation::ExternalCommand { .. } => {
                    unreachable!("we have disabled this possibility with an option")
                }
                Operation::SourceOrDestinationIsBinary => {
                    // TODO: figure out if git does more here
                }
            };
        }
    }
    Ok(None)
}

mod diff {
    use std::ops::Range;

    pub struct Statistics<'a, 'data> {
        pub removed_bytes: usize,
        pub input: &'a crate::blob::intern::InternedInput<&'data [u8]>,
    }

    impl<'a, 'data> crate::blob::Sink for Statistics<'a, 'data> {
        type Out = usize;

        fn process_change(&mut self, before: Range<u32>, _after: Range<u32>) {
            self.removed_bytes = self.input.before[before.start as usize..before.end as usize]
                .iter()
                .map(|token| self.input.interner[*token].len())
                .sum();
        }

        fn finish(self) -> Self::Out {
            self.removed_bytes
        }
    }
}

#[cfg(test)]
mod estimate_involved_items {
    use super::estimate_involved_items;
    use crate::rewrites::tracker::{visit::SourceKind, ChangeKind};

    #[test]
    fn renames_count_unemitted_as_sources_and_destinations() {
        let items = [
            (false, ChangeKind::Addition),
            (true, ChangeKind::Deletion),
            (true, ChangeKind::Deletion),
        ];
        assert_eq!(
            estimate_involved_items(items, SourceKind::Rename),
            (0, 1),
            "here we only have one eligible source, hence nothing to do"
        );
        assert_eq!(
            estimate_involved_items(items.into_iter().map(|t| (false, t.1)), SourceKind::Rename),
            (2, 1),
            "now we have more possibilities as renames count un-emitted deletions as source"
        );
    }

    #[test]
    fn copies_do_not_count_additions_as_sources() {
        let items = [
            (false, ChangeKind::Addition),
            (true, ChangeKind::Addition),
            (true, ChangeKind::Deletion),
        ];
        assert_eq!(
            estimate_involved_items(items, SourceKind::Copy),
            (0, 1),
            "one addition as source, the other isn't counted as it's emitted, nor is it considered a copy-source.\
            deletions don't count"
        );
    }

    #[test]
    fn copies_count_modifications_as_sources() {
        let items = [
            (false, ChangeKind::Addition),
            (true, ChangeKind::Modification),
            (false, ChangeKind::Modification),
        ];
        assert_eq!(
            estimate_involved_items(items, SourceKind::Copy),
            (2, 1),
            "any modifications is a valid source, emitted or not"
        );
    }
}