gix_odb/store_impls/dynamic/
iter.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
use std::{ops::Deref, option::Option::None, sync::Arc, vec::IntoIter};

use gix_hash::ObjectId;

use crate::{
    loose,
    store::{handle, handle::SingleOrMultiIndex, types::PackId},
    store_impls::dynamic,
};

struct EntryForOrdering {
    pack_offset: u64,
    entry_index: u32,
    pack_index: u16,
}

enum State {
    Pack {
        index_iter: IntoIter<handle::IndexLookup>,
        index: handle::IndexLookup,
        ordered_entries: Option<Vec<EntryForOrdering>>,
        entry_index: u32,
        num_objects: u32,
    },
    Loose {
        iter: loose::Iter,
        index: usize,
    },
    Depleted,
}

/// Define the order in which objects are returned.
#[derive(Default, Debug, Copy, Clone)]
pub enum Ordering {
    /// Traverse packs first as sorted by their index files in lexicographical order (sorted by object id), then traverse loose objects
    /// as sorted by their names as well.
    ///
    /// This mode uses no memory as it's the natural ordering of objects, and is best to obtain all object ids as quickly as possible,
    /// while noting that these may contain duplicates. However, it's very costly to obtain object information or decode them with this
    /// scheme as cache-hits are unlikely with it and memory maps are less efficient when loading them in random order.
    #[default]
    PackLexicographicalThenLooseLexicographical,
    /// Traverse packs first yielding object ids sorted by their position in the pack, with those at the beginning of the pack file coming first.
    /// Then follow loose objects sorted by their names.
    ///
    /// This mode allocates and as to pre-sort objects by their offsets, delaying the start of the iteration once per pack while keeping
    /// memory allocated once per pack. This price is usually worth paying once querying object information is planned as pack caches
    /// are more efficiently used that way.
    PackAscendingOffsetThenLooseLexicographical,
}

/// An iterator over all, _possibly duplicate_, objects of an object store, which by default uses no extra memory but yields an
/// order that is costly to traverse when querying object information or decoding them.
///
/// Use [`with_ordering()`][AllObjects::with_ordering()] to choose a performance trade-off.
pub struct AllObjects {
    state: State,
    num_objects: usize,
    loose_dbs: Arc<Vec<loose::Store>>,
    order: Ordering,
}

/// Builder
impl AllObjects {
    /// Set the ordering of the objects returned, trading off memory and latency for object query performance.
    pub fn with_ordering(mut self, order: Ordering) -> Self {
        self.order = order;
        self
    }
}

impl AllObjects {
    /// Create a new iterator from a dynamic store, which will be forced to load all indices eagerly and in the current thread.
    pub fn new(db: &dynamic::Store) -> Result<Self, crate::store::load_index::Error> {
        let snapshot = db.load_all_indices()?;

        let packed_objects = snapshot
            .indices
            .iter()
            .fold(0usize, |dbc, index| dbc.saturating_add(index.num_objects() as usize));
        let mut index_iter = snapshot.indices.into_iter();
        let loose_dbs = snapshot.loose_dbs;
        let order = Default::default();
        let state = match index_iter.next() {
            Some(index) => {
                let num_objects = index.num_objects();
                State::Pack {
                    index_iter,
                    ordered_entries: maybe_sort_entries(&index, order),
                    index,
                    entry_index: 0,
                    num_objects,
                }
            }
            None => {
                let index = 0;
                State::Loose {
                    iter: loose_dbs.get(index).expect("at least one loose db").iter(),
                    index,
                }
            }
        };
        Ok(AllObjects {
            state,
            loose_dbs,
            num_objects: packed_objects,
            order,
        })
    }
}

fn maybe_sort_entries(index: &handle::IndexLookup, order: Ordering) -> Option<Vec<EntryForOrdering>> {
    let mut order: Vec<_> = match order {
        Ordering::PackLexicographicalThenLooseLexicographical => return None,
        Ordering::PackAscendingOffsetThenLooseLexicographical => match &index.file {
            // We know that we cannot have more than u32 entry indices per pack.
            SingleOrMultiIndex::Single { index, .. } => index
                .iter()
                .enumerate()
                .map(|(idx, e)| EntryForOrdering {
                    pack_offset: e.pack_offset,
                    entry_index: idx as u32,
                    pack_index: 0,
                })
                .collect(),
            SingleOrMultiIndex::Multi { index, .. } => index
                .iter()
                .enumerate()
                .map(|(idx, e)| EntryForOrdering {
                    pack_offset: e.pack_offset,
                    entry_index: idx as u32,
                    pack_index: {
                        debug_assert!(
                            e.pack_index < PackId::max_packs_in_multi_index(),
                            "this shows the relation between u16 and pack_index (u32) and why this is OK"
                        );
                        e.pack_index as u16
                    },
                })
                .collect(),
        },
    };
    order.sort_by(|a, b| {
        a.pack_index
            .cmp(&b.pack_index)
            .then_with(|| a.pack_offset.cmp(&b.pack_offset))
    });
    Some(order)
}

impl Iterator for AllObjects {
    type Item = Result<ObjectId, loose::iter::Error>;

    fn next(&mut self) -> Option<Self::Item> {
        match &mut self.state {
            State::Depleted => None,
            State::Pack {
                index_iter,
                ordered_entries,
                index,
                entry_index,
                num_objects,
            } => {
                if *entry_index < *num_objects {
                    let oid = match ordered_entries {
                        Some(entries) => index.oid_at_index(entries[*entry_index as usize].entry_index),
                        None => index.oid_at_index(*entry_index),
                    }
                    .to_owned();
                    *entry_index += 1;
                    Some(Ok(oid))
                } else {
                    match index_iter.next() {
                        Some(new_index) => {
                            *ordered_entries = maybe_sort_entries(&new_index, self.order);
                            *index = new_index;
                            *entry_index = 0;
                            *num_objects = index.num_objects();
                        }
                        None => {
                            let index = 0;
                            self.state = State::Loose {
                                iter: self.loose_dbs.get(index).expect("at least one loose odb").iter(),
                                index,
                            }
                        }
                    }
                    self.next()
                }
            }
            State::Loose { iter, index } => match iter.next() {
                Some(id) => Some(id),
                None => {
                    *index += 1;
                    match self.loose_dbs.get(*index).map(loose::Store::iter) {
                        Some(new_iter) => {
                            *iter = new_iter;
                            self.next()
                        }
                        None => {
                            self.state = State::Depleted;
                            None
                        }
                    }
                }
            },
        }
    }

    fn size_hint(&self) -> (usize, Option<usize>) {
        (self.num_objects, None)
    }
}

impl<S> super::Handle<S>
where
    S: Deref<Target = super::Store> + Clone,
{
    /// Return an iterator over all, _possibly duplicate_, objects, first the ones in all packs of all linked databases (via alternates),
    /// followed by all loose objects.
    pub fn iter(&self) -> Result<AllObjects, dynamic::load_index::Error> {
        AllObjects::new(self.store_ref())
    }
}

impl dynamic::Store {
    /// Like [`Handle::iter()`][super::Handle::iter()], but accessible directly on the store.
    pub fn iter(&self) -> Result<AllObjects, dynamic::load_index::Error> {
        AllObjects::new(self)
    }
}