1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
use filetime::FileTime;

use crate::{entry, extension, Entry, State, Version};

mod entries;
///
pub mod header;

mod error {

    use crate::{decode, extension};

    /// The error returned by [`State::from_bytes()`][crate::State::from_bytes()].
    #[derive(Debug, thiserror::Error)]
    #[allow(missing_docs)]
    pub enum Error {
        #[error(transparent)]
        Header(#[from] decode::header::Error),
        #[error("Could not parse entry at index {index}")]
        Entry { index: u32 },
        #[error("Mandatory extension wasn't implemented or malformed.")]
        Extension(#[from] extension::decode::Error),
        #[error("Index trailer should have been {expected} bytes long, but was {actual}")]
        UnexpectedTrailerLength { expected: usize, actual: usize },
        #[error("Shared index checksum was {actual_checksum} but should have been {expected_checksum}")]
        ChecksumMismatch {
            actual_checksum: gix_hash::ObjectId,
            expected_checksum: gix_hash::ObjectId,
        },
    }
}
pub use error::Error;
use gix_features::parallel::InOrderIter;

use crate::util::read_u32;

/// Options to define how to decode an index state [from bytes][State::from_bytes()].
#[derive(Default, Clone, Copy)]
pub struct Options {
    /// If Some(_), we are allowed to use more than one thread. If Some(N), use no more than N threads. If Some(0)|None, use as many threads
    /// as there are logical cores.
    ///
    /// This applies to loading extensions in parallel to entries if the common EOIE extension is available.
    /// It also allows to use multiple threads for loading entries if the IEOT extension is present.
    pub thread_limit: Option<usize>,
    /// The minimum size in bytes to load extensions in their own thread, assuming there is enough `num_threads` available.
    /// If set to 0, for example, extensions will always be read in their own thread if enough threads are available.
    pub min_extension_block_in_bytes_for_threading: usize,
    /// Set the expected hash of this index if we are read as part of a `link` extension.
    ///
    /// We will abort reading this file if it doesn't match.
    pub expected_checksum: Option<gix_hash::ObjectId>,
}

impl State {
    /// Decode an index state from `data` and store `timestamp` in the resulting instance for pass-through, assuming `object_hash`
    /// to be used through the file. Also return the stored hash over all bytes in `data` or `None` if none was written due to `index.skipHash`.
    pub fn from_bytes(
        data: &[u8],
        timestamp: FileTime,
        object_hash: gix_hash::Kind,
        Options {
            thread_limit,
            min_extension_block_in_bytes_for_threading,
            expected_checksum,
        }: Options,
    ) -> Result<(Self, Option<gix_hash::ObjectId>), Error> {
        let _span = gix_features::trace::detail!("gix_index::State::from_bytes()");
        let (version, num_entries, post_header_data) = header::decode(data, object_hash)?;
        let start_of_extensions = extension::end_of_index_entry::decode(data, object_hash);

        let mut num_threads = gix_features::parallel::num_threads(thread_limit);
        let path_backing_buffer_size = entries::estimate_path_storage_requirements_in_bytes(
            num_entries,
            data.len(),
            start_of_extensions,
            object_hash,
            version,
        );

        let (entries, ext, data) = match start_of_extensions {
            Some(offset) if num_threads > 1 => {
                let extensions_data = &data[offset..];
                let index_offsets_table = extension::index_entry_offset_table::find(extensions_data, object_hash);
                let (entries_res, ext_res) = gix_features::parallel::threads(|scope| {
                    let extension_loading =
                        (extensions_data.len() > min_extension_block_in_bytes_for_threading).then({
                            num_threads -= 1;
                            || {
                                gix_features::parallel::build_thread()
                                    .name("gix-index.from_bytes.load-extensions".into())
                                    .spawn_scoped(scope, || extension::decode::all(extensions_data, object_hash))
                                    .expect("valid name")
                            }
                        });
                    let entries_res = match index_offsets_table {
                        Some(entry_offsets) => {
                            let chunk_size = (entry_offsets.len() as f32 / num_threads as f32).ceil() as usize;
                            let num_chunks = entry_offsets.chunks(chunk_size).count();
                            let mut threads = Vec::with_capacity(num_chunks);
                            for (id, chunks) in entry_offsets.chunks(chunk_size).enumerate() {
                                let chunks = chunks.to_vec();
                                threads.push(
                                    gix_features::parallel::build_thread()
                                        .name(format!("gix-index.from_bytes.read-entries.{id}"))
                                        .spawn_scoped(scope, move || {
                                            let num_entries_for_chunks =
                                                chunks.iter().map(|c| c.num_entries).sum::<u32>() as usize;
                                            let mut entries = Vec::with_capacity(num_entries_for_chunks);
                                            let path_backing_buffer_size_for_chunks =
                                                entries::estimate_path_storage_requirements_in_bytes(
                                                    num_entries_for_chunks as u32,
                                                    data.len() / num_chunks,
                                                    start_of_extensions.map(|ofs| ofs / num_chunks),
                                                    object_hash,
                                                    version,
                                                );
                                            let mut path_backing =
                                                Vec::with_capacity(path_backing_buffer_size_for_chunks);
                                            let mut is_sparse = false;
                                            for offset in chunks {
                                                let (
                                                    entries::Outcome {
                                                        is_sparse: chunk_is_sparse,
                                                    },
                                                    _data,
                                                ) = entries::chunk(
                                                    &data[offset.from_beginning_of_file as usize..],
                                                    &mut entries,
                                                    &mut path_backing,
                                                    offset.num_entries,
                                                    object_hash,
                                                    version,
                                                )?;
                                                is_sparse |= chunk_is_sparse;
                                            }
                                            Ok::<_, Error>((
                                                id,
                                                EntriesOutcome {
                                                    entries,
                                                    path_backing,
                                                    is_sparse,
                                                },
                                            ))
                                        })
                                        .expect("valid name"),
                                );
                            }
                            let mut results =
                                InOrderIter::from(threads.into_iter().map(|thread| thread.join().unwrap()));
                            let mut acc = results.next().expect("have at least two results, one per thread");
                            // We explicitly don't adjust the reserve in acc and rather allow for more copying
                            // to happens as vectors grow to keep the peak memory size low.
                            // NOTE: one day, we might use a memory pool for paths. We could encode the block of memory
                            //       in some bytes in the path offset. That way there is more indirection/slower access
                            //       to the path, but it would save time here.
                            //       As it stands, `git` is definitely more efficient at this and probably uses less memory too.
                            //       Maybe benchmarks can tell if that is noticeable later at 200/400GB/s memory bandwidth, or maybe just
                            //       100GB/s on a single core.
                            while let (Ok(lhs), Some(res)) = (acc.as_mut(), results.next()) {
                                match res {
                                    Ok(rhs) => {
                                        lhs.is_sparse |= rhs.is_sparse;
                                        let ofs = lhs.path_backing.len();
                                        lhs.path_backing.extend(rhs.path_backing);
                                        lhs.entries.extend(rhs.entries.into_iter().map(|mut e| {
                                            e.path.start += ofs;
                                            e.path.end += ofs;
                                            e
                                        }));
                                    }
                                    Err(err) => {
                                        acc = Err(err);
                                    }
                                }
                            }
                            acc.map(|acc| (acc, &data[data.len() - object_hash.len_in_bytes()..]))
                        }
                        None => entries(
                            post_header_data,
                            path_backing_buffer_size,
                            num_entries,
                            object_hash,
                            version,
                        ),
                    };
                    let ext_res = extension_loading.map_or_else(
                        || extension::decode::all(extensions_data, object_hash),
                        |thread| thread.join().unwrap(),
                    );
                    (entries_res, ext_res)
                });
                let (ext, data) = ext_res?;
                (entries_res?.0, ext, data)
            }
            None | Some(_) => {
                let (entries, data) = entries(
                    post_header_data,
                    path_backing_buffer_size,
                    num_entries,
                    object_hash,
                    version,
                )?;
                let (ext, data) = extension::decode::all(data, object_hash)?;
                (entries, ext, data)
            }
        };

        if data.len() != object_hash.len_in_bytes() {
            return Err(Error::UnexpectedTrailerLength {
                expected: object_hash.len_in_bytes(),
                actual: data.len(),
            });
        }

        let checksum = gix_hash::ObjectId::from_bytes_or_panic(data);
        let checksum = (!checksum.is_null()).then_some(checksum);
        if let Some((expected_checksum, actual_checksum)) = expected_checksum.zip(checksum) {
            if actual_checksum != expected_checksum {
                return Err(Error::ChecksumMismatch {
                    actual_checksum,
                    expected_checksum,
                });
            }
        }
        let EntriesOutcome {
            entries,
            path_backing,
            mut is_sparse,
        } = entries;
        let extension::decode::Outcome {
            tree,
            link,
            resolve_undo,
            untracked,
            fs_monitor,
            is_sparse: is_sparse_from_ext, // a marker is needed in case there are no directories
        } = ext;
        is_sparse |= is_sparse_from_ext;

        Ok((
            State {
                object_hash,
                timestamp,
                version,
                entries,
                path_backing,
                is_sparse,

                tree,
                link,
                resolve_undo,
                untracked,
                fs_monitor,
            },
            checksum,
        ))
    }
}

struct EntriesOutcome {
    pub entries: Vec<Entry>,
    pub path_backing: Vec<u8>,
    pub is_sparse: bool,
}

fn entries(
    post_header_data: &[u8],
    path_backing_buffer_size: usize,
    num_entries: u32,
    object_hash: gix_hash::Kind,
    version: Version,
) -> Result<(EntriesOutcome, &[u8]), Error> {
    let mut entries = Vec::with_capacity(num_entries as usize);
    let mut path_backing = Vec::with_capacity(path_backing_buffer_size);
    entries::chunk(
        post_header_data,
        &mut entries,
        &mut path_backing,
        num_entries,
        object_hash,
        version,
    )
    .map(|(entries::Outcome { is_sparse }, data): (entries::Outcome, &[u8])| {
        (
            EntriesOutcome {
                entries,
                path_backing,
                is_sparse,
            },
            data,
        )
    })
}

pub(crate) fn stat(data: &[u8]) -> Option<(entry::Stat, &[u8])> {
    let (ctime_secs, data) = read_u32(data)?;
    let (ctime_nsecs, data) = read_u32(data)?;
    let (mtime_secs, data) = read_u32(data)?;
    let (mtime_nsecs, data) = read_u32(data)?;
    let (dev, data) = read_u32(data)?;
    let (ino, data) = read_u32(data)?;
    let (uid, data) = read_u32(data)?;
    let (gid, data) = read_u32(data)?;
    let (size, data) = read_u32(data)?;
    Some((
        entry::Stat {
            mtime: entry::stat::Time {
                secs: ctime_secs,
                nsecs: ctime_nsecs,
            },
            ctime: entry::stat::Time {
                secs: mtime_secs,
                nsecs: mtime_nsecs,
            },
            dev,
            ino,
            uid,
            gid,
            size,
        },
        data,
    ))
}