gix_index/decode/
mod.rs

1use filetime::FileTime;
2
3use crate::{entry, extension, Entry, State, Version};
4
5mod entries;
6///
7pub mod header;
8
9mod error {
10
11    use crate::{decode, extension};
12
13    /// The error returned by [`State::from_bytes()`][crate::State::from_bytes()].
14    #[derive(Debug, thiserror::Error)]
15    #[allow(missing_docs)]
16    pub enum Error {
17        #[error(transparent)]
18        Header(#[from] decode::header::Error),
19        #[error("Could not parse entry at index {index}")]
20        Entry { index: u32 },
21        #[error("Mandatory extension wasn't implemented or malformed.")]
22        Extension(#[from] extension::decode::Error),
23        #[error("Index trailer should have been {expected} bytes long, but was {actual}")]
24        UnexpectedTrailerLength { expected: usize, actual: usize },
25        #[error("Shared index checksum was {actual_checksum} but should have been {expected_checksum}")]
26        ChecksumMismatch {
27            actual_checksum: gix_hash::ObjectId,
28            expected_checksum: gix_hash::ObjectId,
29        },
30    }
31}
32pub use error::Error;
33use gix_features::parallel::InOrderIter;
34
35use crate::util::read_u32;
36
37/// Options to define how to decode an index state [from bytes][State::from_bytes()].
38#[derive(Debug, Default, Clone, Copy)]
39pub struct Options {
40    /// If Some(_), we are allowed to use more than one thread. If Some(N), use no more than N threads. If Some(0)|None, use as many threads
41    /// as there are logical cores.
42    ///
43    /// This applies to loading extensions in parallel to entries if the common EOIE extension is available.
44    /// It also allows to use multiple threads for loading entries if the IEOT extension is present.
45    pub thread_limit: Option<usize>,
46    /// The minimum size in bytes to load extensions in their own thread, assuming there is enough `num_threads` available.
47    /// If set to 0, for example, extensions will always be read in their own thread if enough threads are available.
48    pub min_extension_block_in_bytes_for_threading: usize,
49    /// Set the expected hash of this index if we are read as part of a `link` extension.
50    ///
51    /// We will abort reading this file if it doesn't match.
52    pub expected_checksum: Option<gix_hash::ObjectId>,
53}
54
55impl State {
56    /// Decode an index state from `data` and store `timestamp` in the resulting instance for pass-through, assuming `object_hash`
57    /// to be used through the file. Also return the stored hash over all bytes in `data` or `None` if none was written due to `index.skipHash`.
58    pub fn from_bytes(
59        data: &[u8],
60        timestamp: FileTime,
61        object_hash: gix_hash::Kind,
62        _options @ Options {
63            thread_limit,
64            min_extension_block_in_bytes_for_threading,
65            expected_checksum,
66        }: Options,
67    ) -> Result<(Self, Option<gix_hash::ObjectId>), Error> {
68        let _span = gix_features::trace::detail!("gix_index::State::from_bytes()", options = ?_options);
69        let (version, num_entries, post_header_data) = header::decode(data, object_hash)?;
70        let start_of_extensions = extension::end_of_index_entry::decode(data, object_hash);
71
72        let mut num_threads = gix_features::parallel::num_threads(thread_limit);
73        let path_backing_buffer_size = entries::estimate_path_storage_requirements_in_bytes(
74            num_entries,
75            data.len(),
76            start_of_extensions,
77            object_hash,
78            version,
79        );
80
81        let (entries, ext, data) = match start_of_extensions {
82            Some(offset) if num_threads > 1 => {
83                let extensions_data = &data[offset..];
84                let index_offsets_table = extension::index_entry_offset_table::find(extensions_data, object_hash);
85                let (entries_res, ext_res) = gix_features::parallel::threads(|scope| {
86                    let extension_loading =
87                        (extensions_data.len() > min_extension_block_in_bytes_for_threading).then({
88                            num_threads -= 1;
89                            || {
90                                gix_features::parallel::build_thread()
91                                    .name("gix-index.from_bytes.load-extensions".into())
92                                    .spawn_scoped(scope, || extension::decode::all(extensions_data, object_hash))
93                                    .expect("valid name")
94                            }
95                        });
96                    let entries_res = match index_offsets_table {
97                        Some(entry_offsets) => {
98                            let chunk_size = (entry_offsets.len() as f32 / num_threads as f32).ceil() as usize;
99                            let entry_offsets_chunked = entry_offsets.chunks(chunk_size);
100                            let num_chunks = entry_offsets_chunked.len();
101                            let mut threads = Vec::with_capacity(num_chunks);
102                            for (id, chunks) in entry_offsets_chunked.enumerate() {
103                                let chunks = chunks.to_vec();
104                                threads.push(
105                                    gix_features::parallel::build_thread()
106                                        .name(format!("gix-index.from_bytes.read-entries.{id}"))
107                                        .spawn_scoped(scope, move || {
108                                            let num_entries_for_chunks =
109                                                chunks.iter().map(|c| c.num_entries).sum::<u32>() as usize;
110                                            let mut entries = Vec::with_capacity(num_entries_for_chunks);
111                                            let path_backing_buffer_size_for_chunks =
112                                                entries::estimate_path_storage_requirements_in_bytes(
113                                                    num_entries_for_chunks as u32,
114                                                    data.len() / num_chunks,
115                                                    start_of_extensions.map(|ofs| ofs / num_chunks),
116                                                    object_hash,
117                                                    version,
118                                                );
119                                            let mut path_backing =
120                                                Vec::with_capacity(path_backing_buffer_size_for_chunks);
121                                            let mut is_sparse = false;
122                                            for offset in chunks {
123                                                let (
124                                                    entries::Outcome {
125                                                        is_sparse: chunk_is_sparse,
126                                                    },
127                                                    _data,
128                                                ) = entries::chunk(
129                                                    &data[offset.from_beginning_of_file as usize..],
130                                                    &mut entries,
131                                                    &mut path_backing,
132                                                    offset.num_entries,
133                                                    object_hash,
134                                                    version,
135                                                )?;
136                                                is_sparse |= chunk_is_sparse;
137                                            }
138                                            Ok::<_, Error>((
139                                                id,
140                                                EntriesOutcome {
141                                                    entries,
142                                                    path_backing,
143                                                    is_sparse,
144                                                },
145                                            ))
146                                        })
147                                        .expect("valid name"),
148                                );
149                            }
150                            let mut results =
151                                InOrderIter::from(threads.into_iter().map(|thread| thread.join().unwrap()));
152                            let mut acc = results.next().expect("have at least two results, one per thread");
153                            // We explicitly don't adjust the reserve in acc and rather allow for more copying
154                            // to happens as vectors grow to keep the peak memory size low.
155                            // NOTE: one day, we might use a memory pool for paths. We could encode the block of memory
156                            //       in some bytes in the path offset. That way there is more indirection/slower access
157                            //       to the path, but it would save time here.
158                            //       As it stands, `git` is definitely more efficient at this and probably uses less memory too.
159                            //       Maybe benchmarks can tell if that is noticeable later at 200/400GB/s memory bandwidth, or maybe just
160                            //       100GB/s on a single core.
161                            while let (Ok(lhs), Some(res)) = (acc.as_mut(), results.next()) {
162                                match res {
163                                    Ok(mut rhs) => {
164                                        lhs.is_sparse |= rhs.is_sparse;
165                                        let ofs = lhs.path_backing.len();
166                                        lhs.path_backing.append(&mut rhs.path_backing);
167                                        lhs.entries.extend(rhs.entries.into_iter().map(|mut e| {
168                                            e.path.start += ofs;
169                                            e.path.end += ofs;
170                                            e
171                                        }));
172                                    }
173                                    Err(err) => {
174                                        acc = Err(err);
175                                    }
176                                }
177                            }
178                            acc.map(|acc| (acc, &data[data.len() - object_hash.len_in_bytes()..]))
179                        }
180                        None => entries(
181                            post_header_data,
182                            path_backing_buffer_size,
183                            num_entries,
184                            object_hash,
185                            version,
186                        ),
187                    };
188                    let ext_res = extension_loading.map_or_else(
189                        || extension::decode::all(extensions_data, object_hash),
190                        |thread| thread.join().unwrap(),
191                    );
192                    (entries_res, ext_res)
193                });
194                let (ext, data) = ext_res?;
195                (entries_res?.0, ext, data)
196            }
197            None | Some(_) => {
198                let (entries, data) = entries(
199                    post_header_data,
200                    path_backing_buffer_size,
201                    num_entries,
202                    object_hash,
203                    version,
204                )?;
205                let (ext, data) = extension::decode::all(data, object_hash)?;
206                (entries, ext, data)
207            }
208        };
209
210        if data.len() != object_hash.len_in_bytes() {
211            return Err(Error::UnexpectedTrailerLength {
212                expected: object_hash.len_in_bytes(),
213                actual: data.len(),
214            });
215        }
216
217        let checksum = gix_hash::ObjectId::from_bytes_or_panic(data);
218        let checksum = (!checksum.is_null()).then_some(checksum);
219        if let Some((expected_checksum, actual_checksum)) = expected_checksum.zip(checksum) {
220            if actual_checksum != expected_checksum {
221                return Err(Error::ChecksumMismatch {
222                    actual_checksum,
223                    expected_checksum,
224                });
225            }
226        }
227        let EntriesOutcome {
228            entries,
229            path_backing,
230            mut is_sparse,
231        } = entries;
232        let extension::decode::Outcome {
233            tree,
234            link,
235            resolve_undo,
236            untracked,
237            fs_monitor,
238            is_sparse: is_sparse_from_ext, // a marker is needed in case there are no directories
239            end_of_index,
240            offset_table,
241        } = ext;
242        is_sparse |= is_sparse_from_ext;
243
244        Ok((
245            State {
246                object_hash,
247                timestamp,
248                version,
249                entries,
250                path_backing,
251                is_sparse,
252
253                end_of_index_at_decode_time: end_of_index,
254                offset_table_at_decode_time: offset_table,
255                tree,
256                link,
257                resolve_undo,
258                untracked,
259                fs_monitor,
260            },
261            checksum,
262        ))
263    }
264}
265
266struct EntriesOutcome {
267    pub entries: Vec<Entry>,
268    pub path_backing: Vec<u8>,
269    pub is_sparse: bool,
270}
271
272fn entries(
273    post_header_data: &[u8],
274    path_backing_buffer_size: usize,
275    num_entries: u32,
276    object_hash: gix_hash::Kind,
277    version: Version,
278) -> Result<(EntriesOutcome, &[u8]), Error> {
279    let mut entries = Vec::with_capacity(num_entries as usize);
280    let mut path_backing = Vec::with_capacity(path_backing_buffer_size);
281    entries::chunk(
282        post_header_data,
283        &mut entries,
284        &mut path_backing,
285        num_entries,
286        object_hash,
287        version,
288    )
289    .map(|(entries::Outcome { is_sparse }, data): (entries::Outcome, &[u8])| {
290        (
291            EntriesOutcome {
292                entries,
293                path_backing,
294                is_sparse,
295            },
296            data,
297        )
298    })
299}
300
301pub(crate) fn stat(data: &[u8]) -> Option<(entry::Stat, &[u8])> {
302    let (ctime_secs, data) = read_u32(data)?;
303    let (ctime_nsecs, data) = read_u32(data)?;
304    let (mtime_secs, data) = read_u32(data)?;
305    let (mtime_nsecs, data) = read_u32(data)?;
306    let (dev, data) = read_u32(data)?;
307    let (ino, data) = read_u32(data)?;
308    let (uid, data) = read_u32(data)?;
309    let (gid, data) = read_u32(data)?;
310    let (size, data) = read_u32(data)?;
311    Some((
312        entry::Stat {
313            mtime: entry::stat::Time {
314                secs: ctime_secs,
315                nsecs: ctime_nsecs,
316            },
317            ctime: entry::stat::Time {
318                secs: mtime_secs,
319                nsecs: mtime_nsecs,
320            },
321            dev,
322            ino,
323            uid,
324            gid,
325            size,
326        },
327        data,
328    ))
329}