gix_pack/multi_index/
chunk.rs

1/// Information for the chunk about index names
2pub mod index_names {
3    use std::path::{Path, PathBuf};
4
5    use gix_object::bstr::{BString, ByteSlice};
6
7    /// The ID used for the index-names chunk.
8    pub const ID: gix_chunk::Id = *b"PNAM";
9
10    ///
11    pub mod decode {
12        use gix_object::bstr::BString;
13
14        /// The error returned by [`from_bytes()`][super::from_bytes()].
15        #[derive(Debug, thiserror::Error)]
16        #[allow(missing_docs)]
17        pub enum Error {
18            #[error("The pack names were not ordered alphabetically.")]
19            NotOrderedAlphabetically,
20            #[error("Each pack path name must be terminated with a null byte")]
21            MissingNullByte,
22            #[error("Couldn't turn path '{path}' into OS path due to encoding issues")]
23            PathEncoding { path: BString },
24            #[error("non-padding bytes found after all paths were read.")]
25            UnknownTrailerBytes,
26        }
27    }
28
29    /// Parse null-separated index names from the given `chunk` of bytes and the expected number of packs and indices.
30    /// Ignore padding bytes which are typically \0.
31    pub fn from_bytes(mut chunk: &[u8], num_packs: u32) -> Result<Vec<PathBuf>, decode::Error> {
32        let mut out = Vec::new();
33        for _ in 0..num_packs {
34            let null_byte_pos = chunk.find_byte(b'\0').ok_or(decode::Error::MissingNullByte)?;
35
36            let path = &chunk[..null_byte_pos];
37            let path = gix_path::try_from_byte_slice(path)
38                .map_err(|_| decode::Error::PathEncoding {
39                    path: BString::from(path),
40                })?
41                .to_owned();
42
43            if let Some(previous) = out.last() {
44                if previous >= &path {
45                    return Err(decode::Error::NotOrderedAlphabetically);
46                }
47            }
48            out.push(path);
49
50            chunk = &chunk[null_byte_pos + 1..];
51        }
52
53        if !chunk.is_empty() && !chunk.iter().all(|b| *b == 0) {
54            return Err(decode::Error::UnknownTrailerBytes);
55        }
56        // NOTE: git writes garbage into this chunk, usually extra \0 bytes, which we simply ignore. If we were strict
57        // about it we couldn't read this chunk data at all.
58        Ok(out)
59    }
60
61    /// Calculate the size on disk for our chunk with the given index paths. Note that these are expected to have been processed already
62    /// to actually be file names.
63    pub fn storage_size(paths: impl IntoIterator<Item = impl AsRef<Path>>) -> u64 {
64        let mut count = 0u64;
65        for path in paths {
66            let path = path.as_ref();
67            let ascii_path = path.to_str().expect("UTF-8 compatible paths");
68            assert!(
69                ascii_path.is_ascii(),
70                "must use ascii bytes for correct size computation"
71            );
72            count += (ascii_path.len() + 1/* null byte */) as u64;
73        }
74
75        let needed_alignment = CHUNK_ALIGNMENT - (count % CHUNK_ALIGNMENT);
76        if needed_alignment < CHUNK_ALIGNMENT {
77            count += needed_alignment;
78        }
79        count
80    }
81
82    /// Write all `paths` in order to `out`, including padding.
83    pub fn write(
84        paths: impl IntoIterator<Item = impl AsRef<Path>>,
85        out: &mut dyn std::io::Write,
86    ) -> std::io::Result<()> {
87        let mut written_bytes = 0;
88        for path in paths {
89            let path = path.as_ref().to_str().expect("UTF-8 path");
90            out.write_all(path.as_bytes())?;
91            out.write_all(&[0])?;
92            written_bytes += path.len() as u64 + 1;
93        }
94
95        let needed_alignment = CHUNK_ALIGNMENT - (written_bytes % CHUNK_ALIGNMENT);
96        if needed_alignment < CHUNK_ALIGNMENT {
97            let padding = [0u8; CHUNK_ALIGNMENT as usize];
98            out.write_all(&padding[..needed_alignment as usize])?;
99        }
100        Ok(())
101    }
102
103    const CHUNK_ALIGNMENT: u64 = 4;
104}
105
106/// Information for the chunk with the fanout table
107pub mod fanout {
108    use crate::multi_index;
109
110    /// The size of the fanout table
111    pub const SIZE: usize = 4 * 256;
112
113    /// The id uniquely identifying the fanout table.
114    pub const ID: gix_chunk::Id = *b"OIDF";
115
116    /// Decode the fanout table contained in `chunk`, or return `None` if it didn't have the expected size.
117    pub fn from_bytes(chunk: &[u8]) -> Option<[u32; 256]> {
118        if chunk.len() != SIZE {
119            return None;
120        }
121        let mut out = [0; 256];
122        for (c, f) in chunk.chunks_exact(4).zip(out.iter_mut()) {
123            *f = u32::from_be_bytes(c.try_into().unwrap());
124        }
125        out.into()
126    }
127
128    /// Write the fanout for the given entries, which must be sorted by oid
129    pub(crate) fn write(
130        sorted_entries: &[multi_index::write::Entry],
131        out: &mut dyn std::io::Write,
132    ) -> std::io::Result<()> {
133        let fanout = crate::index::encode::fanout(&mut sorted_entries.iter().map(|e| e.id.first_byte()));
134
135        for value in fanout.iter() {
136            out.write_all(&value.to_be_bytes())?;
137        }
138        Ok(())
139    }
140}
141
142/// Information about the oid lookup table.
143pub mod lookup {
144    use std::ops::Range;
145
146    use crate::multi_index;
147
148    /// The id uniquely identifying the oid lookup table.
149    pub const ID: gix_chunk::Id = *b"OIDL";
150
151    /// Return the amount of bytes needed to store the data on disk for the given amount of `entries`
152    pub fn storage_size(entries: usize, object_hash: gix_hash::Kind) -> u64 {
153        (entries * object_hash.len_in_bytes()) as u64
154    }
155
156    pub(crate) fn write(
157        sorted_entries: &[multi_index::write::Entry],
158        out: &mut dyn std::io::Write,
159    ) -> std::io::Result<()> {
160        for entry in sorted_entries {
161            out.write_all(entry.id.as_slice())?;
162        }
163        Ok(())
164    }
165
166    /// Return true if the size of the `offset` range seems to match for a `hash` of the given kind and the amount of objects.
167    pub fn is_valid(offset: &Range<usize>, hash: gix_hash::Kind, num_objects: u32) -> bool {
168        (offset.end - offset.start) / hash.len_in_bytes() == num_objects as usize
169    }
170}
171
172/// Information about the offsets table.
173pub mod offsets {
174    use std::ops::Range;
175
176    use crate::multi_index;
177
178    /// The id uniquely identifying the offsets table.
179    pub const ID: gix_chunk::Id = *b"OOFF";
180
181    /// Return the amount of bytes needed to offset data for `entries`.
182    pub fn storage_size(entries: usize) -> u64 {
183        (entries * (4 /*pack-id*/ + 4/* pack offset */)) as u64
184    }
185
186    pub(crate) fn write(
187        sorted_entries: &[multi_index::write::Entry],
188        large_offsets_needed: bool,
189        out: &mut dyn std::io::Write,
190    ) -> std::io::Result<()> {
191        use crate::index::encode::{HIGH_BIT, LARGE_OFFSET_THRESHOLD};
192        let mut num_large_offsets = 0u32;
193
194        for entry in sorted_entries {
195            out.write_all(&entry.pack_index.to_be_bytes())?;
196
197            let offset: u32 = if large_offsets_needed {
198                if entry.pack_offset > LARGE_OFFSET_THRESHOLD {
199                    let res = num_large_offsets | HIGH_BIT;
200                    num_large_offsets += 1;
201                    res
202                } else {
203                    entry.pack_offset as u32
204                }
205            } else {
206                entry
207                    .pack_offset
208                    .try_into()
209                    .expect("without large offsets, pack-offset fits u32")
210            };
211            out.write_all(&offset.to_be_bytes())?;
212        }
213        Ok(())
214    }
215
216    /// Returns true if the `offset` range seems to match the size required for `num_objects`.
217    pub fn is_valid(offset: &Range<usize>, num_objects: u32) -> bool {
218        let entry_size = 4 /* pack-id */ + 4 /* pack-offset */;
219        ((offset.end - offset.start) / num_objects as usize) == entry_size
220    }
221}
222
223/// Information about the large offsets table.
224pub mod large_offsets {
225    use std::ops::Range;
226
227    use crate::{index::encode::LARGE_OFFSET_THRESHOLD, multi_index};
228
229    /// The id uniquely identifying the large offsets table (with 64 bit offsets)
230    pub const ID: gix_chunk::Id = *b"LOFF";
231
232    /// Returns Some(num-large-offset) if there are offsets larger than u32.
233    pub(crate) fn num_large_offsets(entries: &[multi_index::write::Entry]) -> Option<usize> {
234        let mut num_large_offsets = 0;
235        let mut needs_large_offsets = false;
236        for entry in entries {
237            if entry.pack_offset > LARGE_OFFSET_THRESHOLD {
238                num_large_offsets += 1;
239            }
240            if entry.pack_offset > crate::data::Offset::from(u32::MAX) {
241                needs_large_offsets = true;
242            }
243        }
244
245        needs_large_offsets.then_some(num_large_offsets)
246    }
247    /// Returns true if the `offsets` range seems to be properly aligned for the data we expect.
248    pub fn is_valid(offset: &Range<usize>) -> bool {
249        (offset.end - offset.start) % 8 == 0
250    }
251
252    pub(crate) fn write(
253        sorted_entries: &[multi_index::write::Entry],
254        mut num_large_offsets: usize,
255        out: &mut dyn std::io::Write,
256    ) -> std::io::Result<()> {
257        for offset in sorted_entries
258            .iter()
259            .filter_map(|e| (e.pack_offset > LARGE_OFFSET_THRESHOLD).then_some(e.pack_offset))
260        {
261            out.write_all(&offset.to_be_bytes())?;
262            num_large_offsets = num_large_offsets
263                .checked_sub(1)
264                .expect("BUG: wrote more offsets the previously found");
265        }
266        assert_eq!(num_large_offsets, 0, "BUG: wrote less offsets than initially counted");
267        Ok(())
268    }
269
270    /// Return the amount of bytes needed to store the given amount of `large_offsets`
271    pub(crate) fn storage_size(large_offsets: usize) -> u64 {
272        8 * large_offsets as u64
273    }
274}