string_interner/backend/
buffer.rs

1#![cfg(feature = "backends")]
2
3use super::Backend;
4use crate::{symbol::expect_valid_symbol, DefaultSymbol, Symbol};
5use alloc::vec::Vec;
6use core::{marker::PhantomData, mem, str};
7
8/// An interner backend that appends all interned string information in a single buffer.
9///
10/// # Usage Hint
11///
12/// Use this backend if memory consumption is what matters most to you.
13/// Note though that unlike all other backends symbol values are not contigous!
14///
15/// # Usage
16///
17/// - **Fill:** Efficiency of filling an empty string interner.
18/// - **Resolve:** Efficiency of interned string look-up given a symbol.
19/// - **Allocations:** The number of allocations performed by the backend.
20/// - **Footprint:** The total heap memory consumed by the backend.
21/// - **Contiguous:** True if the returned symbols have contiguous values.
22/// - **Iteration:** Efficiency of iterating over the interned strings.
23///
24/// Rating varies between **bad**, **ok**, **good** and **best**.
25///
26/// | Scenario    |  Rating  |
27/// |:------------|:--------:|
28/// | Fill        | **best** |
29/// | Resolve     | **bad**  |
30/// | Allocations | **best** |
31/// | Footprint   | **best** |
32/// | Supports `get_or_intern_static` | **no** |
33/// | `Send` + `Sync` | **yes** |
34/// | Contiguous  | **no**   |
35/// | Iteration   | **bad** |
36#[derive(Debug)]
37pub struct BufferBackend<S = DefaultSymbol> {
38    len_strings: usize,
39    buffer: Vec<u8>,
40    marker: PhantomData<fn() -> S>,
41}
42
43impl<S> PartialEq for BufferBackend<S>
44where
45    S: Symbol,
46{
47    fn eq(&self, other: &Self) -> bool {
48        self.len_strings.eq(&other.len_strings) && self.buffer.eq(&other.buffer)
49    }
50}
51
52impl<S> Eq for BufferBackend<S> where S: Symbol {}
53
54impl<S> Clone for BufferBackend<S> {
55    fn clone(&self) -> Self {
56        Self {
57            len_strings: self.len_strings,
58            buffer: self.buffer.clone(),
59            marker: Default::default(),
60        }
61    }
62}
63
64impl<S> Default for BufferBackend<S> {
65    #[cfg_attr(feature = "inline-more", inline)]
66    fn default() -> Self {
67        Self {
68            len_strings: 0,
69            buffer: Default::default(),
70            marker: Default::default(),
71        }
72    }
73}
74
75impl<S> BufferBackend<S>
76where
77    S: Symbol,
78{
79    /// Returns the next available symbol.
80    #[inline]
81    fn next_symbol(&self) -> S {
82        expect_valid_symbol(self.buffer.len())
83    }
84
85    /// Resolves the string for the given symbol if any.
86    ///
87    /// # Note
88    ///
89    /// Returns the string from the given index if any as well
90    /// as the index of the next string in the buffer.
91    fn resolve_index_to_str(&self, index: usize) -> Option<(&[u8], usize)> {
92        let bytes = self.buffer.get(index..)?;
93        let (str_len, str_len_bytes) = decode_var_usize(bytes)?;
94        let index_str = index + str_len_bytes;
95        let str_bytes = self.buffer.get(index_str..index_str + str_len)?;
96        Some((str_bytes, index_str + str_len))
97    }
98
99    /// Resolves the string for the given symbol.
100    ///
101    /// # Note
102    ///
103    /// It is undefined behavior if the index does not resemble a string.
104    ///
105    /// # Safety
106    ///
107    /// The caller of the function has to ensure that calling this method
108    /// is safe to do.
109    unsafe fn resolve_index_to_str_unchecked(&self, index: usize) -> &str {
110        // SAFETY: The function is marked unsafe so that the caller guarantees
111        //         that required invariants are checked.
112        let bytes = unsafe { self.buffer.get_unchecked(index..) };
113        // SAFETY: The function is marked unsafe so that the caller guarantees
114        //         that required invariants are checked.
115        let (str_len, str_len_bytes) = unsafe { decode_var_usize_unchecked(bytes) };
116        let index_str = index + str_len_bytes;
117        let str_bytes =
118            // SAFETY: The function is marked unsafe so that the caller guarantees
119            //         that required invariants are checked.
120            unsafe { self.buffer.get_unchecked(index_str..index_str + str_len) };
121        // SAFETY: It is guaranteed by the backend that only valid strings
122        //         are stored in this portion of the buffer.
123        unsafe { str::from_utf8_unchecked(str_bytes) }
124    }
125
126    /// Pushes the given value onto the buffer with `var7` encoding.
127    ///
128    /// Returns the amount of `var7` encoded bytes.
129    #[inline]
130    fn encode_var_usize(&mut self, value: usize) -> usize {
131        encode_var_usize(&mut self.buffer, value)
132    }
133
134    /// Pushes the given string into the buffer and returns its span.
135    ///
136    /// # Panics
137    ///
138    /// If the backend ran out of symbols.
139    fn push_string(&mut self, string: &str) -> S {
140        let symbol = self.next_symbol();
141        let str_len = string.len();
142        let str_bytes = string.as_bytes();
143        self.encode_var_usize(str_len);
144        self.buffer.extend(str_bytes);
145        self.len_strings += 1;
146        symbol
147    }
148}
149
150impl<S> Backend for BufferBackend<S>
151where
152    S: Symbol,
153{
154    type Symbol = S;
155    type Iter<'a>
156        = Iter<'a, S>
157    where
158        Self: 'a;
159
160    #[cfg_attr(feature = "inline-more", inline)]
161    fn with_capacity(capacity: usize) -> Self {
162        /// We encode the `usize` string length into the buffer as well.
163        const LEN_USIZE: usize = mem::size_of::<usize>();
164        /// According to google the approx. word length is 5.
165        const DEFAULT_STR_LEN: usize = 5;
166        let bytes_per_string = DEFAULT_STR_LEN + LEN_USIZE;
167        Self {
168            len_strings: 0,
169            buffer: Vec::with_capacity(capacity * bytes_per_string),
170            marker: Default::default(),
171        }
172    }
173
174    #[inline]
175    fn intern(&mut self, string: &str) -> Self::Symbol {
176        self.push_string(string)
177    }
178
179    #[inline]
180    fn resolve(&self, symbol: Self::Symbol) -> Option<&str> {
181        match self.resolve_index_to_str(symbol.to_usize()) {
182            None => None,
183            Some((bytes, _)) => str::from_utf8(bytes).ok(),
184        }
185    }
186
187    fn shrink_to_fit(&mut self) {
188        self.buffer.shrink_to_fit();
189    }
190
191    #[inline]
192    unsafe fn resolve_unchecked(&self, symbol: Self::Symbol) -> &str {
193        // SAFETY: The function is marked unsafe so that the caller guarantees
194        //         that required invariants are checked.
195        unsafe { self.resolve_index_to_str_unchecked(symbol.to_usize()) }
196    }
197
198    #[inline]
199    fn iter(&self) -> Self::Iter<'_> {
200        Iter::new(self)
201    }
202}
203
204/// Encodes the value using variable length encoding into the buffer.
205///
206/// Returns the amount of bytes used for the encoding.
207#[inline]
208fn encode_var_usize(buffer: &mut Vec<u8>, mut value: usize) -> usize {
209    if value <= 0x7F {
210        // Shortcut the common case for low value.
211        buffer.push(value as u8);
212        return 1;
213    }
214    let mut len_chunks = 0;
215    loop {
216        let mut chunk = (value as u8) & 0x7F_u8;
217        value >>= 7;
218        chunk |= ((value != 0) as u8) << 7;
219        buffer.push(chunk);
220        len_chunks += 1;
221        if value == 0 {
222            break;
223        }
224    }
225    len_chunks
226}
227
228/// Decodes from a variable length encoded `usize` from the buffer.
229///
230/// Returns the decoded value as first return value.
231/// Returns the number of decoded bytes as second return value.
232///
233/// # Safety
234///
235/// The caller has to make sure that the buffer contains the necessary
236/// bytes needed to properly decode a valid `usize` value.
237#[inline]
238unsafe fn decode_var_usize_unchecked(buffer: &[u8]) -> (usize, usize) {
239    let first = unsafe { *buffer.get_unchecked(0) };
240    match first {
241        byte if byte <= 0x7F_u8 => (byte as usize, 1),
242        _ => unsafe { decode_var_usize_unchecked_cold(buffer) },
243    }
244}
245
246/// Decodes from a variable length encoded `usize` from the buffer.
247///
248/// Returns the decoded value as first return value.
249/// Returns the number of decoded bytes as second return value.
250///
251/// # Safety
252///
253/// The caller has to make sure that the buffer contains the necessary
254/// bytes needed to properly decode a valid `usize` value.
255///
256/// Uncommon case for string lengths of 254 or greater.
257#[inline]
258#[cold]
259unsafe fn decode_var_usize_unchecked_cold(buffer: &[u8]) -> (usize, usize) {
260    let mut result: usize = 0;
261    let mut i = 0;
262    loop {
263        let byte = unsafe { *buffer.get_unchecked(i) };
264        let shifted = ((byte & 0x7F_u8) as usize) << ((i * 7) as u32);
265        result += shifted;
266        if (byte & 0x80) == 0 {
267            break;
268        }
269        i += 1;
270    }
271    (result, i + 1)
272}
273
274/// Decodes from a variable length encoded `usize` from the buffer.
275///
276/// Returns the decoded value as first return value.
277/// Returns the number of decoded bytes as second return value.
278#[inline]
279fn decode_var_usize(buffer: &[u8]) -> Option<(usize, usize)> {
280    match buffer.first() {
281        None => None,
282        Some(&byte) if byte <= 0x7F_u8 => Some((byte as usize, 1)),
283        _ => decode_var_usize_cold(buffer),
284    }
285}
286
287/// Decodes from a variable length encoded `usize` from the buffer.
288///
289/// Returns the decoded value as first return value.
290/// Returns the number of decoded bytes as second return value.
291///
292/// Uncommon case for string lengths of 254 or greater.
293#[inline]
294#[cold]
295fn decode_var_usize_cold(buffer: &[u8]) -> Option<(usize, usize)> {
296    let mut result: usize = 0;
297    let mut i = 0;
298    loop {
299        let byte = *buffer.get(i)?;
300        let shifted = ((byte & 0x7F_u8) as usize).checked_shl((i * 7) as u32)?;
301        result = result.checked_add(shifted)?;
302        if (byte & 0x80) == 0 {
303            break;
304        }
305        i += 1;
306    }
307    Some((result, i + 1))
308}
309
310impl<'a, S> IntoIterator for &'a BufferBackend<S>
311where
312    S: Symbol,
313{
314    type Item = (S, &'a str);
315    type IntoIter = Iter<'a, S>;
316
317    #[cfg_attr(feature = "inline-more", inline)]
318    fn into_iter(self) -> Self::IntoIter {
319        self.iter()
320    }
321}
322
323pub struct Iter<'a, S> {
324    backend: &'a BufferBackend<S>,
325    remaining: usize,
326    next: usize,
327}
328
329impl<'a, S> Iter<'a, S> {
330    #[cfg_attr(feature = "inline-more", inline)]
331    pub fn new(backend: &'a BufferBackend<S>) -> Self {
332        Self {
333            backend,
334            remaining: backend.len_strings,
335            next: 0,
336        }
337    }
338}
339
340impl<'a, S> Iterator for Iter<'a, S>
341where
342    S: Symbol,
343{
344    type Item = (S, &'a str);
345
346    #[inline]
347    fn size_hint(&self) -> (usize, Option<usize>) {
348        let remaining = self.len();
349        (remaining, Some(remaining))
350    }
351
352    #[inline]
353    fn next(&mut self) -> Option<Self::Item> {
354        self.backend
355            .resolve_index_to_str(self.next)
356            .and_then(|(bytes, next)| {
357                // SAFETY: Within the iterator all indices given to `resolv_index_to_str`
358                //         are properly pointing to the start of each interned string.
359                let string = unsafe { str::from_utf8_unchecked(bytes) };
360                let symbol = S::try_from_usize(self.next)?;
361                self.next = next;
362                self.remaining -= 1;
363                Some((symbol, string))
364            })
365    }
366}
367
368impl<S> ExactSizeIterator for Iter<'_, S>
369where
370    S: Symbol,
371{
372    #[inline]
373    fn len(&self) -> usize {
374        self.remaining
375    }
376}
377
378#[cfg(test)]
379mod tests {
380    use super::{decode_var_usize, encode_var_usize};
381    use alloc::vec::Vec;
382
383    #[test]
384    fn encode_var_usize_1_byte_works() {
385        let mut buffer = Vec::new();
386        for i in 0..2usize.pow(7) {
387            buffer.clear();
388            assert_eq!(encode_var_usize(&mut buffer, i), 1);
389            assert_eq!(buffer, [i as u8]);
390            assert_eq!(decode_var_usize(&buffer), Some((i, 1)));
391        }
392    }
393
394    #[test]
395    fn encode_var_usize_2_bytes_works() {
396        let mut buffer = Vec::new();
397        for i in 2usize.pow(7)..2usize.pow(14) {
398            buffer.clear();
399            assert_eq!(encode_var_usize(&mut buffer, i), 2);
400            assert_eq!(buffer, [0x80 | ((i & 0x7F) as u8), (0x7F & (i >> 7) as u8)]);
401            assert_eq!(decode_var_usize(&buffer), Some((i, 2)));
402        }
403    }
404
405    #[test]
406    #[cfg_attr(any(miri), ignore)]
407    fn encode_var_usize_3_bytes_works() {
408        let mut buffer = Vec::new();
409        for i in 2usize.pow(14)..2usize.pow(21) {
410            buffer.clear();
411            assert_eq!(encode_var_usize(&mut buffer, i), 3);
412            assert_eq!(
413                buffer,
414                [
415                    0x80 | ((i & 0x7F) as u8),
416                    0x80 | (0x7F & (i >> 7) as u8),
417                    (0x7F & (i >> 14) as u8),
418                ]
419            );
420            assert_eq!(decode_var_usize(&buffer), Some((i, 3)));
421        }
422    }
423
424    /// Allows to split up the test into multiple fragments that can run in parallel.
425    #[cfg_attr(any(miri), ignore)]
426    fn assert_encode_var_usize_4_bytes(range: core::ops::Range<usize>) {
427        let mut buffer = Vec::new();
428        for i in range {
429            buffer.clear();
430            assert_eq!(encode_var_usize(&mut buffer, i), 4);
431            assert_eq!(
432                buffer,
433                [
434                    0x80 | ((i & 0x7F) as u8),
435                    0x80 | (0x7F & (i >> 7) as u8),
436                    0x80 | (0x7F & (i >> 14) as u8),
437                    (0x7F & (i >> 21) as u8),
438                ]
439            );
440            assert_eq!(decode_var_usize(&buffer), Some((i, 4)));
441        }
442    }
443
444    #[test]
445    #[cfg_attr(any(miri), ignore)]
446    fn encode_var_usize_4_bytes_01_works() {
447        assert_encode_var_usize_4_bytes(2usize.pow(21)..2usize.pow(24));
448    }
449
450    #[test]
451    #[cfg_attr(any(miri), ignore)]
452    fn encode_var_usize_4_bytes_02_works() {
453        assert_encode_var_usize_4_bytes(2usize.pow(24)..2usize.pow(26));
454    }
455
456    #[test]
457    #[cfg_attr(any(miri), ignore)]
458    fn encode_var_usize_4_bytes_03_works() {
459        assert_encode_var_usize_4_bytes(2usize.pow(26)..2usize.pow(27));
460    }
461
462    #[test]
463    #[cfg_attr(any(miri), ignore)]
464    fn encode_var_usize_4_bytes_04_works() {
465        assert_encode_var_usize_4_bytes(2usize.pow(27)..2usize.pow(28));
466    }
467
468    #[test]
469    fn encode_var_u32_max_works() {
470        let mut buffer = Vec::new();
471        let i = u32::MAX as usize;
472        assert_eq!(encode_var_usize(&mut buffer, i), 5);
473        assert_eq!(buffer, [0xFF, 0xFF, 0xFF, 0xFF, 0x0F]);
474        assert_eq!(decode_var_usize(&buffer), Some((i, 5)));
475    }
476
477    #[test]
478    fn encode_var_u64_max_works() {
479        let mut buffer = Vec::new();
480        let i = u64::MAX as usize;
481        assert_eq!(encode_var_usize(&mut buffer, i), 10);
482        assert_eq!(
483            buffer,
484            [0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01]
485        );
486        assert_eq!(decode_var_usize(&buffer), Some((i, 10)));
487    }
488
489    #[test]
490    fn decode_var_fail() {
491        // Empty buffer.
492        assert_eq!(decode_var_usize(&[]), None);
493        // Missing buffer bytes.
494        assert_eq!(decode_var_usize(&[0x80]), None);
495        // Out of range encoded value.
496        // assert_eq!(
497        //     decode_var_usize(&[
498        //         0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x03
499        //     ]),
500        //     None,
501        // );
502    }
503}