string_interner/backend/
string.rs

1#![cfg(feature = "backends")]
2
3use super::Backend;
4use crate::{symbol::expect_valid_symbol, DefaultSymbol, Symbol};
5use alloc::{string::String, vec::Vec};
6use core::{iter::Enumerate, marker::PhantomData, slice};
7
8/// An interner backend that accumulates all interned string contents into one string.
9///
10/// # Note
11///
12/// Implementation inspired by [CAD97's](https://github.com/CAD97) research
13/// project [`strena`](https://github.com/CAD97/strena).
14///
15/// # Usage Hint
16///
17/// Use this backend if runtime performance is what matters most to you.
18///
19/// # Usage
20///
21/// - **Fill:** Efficiency of filling an empty string interner.
22/// - **Resolve:** Efficiency of interned string look-up given a symbol.
23/// - **Allocations:** The number of allocations performed by the backend.
24/// - **Footprint:** The total heap memory consumed by the backend.
25/// - **Contiguous:** True if the returned symbols have contiguous values.
26/// - **Iteration:** Efficiency of iterating over the interned strings.
27///
28/// Rating varies between **bad**, **ok**, **good** and **best**.
29///
30/// | Scenario    |  Rating  |
31/// |:------------|:--------:|
32/// | Fill        | **good** |
33/// | Resolve     | **ok**   |
34/// | Allocations | **good** |
35/// | Footprint   | **good** |
36/// | Supports `get_or_intern_static` | **no** |
37/// | `Send` + `Sync` | **yes** |
38/// | Contiguous  | **yes**  |
39/// | Iteration   | **good** |
40#[derive(Debug)]
41pub struct StringBackend<S = DefaultSymbol> {
42    ends: Vec<usize>,
43    buffer: String,
44    marker: PhantomData<fn() -> S>,
45}
46
47/// Represents a `[from, to)` index into the `StringBackend` buffer.
48#[derive(Debug, Copy, Clone, PartialEq, Eq)]
49pub struct Span {
50    from: usize,
51    to: usize,
52}
53
54impl<S> PartialEq for StringBackend<S>
55where
56    S: Symbol,
57{
58    fn eq(&self, other: &Self) -> bool {
59        if self.ends.len() != other.ends.len() {
60            return false;
61        }
62        for ((_, lhs), (_, rhs)) in self.into_iter().zip(other) {
63            if lhs != rhs {
64                return false;
65            }
66        }
67        true
68    }
69}
70
71impl<S> Eq for StringBackend<S> where S: Symbol {}
72
73impl<S> Clone for StringBackend<S> {
74    fn clone(&self) -> Self {
75        Self {
76            ends: self.ends.clone(),
77            buffer: self.buffer.clone(),
78            marker: Default::default(),
79        }
80    }
81}
82
83impl<S> Default for StringBackend<S> {
84    #[cfg_attr(feature = "inline-more", inline)]
85    fn default() -> Self {
86        Self {
87            ends: Vec::default(),
88            buffer: String::default(),
89            marker: Default::default(),
90        }
91    }
92}
93
94impl<S> StringBackend<S>
95where
96    S: Symbol,
97{
98    /// Returns the next available symbol.
99    fn next_symbol(&self) -> S {
100        expect_valid_symbol(self.ends.len())
101    }
102
103    /// Returns the string associated to the span.
104    fn span_to_str(&self, span: Span) -> &str {
105        // SAFETY: - We convert a `String` into its underlying bytes and then
106        //           directly reinterpret it as `&str` again which is safe.
107        //         - Nothing mutates the string in between since this is a `&self`
108        //           method.
109        //         - The spans we use for `(start..end]` ranges are always
110        //           constructed in accordance to valid utf8 byte ranges.
111        unsafe { core::str::from_utf8_unchecked(&self.buffer.as_bytes()[span.from..span.to]) }
112    }
113
114    /// Returns the span for the given symbol if any.
115    fn symbol_to_span(&self, symbol: S) -> Option<Span> {
116        let index = symbol.to_usize();
117        self.ends.get(index).copied().map(|to| {
118            let from = self.ends.get(index.wrapping_sub(1)).copied().unwrap_or(0);
119            Span { from, to }
120        })
121    }
122
123    /// Returns the span for the given symbol if any.
124    unsafe fn symbol_to_span_unchecked(&self, symbol: S) -> Span {
125        let index = symbol.to_usize();
126        // SAFETY: The function is marked unsafe so that the caller guarantees
127        //         that required invariants are checked.
128        let to = unsafe { *self.ends.get_unchecked(index) };
129        let from = self.ends.get(index.wrapping_sub(1)).copied().unwrap_or(0);
130        Span { from, to }
131    }
132
133    /// Pushes the given string into the buffer and returns its span.
134    ///
135    /// # Panics
136    ///
137    /// If the backend ran out of symbols.
138    fn push_string(&mut self, string: &str) -> S {
139        self.buffer.push_str(string);
140        let to = self.buffer.len();
141        let symbol = self.next_symbol();
142        self.ends.push(to);
143        symbol
144    }
145}
146
147impl<S> Backend for StringBackend<S>
148where
149    S: Symbol,
150{
151    type Symbol = S;
152    type Iter<'a>
153        = Iter<'a, S>
154    where
155        Self: 'a;
156
157    #[cfg_attr(feature = "inline-more", inline)]
158    fn with_capacity(cap: usize) -> Self {
159        // According to google the approx. word length is 5.
160        let default_word_len = 5;
161        Self {
162            ends: Vec::with_capacity(cap),
163            buffer: String::with_capacity(cap * default_word_len),
164            marker: Default::default(),
165        }
166    }
167
168    #[inline]
169    fn intern(&mut self, string: &str) -> Self::Symbol {
170        self.push_string(string)
171    }
172
173    #[inline]
174    fn resolve(&self, symbol: Self::Symbol) -> Option<&str> {
175        self.symbol_to_span(symbol)
176            .map(|span| self.span_to_str(span))
177    }
178
179    fn shrink_to_fit(&mut self) {
180        self.ends.shrink_to_fit();
181        self.buffer.shrink_to_fit();
182    }
183
184    #[inline]
185    unsafe fn resolve_unchecked(&self, symbol: Self::Symbol) -> &str {
186        // SAFETY: The function is marked unsafe so that the caller guarantees
187        //         that required invariants are checked.
188        unsafe { self.span_to_str(self.symbol_to_span_unchecked(symbol)) }
189    }
190
191    #[inline]
192    fn iter(&self) -> Self::Iter<'_> {
193        Iter::new(self)
194    }
195}
196
197impl<'a, S> IntoIterator for &'a StringBackend<S>
198where
199    S: Symbol,
200{
201    type Item = (S, &'a str);
202    type IntoIter = Iter<'a, S>;
203
204    #[cfg_attr(feature = "inline-more", inline)]
205    fn into_iter(self) -> Self::IntoIter {
206        self.iter()
207    }
208}
209
210pub struct Iter<'a, S> {
211    backend: &'a StringBackend<S>,
212    start: usize,
213    ends: Enumerate<slice::Iter<'a, usize>>,
214}
215
216impl<'a, S> Iter<'a, S> {
217    #[cfg_attr(feature = "inline-more", inline)]
218    pub fn new(backend: &'a StringBackend<S>) -> Self {
219        Self {
220            backend,
221            start: 0,
222            ends: backend.ends.iter().enumerate(),
223        }
224    }
225}
226
227impl<'a, S> Iterator for Iter<'a, S>
228where
229    S: Symbol,
230{
231    type Item = (S, &'a str);
232
233    #[inline]
234    fn size_hint(&self) -> (usize, Option<usize>) {
235        self.ends.size_hint()
236    }
237
238    #[inline]
239    fn next(&mut self) -> Option<Self::Item> {
240        self.ends.next().map(|(id, &to)| {
241            let from = core::mem::replace(&mut self.start, to);
242            (
243                expect_valid_symbol(id),
244                self.backend.span_to_str(Span { from, to }),
245            )
246        })
247    }
248}