ustr_fxhash/lib.rs
1//! Fast, FFI-friendly string interning. A [`Ustr`] (**U**nique **Str**) is a
2//! lightweight handle representing a static, immutable entry in a global string
3//! cache, allowing for:
4//!
5//! * Extremely fast string assignment and comparisons -- it's just a pointer
6//! comparison.
7//!
8//! * Efficient storage -- only one copy of the string is held in memory, and
9//! getting access to it is just a pointer indirection.
10//!
11//! * Fast hashing -- the precomputed hash is stored with the string.
12//!
13//! * Fast FFI -- the string is stored with a terminating null byte so can be
14//! passed to C directly without doing the `CString` dance.
15//!
16//! The downside is no strings are ever freed, so if you're creating lots and
17//! lots of strings, you might run out of memory. On the other hand, War and
18//! Peace is only 3MB, so it's probably fine.
19//!
20//! This crate is based on [OpenImageIO's](https://openimageio.readthedocs.io/en/v2.4.10.0/)
21//! (OIIO) [`ustring`](https://github.com/OpenImageIO/oiio/blob/master/src/include/OpenImageIO/ustring.h)
22//! but it is *not* binary-compatible (yet). The underlying hash map
23//! implementation is directy ported from OIIO.
24//!
25//! # Usage
26//!
27//! ```
28//! use ustr_fxhash::{Ustr, ustr, ustr as u};
29//!
30//! # unsafe { ustr_fxhash::_clear_cache() };
31//! // Creation is quick and easy using either `Ustr::from` or the ustr function
32//! // and only one copy of any string is stored.
33//! let u1 = Ustr::from("the quick brown fox");
34//! let u2 = ustr("the quick brown fox");
35//!
36//! // Comparisons and copies are extremely cheap.
37//! let u3 = u1;
38//! assert_eq!(u2, u3);
39//!
40//! // You can pass straight to FFI.
41//! let len = unsafe {
42//! libc::strlen(u1.as_char_ptr())
43//! };
44//! assert_eq!(len, 19);
45//!
46//! // Use as_str() to get a `str`.
47//! let words: Vec<&str> = u1.as_str().split_whitespace().collect();
48//! assert_eq!(words, ["the", "quick", "brown", "fox"]);
49//!
50//! // For best performance when using Ustr as key for a HashMap or HashSet,
51//! // you'll want to use the precomputed hash. To make this easier, just use
52//! // the UstrMap and UstrSet exports:
53//! use ustr_fxhash::UstrMap;
54//!
55//! // Key type is always `Ustr`.
56//! let mut map: UstrMap<usize> = UstrMap::default();
57//! map.insert(u1, 17);
58//! assert_eq!(*map.get(&u1).unwrap(), 17);
59//! ```
60//!
61//! By enabling the `"serde"` feature you can serialize individual `Ustr`s
62//! or the whole cache with serde.
63//!
64//! ```
65//! # #[cfg(feature = "serde")] {
66//! use ustr_fxhash::{Ustr, ustr};
67//! let u_ser = ustr("serde");
68//! let json = serde_json::to_string(&u_ser).unwrap();
69//! let u_de : Ustr = serde_json::from_str(&json).unwrap();
70//! assert_eq!(u_ser, u_de);
71//! # }
72//! ```
73//!
74//! Since the cache is global, use the `ustr_fxhash::DeserializedCache` dummy object to
75//! drive the deserialization.
76//!
77//! ```
78//! # #[cfg(feature = "serde")] {
79//! use ustr_fxhash::{Ustr, ustr};
80//! ustr("Send me to JSON and back");
81//! let json = serde_json::to_string(ustr_fxhash::cache()).unwrap();
82//!
83//! // ... some time later ...
84//! let _: ustr_fxhash::DeserializedCache = serde_json::from_str(&json).unwrap();
85//! assert_eq!(ustr_fxhash::num_entries(), 1);
86//! assert_eq!(ustr_fxhash::string_cache_iter().collect::<Vec<_>>(), vec!["Send me to JSON and back"]);
87//! # }
88//! ```
89//!
90//! ## Why?
91//!
92//! It is common in certain types of applications to use strings as identifiers,
93//! but not really do any processing with them.
94//! To paraphrase from OIIO's `Ustring` documentation -- compared to standard
95//! strings, `Ustr`s have several advantages:
96//!
97//! - Each individual `Ustr` is very small -- in fact, we guarantee that a
98//! `Ustr` is the same size and memory layout as an ordinary `*u8`.
99//!
100//! - Storage is frugal, since there is only one allocated copy of each unique
101//! character sequence, throughout the lifetime of the program.
102//!
103//! - Assignment from one `Ustr` to another is just copy of the pointer; no
104//! allocation, no character copying, no reference counting.
105//!
106//! - Equality testing (do the strings contain the same characters) is a
107//! single operation, the comparison of the pointer.
108//!
109//! - Memory allocation only occurs when a new `Ustr` is constructed from raw
110//! characters the FIRST time -- subsequent constructions of the same string
111//! just finds it in the canonial string set, but doesn't need to allocate
112//! new storage. Destruction of a `Ustr` is trivial, there is no
113//! de-allocation because the canonical version stays in the set. Also,
114//! therefore, no user code mistake can lead to memory leaks.
115//!
116//! But there are some problems, too. Canonical strings are never freed
117//! from the table. So in some sense all the strings "leak", but they
118//! only leak one copy for each unique string that the program ever comes
119//! across.
120//!
121//! On the whole, `Ustr`s are a really great string representation
122//!
123//! - if you tend to have (relatively) few unique strings, but many copies of
124//! those strings;
125//!
126//! - if the creation of strings from raw characters is relatively rare
127//! compared to copying or comparing to existing strings;
128//!
129//! - if you tend to make the same strings over and over again, and if it's
130//! relatively rare that a single unique character sequence is used only
131//! once in the entire lifetime of the program;
132//!
133//! - if your most common string operations are assignment and equality
134//! testing and you want them to be as fast as possible;
135//!
136//! - if you are doing relatively little character-by-character assembly of
137//! strings, string concatenation, or other "string manipulation" (other
138//! than equality testing).
139//!
140//! `Ustr`s are not so hot
141//!
142//! - if your program tends to have very few copies of each character sequence
143//! over the entire lifetime of the program;
144//!
145//! - if your program tends to generate a huge variety of unique strings over
146//! its lifetime, each of which is used only a short time and then
147//! discarded, never to be needed again;
148//!
149//! - if you don't need to do a lot of string assignment or equality testing,
150//! but lots of more complex string manipulation.
151//!
152//! ## Safety and Compatibility
153//!
154//! This crate contains a significant amount of unsafe but usage has been
155//! checked and is well-documented. It is also run through Miri as part of the
156//! CI process. I use it regularly on 64-bit systems, and it has passed Miri on
157//! a 32-bit system as well, bit 32-bit is not checked regularly. If you want to
158//! use it on 32-bit, please make sure to run Miri and open and issue if you
159//! find any problems.
160use parking_lot::Mutex;
161use std::{
162 borrow::Cow,
163 cmp::Ordering,
164 ffi::{CStr, OsStr},
165 fmt,
166 hash::{Hash, Hasher},
167 ops::Deref,
168 os::raw::c_char,
169 path::Path,
170 ptr::NonNull,
171 rc::Rc,
172 slice, str,
173 str::FromStr,
174 sync::Arc,
175};
176
177mod hash;
178pub use hash::*;
179mod bumpalloc;
180
181mod stringcache;
182pub use stringcache::*;
183#[cfg(feature = "serde")]
184pub mod serialization;
185#[cfg(feature = "serde")]
186pub use serialization::DeserializedCache;
187
188/// A handle representing a string in the global string cache.
189///
190/// To use, create one using [`Ustr::from`] or the [`ustr`] function. You can
191/// freely copy, destroy or send `Ustr`s to other threads: the underlying string
192/// is always valid in memory (and is never destroyed).
193#[derive(Copy, Clone, PartialEq)]
194#[repr(transparent)]
195pub struct Ustr {
196 char_ptr: NonNull<u8>,
197}
198
199/// Defer to `str` for equality.
200///
201/// Lexicographic ordering will be slower than pointer comparison, but much less
202/// surprising if you use `Ustr`s as keys in e.g. a `BTreeMap`.
203impl Ord for Ustr {
204 fn cmp(&self, other: &Self) -> Ordering {
205 self.as_str().cmp(other.as_str())
206 }
207}
208
209/// Defer to `str` for equality.
210///
211/// Lexicographic ordering will be slower thanpointer comparison, but much less
212/// surprising if you use `Ustr`s as keys in e.g. a `BTreeMap`.
213#[allow(clippy::non_canonical_partial_ord_impl)]
214impl PartialOrd for Ustr {
215 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
216 Some(self.cmp(other))
217 }
218}
219
220impl Ustr {
221 /// Create a new `Ustr` from the given `str`.
222 ///
223 /// You can also use the [`ustr`] function.
224 ///
225 /// # Examples
226 ///
227 /// ```
228 /// use ustr_fxhash::{Ustr, ustr as u};
229 /// # unsafe { ustr_fxhash::_clear_cache() };
230 ///
231 /// let u1 = Ustr::from("the quick brown fox");
232 /// let u2 = u("the quick brown fox");
233 /// assert_eq!(u1, u2);
234 /// assert_eq!(ustr_fxhash::num_entries(), 1);
235 /// ```
236 pub fn from(string: &str) -> Ustr {
237 let hash = {
238 let mut hasher = rustc_hash::FxHasher::default();
239 hasher.write(string.as_bytes());
240 hasher.finish()
241 };
242 let mut sc = STRING_CACHE.0[whichbin(hash)].lock();
243 Ustr {
244 // SAFETY: sc.insert does not give back a null pointer
245 char_ptr: unsafe {
246 NonNull::new_unchecked(sc.insert(string, hash) as *mut _)
247 },
248 }
249 }
250
251 pub fn from_existing(string: &str) -> Option<Ustr> {
252 let hash = {
253 let mut hasher = rustc_hash::FxHasher::default();
254 hasher.write(string.as_bytes());
255 hasher.finish()
256 };
257 let sc = STRING_CACHE.0[whichbin(hash)].lock();
258 sc.get_existing(string, hash).map(|ptr| Ustr {
259 char_ptr: unsafe { NonNull::new_unchecked(ptr as *mut _) },
260 })
261 }
262
263 /// Get the cached `Ustr` as a `str`.
264 ///
265 /// # Examples
266 ///
267 /// ```
268 /// use ustr_fxhash::ustr as u;
269 /// # unsafe { ustr_fxhash::_clear_cache() };
270 ///
271 /// let u_fox = u("the quick brown fox");
272 /// let words: Vec<&str> = u_fox.as_str().split_whitespace().collect();
273 /// assert_eq!(words, ["the", "quick", "brown", "fox"]);
274 /// ```
275 pub fn as_str(&self) -> &'static str {
276 // This is safe if:
277 // 1) self.char_ptr points to a valid address
278 // 2) len is a usize stored usize aligned usize bytes before char_ptr
279 // 3) char_ptr points to a valid UTF-8 string of len bytes.
280 // All these are guaranteed by StringCache::insert() and by the fact
281 // we can only construct a Ustr from a valid &str.
282 unsafe {
283 str::from_utf8_unchecked(slice::from_raw_parts(
284 self.char_ptr.as_ptr(),
285 self.len(),
286 ))
287 }
288 }
289
290 /// Get the cached string as a C `char*`.
291 ///
292 /// This includes the null terminator so is safe to pass straight to FFI.
293 ///
294 /// # Examples
295 ///
296 /// ```
297 /// use ustr_fxhash::ustr as u;
298 /// # unsafe { ustr_fxhash::_clear_cache() };
299 ///
300 /// let u_fox = u("the quick brown fox");
301 /// let len = unsafe {
302 /// libc::strlen(u_fox.as_char_ptr())
303 /// };
304 /// assert_eq!(len, 19);
305 /// ```
306 ///
307 /// # Safety
308 ///
309 /// This is just passing a raw byte array with a null terminator to C. If
310 /// your source string contains non-ascii bytes then this will pass them
311 /// straight along with no checking.
312 ///
313 /// The string is **immutable**. That means that if you modify it across the
314 /// FFI boundary then all sorts of terrible things will happen.
315 pub fn as_char_ptr(&self) -> *const c_char {
316 self.char_ptr.as_ptr() as *const c_char
317 }
318
319 /// Get this `Ustr` as a [`CStr`]
320 ///
321 /// This is useful for passing to APIs (like ash) that use `CStr`.
322 ///
323 /// # Safety
324 ///
325 /// This function by itself is safe as the pointer and length are guaranteed
326 /// to be valid. All the same caveats for the use of the `CStr` as given in
327 /// the `CStr` docs apply.
328 pub fn as_cstr(&self) -> &CStr {
329 unsafe {
330 CStr::from_bytes_with_nul_unchecked(slice::from_raw_parts(
331 self.as_ptr(),
332 self.len() + 1,
333 ))
334 }
335 }
336
337 /// Get a raw pointer to the `StringCacheEntry`.
338 #[inline]
339 fn as_string_cache_entry(&self) -> &StringCacheEntry {
340 // The allocator guarantees that the alignment is correct and that
341 // this pointer is non-null
342 unsafe { &*(self.char_ptr.as_ptr().cast::<StringCacheEntry>().sub(1)) }
343 }
344
345 /// Get the length (in bytes) of this string.
346 #[inline]
347 pub fn len(&self) -> usize {
348 self.as_string_cache_entry().len
349 }
350
351 /// Returns true if the length is zero.
352 pub fn is_empty(&self) -> bool {
353 self.len() == 0
354 }
355
356 /// Get the precomputed hash for this string.
357 #[inline]
358 pub fn precomputed_hash(&self) -> u64 {
359 self.as_string_cache_entry().hash
360 }
361
362 /// Get an owned String copy of this string.
363 pub fn to_owned(&self) -> String {
364 self.as_str().to_owned()
365 }
366}
367
368// We're safe to impl these because the strings they reference are immutable
369// and for all intents and purposes 'static since they're never deleted after
370// being created
371unsafe impl Send for Ustr {}
372unsafe impl Sync for Ustr {}
373
374impl PartialEq<str> for Ustr {
375 fn eq(&self, other: &str) -> bool {
376 self.as_str() == other
377 }
378}
379
380impl PartialEq<Ustr> for str {
381 fn eq(&self, u: &Ustr) -> bool {
382 self == u.as_str()
383 }
384}
385
386impl PartialEq<&str> for Ustr {
387 fn eq(&self, other: &&str) -> bool {
388 self.as_str() == *other
389 }
390}
391
392impl PartialEq<Ustr> for &str {
393 fn eq(&self, u: &Ustr) -> bool {
394 *self == u.as_str()
395 }
396}
397
398impl PartialEq<&&str> for Ustr {
399 fn eq(&self, other: &&&str) -> bool {
400 self.as_str() == **other
401 }
402}
403
404impl PartialEq<Ustr> for &&str {
405 fn eq(&self, u: &Ustr) -> bool {
406 **self == u.as_str()
407 }
408}
409
410impl PartialEq<String> for Ustr {
411 fn eq(&self, other: &String) -> bool {
412 self.as_str() == other
413 }
414}
415
416impl PartialEq<Ustr> for String {
417 fn eq(&self, u: &Ustr) -> bool {
418 self == u.as_str()
419 }
420}
421
422impl PartialEq<&String> for Ustr {
423 fn eq(&self, other: &&String) -> bool {
424 self.as_str() == *other
425 }
426}
427
428impl PartialEq<Ustr> for &String {
429 fn eq(&self, u: &Ustr) -> bool {
430 *self == u.as_str()
431 }
432}
433
434impl PartialEq<Box<str>> for Ustr {
435 fn eq(&self, other: &Box<str>) -> bool {
436 self.as_str() == &**other
437 }
438}
439
440impl PartialEq<Ustr> for Box<str> {
441 fn eq(&self, u: &Ustr) -> bool {
442 &**self == u.as_str()
443 }
444}
445
446impl PartialEq<Ustr> for &Box<str> {
447 fn eq(&self, u: &Ustr) -> bool {
448 &***self == u.as_str()
449 }
450}
451
452impl PartialEq<Cow<'_, str>> for Ustr {
453 fn eq(&self, other: &Cow<'_, str>) -> bool {
454 self.as_str() == other
455 }
456}
457
458impl PartialEq<Ustr> for Cow<'_, str> {
459 fn eq(&self, u: &Ustr) -> bool {
460 self == u.as_str()
461 }
462}
463
464impl PartialEq<&Cow<'_, str>> for Ustr {
465 fn eq(&self, other: &&Cow<'_, str>) -> bool {
466 self.as_str() == *other
467 }
468}
469
470impl PartialEq<Ustr> for &Cow<'_, str> {
471 fn eq(&self, u: &Ustr) -> bool {
472 *self == u.as_str()
473 }
474}
475
476impl PartialEq<Ustr> for Path {
477 fn eq(&self, u: &Ustr) -> bool {
478 self == Path::new(u)
479 }
480}
481
482impl PartialEq<Ustr> for &Path {
483 fn eq(&self, u: &Ustr) -> bool {
484 *self == Path::new(u)
485 }
486}
487
488impl PartialEq<Ustr> for OsStr {
489 fn eq(&self, u: &Ustr) -> bool {
490 self == OsStr::new(u)
491 }
492}
493
494impl PartialEq<Ustr> for &OsStr {
495 fn eq(&self, u: &Ustr) -> bool {
496 *self == OsStr::new(u)
497 }
498}
499
500impl Eq for Ustr {}
501
502impl<T: ?Sized> AsRef<T> for Ustr
503where
504 str: AsRef<T>,
505{
506 fn as_ref(&self) -> &T {
507 self.as_str().as_ref()
508 }
509}
510
511impl FromStr for Ustr {
512 type Err = std::string::ParseError;
513
514 #[inline]
515 fn from_str(s: &str) -> Result<Self, Self::Err> {
516 Ok(Ustr::from(s))
517 }
518}
519
520impl From<&str> for Ustr {
521 fn from(s: &str) -> Ustr {
522 Ustr::from(s)
523 }
524}
525
526impl From<Ustr> for &'static str {
527 fn from(s: Ustr) -> &'static str {
528 s.as_str()
529 }
530}
531
532impl From<Ustr> for String {
533 fn from(u: Ustr) -> Self {
534 String::from(u.as_str())
535 }
536}
537
538impl From<Ustr> for Box<str> {
539 fn from(u: Ustr) -> Self {
540 Box::from(u.as_str())
541 }
542}
543
544impl From<Ustr> for Rc<str> {
545 fn from(u: Ustr) -> Self {
546 Rc::from(u.as_str())
547 }
548}
549
550impl From<Ustr> for Arc<str> {
551 fn from(u: Ustr) -> Self {
552 Arc::from(u.as_str())
553 }
554}
555
556impl From<Ustr> for Cow<'static, str> {
557 fn from(u: Ustr) -> Self {
558 Cow::Borrowed(u.as_str())
559 }
560}
561
562impl From<String> for Ustr {
563 fn from(s: String) -> Ustr {
564 Ustr::from(&s)
565 }
566}
567
568impl From<&String> for Ustr {
569 fn from(s: &String) -> Ustr {
570 Ustr::from(s)
571 }
572}
573
574impl From<Box<str>> for Ustr {
575 fn from(s: Box<str>) -> Ustr {
576 Ustr::from(&s)
577 }
578}
579
580impl From<Rc<str>> for Ustr {
581 fn from(s: Rc<str>) -> Ustr {
582 Ustr::from(&s)
583 }
584}
585
586impl From<Arc<str>> for Ustr {
587 fn from(s: Arc<str>) -> Ustr {
588 Ustr::from(&s)
589 }
590}
591
592impl From<Cow<'_, str>> for Ustr {
593 fn from(s: Cow<'_, str>) -> Ustr {
594 Ustr::from(&s)
595 }
596}
597
598impl Default for Ustr {
599 fn default() -> Self {
600 Ustr::from("")
601 }
602}
603
604impl Deref for Ustr {
605 type Target = str;
606 fn deref(&self) -> &Self::Target {
607 self.as_str()
608 }
609}
610
611impl fmt::Display for Ustr {
612 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
613 write!(f, "{}", self.as_str())
614 }
615}
616
617impl fmt::Debug for Ustr {
618 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
619 write!(f, "u!({:?})", self.as_str())
620 }
621}
622
623// Just feed the precomputed hash into the Hasher. Note that this will of course
624// be terrible unless the Hasher in question is expecting a precomputed hash.
625impl Hash for Ustr {
626 fn hash<H: Hasher>(&self, state: &mut H) {
627 self.precomputed_hash().hash(state);
628 }
629}
630
631/// DO NOT CALL THIS.
632///
633/// Clears the cache -- used for benchmarking and testing purposes to clear the
634/// cache. Calling this will invalidate any previously created `UStr`s and
635/// probably cause your house to burn down. DO NOT CALL THIS.
636///
637/// # Safety
638///
639/// DO NOT CALL THIS.
640#[doc(hidden)]
641pub unsafe fn _clear_cache() {
642 for m in STRING_CACHE.0.iter() {
643 m.lock().clear();
644 }
645}
646
647/// Returns the total amount of memory allocated and in use by the cache in
648/// bytes.
649pub fn total_allocated() -> usize {
650 STRING_CACHE
651 .0
652 .iter()
653 .map(|sc| {
654 let t = sc.lock().total_allocated();
655
656 t
657 })
658 .sum()
659}
660
661/// Returns the total amount of memory reserved by the cache in bytes.
662pub fn total_capacity() -> usize {
663 STRING_CACHE
664 .0
665 .iter()
666 .map(|sc| {
667 let t = sc.lock().total_capacity();
668 t
669 })
670 .sum()
671}
672
673/// Create a new `Ustr` from the given `str`.
674///
675/// # Examples
676///
677/// ```
678/// use ustr_fxhash::ustr;
679/// # unsafe { ustr_fxhash::_clear_cache() };
680///
681/// let u1 = ustr("the quick brown fox");
682/// let u2 = ustr("the quick brown fox");
683/// assert_eq!(u1, u2);
684/// assert_eq!(ustr_fxhash::num_entries(), 1);
685/// ```
686#[inline]
687pub fn ustr(s: &str) -> Ustr {
688 Ustr::from(s)
689}
690
691/// Create a new `Ustr` from the given `str` but only if it already exists in
692/// the string cache.
693///
694/// # Examples
695///
696/// ```
697/// use ustr_fxhash::{ustr, existing_ustr};
698/// # unsafe { ustr_fxhash::_clear_cache() };
699///
700/// let u1 = existing_ustr("the quick brown fox");
701/// let u2 = ustr("the quick brown fox");
702/// let u3 = existing_ustr("the quick brown fox");
703/// assert_eq!(u1, None);
704/// assert_eq!(u3, Some(u2));
705/// ```
706#[inline]
707pub fn existing_ustr(s: &str) -> Option<Ustr> {
708 Ustr::from_existing(s)
709}
710
711/// Utility function to get a reference to the main cache object for use with
712/// serialization.
713///
714/// # Examples
715///
716/// ```
717/// # use ustr_fxhash::{Ustr, ustr, ustr as u};
718/// # #[cfg(feature="serde")]
719/// # {
720/// # unsafe { ustr_fxhash::_clear_cache() };
721/// ustr("Send me to JSON and back");
722/// let json = serde_json::to_string(ustr_fxhash::cache()).unwrap();
723/// # }
724pub fn cache() -> &'static Bins {
725 &STRING_CACHE
726}
727
728/// Returns the number of unique strings in the cache.
729///
730/// This may be an underestimate if other threads are writing to the cache
731/// concurrently.
732///
733/// # Examples
734///
735/// ```
736/// use ustr_fxhash::ustr as u;
737///
738/// let _ = u("Hello");
739/// let _ = u(", World!");
740/// assert_eq!(ustr_fxhash::num_entries(), 2);
741/// ```
742pub fn num_entries() -> usize {
743 STRING_CACHE
744 .0
745 .iter()
746 .map(|sc| {
747 let t = sc.lock().num_entries();
748 t
749 })
750 .sum()
751}
752
753#[doc(hidden)]
754pub fn num_entries_per_bin() -> Vec<usize> {
755 STRING_CACHE
756 .0
757 .iter()
758 .map(|sc| {
759 let t = sc.lock().num_entries();
760 t
761 })
762 .collect::<Vec<_>>()
763}
764
765/// Return an iterator over the entire string cache.
766///
767/// If another thread is adding strings concurrently to this call then they
768/// might not show up in the view of the cache presented by this iterator.
769///
770/// # Safety
771///
772/// This returns an iterator to the state of the cache at the time when
773/// `string_cache_iter()` was called. It is of course possible that another
774/// thread will add more strings to the cache after this, but since we never
775/// destroy the strings, they remain valid, meaning it's safe to iterate over
776/// them, the list just might not be completely up to date.
777pub fn string_cache_iter() -> StringCacheIterator {
778 let mut allocs = Vec::new();
779 for m in STRING_CACHE.0.iter() {
780 let sc = m.lock();
781 // the start of the allocator's data is actually the ptr, start() just
782 // points to the beginning of the allocated region. The first bytes will
783 // be uninitialized since we're bumping down
784 for a in &sc.old_allocs {
785 allocs.push((a.ptr(), a.end()));
786 }
787 let ptr = sc.alloc.ptr();
788 let end = sc.alloc.end();
789 if ptr != end {
790 allocs.push((sc.alloc.ptr(), sc.alloc.end()));
791 }
792 }
793
794 let current_ptr =
795 allocs.first().map(|s| s.0).unwrap_or_else(std::ptr::null);
796
797 StringCacheIterator {
798 allocs,
799 current_alloc: 0,
800 current_ptr,
801 }
802}
803
804/// The type used for the global string cache.
805///
806/// This is exposed to allow e.g. serialization of the data returned by the
807/// [`cache()`] function.
808#[repr(transparent)]
809pub struct Bins(pub(crate) [Mutex<StringCache>; NUM_BINS]);
810
811#[cfg(test)]
812lazy_static::lazy_static! {
813 static ref TEST_LOCK: Mutex<()> = Mutex::new(());
814}
815
816#[cfg(test)]
817mod tests {
818 use super::TEST_LOCK;
819
820 use std::ffi::OsStr;
821 use std::path::Path;
822
823 #[test]
824 fn it_works() {
825 let _t = TEST_LOCK.lock();
826 use super::ustr as u;
827
828 let u_hello = u("hello");
829 assert_eq!(u_hello, "hello");
830 let u_world = u("world");
831 assert_eq!(u_world, String::from("world"));
832 }
833
834 #[test]
835 fn empty_string() {
836 let _t = TEST_LOCK.lock();
837 use super::ustr as u;
838
839 unsafe {
840 super::_clear_cache();
841 }
842
843 let _empty = u("");
844 let empty = u("");
845
846 assert!(empty.as_str().is_empty());
847 assert_eq!(super::num_entries(), 1);
848 }
849
850 #[test]
851 fn c_str_works() {
852 let _t = TEST_LOCK.lock();
853 use super::ustr as u;
854 use std::ffi::CStr;
855
856 let s_fox = "The quick brown fox jumps over the lazy dog.";
857 let u_fox = u(s_fox);
858 let fox = unsafe { CStr::from_ptr(u_fox.as_char_ptr()) }
859 .to_string_lossy()
860 .into_owned();
861 assert_eq!(fox, s_fox);
862
863 let s_odys = "Τη γλώσσα μου έδωσαν ελληνική";
864 let u_odys = u(s_odys);
865 let odys = unsafe { CStr::from_ptr(u_odys.as_char_ptr()) }
866 .to_string_lossy()
867 .into_owned();
868 assert_eq!(odys, s_odys);
869 }
870
871 #[test]
872 // We have to disable miri here as it's far too slow unfortunately
873 #[cfg_attr(miri, ignore)]
874 fn blns() {
875 let _t = TEST_LOCK.lock();
876 use super::{string_cache_iter, ustr as u};
877 use std::collections::HashSet;
878
879 // clear the cache first or our results will be wrong
880 unsafe { super::_clear_cache() };
881
882 // let path =
883 // std::path::Path::new(&std::env::var("CARGO_MANIFEST_DIR").unwrap())
884 // .join("data")
885 // .join("blns.txt");
886 // let blns = std::fs::read_to_string(path).unwrap();
887 let blns = include_str!("../data/blns.txt");
888
889 let mut hs = HashSet::new();
890 for s in blns.split_whitespace() {
891 hs.insert(s);
892 }
893
894 let mut us = Vec::new();
895 let mut ss = Vec::new();
896
897 for s in blns.split_whitespace().cycle().take(100_000) {
898 let u = u(s);
899 us.push(u);
900 ss.push(s.to_owned());
901 }
902
903 let mut hs_u = HashSet::new();
904 for s in string_cache_iter() {
905 hs_u.insert(s);
906 }
907 let diff: HashSet<_> = hs.difference(&hs_u).collect();
908
909 // check that the number of entries is the same
910 assert_eq!(super::num_entries(), hs.len());
911
912 // check that we have the exact same (unique) strings in the cache as in
913 // the source data
914 assert_eq!(diff.len(), 0);
915
916 let nbs = super::num_entries_per_bin();
917 println!("{:?}", nbs);
918
919 println!("Total allocated: {}", super::total_allocated());
920 println!("Total capacity: {}", super::total_capacity());
921
922 println!(
923 "size of StringCache: {}",
924 std::mem::size_of::<super::StringCache>()
925 );
926 }
927
928 #[test]
929 // We have to disable miri here as it's far too slow unfortunately
930 #[cfg_attr(miri, ignore)]
931 fn raft() {
932 let _t = TEST_LOCK.lock();
933 use super::ustr as u;
934 use std::sync::Arc;
935
936 // let path =
937 // std::path::Path::new(&std::env::var("CARGO_MANIFEST_DIR").unwrap())
938 // .join("data")
939 // .join("raft-large-directories.txt");
940 // let raft = std::fs::read_to_string(path).unwrap();
941 let raft = include_str!("../data/raft-large-directories.txt");
942 let raft = Arc::new(
943 raft.split_whitespace()
944 .collect::<Vec<_>>()
945 .chunks(3)
946 .map(|s| {
947 if s.len() == 3 {
948 format!("{}/{}/{}", s[0], s[1], s[2])
949 } else {
950 s[0].to_owned()
951 }
952 })
953 .collect::<Vec<_>>(),
954 );
955
956 let s = raft.clone();
957 for _ in 0..600 {
958 let mut v = Vec::with_capacity(20_000);
959 unsafe { super::_clear_cache() };
960 for s in s.iter().cycle().take(20_000) {
961 v.push(u(s));
962 }
963 }
964 }
965
966 // This test is to have miri check the allocation code paths, but miri
967 // can't open files so it's not usable right now
968 // #[test]
969 // fn words() {
970 // let _t = TEST_LOCK.lock();
971 // use super::ustr as u;
972 // use std::sync::Arc;
973
974 // let path = std::path::Path::new("/usr/share/dict/words");
975 // let wordlist = std::fs::read_to_string(path).unwrap();
976 // let wordlist = Arc::new(
977 // wordlist
978 // .split_whitespace()
979 // .collect::<Vec<_>>()
980 // .chunks(7)
981 // .cycle()
982 // .take(4_000_000)
983 // .enumerate()
984 // .map(|(i, s)| u(&format!("{}{}", i, s.join("-"))))
985 // .collect::<Vec<_>>(),
986 // );
987 // }
988
989 #[cfg(all(feature = "serde", not(miri)))]
990 #[test]
991 fn serialization() {
992 let _t = TEST_LOCK.lock();
993 use super::{string_cache_iter, ustr as u};
994 use std::collections::HashSet;
995
996 // clear the cache first or our results will be wrong
997 unsafe { super::_clear_cache() };
998
999 let path = std::path::Path::new(
1000 &std::env::var("CARGO_MANIFEST_DIR")
1001 .expect("CARGO_MANIFEST_DIR not set"),
1002 )
1003 .join("data")
1004 .join("blns.txt");
1005 let blns = std::fs::read_to_string(path).unwrap();
1006
1007 let mut hs = HashSet::new();
1008 for s in blns.split_whitespace() {
1009 hs.insert(s);
1010 }
1011
1012 let mut us = Vec::new();
1013 let mut ss = Vec::new();
1014
1015 for s in blns.split_whitespace().cycle().take(100_000) {
1016 let u = u(s);
1017 us.push(u);
1018 ss.push(s.to_owned());
1019 }
1020
1021 let json = serde_json::to_string(super::cache()).unwrap();
1022 unsafe {
1023 super::_clear_cache();
1024 }
1025 let _: super::DeserializedCache = serde_json::from_str(&json).unwrap();
1026
1027 // now check that we've got the same data in the cache still
1028 let mut hs_u = HashSet::new();
1029 for s in string_cache_iter() {
1030 hs_u.insert(s);
1031 }
1032 let diff: HashSet<_> = hs.difference(&hs_u).collect();
1033
1034 // check that the number of entries is the same
1035 assert_eq!(super::num_entries(), hs.len());
1036
1037 // check that we have the exact same (unique) strings in the cache as in
1038 // the source data
1039 assert_eq!(diff.len(), 0);
1040 }
1041
1042 #[cfg(all(feature = "serde", not(miri)))]
1043 #[test]
1044 fn serialization_ustr() {
1045 let _t = TEST_LOCK.lock();
1046
1047 use super::{ustr, Ustr};
1048
1049 let u_hello = ustr("hello");
1050
1051 let json = serde_json::to_string(&u_hello).unwrap();
1052 let me_hello: Ustr = serde_json::from_str(&json).unwrap();
1053
1054 assert_eq!(u_hello, me_hello);
1055 }
1056
1057 #[test]
1058 fn partial_ord() {
1059 let _t = TEST_LOCK.lock();
1060 use super::ustr;
1061 let str_a = ustr("aaa");
1062 let str_z = ustr("zzz");
1063 let str_k = ustr("kkk");
1064 assert!(str_a < str_k);
1065 assert!(str_k < str_z);
1066 }
1067
1068 #[test]
1069 fn ord() {
1070 let _t = TEST_LOCK.lock();
1071 use super::ustr;
1072 let u_apple = ustr("apple");
1073 let u_bravo = ustr("bravo");
1074 let u_charlie = ustr("charlie");
1075 let u_delta = ustr("delta");
1076
1077 let mut v = vec![u_delta, u_bravo, u_charlie, u_apple];
1078 v.sort();
1079 assert_eq!(v, vec![u_apple, u_bravo, u_charlie, u_delta]);
1080 }
1081
1082 fn takes_into_str<'a, S: Into<&'a str>>(s: S) -> &'a str {
1083 s.into()
1084 }
1085
1086 #[test]
1087 fn test_into_str() {
1088 let _t = TEST_LOCK.lock();
1089 use super::ustr;
1090
1091 assert_eq!("converted", takes_into_str(ustr("converted")));
1092 }
1093
1094 #[test]
1095 fn test_existing_ustr() {
1096 let _t = TEST_LOCK.lock();
1097 use super::{existing_ustr, ustr};
1098 assert_eq!(existing_ustr("hello world!"), None);
1099 let s1 = ustr("hello world!");
1100 let s2 = existing_ustr("hello world!");
1101 assert_eq!(Some(s1), s2);
1102 }
1103
1104 #[test]
1105 fn test_empty_cache() {
1106 unsafe { super::_clear_cache() };
1107 assert_eq!(
1108 super::string_cache_iter().collect::<Vec<_>>(),
1109 Vec::<&'static str>::new()
1110 );
1111 }
1112
1113 #[test]
1114 fn as_refs() {
1115 let _t = TEST_LOCK.lock();
1116
1117 let u = super::ustr("test");
1118
1119 let s: String = u.to_owned();
1120 assert_eq!(u, s);
1121 assert_eq!(s, u);
1122
1123 let p: &Path = u.as_ref();
1124 assert_eq!(p, u);
1125
1126 let _: &[u8] = u.as_ref();
1127
1128 let o: &OsStr = u.as_ref();
1129 assert_eq!(p, o);
1130 assert_eq!(o, p);
1131
1132 let cow = std::borrow::Cow::from(u);
1133 assert_eq!(cow, u);
1134 assert_eq!(u, cow);
1135
1136 let boxed: Box<str> = u.into();
1137 assert_eq!(boxed, u);
1138 }
1139}
1140
1141lazy_static::lazy_static! {
1142 static ref STRING_CACHE: Bins = {
1143 use std::mem::{self, MaybeUninit};
1144 // This deeply unsafe feeling dance allows us to initialize an array of
1145 // arbitrary size and will have to tide us over until const generics
1146 // land. See:
1147 // https://doc.rust-lang.org/beta/std/mem/union.MaybeUninit.html#initializing-an-array-element-by-element
1148
1149 // Create an uninitialized array of `MaybeUninit`. The `assume_init` is
1150 // safe because the type we are claiming to have initialized here is a
1151 // bunch of `MaybeUninit`s, which do not require initialization.
1152 let mut bins: [MaybeUninit<Mutex<StringCache>>; NUM_BINS] = unsafe {
1153 MaybeUninit::uninit().assume_init()
1154 };
1155
1156 // Dropping a `MaybeUninit` does nothing. Thus using raw pointer
1157 // assignment instead of `ptr::write` does not cause the old
1158 // uninitialized value to be dropped. Also if there is a panic during
1159 // this loop, we have a memory leak, but there is no memory safety
1160 // issue.
1161 for bin in &mut bins[..] {
1162 *bin = MaybeUninit::new(Mutex::new(StringCache::default()));
1163 }
1164
1165 // Everything is initialized. Transmute the array to the
1166 // initialized type.
1167 unsafe { mem::transmute::<_, Bins>(bins) }
1168 };
1169}
1170
1171// Use the top bits of the hash to choose a bin
1172#[inline]
1173fn whichbin(hash: u64) -> usize {
1174 ((hash >> TOP_SHIFT as u64) % NUM_BINS as u64) as usize
1175}