sanakirja_core/
lib.rs

1#![no_std]
2
3//! This crate defines tools to implement datastructures that can live
4//! in main memory or on-disk, meaning that their natural habitat is
5//! memory-mapped files, but if that environment is threatened, they
6//! might seek refuge in lower-level environments.
7//!
8//! One core building block of this library is the notion of virtual
9//! memory pages, which are allocated and freed by an
10//! externally-provided allocator (see how the `sanakirja` crate does
11//! this). The particular implementation used here is meant to allow a
12//! transactional system with readers reading the structures
13//! concurrently with one writer at a time.
14//!
15//! At the moment, only B trees are implemented, as well as the
16//! following general traits:
17//!
18//! - [`LoadPage`] is a trait used to get a pointer to a page. In the
19//! most basic version, this may just return a pointer to the file,
20//! offset by the requested offset. In more sophisticated versions,
21//! this can be used to encrypt and compress pages.
22//! - [`AllocPage`] allocates and frees pages, because as
23//! datastructures need to be persisted on disk, we can't rely on
24//! Rust's memory management to do it for us. Users of this crate
25//! don't have to worry about this though.
26//!
27//! Moreover, two other traits can be used to store things on pages:
28//! [`Storable`] is a simple trait that all `Sized + Ord` types
29//! without references can readily implement (the [`direct_repr!`]
30//! macro does that). For types containing references to pages
31//! allocated in the database, the comparison function can be
32//! customised. Moreover, these types must supply an iterator over
33//! these references, in order for reference-counting to work properly
34//! when the datastructures referencing these types are forked.
35//!
36//! Dynamically-sized types, or types that need to be represented in a
37//! dynamically-sized way, can use the [`UnsizedStorable`] format.
38
39pub mod btree;
40
41/// There's a hard-coded assumption that pages have 4K bytes. This is
42/// true for normal memory pages on almost all platforms.
43pub const PAGE_SIZE: usize = 4096;
44
45/// Types that can be stored on disk. This trait may be used in
46/// conjunction with `Sized` in order to determine the on-disk size,
47/// or with [`UnsizedStorable`] when special arrangements are needed.
48///
49/// The size limit for an entry depends on the datastructure. For B
50/// trees, nodes are guaranteed to be at least half-full (i.e. at
51/// least 2kiB), and we need at least two entries in each page in
52/// order to be able to rebalance in a deletion. A sufficient
53/// condition for this is that the size of any entry is less than
54/// 1/4th of the blocks. Now, internal nodes need 16 bytes of block
55/// header, and then 8 extra bytes for each entry. Entries must
56/// therefore not exceed 4080/4 - 8 = 1012 bytes.
57
58pub trait Storable: core::fmt::Debug {
59    /// This is required for B trees, not necessarily for other
60    /// structures. The default implementation panics.
61    fn compare<T: LoadPage>(&self, _txn: &T, _b: &Self) -> core::cmp::Ordering {
62        unimplemented!()
63    }
64
65    /// If this value is an offset to another page at offset `offset`,
66    /// return `Some(offset)`. Return `None` else.
67    fn page_references(&self) -> Self::PageReferences;
68
69    /// If this value is an offset to another page at offset `offset`,
70    /// return `Some(offset)`. Return `None` else.
71    ///
72    /// # Safety
73    ///
74    /// The caller must not keep any reference to the deleted value.
75    unsafe fn drop<T: AllocPage>(&self, txn: &mut T) -> Result<(), T::Error> {
76        for p in self.page_references() {
77            txn.decr_rc(p)?;
78        }
79        Ok(())
80    }
81
82    /// An iterator over the offsets to pages contained in this
83    /// value. Only values from this crate can generate non-empty
84    /// iterators, but combined values (like tuples) must chain the
85    /// iterators returned by method `page_offsets`.
86    type PageReferences: Iterator<Item = u64>;
87}
88
89/// A trait to serialize the identity of complex types and check the
90/// validity of database schemas.
91#[cfg(feature = "typeids")]
92pub trait TypeId {
93    fn type_id() -> [u8; 32];
94}
95
96#[cfg(feature = "typeids")]
97pub use sha2;
98#[cfg(feature = "typeids")]
99use sha2::Digest;
100
101#[cfg(feature = "typeids")]
102impl TypeId for () {
103    fn type_id() -> [u8; 32] {
104        [0; 32]
105    }
106}
107
108/// A macro to implement [`Storable`] on "plain" types,
109/// i.e. fixed-sized types that are `repr(C)` and don't hold
110/// references.
111#[macro_export]
112macro_rules! direct_repr {
113    ($t: ty) => {
114        impl Storable for $t {
115            type PageReferences = core::iter::Empty<u64>;
116            fn page_references(&self) -> Self::PageReferences {
117                core::iter::empty()
118            }
119            fn compare<T>(&self, _: &T, b: &Self) -> core::cmp::Ordering {
120                self.cmp(b)
121            }
122        }
123        impl UnsizedStorable for $t {
124            const ALIGN: usize = core::mem::align_of::<$t>();
125
126            /// If `Self::SIZE.is_some()`, this must return the same
127            /// value. The default implementation is `Self;:SIZE.unwrap()`.
128            fn size(&self) -> usize {
129                core::mem::size_of::<Self>()
130            }
131
132            /// Read the size from an on-page entry.
133            unsafe fn onpage_size(_: *const u8) -> usize {
134                core::mem::size_of::<Self>()
135            }
136
137            unsafe fn write_to_page(&self, p: *mut u8) {
138                core::ptr::copy_nonoverlapping(self, p as *mut Self, 1)
139            }
140
141            unsafe fn from_raw_ptr<'a, T>(_: &T, p: *const u8) -> &'a Self {
142                &*(p as *const Self)
143            }
144        }
145    };
146}
147
148direct_repr!(());
149direct_repr!(u8);
150direct_repr!(i8);
151direct_repr!(u16);
152direct_repr!(i16);
153direct_repr!(u32);
154direct_repr!(i32);
155direct_repr!(u64);
156direct_repr!(i64);
157direct_repr!([u8; 16]);
158
159#[cfg(feature = "std")]
160extern crate std;
161#[cfg(feature = "std")]
162direct_repr!(std::net::Ipv4Addr);
163#[cfg(feature = "std")]
164direct_repr!(std::net::Ipv6Addr);
165#[cfg(feature = "std")]
166direct_repr!(std::net::IpAddr);
167#[cfg(feature = "std")]
168direct_repr!(std::net::SocketAddr);
169#[cfg(feature = "std")]
170direct_repr!(std::time::SystemTime);
171#[cfg(feature = "std")]
172direct_repr!(std::time::Duration);
173#[cfg(feature = "uuid")]
174direct_repr!(uuid::Uuid);
175#[cfg(feature = "ed25519")]
176direct_repr!(ed25519_zebra::VerificationKeyBytes);
177
178/// Types that can be stored on disk.
179pub trait UnsizedStorable: Storable {
180    const ALIGN: usize;
181
182    /// If `Self::SIZE.is_some()`, this must return the same
183    /// value. The default implementation is `Self;:SIZE.unwrap()`.
184    fn size(&self) -> usize;
185
186    /// Read the size from an on-page entry. If `Self::SIZE.is_some()`
187    /// this must be the same value.
188    unsafe fn onpage_size(_: *const u8) -> usize;
189
190    /// Write to a page. Must not overwrite the allocated size, but
191    /// this isn't checked (which is why it's unsafe).
192    unsafe fn write_to_page(&self, _: *mut u8) {
193        unimplemented!()
194    }
195
196    /// Write to a page. Must not overwrite the allocated size, but
197    /// this isn't checked (which is why it's unsafe).
198    ///
199    /// This is similar to `write_to_page`, but allows the user to
200    /// allocate a value as needed when inserting the value into the
201    /// base; do not implement both methods, since only
202    /// `write_to_page_alloc` gets called by the library.
203    ///
204    /// The default implementation just calls `write_to_page`.
205    unsafe fn write_to_page_alloc<T: AllocPage>(&self, _: &mut T, p: *mut u8) {
206        self.write_to_page(p)
207    }
208
209    unsafe fn from_raw_ptr<'a, T>(_: &T, p: *const u8) -> &'a Self;
210}
211
212#[derive(Debug, Clone, Copy)]
213#[repr(C)]
214struct Ref {
215    p: u64,
216    len: u64,
217}
218
219pub union Slice<'b> {
220    len: u16,
221    page: Ref,
222    mem: Mem<'b>,
223}
224
225#[derive(Clone, Copy)]
226#[repr(C)]
227struct Mem<'b> {
228    _len: u16,
229    m: &'b [u8],
230}
231
232impl<'a> core::fmt::Debug for Slice<'a> {
233    fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
234        write!(fmt, "Slice({:?})", unsafe { self.len })
235    }
236}
237
238impl<'a> core::convert::From<&'a [u8]> for Slice<'a> {
239    fn from(m: &'a [u8]) -> Slice<'a> {
240        let s = Slice {
241            mem: Mem { _len: 513, m },
242        };
243        s
244    }
245}
246
247impl<'a> Slice<'a> {
248    pub fn as_bytes<T: LoadPage>(&self, txn: &T) -> Result<&[u8], T::Error> {
249        Ok(unsafe {
250            let len = u16::from_le(self.len) & 0xfff;
251            if len == 512 {
252                // Stored externally
253                let p = txn.load_page(u64::from_le(self.page.p) & !0xfff)?;
254                core::slice::from_raw_parts(p.data, u64::from_le(self.page.len) as usize)
255            } else if len == 513 {
256                // Stored in memory, not on any page
257                self.mem.m
258            } else {
259                core::slice::from_raw_parts(
260                    (&self.len as *const u16 as *const u8).add(2),
261                    len as usize,
262                )
263            }
264        })
265    }
266}
267
268#[cfg(feature = "typeids")]
269impl<'a> TypeId for Slice<'a> {
270    fn type_id() -> [u8; 32] {
271        let mut h = sha2::Sha256::new();
272        h.update(b"sanakirja-core::Slice");
273        h.finalize().into()
274    }
275}
276
277impl<'a> Storable for Slice<'a> {
278    type PageReferences = Pages;
279    fn page_references(&self) -> Self::PageReferences {
280        unsafe {
281            let len = u16::from_le(self.len);
282            if len == 512 {
283                let plen = u64::from_le(self.page.len);
284                let len_up = ((plen + PAGE_SIZE as u64 - 1) / PAGE_SIZE as u64) * PAGE_SIZE as u64;
285                let offset = u64::from_le(self.page.p) & !0xfff;
286                Pages {
287                    offset,
288                    limit: offset + len_up,
289                }
290            } else {
291                Pages {
292                    offset: 0,
293                    limit: 0,
294                }
295            }
296        }
297    }
298    fn compare<T: LoadPage>(&self, t: &T, b: &Self) -> core::cmp::Ordering {
299        self.as_bytes(t).unwrap().cmp(b.as_bytes(t).unwrap())
300    }
301}
302
303pub struct Pages {
304    offset: u64,
305    limit: u64,
306}
307
308impl Iterator for Pages {
309    type Item = u64;
310    fn next(&mut self) -> Option<Self::Item> {
311        if self.offset >= self.limit {
312            None
313        } else {
314            let o = self.offset;
315            self.offset += PAGE_SIZE as u64;
316            Some(o)
317        }
318    }
319}
320
321impl<'b> UnsizedStorable for Slice<'b> {
322    const ALIGN: usize = 8;
323    fn size(&self) -> usize {
324        let s = unsafe {
325            if u16::from_le(self.len) == 512 {
326                // Stored externally
327                16
328            } else if u16::from_le(self.len) == 513 {
329                // Stored in memory, not on any page
330                if self.mem.m.len() > 510 {
331                    16
332                } else {
333                    2 + self.mem.m.len()
334                }
335            } else {
336                u16::from_le(self.len) as usize
337            }
338        };
339        s
340    }
341    unsafe fn from_raw_ptr<'a, T>(_: &T, p: *const u8) -> &'a Self {
342        &*(p as *const Self)
343    }
344    unsafe fn onpage_size(p: *const u8) -> usize {
345        let p = &*(p as *const Self);
346        if u16::from_le(p.len) == 512 {
347            // Stored externally
348            16
349        } else if u16::from_le(p.len) == 513 {
350            // Stored in memory, not on any page
351            2 + p.mem.m.len()
352        } else {
353            u16::from_le(p.len) as usize
354        }
355    }
356
357    unsafe fn write_to_page_alloc<T: AllocPage>(&self, t: &mut T, p: *mut u8) {
358        if self.len == 512 {
359            // Stored externally
360            core::ptr::copy(&self.page as *const Ref as *const u8, p, 16)
361        } else if self.len == 513 {
362            // Alloc ?
363            if self.mem.m.len() > 510 {
364                let len = self.mem.m.len();
365                let page = t
366                    .alloc_contiguous((((len + PAGE_SIZE - 1) / PAGE_SIZE) * PAGE_SIZE) as u64)
367                    .unwrap();
368                assert!(page.0.offset > 0);
369                core::ptr::copy_nonoverlapping(self.mem.m.as_ptr(), page.0.data, len);
370                let p = &mut *(p as *mut Ref);
371                p.len = (self.mem.m.len() as u64).to_le();
372                p.p = (page.0.offset | 512).to_le();
373            } else {
374                let len = self.mem.m.len();
375                *(p as *mut u16) = (len as u16).to_le();
376                core::ptr::copy_nonoverlapping(self.mem.m.as_ptr(), p.add(2), len)
377            }
378        } else {
379            core::ptr::copy(
380                &self.len as *const u16 as *const u8,
381                p,
382                2 + u16::from_le(self.len) as usize,
383            )
384        }
385    }
386}
387
388impl Storable for [u8] {
389    type PageReferences = core::iter::Empty<u64>;
390    fn page_references(&self) -> Self::PageReferences {
391        core::iter::empty()
392    }
393    fn compare<T>(&self, _: &T, b: &Self) -> core::cmp::Ordering {
394        self.cmp(b)
395    }
396}
397
398impl UnsizedStorable for [u8] {
399    const ALIGN: usize = 2;
400    fn size(&self) -> usize {
401        2 + self.len()
402    }
403    unsafe fn from_raw_ptr<'a, T>(_: &T, p: *const u8) -> &'a Self {
404        let len = u16::from_le(*(p as *const u16));
405        assert_ne!(len, 0);
406        assert_eq!(len & 0xf000, 0);
407        core::slice::from_raw_parts(p.add(2), len as usize)
408    }
409    unsafe fn onpage_size(p: *const u8) -> usize {
410        let len = u16::from_le(*(p as *const u16));
411        2 + len as usize
412    }
413    unsafe fn write_to_page_alloc<T: AllocPage>(&self, _txn: &mut T, p: *mut u8) {
414        assert!(self.len() <= 510);
415        *(p as *mut u16) = (self.len() as u16).to_le();
416        core::ptr::copy_nonoverlapping(self.as_ptr(), p.add(2), self.len())
417    }
418}
419
420unsafe fn read<T: LoadPage, K: UnsizedStorable + ?Sized, V: UnsizedStorable + ?Sized>(
421    _txn: &T,
422    k: *const u8,
423) -> (*const u8, *const u8) {
424    let s = K::onpage_size(k);
425    let v = k.add(s);
426    let al = v.align_offset(V::ALIGN);
427    let v = v.add(al);
428    (k, v)
429}
430
431unsafe fn entry_size<K: UnsizedStorable + ?Sized, V: UnsizedStorable + ?Sized>(
432    k: *const u8,
433) -> usize {
434    assert_eq!(k.align_offset(K::ALIGN), 0);
435    let ks = K::onpage_size(k);
436    // next multiple of va, assuming va is a power of 2.
437    let v_off = (ks + V::ALIGN - 1) & !(V::ALIGN - 1);
438    let v_ptr = k.add(v_off);
439    let vs = V::onpage_size(v_ptr);
440    let ka = K::ALIGN.max(V::ALIGN);
441    let size = v_off + vs;
442    (size + ka - 1) & !(ka - 1)
443}
444
445/// Representation of a mutable or shared page. This is an owned page
446/// (like `Vec` in Rust's std), but we do not know whether we can
447/// mutate it or not.
448///
449/// The least-significant bit of the first byte of each page is 1 if
450/// and only if the page was allocated by the current transaction (and
451/// hence isn't visible to any other transaction, meaning we can write
452/// on it).
453#[derive(Debug)]
454#[repr(C)]
455pub struct CowPage {
456    pub data: *mut u8,
457    pub offset: u64,
458}
459
460/// Representation of a borrowed, or immutable page, like a slice in
461/// Rust.
462#[derive(Debug, Clone, Copy)]
463#[repr(C)]
464pub struct Page<'a> {
465    pub data: &'a [u8; PAGE_SIZE],
466    pub offset: u64,
467}
468
469impl CowPage {
470    /// Borrows the page.
471    pub fn as_page(&self) -> Page {
472        Page {
473            data: unsafe { &*(self.data as *const [u8; PAGE_SIZE]) },
474            offset: self.offset,
475        }
476    }
477
478    #[cfg(feature = "crc32")]
479    pub unsafe fn crc(&self, hasher: &crc32fast::Hasher) -> u32 {
480        crc(self.data, hasher)
481    }
482
483    #[cfg(feature = "crc32")]
484    pub unsafe fn crc_check(&self, hasher: &crc32fast::Hasher) -> bool {
485        crc_check(self.data, hasher)
486    }
487}
488
489/// An owned page on which we can write. This is just a wrapper around
490/// `CowPage` to avoid checking the "dirty" bit at runtime.
491#[derive(Debug)]
492pub struct MutPage(pub CowPage);
493
494impl MutPage {
495    #[cfg(not(feature = "crc32"))]
496    pub unsafe fn clear_dirty(&mut self) {
497        *self.0.data &= 0xfe
498    }
499
500    #[cfg(feature = "crc32")]
501    pub unsafe fn clear_dirty(&mut self, hasher: &crc32fast::Hasher) {
502        *self.0.data &= 0xfe;
503        let crc_ = (self.0.data as *mut u32).add(1);
504        *crc_ = crc(self.0.data, hasher)
505    }
506}
507
508#[cfg(feature = "crc32")]
509pub unsafe fn crc(data: *mut u8, hasher: &crc32fast::Hasher) -> u32 {
510    let mut hasher = hasher.clone();
511    hasher.reset();
512    // Hash the beginning and the end of the page (i.e. remove
513    // the CRC).
514    unsafe {
515        // Remove the dirty bit.
516        let x = [(*data) & 0xfe];
517        hasher.update(&x[..]);
518        hasher.update(core::slice::from_raw_parts(data.add(1), 3));
519        hasher.update(core::slice::from_raw_parts(data.add(8), PAGE_SIZE - 8));
520    }
521    hasher.finalize()
522}
523
524#[cfg(feature = "crc32")]
525pub unsafe fn crc_check(data: *mut u8, hasher: &crc32fast::Hasher) -> bool {
526    let crc_ = unsafe { u32::from_le(*(data as *const u32).add(1)) };
527    crc(data, hasher) == crc_
528}
529
530#[cfg(not(feature = "crc32"))]
531pub fn clear_dirty(p: *mut u8) {
532    unsafe { *p &= 0xfe }
533}
534
535#[cfg(feature = "crc32")]
536pub fn clear_dirty(p: *mut u8, hasher: &crc32fast::Hasher) {
537    unsafe {
538        *p &= 0xfe;
539        let crc_ = (p as *mut u32).add(1);
540        *crc_ = crc(p, hasher)
541    }
542}
543
544unsafe impl Sync for CowPage {}
545unsafe impl Send for CowPage {}
546
547impl CowPage {
548    /// Checks the dirty bit of a page.
549    pub fn is_dirty(&self) -> bool {
550        unsafe { (*self.data) & 1 != 0 }
551    }
552}
553
554/// Trait for loading a page.
555pub trait LoadPage {
556    type Error: core::fmt::Debug;
557    /// Loading a page.
558    unsafe fn load_page(&self, off: u64) -> Result<CowPage, Self::Error>;
559
560    /// Loading multiple pages written contiguously in the underlying
561    /// storage media.
562    ///
563    /// If the type also implements `AllocPage`, attention must be
564    /// paid to the compatibility with `alloc_contiguous`.
565    unsafe fn load_page_contiguous(&self, _off: u64, _len: u64) -> Result<CowPage, Self::Error> {
566        unimplemented!()
567    }
568
569    /// Reference-counting. Since reference-counts are designed to be
570    /// storable into B trees by external allocators, pages referenced
571    /// once aren't stored, and hence are indistinguishable from pages
572    /// that are never referenced. The default implementation returns
573    /// 0.
574    ///
575    /// This has the extra benefit of requiring less disk space, and
576    /// isn't more unsafe than storing the reference count, since we
577    /// aren't supposed to hold a reference to a page with "logical
578    /// RC" 0, so storing "1" for that page would be redundant anyway.
579    fn rc(&self, _off: u64) -> Result<u64, Self::Error> {
580        Ok(0)
581    }
582}
583
584/// Trait for allocating and freeing pages.
585pub trait AllocPage: LoadPage {
586    /// Allocate a new page.
587    unsafe fn alloc_page(&mut self) -> Result<MutPage, Self::Error>;
588    /// Allocate a new page, in a context where we cannot use the
589    /// "dirty bit" trick directly on the page.
590    unsafe fn alloc_page_no_dirty(&mut self) -> Result<MutPage, Self::Error> {
591        unimplemented!()
592    }
593    /// Allocate many contiguous pages, return the first one. The
594    /// dirty bit is not needed.
595    unsafe fn alloc_contiguous(&mut self, length: u64) -> Result<MutPage, Self::Error>;
596    /// Increment the page's reference count.
597    fn incr_rc(&mut self, off: u64) -> Result<usize, Self::Error>;
598    /// Decrement the page's reference count, assuming the page was
599    /// first allocated by another transaction. If the RC reaches 0,
600    /// free the page. Must return the new RC (0 if freed).
601    unsafe fn decr_rc(&mut self, off: u64) -> Result<usize, Self::Error>;
602    /// Same as [`Self::decr_rc`], but for pages allocated by the current
603    /// transaction. This is an important distinction, as pages
604    /// allocated by the current transaction can be reused immediately
605    /// after being freed.
606    unsafe fn decr_rc_owned(&mut self, off: u64) -> Result<usize, Self::Error>;
607}