use std::fmt::{self, Debug};
pub const BLOOM_HASH_MASK: u32 = 0x00ffffff;
const KEY_SIZE: usize = 12;
const ARRAY_SIZE: usize = 1 << KEY_SIZE;
const KEY_MASK: u32 = (1 << KEY_SIZE) - 1;
pub type BloomFilter = CountingBloomFilter<BloomStorageU8>;
#[derive(Clone, Default)]
pub struct CountingBloomFilter<S>
where
S: BloomStorage,
{
storage: S,
}
impl<S> CountingBloomFilter<S>
where
S: BloomStorage,
{
#[inline]
pub fn new() -> Self {
Default::default()
}
#[inline]
pub fn clear(&mut self) {
self.storage = Default::default();
}
#[cfg(debug_assertions)]
pub fn is_zeroed(&self) -> bool {
self.storage.is_zeroed()
}
#[cfg(not(debug_assertions))]
pub fn is_zeroed(&self) -> bool {
unreachable!()
}
#[inline]
pub fn insert_hash(&mut self, hash: u32) {
self.storage.adjust_first_slot(hash, true);
self.storage.adjust_second_slot(hash, true);
}
#[inline]
pub fn remove_hash(&mut self, hash: u32) {
self.storage.adjust_first_slot(hash, false);
self.storage.adjust_second_slot(hash, false);
}
#[inline]
pub fn might_contain_hash(&self, hash: u32) -> bool {
!self.storage.first_slot_is_empty(hash) && !self.storage.second_slot_is_empty(hash)
}
}
impl<S> Debug for CountingBloomFilter<S>
where
S: BloomStorage,
{
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut slots_used = 0;
for i in 0..ARRAY_SIZE {
if !self.storage.slot_is_empty(i) {
slots_used += 1;
}
}
write!(f, "BloomFilter({}/{})", slots_used, ARRAY_SIZE)
}
}
pub trait BloomStorage: Clone + Default {
fn slot_is_empty(&self, index: usize) -> bool;
fn adjust_slot(&mut self, index: usize, increment: bool);
fn is_zeroed(&self) -> bool;
#[inline]
fn first_slot_is_empty(&self, hash: u32) -> bool {
self.slot_is_empty(Self::first_slot_index(hash))
}
#[inline]
fn second_slot_is_empty(&self, hash: u32) -> bool {
self.slot_is_empty(Self::second_slot_index(hash))
}
#[inline]
fn adjust_first_slot(&mut self, hash: u32, increment: bool) {
self.adjust_slot(Self::first_slot_index(hash), increment)
}
#[inline]
fn adjust_second_slot(&mut self, hash: u32, increment: bool) {
self.adjust_slot(Self::second_slot_index(hash), increment)
}
#[inline]
fn first_slot_index(hash: u32) -> usize {
hash1(hash) as usize
}
#[inline]
fn second_slot_index(hash: u32) -> usize {
hash2(hash) as usize
}
}
pub struct BloomStorageU8 {
counters: [u8; ARRAY_SIZE],
}
impl BloomStorage for BloomStorageU8 {
#[inline]
fn adjust_slot(&mut self, index: usize, increment: bool) {
let slot = &mut self.counters[index];
if *slot != 0xff {
if increment {
*slot += 1;
} else {
*slot -= 1;
}
}
}
#[inline]
fn slot_is_empty(&self, index: usize) -> bool {
self.counters[index] == 0
}
#[inline]
fn is_zeroed(&self) -> bool {
self.counters.iter().all(|x| *x == 0)
}
}
impl Default for BloomStorageU8 {
fn default() -> Self {
BloomStorageU8 {
counters: [0; ARRAY_SIZE],
}
}
}
impl Clone for BloomStorageU8 {
fn clone(&self) -> Self {
BloomStorageU8 {
counters: self.counters,
}
}
}
pub struct BloomStorageBool {
counters: [u8; ARRAY_SIZE / 8],
}
impl BloomStorage for BloomStorageBool {
#[inline]
fn adjust_slot(&mut self, index: usize, increment: bool) {
let bit = 1 << (index % 8);
let byte = &mut self.counters[index / 8];
assert!(
increment || (*byte & bit) != 0,
"should not decrement if slot is already false"
);
if increment {
*byte |= bit;
}
}
#[inline]
fn slot_is_empty(&self, index: usize) -> bool {
let bit = 1 << (index % 8);
(self.counters[index / 8] & bit) == 0
}
#[inline]
fn is_zeroed(&self) -> bool {
self.counters.iter().all(|x| *x == 0)
}
}
impl Default for BloomStorageBool {
fn default() -> Self {
BloomStorageBool {
counters: [0; ARRAY_SIZE / 8],
}
}
}
impl Clone for BloomStorageBool {
fn clone(&self) -> Self {
BloomStorageBool {
counters: self.counters,
}
}
}
#[inline]
fn hash1(hash: u32) -> u32 {
hash & KEY_MASK
}
#[inline]
fn hash2(hash: u32) -> u32 {
(hash >> KEY_SIZE) & KEY_MASK
}
#[test]
fn create_and_insert_some_stuff() {
use fxhash::FxHasher;
use std::hash::{Hash, Hasher};
use std::mem::transmute;
fn hash_as_str(i: usize) -> u32 {
let mut hasher = FxHasher::default();
let s = i.to_string();
s.hash(&mut hasher);
let hash: u64 = hasher.finish();
(hash >> 32) as u32 ^ (hash as u32)
}
let mut bf = BloomFilter::new();
unsafe {
transmute::<[u8; ARRAY_SIZE % 8], [u8; 0]>([]);
}
for i in 0_usize..1000 {
bf.insert_hash(hash_as_str(i));
}
for i in 0_usize..1000 {
assert!(bf.might_contain_hash(hash_as_str(i)));
}
let false_positives = (1001_usize..2000)
.filter(|i| bf.might_contain_hash(hash_as_str(*i)))
.count();
assert!(false_positives < 190, "{} is not < 190", false_positives);
for i in 0_usize..100 {
bf.remove_hash(hash_as_str(i));
}
for i in 100_usize..1000 {
assert!(bf.might_contain_hash(hash_as_str(i)));
}
let false_positives = (0_usize..100)
.filter(|i| bf.might_contain_hash(hash_as_str(*i)))
.count();
assert!(false_positives < 20, "{} is not < 20", false_positives);
bf.clear();
for i in 0_usize..2000 {
assert!(!bf.might_contain_hash(hash_as_str(i)));
}
}
#[cfg(feature = "bench")]
#[cfg(test)]
mod bench {
extern crate test;
use super::BloomFilter;
#[derive(Default)]
struct HashGenerator(u32);
impl HashGenerator {
fn next(&mut self) -> u32 {
self.0 += (65) + (65 << super::KEY_SIZE);
self.0
}
}
#[bench]
fn create_insert_1000_remove_100_lookup_100(b: &mut test::Bencher) {
b.iter(|| {
let mut gen1 = HashGenerator::default();
let mut gen2 = HashGenerator::default();
let mut bf = BloomFilter::new();
for _ in 0_usize..1000 {
bf.insert_hash(gen1.next());
}
for _ in 0_usize..100 {
bf.remove_hash(gen2.next());
}
for _ in 100_usize..200 {
test::black_box(bf.might_contain_hash(gen2.next()));
}
});
}
#[bench]
fn might_contain_10(b: &mut test::Bencher) {
let bf = BloomFilter::new();
let mut gen = HashGenerator::default();
b.iter(|| {
for _ in 0..10 {
test::black_box(bf.might_contain_hash(gen.next()));
}
});
}
#[bench]
fn clear(b: &mut test::Bencher) {
let mut bf = Box::new(BloomFilter::new());
b.iter(|| test::black_box(&mut bf).clear());
}
#[bench]
fn insert_10(b: &mut test::Bencher) {
let mut bf = BloomFilter::new();
let mut gen = HashGenerator::default();
b.iter(|| {
for _ in 0..10 {
test::black_box(bf.insert_hash(gen.next()));
}
});
}
#[bench]
fn remove_10(b: &mut test::Bencher) {
let mut bf = BloomFilter::new();
let mut gen = HashGenerator::default();
b.iter(|| {
for _ in 0..10 {
bf.remove_hash(gen.next())
}
});
}
}