1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
use bv::BitVec;
use fnv::FnvHasher;
use rand::{self, Rng};
use serde::{Deserialize, Serialize};
use std::cmp;
use std::hash::Hasher;
use std::marker::PhantomData;
pub trait BloomHashIndex {
fn hash_at_index(&self, hash_index: u64) -> u64;
}
#[derive(Serialize, Deserialize, Default, Clone, Debug, PartialEq)]
pub struct Bloom<T: BloomHashIndex> {
pub keys: Vec<u64>,
pub bits: BitVec<u8>,
_phantom: PhantomData<T>,
}
impl<T: BloomHashIndex> Bloom<T> {
pub fn new(num_bits: usize, keys: Vec<u64>) -> Self {
let bits = BitVec::new_fill(false, num_bits as u64);
Bloom {
keys,
bits,
_phantom: PhantomData::default(),
}
}
pub fn random(num: usize, false_rate: f64, max_bits: usize) -> Self {
let min_num_bits = ((num as f64 * false_rate.log(2f64))
/ (1f64 / 2f64.powf(2f64.log(2f64))).log(2f64))
.ceil() as usize;
let num_bits = cmp::max(1, cmp::min(min_num_bits, max_bits));
let num_keys = ((num_bits as f64 / num as f64) * 2f64.log(2f64)).round() as usize;
let keys: Vec<u64> = (0..num_keys).map(|_| rand::thread_rng().gen()).collect();
Self::new(num_bits, keys)
}
fn pos(&self, key: &T, k: u64) -> u64 {
key.hash_at_index(k) % self.bits.len()
}
pub fn clear(&mut self) {
self.bits = BitVec::new_fill(false, self.bits.len());
}
pub fn add(&mut self, key: &T) {
for k in &self.keys {
let pos = self.pos(key, *k);
self.bits.set(pos, true);
}
}
pub fn contains(&self, key: &T) -> bool {
for k in &self.keys {
let pos = self.pos(key, *k);
if !self.bits.get(pos) {
return false;
}
}
true
}
}
fn slice_hash(slice: &[u8], hash_index: u64) -> u64 {
let mut hasher = FnvHasher::with_key(hash_index);
hasher.write(slice);
hasher.finish()
}
impl<T: AsRef<[u8]>> BloomHashIndex for T {
fn hash_at_index(&self, hash_index: u64) -> u64 {
slice_hash(self.as_ref(), hash_index)
}
}
#[cfg(test)]
mod test {
use super::*;
use solana_sdk::hash::{hash, Hash};
#[test]
fn test_bloom_filter() {
let bloom: Bloom<Hash> = Bloom::random(0, 0.1, 100);
assert_eq!(bloom.keys.len(), 0);
assert_eq!(bloom.bits.len(), 1);
let bloom: Bloom<Hash> = Bloom::random(10, 0.1, 100);
assert_eq!(bloom.keys.len(), 3);
assert_eq!(bloom.bits.len(), 34);
let bloom: Bloom<Hash> = Bloom::random(100, 0.1, 100);
assert_eq!(bloom.keys.len(), 1);
assert_eq!(bloom.bits.len(), 100);
}
#[test]
fn test_add_contains() {
let mut bloom: Bloom<Hash> = Bloom::random(100, 0.1, 100);
bloom.keys = vec![0, 1, 2, 3];
let key = hash(b"hello");
assert!(!bloom.contains(&key));
bloom.add(&key);
assert!(bloom.contains(&key));
let key = hash(b"world");
assert!(!bloom.contains(&key));
bloom.add(&key);
assert!(bloom.contains(&key));
}
#[test]
fn test_random() {
let mut b1: Bloom<Hash> = Bloom::random(10, 0.1, 100);
let mut b2: Bloom<Hash> = Bloom::random(10, 0.1, 100);
b1.keys.sort();
b2.keys.sort();
assert_ne!(b1.keys, b2.keys);
}
}