| /* This Source Code Form is subject to the terms of the Mozilla Public |
| * License, v. 2.0. If a copy of the MPL was not distributed with this |
| * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ |
| |
| //! Counting and non-counting Bloom filters tuned for use as ancestor filters |
| //! for selector matching. |
| |
| use std::fmt::{self, Debug}; |
| |
| // The top 8 bits of the 32-bit hash value are not used by the bloom filter. |
| // Consumers may rely on this to pack hashes more efficiently. |
| pub const BLOOM_HASH_MASK: u32 = 0x00ffffff; |
| const KEY_SIZE: usize = 12; |
| |
| const ARRAY_SIZE: usize = 1 << KEY_SIZE; |
| const KEY_MASK: u32 = (1 << KEY_SIZE) - 1; |
| |
| /// A counting Bloom filter with 8-bit counters. |
| pub type BloomFilter = CountingBloomFilter<BloomStorageU8>; |
| |
| /// A counting Bloom filter with parameterized storage to handle |
| /// counters of different sizes. For now we assume that having two hash |
| /// functions is enough, but we may revisit that decision later. |
| /// |
| /// The filter uses an array with 2**KeySize entries. |
| /// |
| /// Assuming a well-distributed hash function, a Bloom filter with |
| /// array size M containing N elements and |
| /// using k hash function has expected false positive rate exactly |
| /// |
| /// $ (1 - (1 - 1/M)^{kN})^k $ |
| /// |
| /// because each array slot has a |
| /// |
| /// $ (1 - 1/M)^{kN} $ |
| /// |
| /// chance of being 0, and the expected false positive rate is the |
| /// probability that all of the k hash functions will hit a nonzero |
| /// slot. |
| /// |
| /// For reasonable assumptions (M large, kN large, which should both |
| /// hold if we're worried about false positives) about M and kN this |
| /// becomes approximately |
| /// |
| /// $$ (1 - \exp(-kN/M))^k $$ |
| /// |
| /// For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$, |
| /// or in other words |
| /// |
| /// $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$ |
| /// |
| /// where r is the false positive rate. This can be used to compute |
| /// the desired KeySize for a given load N and false positive rate r. |
| /// |
| /// If N/M is assumed small, then the false positive rate can |
| /// further be approximated as 4*N^2/M^2. So increasing KeySize by |
| /// 1, which doubles M, reduces the false positive rate by about a |
| /// factor of 4, and a false positive rate of 1% corresponds to |
| /// about M/N == 20. |
| /// |
| /// What this means in practice is that for a few hundred keys using a |
| /// KeySize of 12 gives false positive rates on the order of 0.25-4%. |
| /// |
| /// Similarly, using a KeySize of 10 would lead to a 4% false |
| /// positive rate for N == 100 and to quite bad false positive |
| /// rates for larger N. |
| #[derive(Clone, Default)] |
| pub struct CountingBloomFilter<S> |
| where |
| S: BloomStorage, |
| { |
| storage: S, |
| } |
| |
| impl<S> CountingBloomFilter<S> |
| where |
| S: BloomStorage, |
| { |
| /// Creates a new bloom filter. |
| #[inline] |
| pub fn new() -> Self { |
| Default::default() |
| } |
| |
| #[inline] |
| pub fn clear(&mut self) { |
| self.storage = Default::default(); |
| } |
| |
| // Slow linear accessor to make sure the bloom filter is zeroed. This should |
| // never be used in release builds. |
| #[cfg(debug_assertions)] |
| pub fn is_zeroed(&self) -> bool { |
| self.storage.is_zeroed() |
| } |
| |
| #[cfg(not(debug_assertions))] |
| pub fn is_zeroed(&self) -> bool { |
| unreachable!() |
| } |
| |
| /// Inserts an item with a particular hash into the bloom filter. |
| #[inline] |
| pub fn insert_hash(&mut self, hash: u32) { |
| self.storage.adjust_first_slot(hash, true); |
| self.storage.adjust_second_slot(hash, true); |
| } |
| |
| /// Removes an item with a particular hash from the bloom filter. |
| #[inline] |
| pub fn remove_hash(&mut self, hash: u32) { |
| self.storage.adjust_first_slot(hash, false); |
| self.storage.adjust_second_slot(hash, false); |
| } |
| |
| /// Check whether the filter might contain an item with the given hash. |
| /// This can sometimes return true even if the item is not in the filter, |
| /// but will never return false for items that are actually in the filter. |
| #[inline] |
| pub fn might_contain_hash(&self, hash: u32) -> bool { |
| !self.storage.first_slot_is_empty(hash) && !self.storage.second_slot_is_empty(hash) |
| } |
| } |
| |
| impl<S> Debug for CountingBloomFilter<S> |
| where |
| S: BloomStorage, |
| { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| let mut slots_used = 0; |
| for i in 0..ARRAY_SIZE { |
| if !self.storage.slot_is_empty(i) { |
| slots_used += 1; |
| } |
| } |
| write!(f, "BloomFilter({}/{})", slots_used, ARRAY_SIZE) |
| } |
| } |
| |
| pub trait BloomStorage: Clone + Default { |
| fn slot_is_empty(&self, index: usize) -> bool; |
| fn adjust_slot(&mut self, index: usize, increment: bool); |
| fn is_zeroed(&self) -> bool; |
| |
| #[inline] |
| fn first_slot_is_empty(&self, hash: u32) -> bool { |
| self.slot_is_empty(Self::first_slot_index(hash)) |
| } |
| |
| #[inline] |
| fn second_slot_is_empty(&self, hash: u32) -> bool { |
| self.slot_is_empty(Self::second_slot_index(hash)) |
| } |
| |
| #[inline] |
| fn adjust_first_slot(&mut self, hash: u32, increment: bool) { |
| self.adjust_slot(Self::first_slot_index(hash), increment) |
| } |
| |
| #[inline] |
| fn adjust_second_slot(&mut self, hash: u32, increment: bool) { |
| self.adjust_slot(Self::second_slot_index(hash), increment) |
| } |
| |
| #[inline] |
| fn first_slot_index(hash: u32) -> usize { |
| hash1(hash) as usize |
| } |
| |
| #[inline] |
| fn second_slot_index(hash: u32) -> usize { |
| hash2(hash) as usize |
| } |
| } |
| |
| /// Storage class for a CountingBloomFilter that has 8-bit counters. |
| pub struct BloomStorageU8 { |
| counters: [u8; ARRAY_SIZE], |
| } |
| |
| impl BloomStorage for BloomStorageU8 { |
| #[inline] |
| fn adjust_slot(&mut self, index: usize, increment: bool) { |
| let slot = &mut self.counters[index]; |
| if *slot != 0xff { |
| // full |
| if increment { |
| *slot += 1; |
| } else { |
| *slot -= 1; |
| } |
| } |
| } |
| |
| #[inline] |
| fn slot_is_empty(&self, index: usize) -> bool { |
| self.counters[index] == 0 |
| } |
| |
| #[inline] |
| fn is_zeroed(&self) -> bool { |
| self.counters.iter().all(|x| *x == 0) |
| } |
| } |
| |
| impl Default for BloomStorageU8 { |
| fn default() -> Self { |
| BloomStorageU8 { |
| counters: [0; ARRAY_SIZE], |
| } |
| } |
| } |
| |
| impl Clone for BloomStorageU8 { |
| fn clone(&self) -> Self { |
| BloomStorageU8 { |
| counters: self.counters, |
| } |
| } |
| } |
| |
| /// Storage class for a CountingBloomFilter that has 1-bit counters. |
| pub struct BloomStorageBool { |
| counters: [u8; ARRAY_SIZE / 8], |
| } |
| |
| impl BloomStorage for BloomStorageBool { |
| #[inline] |
| fn adjust_slot(&mut self, index: usize, increment: bool) { |
| let bit = 1 << (index % 8); |
| let byte = &mut self.counters[index / 8]; |
| |
| // Since we have only one bit for storage, decrementing it |
| // should never do anything. Assert against an accidental |
| // decrementing of a bit that was never set. |
| assert!( |
| increment || (*byte & bit) != 0, |
| "should not decrement if slot is already false" |
| ); |
| |
| if increment { |
| *byte |= bit; |
| } |
| } |
| |
| #[inline] |
| fn slot_is_empty(&self, index: usize) -> bool { |
| let bit = 1 << (index % 8); |
| (self.counters[index / 8] & bit) == 0 |
| } |
| |
| #[inline] |
| fn is_zeroed(&self) -> bool { |
| self.counters.iter().all(|x| *x == 0) |
| } |
| } |
| |
| impl Default for BloomStorageBool { |
| fn default() -> Self { |
| BloomStorageBool { |
| counters: [0; ARRAY_SIZE / 8], |
| } |
| } |
| } |
| |
| impl Clone for BloomStorageBool { |
| fn clone(&self) -> Self { |
| BloomStorageBool { |
| counters: self.counters, |
| } |
| } |
| } |
| |
| #[inline] |
| fn hash1(hash: u32) -> u32 { |
| hash & KEY_MASK |
| } |
| |
| #[inline] |
| fn hash2(hash: u32) -> u32 { |
| (hash >> KEY_SIZE) & KEY_MASK |
| } |
| |
| #[test] |
| fn create_and_insert_some_stuff() { |
| use rustc_hash::FxHasher; |
| use std::hash::{Hash, Hasher}; |
| use std::mem::transmute; |
| |
| fn hash_as_str(i: usize) -> u32 { |
| let mut hasher = FxHasher::default(); |
| let s = i.to_string(); |
| s.hash(&mut hasher); |
| let hash: u64 = hasher.finish(); |
| (hash >> 32) as u32 ^ (hash as u32) |
| } |
| |
| let mut bf = BloomFilter::new(); |
| |
| // Statically assert that ARRAY_SIZE is a multiple of 8, which |
| // BloomStorageBool relies on. |
| unsafe { |
| transmute::<[u8; ARRAY_SIZE % 8], [u8; 0]>([]); |
| } |
| |
| for i in 0_usize..1000 { |
| bf.insert_hash(hash_as_str(i)); |
| } |
| |
| for i in 0_usize..1000 { |
| assert!(bf.might_contain_hash(hash_as_str(i))); |
| } |
| |
| let false_positives = (1001_usize..2000).filter(|i| bf.might_contain_hash(hash_as_str(*i))).count(); |
| |
| assert!(false_positives < 190, "{} is not < 190", false_positives); // 19%. |
| |
| for i in 0_usize..100 { |
| bf.remove_hash(hash_as_str(i)); |
| } |
| |
| for i in 100_usize..1000 { |
| assert!(bf.might_contain_hash(hash_as_str(i))); |
| } |
| |
| let false_positives = (0_usize..100).filter(|i| bf.might_contain_hash(hash_as_str(*i))).count(); |
| |
| assert!(false_positives < 20, "{} is not < 20", false_positives); // 20%. |
| |
| bf.clear(); |
| |
| for i in 0_usize..2000 { |
| assert!(!bf.might_contain_hash(hash_as_str(i))); |
| } |
| } |
| |
| #[cfg(feature = "bench")] |
| #[cfg(test)] |
| mod bench { |
| extern crate test; |
| use super::BloomFilter; |
| |
| #[derive(Default)] |
| struct HashGenerator(u32); |
| |
| impl HashGenerator { |
| fn next(&mut self) -> u32 { |
| // Each hash is split into two twelve-bit segments, which are used |
| // as an index into an array. We increment each by 64 so that we |
| // hit the next cache line, and then another 1 so that our wrapping |
| // behavior leads us to different entries each time. |
| // |
| // Trying to simulate cold caches is rather difficult with the cargo |
| // benchmarking setup, so it may all be moot depending on the number |
| // of iterations that end up being run. But we might as well. |
| self.0 += (65) + (65 << super::KEY_SIZE); |
| self.0 |
| } |
| } |
| |
| #[bench] |
| fn create_insert_1000_remove_100_lookup_100(b: &mut test::Bencher) { |
| b.iter(|| { |
| let mut gen1 = HashGenerator::default(); |
| let mut gen2 = HashGenerator::default(); |
| let mut bf = BloomFilter::new(); |
| for _ in 0_usize..1000 { |
| bf.insert_hash(gen1.next()); |
| } |
| for _ in 0_usize..100 { |
| bf.remove_hash(gen2.next()); |
| } |
| for _ in 100_usize..200 { |
| test::black_box(bf.might_contain_hash(gen2.next())); |
| } |
| }); |
| } |
| |
| #[bench] |
| fn might_contain_10(b: &mut test::Bencher) { |
| let bf = BloomFilter::new(); |
| let mut gen = HashGenerator::default(); |
| b.iter(|| { |
| for _ in 0..10 { |
| test::black_box(bf.might_contain_hash(gen.next())); |
| } |
| }); |
| } |
| |
| #[bench] |
| fn clear(b: &mut test::Bencher) { |
| let mut bf = Box::new(BloomFilter::new()); |
| b.iter(|| test::black_box(&mut bf).clear()); |
| } |
| |
| #[bench] |
| fn insert_10(b: &mut test::Bencher) { |
| let mut bf = BloomFilter::new(); |
| let mut gen = HashGenerator::default(); |
| b.iter(|| { |
| for _ in 0..10 { |
| test::black_box(bf.insert_hash(gen.next())); |
| } |
| }); |
| } |
| |
| #[bench] |
| fn remove_10(b: &mut test::Bencher) { |
| let mut bf = BloomFilter::new(); |
| let mut gen = HashGenerator::default(); |
| // Note: this will underflow, and that's ok. |
| b.iter(|| { |
| for _ in 0..10 { |
| bf.remove_hash(gen.next()) |
| } |
| }); |
| } |
| } |