1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/mfbt/BloomFilter.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,234 @@ 1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ 1.6 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.7 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.8 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.9 + 1.10 +/* 1.11 + * A counting Bloom filter implementation. This allows consumers to 1.12 + * do fast probabilistic "is item X in set Y?" testing which will 1.13 + * never answer "no" when the correct answer is "yes" (but might 1.14 + * incorrectly answer "yes" when the correct answer is "no"). 1.15 + */ 1.16 + 1.17 +#ifndef mozilla_BloomFilter_h 1.18 +#define mozilla_BloomFilter_h 1.19 + 1.20 +#include "mozilla/Assertions.h" 1.21 +#include "mozilla/Likely.h" 1.22 + 1.23 +#include <stdint.h> 1.24 +#include <string.h> 1.25 + 1.26 +namespace mozilla { 1.27 + 1.28 +/* 1.29 + * This class implements a counting Bloom filter as described at 1.30 + * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with 1.31 + * 8-bit counters. This allows quick probabilistic answers to the 1.32 + * question "is object X in set Y?" where the contents of Y might not 1.33 + * be time-invariant. The probabilistic nature of the test means that 1.34 + * sometimes the answer will be "yes" when it should be "no". If the 1.35 + * answer is "no", then X is guaranteed not to be in Y. 1.36 + * 1.37 + * The filter is parametrized on KeySize, which is the size of the key 1.38 + * generated by each of hash functions used by the filter, in bits, 1.39 + * and the type of object T being added and removed. T must implement 1.40 + * a |uint32_t hash() const| method which returns a uint32_t hash key 1.41 + * that will be used to generate the two separate hash functions for 1.42 + * the Bloom filter. This hash key MUST be well-distributed for good 1.43 + * results! KeySize is not allowed to be larger than 16. 1.44 + * 1.45 + * The filter uses exactly 2**KeySize bytes of memory. From now on we 1.46 + * will refer to the memory used by the filter as M. 1.47 + * 1.48 + * The expected rate of incorrect "yes" answers depends on M and on 1.49 + * the number N of objects in set Y. As long as N is small compared 1.50 + * to M, the rate of such answers is expected to be approximately 1.51 + * 4*(N/M)**2 for this filter. In practice, if Y has a few hundred 1.52 + * elements then using a KeySize of 12 gives a reasonably low 1.53 + * incorrect answer rate. A KeySize of 12 has the additional benefit 1.54 + * of using exactly one page for the filter in typical hardware 1.55 + * configurations. 1.56 + */ 1.57 + 1.58 +template<unsigned KeySize, class T> 1.59 +class BloomFilter 1.60 +{ 1.61 + /* 1.62 + * A counting Bloom filter with 8-bit counters. For now we assume 1.63 + * that having two hash functions is enough, but we may revisit that 1.64 + * decision later. 1.65 + * 1.66 + * The filter uses an array with 2**KeySize entries. 1.67 + * 1.68 + * Assuming a well-distributed hash function, a Bloom filter with 1.69 + * array size M containing N elements and 1.70 + * using k hash function has expected false positive rate exactly 1.71 + * 1.72 + * $ (1 - (1 - 1/M)^{kN})^k $ 1.73 + * 1.74 + * because each array slot has a 1.75 + * 1.76 + * $ (1 - 1/M)^{kN} $ 1.77 + * 1.78 + * chance of being 0, and the expected false positive rate is the 1.79 + * probability that all of the k hash functions will hit a nonzero 1.80 + * slot. 1.81 + * 1.82 + * For reasonable assumptions (M large, kN large, which should both 1.83 + * hold if we're worried about false positives) about M and kN this 1.84 + * becomes approximately 1.85 + * 1.86 + * $$ (1 - \exp(-kN/M))^k $$ 1.87 + * 1.88 + * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$, 1.89 + * or in other words 1.90 + * 1.91 + * $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$ 1.92 + * 1.93 + * where r is the false positive rate. This can be used to compute 1.94 + * the desired KeySize for a given load N and false positive rate r. 1.95 + * 1.96 + * If N/M is assumed small, then the false positive rate can 1.97 + * further be approximated as 4*N^2/M^2. So increasing KeySize by 1.98 + * 1, which doubles M, reduces the false positive rate by about a 1.99 + * factor of 4, and a false positive rate of 1% corresponds to 1.100 + * about M/N == 20. 1.101 + * 1.102 + * What this means in practice is that for a few hundred keys using a 1.103 + * KeySize of 12 gives false positive rates on the order of 0.25-4%. 1.104 + * 1.105 + * Similarly, using a KeySize of 10 would lead to a 4% false 1.106 + * positive rate for N == 100 and to quite bad false positive 1.107 + * rates for larger N. 1.108 + */ 1.109 + public: 1.110 + BloomFilter() { 1.111 + static_assert(KeySize <= keyShift, "KeySize too big"); 1.112 + 1.113 + // Should we have a custom operator new using calloc instead and 1.114 + // require that we're allocated via the operator? 1.115 + clear(); 1.116 + } 1.117 + 1.118 + /* 1.119 + * Clear the filter. This should be done before reusing it, because 1.120 + * just removing all items doesn't clear counters that hit the upper 1.121 + * bound. 1.122 + */ 1.123 + void clear(); 1.124 + 1.125 + /* 1.126 + * Add an item to the filter. 1.127 + */ 1.128 + void add(const T* t); 1.129 + 1.130 + /* 1.131 + * Remove an item from the filter. 1.132 + */ 1.133 + void remove(const T* t); 1.134 + 1.135 + /* 1.136 + * Check whether the filter might contain an item. This can 1.137 + * sometimes return true even if the item is not in the filter, 1.138 + * but will never return false for items that are actually in the 1.139 + * filter. 1.140 + */ 1.141 + bool mightContain(const T* t) const; 1.142 + 1.143 + /* 1.144 + * Methods for add/remove/contain when we already have a hash computed 1.145 + */ 1.146 + void add(uint32_t hash); 1.147 + void remove(uint32_t hash); 1.148 + bool mightContain(uint32_t hash) const; 1.149 + 1.150 + private: 1.151 + static const size_t arraySize = (1 << KeySize); 1.152 + static const uint32_t keyMask = (1 << KeySize) - 1; 1.153 + static const uint32_t keyShift = 16; 1.154 + 1.155 + static uint32_t hash1(uint32_t hash) { return hash & keyMask; } 1.156 + static uint32_t hash2(uint32_t hash) { return (hash >> keyShift) & keyMask; } 1.157 + 1.158 + uint8_t& firstSlot(uint32_t hash) { return counters[hash1(hash)]; } 1.159 + uint8_t& secondSlot(uint32_t hash) { return counters[hash2(hash)]; } 1.160 + const uint8_t& firstSlot(uint32_t hash) const { return counters[hash1(hash)]; } 1.161 + const uint8_t& secondSlot(uint32_t hash) const { return counters[hash2(hash)]; } 1.162 + 1.163 + static bool full(const uint8_t& slot) { return slot == UINT8_MAX; } 1.164 + 1.165 + uint8_t counters[arraySize]; 1.166 +}; 1.167 + 1.168 +template<unsigned KeySize, class T> 1.169 +inline void 1.170 +BloomFilter<KeySize, T>::clear() 1.171 +{ 1.172 + memset(counters, 0, arraySize); 1.173 +} 1.174 + 1.175 +template<unsigned KeySize, class T> 1.176 +inline void 1.177 +BloomFilter<KeySize, T>::add(uint32_t hash) 1.178 +{ 1.179 + uint8_t& slot1 = firstSlot(hash); 1.180 + if (MOZ_LIKELY(!full(slot1))) 1.181 + ++slot1; 1.182 + 1.183 + uint8_t& slot2 = secondSlot(hash); 1.184 + if (MOZ_LIKELY(!full(slot2))) 1.185 + ++slot2; 1.186 +} 1.187 + 1.188 +template<unsigned KeySize, class T> 1.189 +MOZ_ALWAYS_INLINE void 1.190 +BloomFilter<KeySize, T>::add(const T* t) 1.191 +{ 1.192 + uint32_t hash = t->hash(); 1.193 + return add(hash); 1.194 +} 1.195 + 1.196 +template<unsigned KeySize, class T> 1.197 +inline void 1.198 +BloomFilter<KeySize, T>::remove(uint32_t hash) 1.199 +{ 1.200 + // If the slots are full, we don't know whether we bumped them to be 1.201 + // there when we added or not, so just leave them full. 1.202 + uint8_t& slot1 = firstSlot(hash); 1.203 + if (MOZ_LIKELY(!full(slot1))) 1.204 + --slot1; 1.205 + 1.206 + uint8_t& slot2 = secondSlot(hash); 1.207 + if (MOZ_LIKELY(!full(slot2))) 1.208 + --slot2; 1.209 +} 1.210 + 1.211 +template<unsigned KeySize, class T> 1.212 +MOZ_ALWAYS_INLINE void 1.213 +BloomFilter<KeySize, T>::remove(const T* t) 1.214 +{ 1.215 + uint32_t hash = t->hash(); 1.216 + remove(hash); 1.217 +} 1.218 + 1.219 +template<unsigned KeySize, class T> 1.220 +MOZ_ALWAYS_INLINE bool 1.221 +BloomFilter<KeySize, T>::mightContain(uint32_t hash) const 1.222 +{ 1.223 + // Check that all the slots for this hash contain something 1.224 + return firstSlot(hash) && secondSlot(hash); 1.225 +} 1.226 + 1.227 +template<unsigned KeySize, class T> 1.228 +MOZ_ALWAYS_INLINE bool 1.229 +BloomFilter<KeySize, T>::mightContain(const T* t) const 1.230 +{ 1.231 + uint32_t hash = t->hash(); 1.232 + return mightContain(hash); 1.233 +} 1.234 + 1.235 +} // namespace mozilla 1.236 + 1.237 +#endif /* mozilla_BloomFilter_h */