|
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ |
|
3 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
4 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
6 |
|
7 /* |
|
8 * A counting Bloom filter implementation. This allows consumers to |
|
9 * do fast probabilistic "is item X in set Y?" testing which will |
|
10 * never answer "no" when the correct answer is "yes" (but might |
|
11 * incorrectly answer "yes" when the correct answer is "no"). |
|
12 */ |
|
13 |
|
14 #ifndef mozilla_BloomFilter_h |
|
15 #define mozilla_BloomFilter_h |
|
16 |
|
17 #include "mozilla/Assertions.h" |
|
18 #include "mozilla/Likely.h" |
|
19 |
|
20 #include <stdint.h> |
|
21 #include <string.h> |
|
22 |
|
23 namespace mozilla { |
|
24 |
|
25 /* |
|
26 * This class implements a counting Bloom filter as described at |
|
27 * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with |
|
28 * 8-bit counters. This allows quick probabilistic answers to the |
|
29 * question "is object X in set Y?" where the contents of Y might not |
|
30 * be time-invariant. The probabilistic nature of the test means that |
|
31 * sometimes the answer will be "yes" when it should be "no". If the |
|
32 * answer is "no", then X is guaranteed not to be in Y. |
|
33 * |
|
34 * The filter is parametrized on KeySize, which is the size of the key |
|
35 * generated by each of hash functions used by the filter, in bits, |
|
36 * and the type of object T being added and removed. T must implement |
|
37 * a |uint32_t hash() const| method which returns a uint32_t hash key |
|
38 * that will be used to generate the two separate hash functions for |
|
39 * the Bloom filter. This hash key MUST be well-distributed for good |
|
40 * results! KeySize is not allowed to be larger than 16. |
|
41 * |
|
42 * The filter uses exactly 2**KeySize bytes of memory. From now on we |
|
43 * will refer to the memory used by the filter as M. |
|
44 * |
|
45 * The expected rate of incorrect "yes" answers depends on M and on |
|
46 * the number N of objects in set Y. As long as N is small compared |
|
47 * to M, the rate of such answers is expected to be approximately |
|
48 * 4*(N/M)**2 for this filter. In practice, if Y has a few hundred |
|
49 * elements then using a KeySize of 12 gives a reasonably low |
|
50 * incorrect answer rate. A KeySize of 12 has the additional benefit |
|
51 * of using exactly one page for the filter in typical hardware |
|
52 * configurations. |
|
53 */ |
|
54 |
|
55 template<unsigned KeySize, class T> |
|
56 class BloomFilter |
|
57 { |
|
58 /* |
|
59 * A counting Bloom filter with 8-bit counters. For now we assume |
|
60 * that having two hash functions is enough, but we may revisit that |
|
61 * decision later. |
|
62 * |
|
63 * The filter uses an array with 2**KeySize entries. |
|
64 * |
|
65 * Assuming a well-distributed hash function, a Bloom filter with |
|
66 * array size M containing N elements and |
|
67 * using k hash function has expected false positive rate exactly |
|
68 * |
|
69 * $ (1 - (1 - 1/M)^{kN})^k $ |
|
70 * |
|
71 * because each array slot has a |
|
72 * |
|
73 * $ (1 - 1/M)^{kN} $ |
|
74 * |
|
75 * chance of being 0, and the expected false positive rate is the |
|
76 * probability that all of the k hash functions will hit a nonzero |
|
77 * slot. |
|
78 * |
|
79 * For reasonable assumptions (M large, kN large, which should both |
|
80 * hold if we're worried about false positives) about M and kN this |
|
81 * becomes approximately |
|
82 * |
|
83 * $$ (1 - \exp(-kN/M))^k $$ |
|
84 * |
|
85 * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$, |
|
86 * or in other words |
|
87 * |
|
88 * $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$ |
|
89 * |
|
90 * where r is the false positive rate. This can be used to compute |
|
91 * the desired KeySize for a given load N and false positive rate r. |
|
92 * |
|
93 * If N/M is assumed small, then the false positive rate can |
|
94 * further be approximated as 4*N^2/M^2. So increasing KeySize by |
|
95 * 1, which doubles M, reduces the false positive rate by about a |
|
96 * factor of 4, and a false positive rate of 1% corresponds to |
|
97 * about M/N == 20. |
|
98 * |
|
99 * What this means in practice is that for a few hundred keys using a |
|
100 * KeySize of 12 gives false positive rates on the order of 0.25-4%. |
|
101 * |
|
102 * Similarly, using a KeySize of 10 would lead to a 4% false |
|
103 * positive rate for N == 100 and to quite bad false positive |
|
104 * rates for larger N. |
|
105 */ |
|
106 public: |
|
107 BloomFilter() { |
|
108 static_assert(KeySize <= keyShift, "KeySize too big"); |
|
109 |
|
110 // Should we have a custom operator new using calloc instead and |
|
111 // require that we're allocated via the operator? |
|
112 clear(); |
|
113 } |
|
114 |
|
115 /* |
|
116 * Clear the filter. This should be done before reusing it, because |
|
117 * just removing all items doesn't clear counters that hit the upper |
|
118 * bound. |
|
119 */ |
|
120 void clear(); |
|
121 |
|
122 /* |
|
123 * Add an item to the filter. |
|
124 */ |
|
125 void add(const T* t); |
|
126 |
|
127 /* |
|
128 * Remove an item from the filter. |
|
129 */ |
|
130 void remove(const T* t); |
|
131 |
|
132 /* |
|
133 * Check whether the filter might contain an item. This can |
|
134 * sometimes return true even if the item is not in the filter, |
|
135 * but will never return false for items that are actually in the |
|
136 * filter. |
|
137 */ |
|
138 bool mightContain(const T* t) const; |
|
139 |
|
140 /* |
|
141 * Methods for add/remove/contain when we already have a hash computed |
|
142 */ |
|
143 void add(uint32_t hash); |
|
144 void remove(uint32_t hash); |
|
145 bool mightContain(uint32_t hash) const; |
|
146 |
|
147 private: |
|
148 static const size_t arraySize = (1 << KeySize); |
|
149 static const uint32_t keyMask = (1 << KeySize) - 1; |
|
150 static const uint32_t keyShift = 16; |
|
151 |
|
152 static uint32_t hash1(uint32_t hash) { return hash & keyMask; } |
|
153 static uint32_t hash2(uint32_t hash) { return (hash >> keyShift) & keyMask; } |
|
154 |
|
155 uint8_t& firstSlot(uint32_t hash) { return counters[hash1(hash)]; } |
|
156 uint8_t& secondSlot(uint32_t hash) { return counters[hash2(hash)]; } |
|
157 const uint8_t& firstSlot(uint32_t hash) const { return counters[hash1(hash)]; } |
|
158 const uint8_t& secondSlot(uint32_t hash) const { return counters[hash2(hash)]; } |
|
159 |
|
160 static bool full(const uint8_t& slot) { return slot == UINT8_MAX; } |
|
161 |
|
162 uint8_t counters[arraySize]; |
|
163 }; |
|
164 |
|
165 template<unsigned KeySize, class T> |
|
166 inline void |
|
167 BloomFilter<KeySize, T>::clear() |
|
168 { |
|
169 memset(counters, 0, arraySize); |
|
170 } |
|
171 |
|
172 template<unsigned KeySize, class T> |
|
173 inline void |
|
174 BloomFilter<KeySize, T>::add(uint32_t hash) |
|
175 { |
|
176 uint8_t& slot1 = firstSlot(hash); |
|
177 if (MOZ_LIKELY(!full(slot1))) |
|
178 ++slot1; |
|
179 |
|
180 uint8_t& slot2 = secondSlot(hash); |
|
181 if (MOZ_LIKELY(!full(slot2))) |
|
182 ++slot2; |
|
183 } |
|
184 |
|
185 template<unsigned KeySize, class T> |
|
186 MOZ_ALWAYS_INLINE void |
|
187 BloomFilter<KeySize, T>::add(const T* t) |
|
188 { |
|
189 uint32_t hash = t->hash(); |
|
190 return add(hash); |
|
191 } |
|
192 |
|
193 template<unsigned KeySize, class T> |
|
194 inline void |
|
195 BloomFilter<KeySize, T>::remove(uint32_t hash) |
|
196 { |
|
197 // If the slots are full, we don't know whether we bumped them to be |
|
198 // there when we added or not, so just leave them full. |
|
199 uint8_t& slot1 = firstSlot(hash); |
|
200 if (MOZ_LIKELY(!full(slot1))) |
|
201 --slot1; |
|
202 |
|
203 uint8_t& slot2 = secondSlot(hash); |
|
204 if (MOZ_LIKELY(!full(slot2))) |
|
205 --slot2; |
|
206 } |
|
207 |
|
208 template<unsigned KeySize, class T> |
|
209 MOZ_ALWAYS_INLINE void |
|
210 BloomFilter<KeySize, T>::remove(const T* t) |
|
211 { |
|
212 uint32_t hash = t->hash(); |
|
213 remove(hash); |
|
214 } |
|
215 |
|
216 template<unsigned KeySize, class T> |
|
217 MOZ_ALWAYS_INLINE bool |
|
218 BloomFilter<KeySize, T>::mightContain(uint32_t hash) const |
|
219 { |
|
220 // Check that all the slots for this hash contain something |
|
221 return firstSlot(hash) && secondSlot(hash); |
|
222 } |
|
223 |
|
224 template<unsigned KeySize, class T> |
|
225 MOZ_ALWAYS_INLINE bool |
|
226 BloomFilter<KeySize, T>::mightContain(const T* t) const |
|
227 { |
|
228 uint32_t hash = t->hash(); |
|
229 return mightContain(hash); |
|
230 } |
|
231 |
|
232 } // namespace mozilla |
|
233 |
|
234 #endif /* mozilla_BloomFilter_h */ |