mfbt/BloomFilter.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
michael@0 3 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 4 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 6
michael@0 7 /*
michael@0 8 * A counting Bloom filter implementation. This allows consumers to
michael@0 9 * do fast probabilistic "is item X in set Y?" testing which will
michael@0 10 * never answer "no" when the correct answer is "yes" (but might
michael@0 11 * incorrectly answer "yes" when the correct answer is "no").
michael@0 12 */
michael@0 13
michael@0 14 #ifndef mozilla_BloomFilter_h
michael@0 15 #define mozilla_BloomFilter_h
michael@0 16
michael@0 17 #include "mozilla/Assertions.h"
michael@0 18 #include "mozilla/Likely.h"
michael@0 19
michael@0 20 #include <stdint.h>
michael@0 21 #include <string.h>
michael@0 22
michael@0 23 namespace mozilla {
michael@0 24
michael@0 25 /*
michael@0 26 * This class implements a counting Bloom filter as described at
michael@0 27 * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
michael@0 28 * 8-bit counters. This allows quick probabilistic answers to the
michael@0 29 * question "is object X in set Y?" where the contents of Y might not
michael@0 30 * be time-invariant. The probabilistic nature of the test means that
michael@0 31 * sometimes the answer will be "yes" when it should be "no". If the
michael@0 32 * answer is "no", then X is guaranteed not to be in Y.
michael@0 33 *
michael@0 34 * The filter is parametrized on KeySize, which is the size of the key
michael@0 35 * generated by each of hash functions used by the filter, in bits,
michael@0 36 * and the type of object T being added and removed. T must implement
michael@0 37 * a |uint32_t hash() const| method which returns a uint32_t hash key
michael@0 38 * that will be used to generate the two separate hash functions for
michael@0 39 * the Bloom filter. This hash key MUST be well-distributed for good
michael@0 40 * results! KeySize is not allowed to be larger than 16.
michael@0 41 *
michael@0 42 * The filter uses exactly 2**KeySize bytes of memory. From now on we
michael@0 43 * will refer to the memory used by the filter as M.
michael@0 44 *
michael@0 45 * The expected rate of incorrect "yes" answers depends on M and on
michael@0 46 * the number N of objects in set Y. As long as N is small compared
michael@0 47 * to M, the rate of such answers is expected to be approximately
michael@0 48 * 4*(N/M)**2 for this filter. In practice, if Y has a few hundred
michael@0 49 * elements then using a KeySize of 12 gives a reasonably low
michael@0 50 * incorrect answer rate. A KeySize of 12 has the additional benefit
michael@0 51 * of using exactly one page for the filter in typical hardware
michael@0 52 * configurations.
michael@0 53 */
michael@0 54
michael@0 55 template<unsigned KeySize, class T>
michael@0 56 class BloomFilter
michael@0 57 {
michael@0 58 /*
michael@0 59 * A counting Bloom filter with 8-bit counters. For now we assume
michael@0 60 * that having two hash functions is enough, but we may revisit that
michael@0 61 * decision later.
michael@0 62 *
michael@0 63 * The filter uses an array with 2**KeySize entries.
michael@0 64 *
michael@0 65 * Assuming a well-distributed hash function, a Bloom filter with
michael@0 66 * array size M containing N elements and
michael@0 67 * using k hash function has expected false positive rate exactly
michael@0 68 *
michael@0 69 * $ (1 - (1 - 1/M)^{kN})^k $
michael@0 70 *
michael@0 71 * because each array slot has a
michael@0 72 *
michael@0 73 * $ (1 - 1/M)^{kN} $
michael@0 74 *
michael@0 75 * chance of being 0, and the expected false positive rate is the
michael@0 76 * probability that all of the k hash functions will hit a nonzero
michael@0 77 * slot.
michael@0 78 *
michael@0 79 * For reasonable assumptions (M large, kN large, which should both
michael@0 80 * hold if we're worried about false positives) about M and kN this
michael@0 81 * becomes approximately
michael@0 82 *
michael@0 83 * $$ (1 - \exp(-kN/M))^k $$
michael@0 84 *
michael@0 85 * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
michael@0 86 * or in other words
michael@0 87 *
michael@0 88 * $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$
michael@0 89 *
michael@0 90 * where r is the false positive rate. This can be used to compute
michael@0 91 * the desired KeySize for a given load N and false positive rate r.
michael@0 92 *
michael@0 93 * If N/M is assumed small, then the false positive rate can
michael@0 94 * further be approximated as 4*N^2/M^2. So increasing KeySize by
michael@0 95 * 1, which doubles M, reduces the false positive rate by about a
michael@0 96 * factor of 4, and a false positive rate of 1% corresponds to
michael@0 97 * about M/N == 20.
michael@0 98 *
michael@0 99 * What this means in practice is that for a few hundred keys using a
michael@0 100 * KeySize of 12 gives false positive rates on the order of 0.25-4%.
michael@0 101 *
michael@0 102 * Similarly, using a KeySize of 10 would lead to a 4% false
michael@0 103 * positive rate for N == 100 and to quite bad false positive
michael@0 104 * rates for larger N.
michael@0 105 */
michael@0 106 public:
michael@0 107 BloomFilter() {
michael@0 108 static_assert(KeySize <= keyShift, "KeySize too big");
michael@0 109
michael@0 110 // Should we have a custom operator new using calloc instead and
michael@0 111 // require that we're allocated via the operator?
michael@0 112 clear();
michael@0 113 }
michael@0 114
michael@0 115 /*
michael@0 116 * Clear the filter. This should be done before reusing it, because
michael@0 117 * just removing all items doesn't clear counters that hit the upper
michael@0 118 * bound.
michael@0 119 */
michael@0 120 void clear();
michael@0 121
michael@0 122 /*
michael@0 123 * Add an item to the filter.
michael@0 124 */
michael@0 125 void add(const T* t);
michael@0 126
michael@0 127 /*
michael@0 128 * Remove an item from the filter.
michael@0 129 */
michael@0 130 void remove(const T* t);
michael@0 131
michael@0 132 /*
michael@0 133 * Check whether the filter might contain an item. This can
michael@0 134 * sometimes return true even if the item is not in the filter,
michael@0 135 * but will never return false for items that are actually in the
michael@0 136 * filter.
michael@0 137 */
michael@0 138 bool mightContain(const T* t) const;
michael@0 139
michael@0 140 /*
michael@0 141 * Methods for add/remove/contain when we already have a hash computed
michael@0 142 */
michael@0 143 void add(uint32_t hash);
michael@0 144 void remove(uint32_t hash);
michael@0 145 bool mightContain(uint32_t hash) const;
michael@0 146
michael@0 147 private:
michael@0 148 static const size_t arraySize = (1 << KeySize);
michael@0 149 static const uint32_t keyMask = (1 << KeySize) - 1;
michael@0 150 static const uint32_t keyShift = 16;
michael@0 151
michael@0 152 static uint32_t hash1(uint32_t hash) { return hash & keyMask; }
michael@0 153 static uint32_t hash2(uint32_t hash) { return (hash >> keyShift) & keyMask; }
michael@0 154
michael@0 155 uint8_t& firstSlot(uint32_t hash) { return counters[hash1(hash)]; }
michael@0 156 uint8_t& secondSlot(uint32_t hash) { return counters[hash2(hash)]; }
michael@0 157 const uint8_t& firstSlot(uint32_t hash) const { return counters[hash1(hash)]; }
michael@0 158 const uint8_t& secondSlot(uint32_t hash) const { return counters[hash2(hash)]; }
michael@0 159
michael@0 160 static bool full(const uint8_t& slot) { return slot == UINT8_MAX; }
michael@0 161
michael@0 162 uint8_t counters[arraySize];
michael@0 163 };
michael@0 164
michael@0 165 template<unsigned KeySize, class T>
michael@0 166 inline void
michael@0 167 BloomFilter<KeySize, T>::clear()
michael@0 168 {
michael@0 169 memset(counters, 0, arraySize);
michael@0 170 }
michael@0 171
michael@0 172 template<unsigned KeySize, class T>
michael@0 173 inline void
michael@0 174 BloomFilter<KeySize, T>::add(uint32_t hash)
michael@0 175 {
michael@0 176 uint8_t& slot1 = firstSlot(hash);
michael@0 177 if (MOZ_LIKELY(!full(slot1)))
michael@0 178 ++slot1;
michael@0 179
michael@0 180 uint8_t& slot2 = secondSlot(hash);
michael@0 181 if (MOZ_LIKELY(!full(slot2)))
michael@0 182 ++slot2;
michael@0 183 }
michael@0 184
michael@0 185 template<unsigned KeySize, class T>
michael@0 186 MOZ_ALWAYS_INLINE void
michael@0 187 BloomFilter<KeySize, T>::add(const T* t)
michael@0 188 {
michael@0 189 uint32_t hash = t->hash();
michael@0 190 return add(hash);
michael@0 191 }
michael@0 192
michael@0 193 template<unsigned KeySize, class T>
michael@0 194 inline void
michael@0 195 BloomFilter<KeySize, T>::remove(uint32_t hash)
michael@0 196 {
michael@0 197 // If the slots are full, we don't know whether we bumped them to be
michael@0 198 // there when we added or not, so just leave them full.
michael@0 199 uint8_t& slot1 = firstSlot(hash);
michael@0 200 if (MOZ_LIKELY(!full(slot1)))
michael@0 201 --slot1;
michael@0 202
michael@0 203 uint8_t& slot2 = secondSlot(hash);
michael@0 204 if (MOZ_LIKELY(!full(slot2)))
michael@0 205 --slot2;
michael@0 206 }
michael@0 207
michael@0 208 template<unsigned KeySize, class T>
michael@0 209 MOZ_ALWAYS_INLINE void
michael@0 210 BloomFilter<KeySize, T>::remove(const T* t)
michael@0 211 {
michael@0 212 uint32_t hash = t->hash();
michael@0 213 remove(hash);
michael@0 214 }
michael@0 215
michael@0 216 template<unsigned KeySize, class T>
michael@0 217 MOZ_ALWAYS_INLINE bool
michael@0 218 BloomFilter<KeySize, T>::mightContain(uint32_t hash) const
michael@0 219 {
michael@0 220 // Check that all the slots for this hash contain something
michael@0 221 return firstSlot(hash) && secondSlot(hash);
michael@0 222 }
michael@0 223
michael@0 224 template<unsigned KeySize, class T>
michael@0 225 MOZ_ALWAYS_INLINE bool
michael@0 226 BloomFilter<KeySize, T>::mightContain(const T* t) const
michael@0 227 {
michael@0 228 uint32_t hash = t->hash();
michael@0 229 return mightContain(hash);
michael@0 230 }
michael@0 231
michael@0 232 } // namespace mozilla
michael@0 233
michael@0 234 #endif /* mozilla_BloomFilter_h */

mercurial