Tue, 06 Jan 2015 21:39:09 +0100
Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
michael@0 | 1 | /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
michael@0 | 2 | /* vim: set ts=8 sts=2 et sw=2 tw=80: */ |
michael@0 | 3 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 4 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 5 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 6 | |
michael@0 | 7 | /* |
michael@0 | 8 | * A counting Bloom filter implementation. This allows consumers to |
michael@0 | 9 | * do fast probabilistic "is item X in set Y?" testing which will |
michael@0 | 10 | * never answer "no" when the correct answer is "yes" (but might |
michael@0 | 11 | * incorrectly answer "yes" when the correct answer is "no"). |
michael@0 | 12 | */ |
michael@0 | 13 | |
michael@0 | 14 | #ifndef mozilla_BloomFilter_h |
michael@0 | 15 | #define mozilla_BloomFilter_h |
michael@0 | 16 | |
michael@0 | 17 | #include "mozilla/Assertions.h" |
michael@0 | 18 | #include "mozilla/Likely.h" |
michael@0 | 19 | |
michael@0 | 20 | #include <stdint.h> |
michael@0 | 21 | #include <string.h> |
michael@0 | 22 | |
michael@0 | 23 | namespace mozilla { |
michael@0 | 24 | |
michael@0 | 25 | /* |
michael@0 | 26 | * This class implements a counting Bloom filter as described at |
michael@0 | 27 | * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with |
michael@0 | 28 | * 8-bit counters. This allows quick probabilistic answers to the |
michael@0 | 29 | * question "is object X in set Y?" where the contents of Y might not |
michael@0 | 30 | * be time-invariant. The probabilistic nature of the test means that |
michael@0 | 31 | * sometimes the answer will be "yes" when it should be "no". If the |
michael@0 | 32 | * answer is "no", then X is guaranteed not to be in Y. |
michael@0 | 33 | * |
michael@0 | 34 | * The filter is parametrized on KeySize, which is the size of the key |
michael@0 | 35 | * generated by each of hash functions used by the filter, in bits, |
michael@0 | 36 | * and the type of object T being added and removed. T must implement |
michael@0 | 37 | * a |uint32_t hash() const| method which returns a uint32_t hash key |
michael@0 | 38 | * that will be used to generate the two separate hash functions for |
michael@0 | 39 | * the Bloom filter. This hash key MUST be well-distributed for good |
michael@0 | 40 | * results! KeySize is not allowed to be larger than 16. |
michael@0 | 41 | * |
michael@0 | 42 | * The filter uses exactly 2**KeySize bytes of memory. From now on we |
michael@0 | 43 | * will refer to the memory used by the filter as M. |
michael@0 | 44 | * |
michael@0 | 45 | * The expected rate of incorrect "yes" answers depends on M and on |
michael@0 | 46 | * the number N of objects in set Y. As long as N is small compared |
michael@0 | 47 | * to M, the rate of such answers is expected to be approximately |
michael@0 | 48 | * 4*(N/M)**2 for this filter. In practice, if Y has a few hundred |
michael@0 | 49 | * elements then using a KeySize of 12 gives a reasonably low |
michael@0 | 50 | * incorrect answer rate. A KeySize of 12 has the additional benefit |
michael@0 | 51 | * of using exactly one page for the filter in typical hardware |
michael@0 | 52 | * configurations. |
michael@0 | 53 | */ |
michael@0 | 54 | |
michael@0 | 55 | template<unsigned KeySize, class T> |
michael@0 | 56 | class BloomFilter |
michael@0 | 57 | { |
michael@0 | 58 | /* |
michael@0 | 59 | * A counting Bloom filter with 8-bit counters. For now we assume |
michael@0 | 60 | * that having two hash functions is enough, but we may revisit that |
michael@0 | 61 | * decision later. |
michael@0 | 62 | * |
michael@0 | 63 | * The filter uses an array with 2**KeySize entries. |
michael@0 | 64 | * |
michael@0 | 65 | * Assuming a well-distributed hash function, a Bloom filter with |
michael@0 | 66 | * array size M containing N elements and |
michael@0 | 67 | * using k hash function has expected false positive rate exactly |
michael@0 | 68 | * |
michael@0 | 69 | * $ (1 - (1 - 1/M)^{kN})^k $ |
michael@0 | 70 | * |
michael@0 | 71 | * because each array slot has a |
michael@0 | 72 | * |
michael@0 | 73 | * $ (1 - 1/M)^{kN} $ |
michael@0 | 74 | * |
michael@0 | 75 | * chance of being 0, and the expected false positive rate is the |
michael@0 | 76 | * probability that all of the k hash functions will hit a nonzero |
michael@0 | 77 | * slot. |
michael@0 | 78 | * |
michael@0 | 79 | * For reasonable assumptions (M large, kN large, which should both |
michael@0 | 80 | * hold if we're worried about false positives) about M and kN this |
michael@0 | 81 | * becomes approximately |
michael@0 | 82 | * |
michael@0 | 83 | * $$ (1 - \exp(-kN/M))^k $$ |
michael@0 | 84 | * |
michael@0 | 85 | * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$, |
michael@0 | 86 | * or in other words |
michael@0 | 87 | * |
michael@0 | 88 | * $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$ |
michael@0 | 89 | * |
michael@0 | 90 | * where r is the false positive rate. This can be used to compute |
michael@0 | 91 | * the desired KeySize for a given load N and false positive rate r. |
michael@0 | 92 | * |
michael@0 | 93 | * If N/M is assumed small, then the false positive rate can |
michael@0 | 94 | * further be approximated as 4*N^2/M^2. So increasing KeySize by |
michael@0 | 95 | * 1, which doubles M, reduces the false positive rate by about a |
michael@0 | 96 | * factor of 4, and a false positive rate of 1% corresponds to |
michael@0 | 97 | * about M/N == 20. |
michael@0 | 98 | * |
michael@0 | 99 | * What this means in practice is that for a few hundred keys using a |
michael@0 | 100 | * KeySize of 12 gives false positive rates on the order of 0.25-4%. |
michael@0 | 101 | * |
michael@0 | 102 | * Similarly, using a KeySize of 10 would lead to a 4% false |
michael@0 | 103 | * positive rate for N == 100 and to quite bad false positive |
michael@0 | 104 | * rates for larger N. |
michael@0 | 105 | */ |
michael@0 | 106 | public: |
michael@0 | 107 | BloomFilter() { |
michael@0 | 108 | static_assert(KeySize <= keyShift, "KeySize too big"); |
michael@0 | 109 | |
michael@0 | 110 | // Should we have a custom operator new using calloc instead and |
michael@0 | 111 | // require that we're allocated via the operator? |
michael@0 | 112 | clear(); |
michael@0 | 113 | } |
michael@0 | 114 | |
michael@0 | 115 | /* |
michael@0 | 116 | * Clear the filter. This should be done before reusing it, because |
michael@0 | 117 | * just removing all items doesn't clear counters that hit the upper |
michael@0 | 118 | * bound. |
michael@0 | 119 | */ |
michael@0 | 120 | void clear(); |
michael@0 | 121 | |
michael@0 | 122 | /* |
michael@0 | 123 | * Add an item to the filter. |
michael@0 | 124 | */ |
michael@0 | 125 | void add(const T* t); |
michael@0 | 126 | |
michael@0 | 127 | /* |
michael@0 | 128 | * Remove an item from the filter. |
michael@0 | 129 | */ |
michael@0 | 130 | void remove(const T* t); |
michael@0 | 131 | |
michael@0 | 132 | /* |
michael@0 | 133 | * Check whether the filter might contain an item. This can |
michael@0 | 134 | * sometimes return true even if the item is not in the filter, |
michael@0 | 135 | * but will never return false for items that are actually in the |
michael@0 | 136 | * filter. |
michael@0 | 137 | */ |
michael@0 | 138 | bool mightContain(const T* t) const; |
michael@0 | 139 | |
michael@0 | 140 | /* |
michael@0 | 141 | * Methods for add/remove/contain when we already have a hash computed |
michael@0 | 142 | */ |
michael@0 | 143 | void add(uint32_t hash); |
michael@0 | 144 | void remove(uint32_t hash); |
michael@0 | 145 | bool mightContain(uint32_t hash) const; |
michael@0 | 146 | |
michael@0 | 147 | private: |
michael@0 | 148 | static const size_t arraySize = (1 << KeySize); |
michael@0 | 149 | static const uint32_t keyMask = (1 << KeySize) - 1; |
michael@0 | 150 | static const uint32_t keyShift = 16; |
michael@0 | 151 | |
michael@0 | 152 | static uint32_t hash1(uint32_t hash) { return hash & keyMask; } |
michael@0 | 153 | static uint32_t hash2(uint32_t hash) { return (hash >> keyShift) & keyMask; } |
michael@0 | 154 | |
michael@0 | 155 | uint8_t& firstSlot(uint32_t hash) { return counters[hash1(hash)]; } |
michael@0 | 156 | uint8_t& secondSlot(uint32_t hash) { return counters[hash2(hash)]; } |
michael@0 | 157 | const uint8_t& firstSlot(uint32_t hash) const { return counters[hash1(hash)]; } |
michael@0 | 158 | const uint8_t& secondSlot(uint32_t hash) const { return counters[hash2(hash)]; } |
michael@0 | 159 | |
michael@0 | 160 | static bool full(const uint8_t& slot) { return slot == UINT8_MAX; } |
michael@0 | 161 | |
michael@0 | 162 | uint8_t counters[arraySize]; |
michael@0 | 163 | }; |
michael@0 | 164 | |
michael@0 | 165 | template<unsigned KeySize, class T> |
michael@0 | 166 | inline void |
michael@0 | 167 | BloomFilter<KeySize, T>::clear() |
michael@0 | 168 | { |
michael@0 | 169 | memset(counters, 0, arraySize); |
michael@0 | 170 | } |
michael@0 | 171 | |
michael@0 | 172 | template<unsigned KeySize, class T> |
michael@0 | 173 | inline void |
michael@0 | 174 | BloomFilter<KeySize, T>::add(uint32_t hash) |
michael@0 | 175 | { |
michael@0 | 176 | uint8_t& slot1 = firstSlot(hash); |
michael@0 | 177 | if (MOZ_LIKELY(!full(slot1))) |
michael@0 | 178 | ++slot1; |
michael@0 | 179 | |
michael@0 | 180 | uint8_t& slot2 = secondSlot(hash); |
michael@0 | 181 | if (MOZ_LIKELY(!full(slot2))) |
michael@0 | 182 | ++slot2; |
michael@0 | 183 | } |
michael@0 | 184 | |
michael@0 | 185 | template<unsigned KeySize, class T> |
michael@0 | 186 | MOZ_ALWAYS_INLINE void |
michael@0 | 187 | BloomFilter<KeySize, T>::add(const T* t) |
michael@0 | 188 | { |
michael@0 | 189 | uint32_t hash = t->hash(); |
michael@0 | 190 | return add(hash); |
michael@0 | 191 | } |
michael@0 | 192 | |
michael@0 | 193 | template<unsigned KeySize, class T> |
michael@0 | 194 | inline void |
michael@0 | 195 | BloomFilter<KeySize, T>::remove(uint32_t hash) |
michael@0 | 196 | { |
michael@0 | 197 | // If the slots are full, we don't know whether we bumped them to be |
michael@0 | 198 | // there when we added or not, so just leave them full. |
michael@0 | 199 | uint8_t& slot1 = firstSlot(hash); |
michael@0 | 200 | if (MOZ_LIKELY(!full(slot1))) |
michael@0 | 201 | --slot1; |
michael@0 | 202 | |
michael@0 | 203 | uint8_t& slot2 = secondSlot(hash); |
michael@0 | 204 | if (MOZ_LIKELY(!full(slot2))) |
michael@0 | 205 | --slot2; |
michael@0 | 206 | } |
michael@0 | 207 | |
michael@0 | 208 | template<unsigned KeySize, class T> |
michael@0 | 209 | MOZ_ALWAYS_INLINE void |
michael@0 | 210 | BloomFilter<KeySize, T>::remove(const T* t) |
michael@0 | 211 | { |
michael@0 | 212 | uint32_t hash = t->hash(); |
michael@0 | 213 | remove(hash); |
michael@0 | 214 | } |
michael@0 | 215 | |
michael@0 | 216 | template<unsigned KeySize, class T> |
michael@0 | 217 | MOZ_ALWAYS_INLINE bool |
michael@0 | 218 | BloomFilter<KeySize, T>::mightContain(uint32_t hash) const |
michael@0 | 219 | { |
michael@0 | 220 | // Check that all the slots for this hash contain something |
michael@0 | 221 | return firstSlot(hash) && secondSlot(hash); |
michael@0 | 222 | } |
michael@0 | 223 | |
michael@0 | 224 | template<unsigned KeySize, class T> |
michael@0 | 225 | MOZ_ALWAYS_INLINE bool |
michael@0 | 226 | BloomFilter<KeySize, T>::mightContain(const T* t) const |
michael@0 | 227 | { |
michael@0 | 228 | uint32_t hash = t->hash(); |
michael@0 | 229 | return mightContain(hash); |
michael@0 | 230 | } |
michael@0 | 231 | |
michael@0 | 232 | } // namespace mozilla |
michael@0 | 233 | |
michael@0 | 234 | #endif /* mozilla_BloomFilter_h */ |