The Tor Browser: mfbt/BloomFilter.h@97036ab72558

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */

     2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */

     3 /* This Source Code Form is subject to the terms of the Mozilla Public

     4  * License, v. 2.0. If a copy of the MPL was not distributed with this

     5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

     7 /*

     8  * A counting Bloom filter implementation.  This allows consumers to

     9  * do fast probabilistic "is item X in set Y?" testing which will

    10  * never answer "no" when the correct answer is "yes" (but might

    11  * incorrectly answer "yes" when the correct answer is "no").

    12  */

    14 #ifndef mozilla_BloomFilter_h

    15 #define mozilla_BloomFilter_h

    17 #include "mozilla/Assertions.h"

    18 #include "mozilla/Likely.h"

    20 #include <stdint.h>

    21 #include <string.h>

    23 namespace mozilla {

    25 /*

    26  * This class implements a counting Bloom filter as described at

    27  * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with

    28  * 8-bit counters.  This allows quick probabilistic answers to the

    29  * question "is object X in set Y?" where the contents of Y might not

    30  * be time-invariant.  The probabilistic nature of the test means that

    31  * sometimes the answer will be "yes" when it should be "no".  If the

    32  * answer is "no", then X is guaranteed not to be in Y.

    33  *

    34  * The filter is parametrized on KeySize, which is the size of the key

    35  * generated by each of hash functions used by the filter, in bits,

    36  * and the type of object T being added and removed.  T must implement

    37  * a |uint32_t hash() const| method which returns a uint32_t hash key

    38  * that will be used to generate the two separate hash functions for

    39  * the Bloom filter.  This hash key MUST be well-distributed for good

    40  * results!  KeySize is not allowed to be larger than 16.

    41  *

    42  * The filter uses exactly 2**KeySize bytes of memory.  From now on we

    43  * will refer to the memory used by the filter as M.

    44  *

    45  * The expected rate of incorrect "yes" answers depends on M and on

    46  * the number N of objects in set Y.  As long as N is small compared

    47  * to M, the rate of such answers is expected to be approximately

    48  * 4*(N/M)**2 for this filter.  In practice, if Y has a few hundred

    49  * elements then using a KeySize of 12 gives a reasonably low

    50  * incorrect answer rate.  A KeySize of 12 has the additional benefit

    51  * of using exactly one page for the filter in typical hardware

    52  * configurations.

    53  */

    55 template<unsigned KeySize, class T>

    56 class BloomFilter

    57 {

    58     /*

    59      * A counting Bloom filter with 8-bit counters.  For now we assume

    60      * that having two hash functions is enough, but we may revisit that

    61      * decision later.

    62      *

    63      * The filter uses an array with 2**KeySize entries.

    64      *

    65      * Assuming a well-distributed hash function, a Bloom filter with

    66      * array size M containing N elements and

    67      * using k hash function has expected false positive rate exactly

    68      *

    69      * $  (1 - (1 - 1/M)^{kN})^k  $

    70      *

    71      * because each array slot has a

    72      *

    73      * $  (1 - 1/M)^{kN}  $

    74      *

    75      * chance of being 0, and the expected false positive rate is the

    76      * probability that all of the k hash functions will hit a nonzero

    77      * slot.

    78      *

    79      * For reasonable assumptions (M large, kN large, which should both

    80      * hold if we're worried about false positives) about M and kN this

    81      * becomes approximately

    82      *

    83      * $$  (1 - \exp(-kN/M))^k   $$

    84      *

    85      * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,

    86      * or in other words

    87      *

    88      * $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$

    89      *

    90      * where r is the false positive rate.  This can be used to compute

    91      * the desired KeySize for a given load N and false positive rate r.

    92      *

    93      * If N/M is assumed small, then the false positive rate can

    94      * further be approximated as 4*N^2/M^2.  So increasing KeySize by

    95      * 1, which doubles M, reduces the false positive rate by about a

    96      * factor of 4, and a false positive rate of 1% corresponds to

    97      * about M/N == 20.

    98      *

    99      * What this means in practice is that for a few hundred keys using a

   100      * KeySize of 12 gives false positive rates on the order of 0.25-4%.

   101      *

   102      * Similarly, using a KeySize of 10 would lead to a 4% false

   103      * positive rate for N == 100 and to quite bad false positive

   104      * rates for larger N.

   105      */

   106   public:

   107     BloomFilter() {

   108         static_assert(KeySize <= keyShift, "KeySize too big");

   110         // Should we have a custom operator new using calloc instead and

   111         // require that we're allocated via the operator?

   112         clear();

   113     }

   115     /*

   116      * Clear the filter.  This should be done before reusing it, because

   117      * just removing all items doesn't clear counters that hit the upper

   118      * bound.

   119      */

   120     void clear();

   122     /*

   123      * Add an item to the filter.

   124      */

   125     void add(const T* t);

   127     /*

   128      * Remove an item from the filter.

   129      */

   130     void remove(const T* t);

   132     /*

   133      * Check whether the filter might contain an item.  This can

   134      * sometimes return true even if the item is not in the filter,

   135      * but will never return false for items that are actually in the

   136      * filter.

   137      */

   138     bool mightContain(const T* t) const;

   140     /*

   141      * Methods for add/remove/contain when we already have a hash computed

   142      */

   143     void add(uint32_t hash);

   144     void remove(uint32_t hash);

   145     bool mightContain(uint32_t hash) const;

   147   private:

   148     static const size_t arraySize = (1 << KeySize);

   149     static const uint32_t keyMask = (1 << KeySize) - 1;

   150     static const uint32_t keyShift = 16;

   152     static uint32_t hash1(uint32_t hash) { return hash & keyMask; }

   153     static uint32_t hash2(uint32_t hash) { return (hash >> keyShift) & keyMask; }

   155     uint8_t& firstSlot(uint32_t hash) { return counters[hash1(hash)]; }

   156     uint8_t& secondSlot(uint32_t hash) { return counters[hash2(hash)]; }

   157     const uint8_t& firstSlot(uint32_t hash) const { return counters[hash1(hash)]; }

   158     const uint8_t& secondSlot(uint32_t hash) const { return counters[hash2(hash)]; }

   160     static bool full(const uint8_t& slot) { return slot == UINT8_MAX; }

   162     uint8_t counters[arraySize];

   163 };

   165 template<unsigned KeySize, class T>

   166 inline void

   167 BloomFilter<KeySize, T>::clear()

   168 {

   169   memset(counters, 0, arraySize);

   170 }

   172 template<unsigned KeySize, class T>

   173 inline void

   174 BloomFilter<KeySize, T>::add(uint32_t hash)

   175 {

   176   uint8_t& slot1 = firstSlot(hash);

   177   if (MOZ_LIKELY(!full(slot1)))

   178     ++slot1;

   180   uint8_t& slot2 = secondSlot(hash);

   181   if (MOZ_LIKELY(!full(slot2)))

   182     ++slot2;

   183 }

   185 template<unsigned KeySize, class T>

   186 MOZ_ALWAYS_INLINE void

   187 BloomFilter<KeySize, T>::add(const T* t)

   188 {

   189   uint32_t hash = t->hash();

   190   return add(hash);

   191 }

   193 template<unsigned KeySize, class T>

   194 inline void

   195 BloomFilter<KeySize, T>::remove(uint32_t hash)

   196 {

   197   // If the slots are full, we don't know whether we bumped them to be

   198   // there when we added or not, so just leave them full.

   199   uint8_t& slot1 = firstSlot(hash);

   200   if (MOZ_LIKELY(!full(slot1)))

   201     --slot1;

   203   uint8_t& slot2 = secondSlot(hash);

   204   if (MOZ_LIKELY(!full(slot2)))

   205     --slot2;

   206 }

   208 template<unsigned KeySize, class T>

   209 MOZ_ALWAYS_INLINE void

   210 BloomFilter<KeySize, T>::remove(const T* t)

   211 {

   212   uint32_t hash = t->hash();

   213   remove(hash);

   214 }

   216 template<unsigned KeySize, class T>

   217 MOZ_ALWAYS_INLINE bool

   218 BloomFilter<KeySize, T>::mightContain(uint32_t hash) const

   219 {

   220   // Check that all the slots for this hash contain something

   221   return firstSlot(hash) && secondSlot(hash);

   222 }

   224 template<unsigned KeySize, class T>

   225 MOZ_ALWAYS_INLINE bool

   226 BloomFilter<KeySize, T>::mightContain(const T* t) const

   227 {

   228   uint32_t hash = t->hash();

   229   return mightContain(hash);

   230 }

   232 } // namespace mozilla

   234 #endif /* mozilla_BloomFilter_h */

The Tor Browser / file revision

mfbt/BloomFilter.h@97036ab72558

mfbt/BloomFilter.h