mfbt/BloomFilter.h

Tue, 06 Jan 2015 21:39:09 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Tue, 06 Jan 2015 21:39:09 +0100
branch
TOR_BUG_9701
changeset 8
97036ab72558
permissions
-rw-r--r--

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
     3 /* This Source Code Form is subject to the terms of the Mozilla Public
     4  * License, v. 2.0. If a copy of the MPL was not distributed with this
     5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     7 /*
     8  * A counting Bloom filter implementation.  This allows consumers to
     9  * do fast probabilistic "is item X in set Y?" testing which will
    10  * never answer "no" when the correct answer is "yes" (but might
    11  * incorrectly answer "yes" when the correct answer is "no").
    12  */
    14 #ifndef mozilla_BloomFilter_h
    15 #define mozilla_BloomFilter_h
    17 #include "mozilla/Assertions.h"
    18 #include "mozilla/Likely.h"
    20 #include <stdint.h>
    21 #include <string.h>
    23 namespace mozilla {
    25 /*
    26  * This class implements a counting Bloom filter as described at
    27  * <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
    28  * 8-bit counters.  This allows quick probabilistic answers to the
    29  * question "is object X in set Y?" where the contents of Y might not
    30  * be time-invariant.  The probabilistic nature of the test means that
    31  * sometimes the answer will be "yes" when it should be "no".  If the
    32  * answer is "no", then X is guaranteed not to be in Y.
    33  *
    34  * The filter is parametrized on KeySize, which is the size of the key
    35  * generated by each of hash functions used by the filter, in bits,
    36  * and the type of object T being added and removed.  T must implement
    37  * a |uint32_t hash() const| method which returns a uint32_t hash key
    38  * that will be used to generate the two separate hash functions for
    39  * the Bloom filter.  This hash key MUST be well-distributed for good
    40  * results!  KeySize is not allowed to be larger than 16.
    41  *
    42  * The filter uses exactly 2**KeySize bytes of memory.  From now on we
    43  * will refer to the memory used by the filter as M.
    44  *
    45  * The expected rate of incorrect "yes" answers depends on M and on
    46  * the number N of objects in set Y.  As long as N is small compared
    47  * to M, the rate of such answers is expected to be approximately
    48  * 4*(N/M)**2 for this filter.  In practice, if Y has a few hundred
    49  * elements then using a KeySize of 12 gives a reasonably low
    50  * incorrect answer rate.  A KeySize of 12 has the additional benefit
    51  * of using exactly one page for the filter in typical hardware
    52  * configurations.
    53  */
    55 template<unsigned KeySize, class T>
    56 class BloomFilter
    57 {
    58     /*
    59      * A counting Bloom filter with 8-bit counters.  For now we assume
    60      * that having two hash functions is enough, but we may revisit that
    61      * decision later.
    62      *
    63      * The filter uses an array with 2**KeySize entries.
    64      *
    65      * Assuming a well-distributed hash function, a Bloom filter with
    66      * array size M containing N elements and
    67      * using k hash function has expected false positive rate exactly
    68      *
    69      * $  (1 - (1 - 1/M)^{kN})^k  $
    70      *
    71      * because each array slot has a
    72      *
    73      * $  (1 - 1/M)^{kN}  $
    74      *
    75      * chance of being 0, and the expected false positive rate is the
    76      * probability that all of the k hash functions will hit a nonzero
    77      * slot.
    78      *
    79      * For reasonable assumptions (M large, kN large, which should both
    80      * hold if we're worried about false positives) about M and kN this
    81      * becomes approximately
    82      *
    83      * $$  (1 - \exp(-kN/M))^k   $$
    84      *
    85      * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
    86      * or in other words
    87      *
    88      * $$    N/M = -0.5 * \ln(1 - \sqrt(r))   $$
    89      *
    90      * where r is the false positive rate.  This can be used to compute
    91      * the desired KeySize for a given load N and false positive rate r.
    92      *
    93      * If N/M is assumed small, then the false positive rate can
    94      * further be approximated as 4*N^2/M^2.  So increasing KeySize by
    95      * 1, which doubles M, reduces the false positive rate by about a
    96      * factor of 4, and a false positive rate of 1% corresponds to
    97      * about M/N == 20.
    98      *
    99      * What this means in practice is that for a few hundred keys using a
   100      * KeySize of 12 gives false positive rates on the order of 0.25-4%.
   101      *
   102      * Similarly, using a KeySize of 10 would lead to a 4% false
   103      * positive rate for N == 100 and to quite bad false positive
   104      * rates for larger N.
   105      */
   106   public:
   107     BloomFilter() {
   108         static_assert(KeySize <= keyShift, "KeySize too big");
   110         // Should we have a custom operator new using calloc instead and
   111         // require that we're allocated via the operator?
   112         clear();
   113     }
   115     /*
   116      * Clear the filter.  This should be done before reusing it, because
   117      * just removing all items doesn't clear counters that hit the upper
   118      * bound.
   119      */
   120     void clear();
   122     /*
   123      * Add an item to the filter.
   124      */
   125     void add(const T* t);
   127     /*
   128      * Remove an item from the filter.
   129      */
   130     void remove(const T* t);
   132     /*
   133      * Check whether the filter might contain an item.  This can
   134      * sometimes return true even if the item is not in the filter,
   135      * but will never return false for items that are actually in the
   136      * filter.
   137      */
   138     bool mightContain(const T* t) const;
   140     /*
   141      * Methods for add/remove/contain when we already have a hash computed
   142      */
   143     void add(uint32_t hash);
   144     void remove(uint32_t hash);
   145     bool mightContain(uint32_t hash) const;
   147   private:
   148     static const size_t arraySize = (1 << KeySize);
   149     static const uint32_t keyMask = (1 << KeySize) - 1;
   150     static const uint32_t keyShift = 16;
   152     static uint32_t hash1(uint32_t hash) { return hash & keyMask; }
   153     static uint32_t hash2(uint32_t hash) { return (hash >> keyShift) & keyMask; }
   155     uint8_t& firstSlot(uint32_t hash) { return counters[hash1(hash)]; }
   156     uint8_t& secondSlot(uint32_t hash) { return counters[hash2(hash)]; }
   157     const uint8_t& firstSlot(uint32_t hash) const { return counters[hash1(hash)]; }
   158     const uint8_t& secondSlot(uint32_t hash) const { return counters[hash2(hash)]; }
   160     static bool full(const uint8_t& slot) { return slot == UINT8_MAX; }
   162     uint8_t counters[arraySize];
   163 };
   165 template<unsigned KeySize, class T>
   166 inline void
   167 BloomFilter<KeySize, T>::clear()
   168 {
   169   memset(counters, 0, arraySize);
   170 }
   172 template<unsigned KeySize, class T>
   173 inline void
   174 BloomFilter<KeySize, T>::add(uint32_t hash)
   175 {
   176   uint8_t& slot1 = firstSlot(hash);
   177   if (MOZ_LIKELY(!full(slot1)))
   178     ++slot1;
   180   uint8_t& slot2 = secondSlot(hash);
   181   if (MOZ_LIKELY(!full(slot2)))
   182     ++slot2;
   183 }
   185 template<unsigned KeySize, class T>
   186 MOZ_ALWAYS_INLINE void
   187 BloomFilter<KeySize, T>::add(const T* t)
   188 {
   189   uint32_t hash = t->hash();
   190   return add(hash);
   191 }
   193 template<unsigned KeySize, class T>
   194 inline void
   195 BloomFilter<KeySize, T>::remove(uint32_t hash)
   196 {
   197   // If the slots are full, we don't know whether we bumped them to be
   198   // there when we added or not, so just leave them full.
   199   uint8_t& slot1 = firstSlot(hash);
   200   if (MOZ_LIKELY(!full(slot1)))
   201     --slot1;
   203   uint8_t& slot2 = secondSlot(hash);
   204   if (MOZ_LIKELY(!full(slot2)))
   205     --slot2;
   206 }
   208 template<unsigned KeySize, class T>
   209 MOZ_ALWAYS_INLINE void
   210 BloomFilter<KeySize, T>::remove(const T* t)
   211 {
   212   uint32_t hash = t->hash();
   213   remove(hash);
   214 }
   216 template<unsigned KeySize, class T>
   217 MOZ_ALWAYS_INLINE bool
   218 BloomFilter<KeySize, T>::mightContain(uint32_t hash) const
   219 {
   220   // Check that all the slots for this hash contain something
   221   return firstSlot(hash) && secondSlot(hash);
   222 }
   224 template<unsigned KeySize, class T>
   225 MOZ_ALWAYS_INLINE bool
   226 BloomFilter<KeySize, T>::mightContain(const T* t) const
   227 {
   228   uint32_t hash = t->hash();
   229   return mightContain(hash);
   230 }
   232 } // namespace mozilla
   234 #endif /* mozilla_BloomFilter_h */

mercurial