michael@0: /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* vim: set ts=8 sts=2 et sw=2 tw=80: */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: /* michael@0: * A counting Bloom filter implementation. This allows consumers to michael@0: * do fast probabilistic "is item X in set Y?" testing which will michael@0: * never answer "no" when the correct answer is "yes" (but might michael@0: * incorrectly answer "yes" when the correct answer is "no"). michael@0: */ michael@0: michael@0: #ifndef mozilla_BloomFilter_h michael@0: #define mozilla_BloomFilter_h michael@0: michael@0: #include "mozilla/Assertions.h" michael@0: #include "mozilla/Likely.h" michael@0: michael@0: #include michael@0: #include michael@0: michael@0: namespace mozilla { michael@0: michael@0: /* michael@0: * This class implements a counting Bloom filter as described at michael@0: * , with michael@0: * 8-bit counters. This allows quick probabilistic answers to the michael@0: * question "is object X in set Y?" where the contents of Y might not michael@0: * be time-invariant. The probabilistic nature of the test means that michael@0: * sometimes the answer will be "yes" when it should be "no". If the michael@0: * answer is "no", then X is guaranteed not to be in Y. michael@0: * michael@0: * The filter is parametrized on KeySize, which is the size of the key michael@0: * generated by each of hash functions used by the filter, in bits, michael@0: * and the type of object T being added and removed. T must implement michael@0: * a |uint32_t hash() const| method which returns a uint32_t hash key michael@0: * that will be used to generate the two separate hash functions for michael@0: * the Bloom filter. This hash key MUST be well-distributed for good michael@0: * results! KeySize is not allowed to be larger than 16. michael@0: * michael@0: * The filter uses exactly 2**KeySize bytes of memory. From now on we michael@0: * will refer to the memory used by the filter as M. michael@0: * michael@0: * The expected rate of incorrect "yes" answers depends on M and on michael@0: * the number N of objects in set Y. As long as N is small compared michael@0: * to M, the rate of such answers is expected to be approximately michael@0: * 4*(N/M)**2 for this filter. In practice, if Y has a few hundred michael@0: * elements then using a KeySize of 12 gives a reasonably low michael@0: * incorrect answer rate. A KeySize of 12 has the additional benefit michael@0: * of using exactly one page for the filter in typical hardware michael@0: * configurations. michael@0: */ michael@0: michael@0: template michael@0: class BloomFilter michael@0: { michael@0: /* michael@0: * A counting Bloom filter with 8-bit counters. For now we assume michael@0: * that having two hash functions is enough, but we may revisit that michael@0: * decision later. michael@0: * michael@0: * The filter uses an array with 2**KeySize entries. michael@0: * michael@0: * Assuming a well-distributed hash function, a Bloom filter with michael@0: * array size M containing N elements and michael@0: * using k hash function has expected false positive rate exactly michael@0: * michael@0: * $ (1 - (1 - 1/M)^{kN})^k $ michael@0: * michael@0: * because each array slot has a michael@0: * michael@0: * $ (1 - 1/M)^{kN} $ michael@0: * michael@0: * chance of being 0, and the expected false positive rate is the michael@0: * probability that all of the k hash functions will hit a nonzero michael@0: * slot. michael@0: * michael@0: * For reasonable assumptions (M large, kN large, which should both michael@0: * hold if we're worried about false positives) about M and kN this michael@0: * becomes approximately michael@0: * michael@0: * $$ (1 - \exp(-kN/M))^k $$ michael@0: * michael@0: * For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$, michael@0: * or in other words michael@0: * michael@0: * $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$ michael@0: * michael@0: * where r is the false positive rate. This can be used to compute michael@0: * the desired KeySize for a given load N and false positive rate r. michael@0: * michael@0: * If N/M is assumed small, then the false positive rate can michael@0: * further be approximated as 4*N^2/M^2. So increasing KeySize by michael@0: * 1, which doubles M, reduces the false positive rate by about a michael@0: * factor of 4, and a false positive rate of 1% corresponds to michael@0: * about M/N == 20. michael@0: * michael@0: * What this means in practice is that for a few hundred keys using a michael@0: * KeySize of 12 gives false positive rates on the order of 0.25-4%. michael@0: * michael@0: * Similarly, using a KeySize of 10 would lead to a 4% false michael@0: * positive rate for N == 100 and to quite bad false positive michael@0: * rates for larger N. michael@0: */ michael@0: public: michael@0: BloomFilter() { michael@0: static_assert(KeySize <= keyShift, "KeySize too big"); michael@0: michael@0: // Should we have a custom operator new using calloc instead and michael@0: // require that we're allocated via the operator? michael@0: clear(); michael@0: } michael@0: michael@0: /* michael@0: * Clear the filter. This should be done before reusing it, because michael@0: * just removing all items doesn't clear counters that hit the upper michael@0: * bound. michael@0: */ michael@0: void clear(); michael@0: michael@0: /* michael@0: * Add an item to the filter. michael@0: */ michael@0: void add(const T* t); michael@0: michael@0: /* michael@0: * Remove an item from the filter. michael@0: */ michael@0: void remove(const T* t); michael@0: michael@0: /* michael@0: * Check whether the filter might contain an item. This can michael@0: * sometimes return true even if the item is not in the filter, michael@0: * but will never return false for items that are actually in the michael@0: * filter. michael@0: */ michael@0: bool mightContain(const T* t) const; michael@0: michael@0: /* michael@0: * Methods for add/remove/contain when we already have a hash computed michael@0: */ michael@0: void add(uint32_t hash); michael@0: void remove(uint32_t hash); michael@0: bool mightContain(uint32_t hash) const; michael@0: michael@0: private: michael@0: static const size_t arraySize = (1 << KeySize); michael@0: static const uint32_t keyMask = (1 << KeySize) - 1; michael@0: static const uint32_t keyShift = 16; michael@0: michael@0: static uint32_t hash1(uint32_t hash) { return hash & keyMask; } michael@0: static uint32_t hash2(uint32_t hash) { return (hash >> keyShift) & keyMask; } michael@0: michael@0: uint8_t& firstSlot(uint32_t hash) { return counters[hash1(hash)]; } michael@0: uint8_t& secondSlot(uint32_t hash) { return counters[hash2(hash)]; } michael@0: const uint8_t& firstSlot(uint32_t hash) const { return counters[hash1(hash)]; } michael@0: const uint8_t& secondSlot(uint32_t hash) const { return counters[hash2(hash)]; } michael@0: michael@0: static bool full(const uint8_t& slot) { return slot == UINT8_MAX; } michael@0: michael@0: uint8_t counters[arraySize]; michael@0: }; michael@0: michael@0: template michael@0: inline void michael@0: BloomFilter::clear() michael@0: { michael@0: memset(counters, 0, arraySize); michael@0: } michael@0: michael@0: template michael@0: inline void michael@0: BloomFilter::add(uint32_t hash) michael@0: { michael@0: uint8_t& slot1 = firstSlot(hash); michael@0: if (MOZ_LIKELY(!full(slot1))) michael@0: ++slot1; michael@0: michael@0: uint8_t& slot2 = secondSlot(hash); michael@0: if (MOZ_LIKELY(!full(slot2))) michael@0: ++slot2; michael@0: } michael@0: michael@0: template michael@0: MOZ_ALWAYS_INLINE void michael@0: BloomFilter::add(const T* t) michael@0: { michael@0: uint32_t hash = t->hash(); michael@0: return add(hash); michael@0: } michael@0: michael@0: template michael@0: inline void michael@0: BloomFilter::remove(uint32_t hash) michael@0: { michael@0: // If the slots are full, we don't know whether we bumped them to be michael@0: // there when we added or not, so just leave them full. michael@0: uint8_t& slot1 = firstSlot(hash); michael@0: if (MOZ_LIKELY(!full(slot1))) michael@0: --slot1; michael@0: michael@0: uint8_t& slot2 = secondSlot(hash); michael@0: if (MOZ_LIKELY(!full(slot2))) michael@0: --slot2; michael@0: } michael@0: michael@0: template michael@0: MOZ_ALWAYS_INLINE void michael@0: BloomFilter::remove(const T* t) michael@0: { michael@0: uint32_t hash = t->hash(); michael@0: remove(hash); michael@0: } michael@0: michael@0: template michael@0: MOZ_ALWAYS_INLINE bool michael@0: BloomFilter::mightContain(uint32_t hash) const michael@0: { michael@0: // Check that all the slots for this hash contain something michael@0: return firstSlot(hash) && secondSlot(hash); michael@0: } michael@0: michael@0: template michael@0: MOZ_ALWAYS_INLINE bool michael@0: BloomFilter::mightContain(const T* t) const michael@0: { michael@0: uint32_t hash = t->hash(); michael@0: return mightContain(hash); michael@0: } michael@0: michael@0: } // namespace mozilla michael@0: michael@0: #endif /* mozilla_BloomFilter_h */