mfbt/HashFunctions.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/mfbt/HashFunctions.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,379 @@
     1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* vim: set ts=8 sts=2 et sw=2 tw=80: */
     1.6 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.7 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.8 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.9 +
    1.10 +/* Utilities for hashing. */
    1.11 +
    1.12 +/*
    1.13 + * This file exports functions for hashing data down to a 32-bit value,
    1.14 + * including:
    1.15 + *
    1.16 + *  - HashString    Hash a char* or uint16_t/wchar_t* of known or unknown
    1.17 + *                  length.
    1.18 + *
    1.19 + *  - HashBytes     Hash a byte array of known length.
    1.20 + *
    1.21 + *  - HashGeneric   Hash one or more values.  Currently, we support uint32_t,
    1.22 + *                  types which can be implicitly cast to uint32_t, data
    1.23 + *                  pointers, and function pointers.
    1.24 + *
    1.25 + *  - AddToHash     Add one or more values to the given hash.  This supports the
    1.26 + *                  same list of types as HashGeneric.
    1.27 + *
    1.28 + *
    1.29 + * You can chain these functions together to hash complex objects.  For example:
    1.30 + *
    1.31 + *  class ComplexObject
    1.32 + *  {
    1.33 + *      char* str;
    1.34 + *      uint32_t uint1, uint2;
    1.35 + *      void (*callbackFn)();
    1.36 + *
    1.37 + *    public:
    1.38 + *      uint32_t hash() {
    1.39 + *        uint32_t hash = HashString(str);
    1.40 + *        hash = AddToHash(hash, uint1, uint2);
    1.41 + *        return AddToHash(hash, callbackFn);
    1.42 + *      }
    1.43 + *  };
    1.44 + *
    1.45 + * If you want to hash an nsAString or nsACString, use the HashString functions
    1.46 + * in nsHashKeys.h.
    1.47 + */
    1.48 +
    1.49 +#ifndef mozilla_HashFunctions_h
    1.50 +#define mozilla_HashFunctions_h
    1.51 +
    1.52 +#include "mozilla/Assertions.h"
    1.53 +#include "mozilla/Attributes.h"
    1.54 +#include "mozilla/Char16.h"
    1.55 +#include "mozilla/Types.h"
    1.56 +
    1.57 +#include <stdint.h>
    1.58 +
    1.59 +#ifdef __cplusplus
    1.60 +namespace mozilla {
    1.61 +
    1.62 +/**
    1.63 + * The golden ratio as a 32-bit fixed-point value.
    1.64 + */
    1.65 +static const uint32_t GoldenRatioU32 = 0x9E3779B9U;
    1.66 +
    1.67 +inline uint32_t
    1.68 +RotateBitsLeft32(uint32_t value, uint8_t bits)
    1.69 +{
    1.70 +  MOZ_ASSERT(bits < 32);
    1.71 +  return (value << bits) | (value >> (32 - bits));
    1.72 +}
    1.73 +
    1.74 +namespace detail {
    1.75 +
    1.76 +inline uint32_t
    1.77 +AddU32ToHash(uint32_t hash, uint32_t value)
    1.78 +{
    1.79 +  /*
    1.80 +   * This is the meat of all our hash routines.  This hash function is not
    1.81 +   * particularly sophisticated, but it seems to work well for our mostly
    1.82 +   * plain-text inputs.  Implementation notes follow.
    1.83 +   *
    1.84 +   * Our use of the golden ratio here is arbitrary; we could pick almost any
    1.85 +   * number which:
    1.86 +   *
    1.87 +   *  * is odd (because otherwise, all our hash values will be even)
    1.88 +   *
    1.89 +   *  * has a reasonably-even mix of 1's and 0's (consider the extreme case
    1.90 +   *    where we multiply by 0x3 or 0xeffffff -- this will not produce good
    1.91 +   *    mixing across all bits of the hash).
    1.92 +   *
    1.93 +   * The rotation length of 5 is also arbitrary, although an odd number is again
    1.94 +   * preferable so our hash explores the whole universe of possible rotations.
    1.95 +   *
    1.96 +   * Finally, we multiply by the golden ratio *after* xor'ing, not before.
    1.97 +   * Otherwise, if |hash| is 0 (as it often is for the beginning of a message),
    1.98 +   * the expression
    1.99 +   *
   1.100 +   *   (GoldenRatioU32 * RotateBitsLeft(hash, 5)) |xor| value
   1.101 +   *
   1.102 +   * evaluates to |value|.
   1.103 +   *
   1.104 +   * (Number-theoretic aside: Because any odd number |m| is relatively prime to
   1.105 +   * our modulus (2^32), the list
   1.106 +   *
   1.107 +   *    [x * m (mod 2^32) for 0 <= x < 2^32]
   1.108 +   *
   1.109 +   * has no duplicate elements.  This means that multiplying by |m| does not
   1.110 +   * cause us to skip any possible hash values.
   1.111 +   *
   1.112 +   * It's also nice if |m| has large-ish order mod 2^32 -- that is, if the
   1.113 +   * smallest k such that m^k == 1 (mod 2^32) is large -- so we can safely
   1.114 +   * multiply our hash value by |m| a few times without negating the
   1.115 +   * multiplicative effect.  Our golden ratio constant has order 2^29, which is
   1.116 +   * more than enough for our purposes.)
   1.117 +   */
   1.118 +  return GoldenRatioU32 * (RotateBitsLeft32(hash, 5) ^ value);
   1.119 +}
   1.120 +
   1.121 +/**
   1.122 + * AddUintptrToHash takes sizeof(uintptr_t) as a template parameter.
   1.123 + */
   1.124 +template<size_t PtrSize>
   1.125 +inline uint32_t
   1.126 +AddUintptrToHash(uint32_t hash, uintptr_t value);
   1.127 +
   1.128 +template<>
   1.129 +inline uint32_t
   1.130 +AddUintptrToHash<4>(uint32_t hash, uintptr_t value)
   1.131 +{
   1.132 +  return AddU32ToHash(hash, static_cast<uint32_t>(value));
   1.133 +}
   1.134 +
   1.135 +template<>
   1.136 +inline uint32_t
   1.137 +AddUintptrToHash<8>(uint32_t hash, uintptr_t value)
   1.138 +{
   1.139 +  /*
   1.140 +   * The static cast to uint64_t below is necessary because this function
   1.141 +   * sometimes gets compiled on 32-bit platforms (yes, even though it's a
   1.142 +   * template and we never call this particular override in a 32-bit build).  If
   1.143 +   * we do value >> 32 on a 32-bit machine, we're shifting a 32-bit uintptr_t
   1.144 +   * right 32 bits, and the compiler throws an error.
   1.145 +   */
   1.146 +  uint32_t v1 = static_cast<uint32_t>(value);
   1.147 +  uint32_t v2 = static_cast<uint32_t>(static_cast<uint64_t>(value) >> 32);
   1.148 +  return AddU32ToHash(AddU32ToHash(hash, v1), v2);
   1.149 +}
   1.150 +
   1.151 +} /* namespace detail */
   1.152 +
   1.153 +/**
   1.154 + * AddToHash takes a hash and some values and returns a new hash based on the
   1.155 + * inputs.
   1.156 + *
   1.157 + * Currently, we support hashing uint32_t's, values which we can implicitly
   1.158 + * convert to uint32_t, data pointers, and function pointers.
   1.159 + */
   1.160 +template<typename A>
   1.161 +MOZ_WARN_UNUSED_RESULT
   1.162 +inline uint32_t
   1.163 +AddToHash(uint32_t hash, A a)
   1.164 +{
   1.165 +  /*
   1.166 +   * Try to convert |A| to uint32_t implicitly.  If this works, great.  If not,
   1.167 +   * we'll error out.
   1.168 +   */
   1.169 +  return detail::AddU32ToHash(hash, a);
   1.170 +}
   1.171 +
   1.172 +template<typename A>
   1.173 +MOZ_WARN_UNUSED_RESULT
   1.174 +inline uint32_t
   1.175 +AddToHash(uint32_t hash, A* a)
   1.176 +{
   1.177 +  /*
   1.178 +   * You might think this function should just take a void*.  But then we'd only
   1.179 +   * catch data pointers and couldn't handle function pointers.
   1.180 +   */
   1.181 +
   1.182 +  static_assert(sizeof(a) == sizeof(uintptr_t),
   1.183 +                "Strange pointer!");
   1.184 +
   1.185 +  return detail::AddUintptrToHash<sizeof(uintptr_t)>(hash, uintptr_t(a));
   1.186 +}
   1.187 +
   1.188 +template<>
   1.189 +MOZ_WARN_UNUSED_RESULT
   1.190 +inline uint32_t
   1.191 +AddToHash(uint32_t hash, uintptr_t a)
   1.192 +{
   1.193 +  return detail::AddUintptrToHash<sizeof(uintptr_t)>(hash, a);
   1.194 +}
   1.195 +
   1.196 +template<typename A, typename B>
   1.197 +MOZ_WARN_UNUSED_RESULT
   1.198 +uint32_t
   1.199 +AddToHash(uint32_t hash, A a, B b)
   1.200 +{
   1.201 +  return AddToHash(AddToHash(hash, a), b);
   1.202 +}
   1.203 +
   1.204 +template<typename A, typename B, typename C>
   1.205 +MOZ_WARN_UNUSED_RESULT
   1.206 +uint32_t
   1.207 +AddToHash(uint32_t hash, A a, B b, C c)
   1.208 +{
   1.209 +  return AddToHash(AddToHash(hash, a, b), c);
   1.210 +}
   1.211 +
   1.212 +template<typename A, typename B, typename C, typename D>
   1.213 +MOZ_WARN_UNUSED_RESULT
   1.214 +uint32_t
   1.215 +AddToHash(uint32_t hash, A a, B b, C c, D d)
   1.216 +{
   1.217 +  return AddToHash(AddToHash(hash, a, b, c), d);
   1.218 +}
   1.219 +
   1.220 +template<typename A, typename B, typename C, typename D, typename E>
   1.221 +MOZ_WARN_UNUSED_RESULT
   1.222 +uint32_t
   1.223 +AddToHash(uint32_t hash, A a, B b, C c, D d, E e)
   1.224 +{
   1.225 +  return AddToHash(AddToHash(hash, a, b, c, d), e);
   1.226 +}
   1.227 +
   1.228 +/**
   1.229 + * The HashGeneric class of functions let you hash one or more values.
   1.230 + *
   1.231 + * If you want to hash together two values x and y, calling HashGeneric(x, y) is
   1.232 + * much better than calling AddToHash(x, y), because AddToHash(x, y) assumes
   1.233 + * that x has already been hashed.
   1.234 + */
   1.235 +template<typename A>
   1.236 +MOZ_WARN_UNUSED_RESULT
   1.237 +inline uint32_t
   1.238 +HashGeneric(A a)
   1.239 +{
   1.240 +  return AddToHash(0, a);
   1.241 +}
   1.242 +
   1.243 +template<typename A, typename B>
   1.244 +MOZ_WARN_UNUSED_RESULT
   1.245 +inline uint32_t
   1.246 +HashGeneric(A a, B b)
   1.247 +{
   1.248 +  return AddToHash(0, a, b);
   1.249 +}
   1.250 +
   1.251 +template<typename A, typename B, typename C>
   1.252 +MOZ_WARN_UNUSED_RESULT
   1.253 +inline uint32_t
   1.254 +HashGeneric(A a, B b, C c)
   1.255 +{
   1.256 +  return AddToHash(0, a, b, c);
   1.257 +}
   1.258 +
   1.259 +template<typename A, typename B, typename C, typename D>
   1.260 +MOZ_WARN_UNUSED_RESULT
   1.261 +inline uint32_t
   1.262 +HashGeneric(A a, B b, C c, D d)
   1.263 +{
   1.264 +  return AddToHash(0, a, b, c, d);
   1.265 +}
   1.266 +
   1.267 +template<typename A, typename B, typename C, typename D, typename E>
   1.268 +MOZ_WARN_UNUSED_RESULT
   1.269 +inline uint32_t
   1.270 +HashGeneric(A a, B b, C c, D d, E e)
   1.271 +{
   1.272 +  return AddToHash(0, a, b, c, d, e);
   1.273 +}
   1.274 +
   1.275 +namespace detail {
   1.276 +
   1.277 +template<typename T>
   1.278 +uint32_t
   1.279 +HashUntilZero(const T* str)
   1.280 +{
   1.281 +  uint32_t hash = 0;
   1.282 +  for (T c; (c = *str); str++)
   1.283 +    hash = AddToHash(hash, c);
   1.284 +  return hash;
   1.285 +}
   1.286 +
   1.287 +template<typename T>
   1.288 +uint32_t
   1.289 +HashKnownLength(const T* str, size_t length)
   1.290 +{
   1.291 +  uint32_t hash = 0;
   1.292 +  for (size_t i = 0; i < length; i++)
   1.293 +    hash = AddToHash(hash, str[i]);
   1.294 +  return hash;
   1.295 +}
   1.296 +
   1.297 +} /* namespace detail */
   1.298 +
   1.299 +/**
   1.300 + * The HashString overloads below do just what you'd expect.
   1.301 + *
   1.302 + * If you have the string's length, you might as well call the overload which
   1.303 + * includes the length.  It may be marginally faster.
   1.304 + */
   1.305 +MOZ_WARN_UNUSED_RESULT
   1.306 +inline uint32_t
   1.307 +HashString(const char* str)
   1.308 +{
   1.309 +  return detail::HashUntilZero(str);
   1.310 +}
   1.311 +
   1.312 +MOZ_WARN_UNUSED_RESULT
   1.313 +inline uint32_t
   1.314 +HashString(const char* str, size_t length)
   1.315 +{
   1.316 +  return detail::HashKnownLength(str, length);
   1.317 +}
   1.318 +
   1.319 +MOZ_WARN_UNUSED_RESULT
   1.320 +inline uint32_t
   1.321 +HashString(const uint16_t* str)
   1.322 +{
   1.323 +  return detail::HashUntilZero(str);
   1.324 +}
   1.325 +
   1.326 +MOZ_WARN_UNUSED_RESULT
   1.327 +inline uint32_t
   1.328 +HashString(const uint16_t* str, size_t length)
   1.329 +{
   1.330 +  return detail::HashKnownLength(str, length);
   1.331 +}
   1.332 +
   1.333 +#ifdef MOZ_CHAR16_IS_NOT_WCHAR
   1.334 +MOZ_WARN_UNUSED_RESULT
   1.335 +inline uint32_t
   1.336 +HashString(const char16_t* str)
   1.337 +{
   1.338 +  return detail::HashUntilZero(str);
   1.339 +}
   1.340 +
   1.341 +MOZ_WARN_UNUSED_RESULT
   1.342 +inline uint32_t
   1.343 +HashString(const char16_t* str, size_t length)
   1.344 +{
   1.345 +  return detail::HashKnownLength(str, length);
   1.346 +}
   1.347 +#endif
   1.348 +
   1.349 +/*
   1.350 + * On Windows, wchar_t (char16_t) is not the same as uint16_t, even though it's
   1.351 + * the same width!
   1.352 + */
   1.353 +#ifdef WIN32
   1.354 +MOZ_WARN_UNUSED_RESULT
   1.355 +inline uint32_t
   1.356 +HashString(const wchar_t* str)
   1.357 +{
   1.358 +  return detail::HashUntilZero(str);
   1.359 +}
   1.360 +
   1.361 +MOZ_WARN_UNUSED_RESULT
   1.362 +inline uint32_t
   1.363 +HashString(const wchar_t* str, size_t length)
   1.364 +{
   1.365 +  return detail::HashKnownLength(str, length);
   1.366 +}
   1.367 +#endif
   1.368 +
   1.369 +/**
   1.370 + * Hash some number of bytes.
   1.371 + *
   1.372 + * This hash walks word-by-word, rather than byte-by-byte, so you won't get the
   1.373 + * same result out of HashBytes as you would out of HashString.
   1.374 + */
   1.375 +MOZ_WARN_UNUSED_RESULT
   1.376 +extern MFBT_API uint32_t
   1.377 +HashBytes(const void* bytes, size_t length);
   1.378 +
   1.379 +} /* namespace mozilla */
   1.380 +#endif /* __cplusplus */
   1.381 +
   1.382 +#endif /* mozilla_HashFunctions_h */

mercurial