browser/components/translation/cld2/internal/cldutil_shared.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/browser/components/translation/cld2/internal/cldutil_shared.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,437 @@
     1.4 +// Copyright 2013 Google Inc. All Rights Reserved.
     1.5 +//
     1.6 +// Licensed under the Apache License, Version 2.0 (the "License");
     1.7 +// you may not use this file except in compliance with the License.
     1.8 +// You may obtain a copy of the License at
     1.9 +//
    1.10 +//     http://www.apache.org/licenses/LICENSE-2.0
    1.11 +//
    1.12 +// Unless required by applicable law or agreed to in writing, software
    1.13 +// distributed under the License is distributed on an "AS IS" BASIS,
    1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    1.15 +// See the License for the specific language governing permissions and
    1.16 +// limitations under the License.
    1.17 +
    1.18 +//
    1.19 +// Author: dsites@google.com (Dick Sites)
    1.20 +//
    1.21 +
    1.22 +#include "cldutil_shared.h"
    1.23 +#include <string>
    1.24 +
    1.25 +#include "cld2tablesummary.h"
    1.26 +#include "integral_types.h"
    1.27 +#include "port.h"
    1.28 +#include "utf8statetable.h"
    1.29 +
    1.30 +namespace CLD2 {
    1.31 +
    1.32 +// Runtime routines for hashing, looking up, and scoring
    1.33 +// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
    1.34 +// Unigrams and bigrams are for CJK languages only, including simplified/
    1.35 +// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
    1.36 +// Zhuang Han characters. Surrounding spaces are not considered.
    1.37 +// Quadgrams and octagrams for for non-CJK and include two bits indicating
    1.38 +// preceding and trailing spaces (word boundaries).
    1.39 +
    1.40 +
    1.41 +// Indicator bits for leading/trailing space around quad/octagram
    1.42 +// NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
    1.43 +// 1-, 2-, or 3-bytes each.
    1.44 +static const uint32 kPreSpaceIndicator =  0x00004444;
    1.45 +static const uint32 kPostSpaceIndicator = 0x44440000;
    1.46 +
    1.47 +// Little-endian masks for 0..24 bytes picked up as uint32's
    1.48 +static const uint32 kWordMask0[4] = {
    1.49 +  0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
    1.50 +};
    1.51 +
    1.52 +static const int kMinCJKUTF8CharBytes = 3;
    1.53 +
    1.54 +static const int kMinGramCount = 3;
    1.55 +static const int kMaxGramCount = 16;
    1.56 +
    1.57 +static const int UTFmax = 4;        // Max number of bytes in a UTF-8 character
    1.58 +
    1.59 +
    1.60 +// Routines to access a hash table of <key:wordhash, value:probs> pairs
    1.61 +// Buckets have 4-byte wordhash for sizes < 32K buckets, but only
    1.62 +// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
    1.63 +// bucket subscript.
    1.64 +// Probs is a packed: three languages plus a subscript for probability table
    1.65 +// Buckets have all the keys together, then all the values.Key array never
    1.66 +// crosses a cache-line boundary, so no-match case takes exactly one cache miss.
    1.67 +// Match case may sometimes take an additional cache miss on value access.
    1.68 +//
    1.69 +// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
    1.70 +// byte buckets with single cache miss.
    1.71 +// Or 2-byte key and 6-byte value, allowing 5 languages instead  of three.
    1.72 +
    1.73 +
    1.74 +//----------------------------------------------------------------------------//
    1.75 +// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores      //
    1.76 +//----------------------------------------------------------------------------//
    1.77 +
    1.78 +// Design principles for these hash functions
    1.79 +// - Few operations
    1.80 +// - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
    1.81 +//   Latin script expect 1- and 2-byte mixtures.
    1.82 +// - Last byte of each character has about 5 bits of information
    1.83 +// - Spread good bits around so they can interact in at least two ways
    1.84 +//   with other characters
    1.85 +// - Use add for additional mixing thorugh carries
    1.86 +
    1.87 +// CJK Three-byte bigram
    1.88 +//   ....dddd..cccccc..bbbbbb....aaaa
    1.89 +//   ..................ffffff..eeeeee
    1.90 +// make
    1.91 +//   ....dddd..cccccc..bbbbbb....aaaa
    1.92 +//   000....dddd..cccccc..bbbbbb....a
    1.93 +//   ..................ffffff..eeeeee
    1.94 +//   ffffff..eeeeee000000000000000000
    1.95 +//
    1.96 +// CJK Four-byte bigram
    1.97 +//   ..dddddd..cccccc....bbbb....aaaa
    1.98 +//   ..hhhhhh..gggggg....ffff....eeee
    1.99 +// make
   1.100 +//   ..dddddd..cccccc....bbbb....aaaa
   1.101 +//   000..dddddd..cccccc....bbbb....a
   1.102 +//   ..hhhhhh..gggggg....ffff....eeee
   1.103 +//   ..ffff....eeee000000000000000000
   1.104 +
   1.105 +// BIGRAM
   1.106 +// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
   1.107 +// OVERSHOOTS up to 3 bytes
   1.108 +// For runtime use of tables
   1.109 +// Does X86 unaligned loads
   1.110 +uint32 BiHashV2(const char* word_ptr, int bytecount) {
   1.111 +  if (bytecount == 0) {return 0;}
   1.112 +  const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
   1.113 +  uint32 word0, word1;
   1.114 +  if (bytecount <= 4) {
   1.115 +    word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
   1.116 +    word0 = word0 ^ (word0 >> 3);
   1.117 +    return word0;
   1.118 +  }
   1.119 +  // Else do 8 bytes
   1.120 +  word0 = UNALIGNED_LOAD32(word_ptr32);
   1.121 +  word0 = word0 ^ (word0 >> 3);
   1.122 +  word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
   1.123 +  word1 = word1 ^ (word1 << 18);
   1.124 +  return word0 + word1;
   1.125 +}
   1.126 +
   1.127 +//
   1.128 +// Ascii-7 One-byte chars
   1.129 +//   ...ddddd...ccccc...bbbbb...aaaaa
   1.130 +// make
   1.131 +//   ...ddddd...ccccc...bbbbb...aaaaa
   1.132 +//   000...ddddd...ccccc...bbbbb...aa
   1.133 +//
   1.134 +// Latin 1- and 2-byte chars
   1.135 +//   ...ddddd...ccccc...bbbbb...aaaaa
   1.136 +//   ...................fffff...eeeee
   1.137 +// make
   1.138 +//   ...ddddd...ccccc...bbbbb...aaaaa
   1.139 +//   000...ddddd...ccccc...bbbbb...aa
   1.140 +//   ...................fffff...eeeee
   1.141 +//   ...............fffff...eeeee0000
   1.142 +//
   1.143 +// Non-CJK Two-byte chars
   1.144 +//   ...ddddd...........bbbbb........
   1.145 +//   ...hhhhh...........fffff........
   1.146 +// make
   1.147 +//   ...ddddd...........bbbbb........
   1.148 +//   000...ddddd...........bbbbb.....
   1.149 +//   ...hhhhh...........fffff........
   1.150 +//   hhhh...........fffff........0000
   1.151 +//
   1.152 +// Non-CJK Three-byte chars
   1.153 +//   ...........ccccc................
   1.154 +//   ...................fffff........
   1.155 +//   ...lllll...................iiiii
   1.156 +// make
   1.157 +//   ...........ccccc................
   1.158 +//   000...........ccccc.............
   1.159 +//   ...................fffff........
   1.160 +//   ...............fffff........0000
   1.161 +//   ...lllll...................iiiii
   1.162 +//   .lllll...................iiiii00
   1.163 +//
   1.164 +
   1.165 +// QUADGRAM
   1.166 +// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
   1.167 +// OVERSHOOTS up to 3 bytes
   1.168 +// For runtime use of tables
   1.169 +// Does X86 unaligned loads
   1.170 +uint32 QuadHashV2Mix(const char* word_ptr, int bytecount, uint32 prepost) {
   1.171 +  const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
   1.172 +  uint32 word0, word1, word2;
   1.173 +  if (bytecount <= 4) {
   1.174 +    word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
   1.175 +    word0 = word0 ^ (word0 >> 3);
   1.176 +    return word0 ^ prepost;
   1.177 +  } else if (bytecount <= 8) {
   1.178 +    word0 = UNALIGNED_LOAD32(word_ptr32);
   1.179 +    word0 = word0 ^ (word0 >> 3);
   1.180 +    word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
   1.181 +    word1 = word1 ^ (word1 << 4);
   1.182 +    return (word0 ^ prepost) + word1;
   1.183 +  }
   1.184 +  // else do 12 bytes
   1.185 +  word0 = UNALIGNED_LOAD32(word_ptr32);
   1.186 +  word0 = word0 ^ (word0 >> 3);
   1.187 +  word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
   1.188 +  word1 = word1 ^ (word1 << 4);
   1.189 +  word2 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];
   1.190 +  word2 = word2 ^ (word2 << 2);
   1.191 +  return (word0 ^ prepost) + word1 + word2;
   1.192 +}
   1.193 +
   1.194 +
   1.195 +// QUADGRAM wrapper with surrounding spaces
   1.196 +// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
   1.197 +// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
   1.198 +// For runtime use of tables
   1.199 +uint32 QuadHashV2(const char* word_ptr, int bytecount) {
   1.200 +  if (bytecount == 0) {return 0;}
   1.201 +  uint32 prepost = 0;
   1.202 +  if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
   1.203 +  if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
   1.204 +  return QuadHashV2Mix(word_ptr, bytecount, prepost);
   1.205 +}
   1.206 +
   1.207 +// QUADGRAM wrapper with surrounding underscores (offline use)
   1.208 +// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
   1.209 +// OVERSHOOTS up to 3 bytes
   1.210 +// For offline construction of tables
   1.211 +uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount) {
   1.212 +  if (bytecount == 0) {return 0;}
   1.213 +  const char* local_word_ptr = word_ptr;
   1.214 +  int local_bytecount = bytecount;
   1.215 +  uint32 prepost = 0;
   1.216 +  if (local_word_ptr[0] == '_') {
   1.217 +    prepost |= kPreSpaceIndicator;
   1.218 +    ++local_word_ptr;
   1.219 +    --local_bytecount;
   1.220 +  }
   1.221 +  if (local_word_ptr[local_bytecount - 1] == '_') {
   1.222 +    prepost |= kPostSpaceIndicator;
   1.223 +    --local_bytecount;
   1.224 +  }
   1.225 +  return QuadHashV2Mix(local_word_ptr, local_bytecount, prepost);
   1.226 +}
   1.227 +
   1.228 +
   1.229 +// OCTAGRAM
   1.230 +// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
   1.231 +// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
   1.232 +//
   1.233 +// The low 32 bits follow the pattern from above, tuned to different scripts
   1.234 +// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
   1.235 +// For runtime use of tables V3
   1.236 +// Does X86 unaligned loads
   1.237 +uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
   1.238 +  const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
   1.239 +  uint64 word0;
   1.240 +  uint64 word1;
   1.241 +  uint64 sum;
   1.242 +
   1.243 +  if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
   1.244 +  if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
   1.245 +  switch ((bytecount - 1) >> 2) {
   1.246 +  case 0:       // 1..4 bytes
   1.247 +    word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
   1.248 +    sum = word0;
   1.249 +    word0 = word0 ^ (word0 >> 3);
   1.250 +    break;
   1.251 +  case 1:       // 5..8 bytes
   1.252 +    word0 = UNALIGNED_LOAD32(word_ptr32);
   1.253 +    sum = word0;
   1.254 +    word0 = word0 ^ (word0 >> 3);
   1.255 +    word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
   1.256 +    sum += word1;
   1.257 +    word1 = word1 ^ (word1 << 4);
   1.258 +    word0 += word1;
   1.259 +    break;
   1.260 +  case 2:       // 9..12 bytes
   1.261 +    word0 = UNALIGNED_LOAD32(word_ptr32);
   1.262 +    sum = word0;
   1.263 +    word0 = word0 ^ (word0 >> 3);
   1.264 +    word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
   1.265 +    sum += word1;
   1.266 +    word1 = word1 ^ (word1 << 4);
   1.267 +    word0 += word1;
   1.268 +    word1 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];
   1.269 +    sum += word1;
   1.270 +    word1 = word1 ^ (word1 << 2);
   1.271 +    word0 += word1;
   1.272 +    break;
   1.273 +  case 3:       // 13..16 bytes
   1.274 +    word0 =UNALIGNED_LOAD32(word_ptr32);
   1.275 +    sum = word0;
   1.276 +    word0 = word0 ^ (word0 >> 3);
   1.277 +    word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
   1.278 +    sum += word1;
   1.279 +    word1 = word1 ^ (word1 << 4);
   1.280 +    word0 += word1;
   1.281 +    word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
   1.282 +    sum += word1;
   1.283 +    word1 = word1 ^ (word1 << 2);
   1.284 +    word0 += word1;
   1.285 +    word1 = UNALIGNED_LOAD32(word_ptr32 + 3) & kWordMask0[bytecount & 3];
   1.286 +    sum += word1;
   1.287 +    word1 = word1 ^ (word1 >> 8);
   1.288 +    word0 += word1;
   1.289 +    break;
   1.290 +  case 4:       // 17..20 bytes
   1.291 +    word0 = UNALIGNED_LOAD32(word_ptr32);
   1.292 +    sum = word0;
   1.293 +    word0 = word0 ^ (word0 >> 3);
   1.294 +    word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
   1.295 +    sum += word1;
   1.296 +    word1 = word1 ^ (word1 << 4);
   1.297 +    word0 += word1;
   1.298 +    word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
   1.299 +    sum += word1;
   1.300 +    word1 = word1 ^ (word1 << 2);
   1.301 +    word0 += word1;
   1.302 +    word1 = UNALIGNED_LOAD32(word_ptr32 + 3);
   1.303 +    sum += word1;
   1.304 +    word1 = word1 ^ (word1 >> 8);
   1.305 +    word0 += word1;
   1.306 +    word1 = UNALIGNED_LOAD32(word_ptr32 + 4) & kWordMask0[bytecount & 3];
   1.307 +    sum += word1;
   1.308 +    word1 = word1 ^ (word1 >> 4);
   1.309 +    word0 += word1;
   1.310 +    break;
   1.311 +  default:      // 21..24 bytes and higher (ignores beyond 24)
   1.312 +    word0 = UNALIGNED_LOAD32(word_ptr32);
   1.313 +    sum = word0;
   1.314 +    word0 = word0 ^ (word0 >> 3);
   1.315 +    word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
   1.316 +    sum += word1;
   1.317 +    word1 = word1 ^ (word1 << 4);
   1.318 +    word0 += word1;
   1.319 +    word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
   1.320 +    sum += word1;
   1.321 +    word1 = word1 ^ (word1 << 2);
   1.322 +    word0 += word1;
   1.323 +    word1 = UNALIGNED_LOAD32(word_ptr32 + 3);
   1.324 +    sum += word1;
   1.325 +    word1 = word1 ^ (word1 >> 8);
   1.326 +    word0 += word1;
   1.327 +    word1 = UNALIGNED_LOAD32(word_ptr32 + 4);
   1.328 +    sum += word1;
   1.329 +    word1 = word1 ^ (word1 >> 4);
   1.330 +    word0 += word1;
   1.331 +    word1 = UNALIGNED_LOAD32(word_ptr32 + 5) & kWordMask0[bytecount & 3];
   1.332 +    sum += word1;
   1.333 +    word1 = word1 ^ (word1 >> 6);
   1.334 +    word0 += word1;
   1.335 +    break;
   1.336 +  }
   1.337 +
   1.338 +  sum += (sum >> 17);             // extra 1-bit shift for bytes 2 & 3
   1.339 +  sum += (sum >> 9);              // extra 1-bit shift for bytes 1 & 3
   1.340 +  sum = (sum & 0xff) << 32;
   1.341 +  return (word0 ^ prepost) + sum;
   1.342 +}
   1.343 +
   1.344 +// OCTAGRAM wrapper with surrounding spaces
   1.345 +// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
   1.346 +// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
   1.347 +//
   1.348 +// The low 32 bits follow the pattern from above, tuned to different scripts
   1.349 +// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
   1.350 +// For runtime use of tables V3
   1.351 +uint64 OctaHash40(const char* word_ptr, int bytecount) {
   1.352 +  if (bytecount == 0) {return 0;}
   1.353 +  uint64 prepost = 0;
   1.354 +  if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
   1.355 +  if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
   1.356 +  return OctaHash40Mix(word_ptr, bytecount, prepost);
   1.357 +}
   1.358 +
   1.359 +
   1.360 +// OCTAGRAM wrapper with surrounding underscores (offline use)
   1.361 +// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
   1.362 +// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
   1.363 +//
   1.364 +// The low 32 bits follow the pattern from above, tuned to different scripts
   1.365 +// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
   1.366 +// For offline construction of tables
   1.367 +uint64 OctaHash40underscore(const char* word_ptr, int bytecount) {
   1.368 +  if (bytecount == 0) {return 0;}
   1.369 +  const char* local_word_ptr = word_ptr;
   1.370 +  int local_bytecount = bytecount;
   1.371 +  uint64 prepost = 0;
   1.372 +  if (local_word_ptr[0] == '_') {
   1.373 +    prepost |= kPreSpaceIndicator;
   1.374 +    ++local_word_ptr;
   1.375 +    --local_bytecount;
   1.376 +  }
   1.377 +  if (local_word_ptr[local_bytecount - 1] == '_') {
   1.378 +    prepost |= kPostSpaceIndicator;
   1.379 +    --local_bytecount;
   1.380 +  }
   1.381 +  return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
   1.382 +}
   1.383 +
   1.384 +// Hash a consecutive pair of tokens/words A B
   1.385 +// Old: hash is B - A, which gives too many false hits on one-char diffs
   1.386 +// Now: rotate(A,13) + B
   1.387 +uint64 PairHash(uint64 worda_hash, uint64 wordb_hash) {
   1.388 +   return ((worda_hash >> 13) | (worda_hash << (64 - 13))) + wordb_hash;
   1.389 +}
   1.390 +
   1.391 +
   1.392 +
   1.393 +
   1.394 +//----------------------------------------------------------------------------//
   1.395 +// Finding groups of 1/2/4/8 letters                                          //
   1.396 +//----------------------------------------------------------------------------//
   1.397 +
   1.398 +// src points to a letter. Find the byte length of a unigram starting there.
   1.399 +int UniLen(const char* src) {
   1.400 +  const char* src_end = src;
   1.401 +  src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   1.402 +  return src_end - src;
   1.403 +}
   1.404 +
   1.405 +// src points to a letter. Find the byte length of a bigram starting there.
   1.406 +int BiLen(const char* src) {
   1.407 +  const char* src_end = src;
   1.408 +  src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   1.409 +  src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   1.410 +  return src_end - src;
   1.411 +}
   1.412 +
   1.413 +// src points to a letter. Find the byte length of a quadgram starting there.
   1.414 +int QuadLen(const char* src) {
   1.415 +  const char* src_end = src;
   1.416 +  src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   1.417 +  src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   1.418 +  src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   1.419 +  src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   1.420 +  return src_end - src;
   1.421 +}
   1.422 +
   1.423 +// src points to a letter. Find the byte length of an octagram starting there.
   1.424 +int OctaLen(const char* src) {
   1.425 +  const char* src_end = src;
   1.426 +  int charcount = 0;
   1.427 +  while (src_end[0] != ' ') {
   1.428 +    src_end += UTF8OneCharLen(src);
   1.429 +    ++charcount;
   1.430 +    if (charcount == 8) {break;}
   1.431 +  }
   1.432 +  return src_end - src;
   1.433 +}
   1.434 +
   1.435 +}       // End namespace CLD2
   1.436 +
   1.437 +
   1.438 +
   1.439 +
   1.440 +

mercurial