1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/internal/cldutil_shared.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,437 @@ 1.4 +// Copyright 2013 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +// 1.19 +// Author: dsites@google.com (Dick Sites) 1.20 +// 1.21 + 1.22 +#include "cldutil_shared.h" 1.23 +#include <string> 1.24 + 1.25 +#include "cld2tablesummary.h" 1.26 +#include "integral_types.h" 1.27 +#include "port.h" 1.28 +#include "utf8statetable.h" 1.29 + 1.30 +namespace CLD2 { 1.31 + 1.32 +// Runtime routines for hashing, looking up, and scoring 1.33 +// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams. 1.34 +// Unigrams and bigrams are for CJK languages only, including simplified/ 1.35 +// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and 1.36 +// Zhuang Han characters. Surrounding spaces are not considered. 1.37 +// Quadgrams and octagrams for for non-CJK and include two bits indicating 1.38 +// preceding and trailing spaces (word boundaries). 1.39 + 1.40 + 1.41 +// Indicator bits for leading/trailing space around quad/octagram 1.42 +// NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of 1.43 +// 1-, 2-, or 3-bytes each. 1.44 +static const uint32 kPreSpaceIndicator = 0x00004444; 1.45 +static const uint32 kPostSpaceIndicator = 0x44440000; 1.46 + 1.47 +// Little-endian masks for 0..24 bytes picked up as uint32's 1.48 +static const uint32 kWordMask0[4] = { 1.49 + 0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF 1.50 +}; 1.51 + 1.52 +static const int kMinCJKUTF8CharBytes = 3; 1.53 + 1.54 +static const int kMinGramCount = 3; 1.55 +static const int kMaxGramCount = 16; 1.56 + 1.57 +static const int UTFmax = 4; // Max number of bytes in a UTF-8 character 1.58 + 1.59 + 1.60 +// Routines to access a hash table of <key:wordhash, value:probs> pairs 1.61 +// Buckets have 4-byte wordhash for sizes < 32K buckets, but only 1.62 +// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as 1.63 +// bucket subscript. 1.64 +// Probs is a packed: three languages plus a subscript for probability table 1.65 +// Buckets have all the keys together, then all the values.Key array never 1.66 +// crosses a cache-line boundary, so no-match case takes exactly one cache miss. 1.67 +// Match case may sometimes take an additional cache miss on value access. 1.68 +// 1.69 +// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64 1.70 +// byte buckets with single cache miss. 1.71 +// Or 2-byte key and 6-byte value, allowing 5 languages instead of three. 1.72 + 1.73 + 1.74 +//----------------------------------------------------------------------------// 1.75 +// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores // 1.76 +//----------------------------------------------------------------------------// 1.77 + 1.78 +// Design principles for these hash functions 1.79 +// - Few operations 1.80 +// - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in 1.81 +// Latin script expect 1- and 2-byte mixtures. 1.82 +// - Last byte of each character has about 5 bits of information 1.83 +// - Spread good bits around so they can interact in at least two ways 1.84 +// with other characters 1.85 +// - Use add for additional mixing thorugh carries 1.86 + 1.87 +// CJK Three-byte bigram 1.88 +// ....dddd..cccccc..bbbbbb....aaaa 1.89 +// ..................ffffff..eeeeee 1.90 +// make 1.91 +// ....dddd..cccccc..bbbbbb....aaaa 1.92 +// 000....dddd..cccccc..bbbbbb....a 1.93 +// ..................ffffff..eeeeee 1.94 +// ffffff..eeeeee000000000000000000 1.95 +// 1.96 +// CJK Four-byte bigram 1.97 +// ..dddddd..cccccc....bbbb....aaaa 1.98 +// ..hhhhhh..gggggg....ffff....eeee 1.99 +// make 1.100 +// ..dddddd..cccccc....bbbb....aaaa 1.101 +// 000..dddddd..cccccc....bbbb....a 1.102 +// ..hhhhhh..gggggg....ffff....eeee 1.103 +// ..ffff....eeee000000000000000000 1.104 + 1.105 +// BIGRAM 1.106 +// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post 1.107 +// OVERSHOOTS up to 3 bytes 1.108 +// For runtime use of tables 1.109 +// Does X86 unaligned loads 1.110 +uint32 BiHashV2(const char* word_ptr, int bytecount) { 1.111 + if (bytecount == 0) {return 0;} 1.112 + const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr); 1.113 + uint32 word0, word1; 1.114 + if (bytecount <= 4) { 1.115 + word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3]; 1.116 + word0 = word0 ^ (word0 >> 3); 1.117 + return word0; 1.118 + } 1.119 + // Else do 8 bytes 1.120 + word0 = UNALIGNED_LOAD32(word_ptr32); 1.121 + word0 = word0 ^ (word0 >> 3); 1.122 + word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3]; 1.123 + word1 = word1 ^ (word1 << 18); 1.124 + return word0 + word1; 1.125 +} 1.126 + 1.127 +// 1.128 +// Ascii-7 One-byte chars 1.129 +// ...ddddd...ccccc...bbbbb...aaaaa 1.130 +// make 1.131 +// ...ddddd...ccccc...bbbbb...aaaaa 1.132 +// 000...ddddd...ccccc...bbbbb...aa 1.133 +// 1.134 +// Latin 1- and 2-byte chars 1.135 +// ...ddddd...ccccc...bbbbb...aaaaa 1.136 +// ...................fffff...eeeee 1.137 +// make 1.138 +// ...ddddd...ccccc...bbbbb...aaaaa 1.139 +// 000...ddddd...ccccc...bbbbb...aa 1.140 +// ...................fffff...eeeee 1.141 +// ...............fffff...eeeee0000 1.142 +// 1.143 +// Non-CJK Two-byte chars 1.144 +// ...ddddd...........bbbbb........ 1.145 +// ...hhhhh...........fffff........ 1.146 +// make 1.147 +// ...ddddd...........bbbbb........ 1.148 +// 000...ddddd...........bbbbb..... 1.149 +// ...hhhhh...........fffff........ 1.150 +// hhhh...........fffff........0000 1.151 +// 1.152 +// Non-CJK Three-byte chars 1.153 +// ...........ccccc................ 1.154 +// ...................fffff........ 1.155 +// ...lllll...................iiiii 1.156 +// make 1.157 +// ...........ccccc................ 1.158 +// 000...........ccccc............. 1.159 +// ...................fffff........ 1.160 +// ...............fffff........0000 1.161 +// ...lllll...................iiiii 1.162 +// .lllll...................iiiii00 1.163 +// 1.164 + 1.165 +// QUADGRAM 1.166 +// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add 1.167 +// OVERSHOOTS up to 3 bytes 1.168 +// For runtime use of tables 1.169 +// Does X86 unaligned loads 1.170 +uint32 QuadHashV2Mix(const char* word_ptr, int bytecount, uint32 prepost) { 1.171 + const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr); 1.172 + uint32 word0, word1, word2; 1.173 + if (bytecount <= 4) { 1.174 + word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3]; 1.175 + word0 = word0 ^ (word0 >> 3); 1.176 + return word0 ^ prepost; 1.177 + } else if (bytecount <= 8) { 1.178 + word0 = UNALIGNED_LOAD32(word_ptr32); 1.179 + word0 = word0 ^ (word0 >> 3); 1.180 + word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3]; 1.181 + word1 = word1 ^ (word1 << 4); 1.182 + return (word0 ^ prepost) + word1; 1.183 + } 1.184 + // else do 12 bytes 1.185 + word0 = UNALIGNED_LOAD32(word_ptr32); 1.186 + word0 = word0 ^ (word0 >> 3); 1.187 + word1 = UNALIGNED_LOAD32(word_ptr32 + 1); 1.188 + word1 = word1 ^ (word1 << 4); 1.189 + word2 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3]; 1.190 + word2 = word2 ^ (word2 << 2); 1.191 + return (word0 ^ prepost) + word1 + word2; 1.192 +} 1.193 + 1.194 + 1.195 +// QUADGRAM wrapper with surrounding spaces 1.196 +// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add 1.197 +// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes 1.198 +// For runtime use of tables 1.199 +uint32 QuadHashV2(const char* word_ptr, int bytecount) { 1.200 + if (bytecount == 0) {return 0;} 1.201 + uint32 prepost = 0; 1.202 + if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} 1.203 + if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} 1.204 + return QuadHashV2Mix(word_ptr, bytecount, prepost); 1.205 +} 1.206 + 1.207 +// QUADGRAM wrapper with surrounding underscores (offline use) 1.208 +// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add 1.209 +// OVERSHOOTS up to 3 bytes 1.210 +// For offline construction of tables 1.211 +uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount) { 1.212 + if (bytecount == 0) {return 0;} 1.213 + const char* local_word_ptr = word_ptr; 1.214 + int local_bytecount = bytecount; 1.215 + uint32 prepost = 0; 1.216 + if (local_word_ptr[0] == '_') { 1.217 + prepost |= kPreSpaceIndicator; 1.218 + ++local_word_ptr; 1.219 + --local_bytecount; 1.220 + } 1.221 + if (local_word_ptr[local_bytecount - 1] == '_') { 1.222 + prepost |= kPostSpaceIndicator; 1.223 + --local_bytecount; 1.224 + } 1.225 + return QuadHashV2Mix(local_word_ptr, local_bytecount, prepost); 1.226 +} 1.227 + 1.228 + 1.229 +// OCTAGRAM 1.230 +// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add 1.231 +// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes 1.232 +// 1.233 +// The low 32 bits follow the pattern from above, tuned to different scripts 1.234 +// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each 1.235 +// For runtime use of tables V3 1.236 +// Does X86 unaligned loads 1.237 +uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) { 1.238 + const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr); 1.239 + uint64 word0; 1.240 + uint64 word1; 1.241 + uint64 sum; 1.242 + 1.243 + if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} 1.244 + if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} 1.245 + switch ((bytecount - 1) >> 2) { 1.246 + case 0: // 1..4 bytes 1.247 + word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3]; 1.248 + sum = word0; 1.249 + word0 = word0 ^ (word0 >> 3); 1.250 + break; 1.251 + case 1: // 5..8 bytes 1.252 + word0 = UNALIGNED_LOAD32(word_ptr32); 1.253 + sum = word0; 1.254 + word0 = word0 ^ (word0 >> 3); 1.255 + word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3]; 1.256 + sum += word1; 1.257 + word1 = word1 ^ (word1 << 4); 1.258 + word0 += word1; 1.259 + break; 1.260 + case 2: // 9..12 bytes 1.261 + word0 = UNALIGNED_LOAD32(word_ptr32); 1.262 + sum = word0; 1.263 + word0 = word0 ^ (word0 >> 3); 1.264 + word1 = UNALIGNED_LOAD32(word_ptr32 + 1); 1.265 + sum += word1; 1.266 + word1 = word1 ^ (word1 << 4); 1.267 + word0 += word1; 1.268 + word1 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3]; 1.269 + sum += word1; 1.270 + word1 = word1 ^ (word1 << 2); 1.271 + word0 += word1; 1.272 + break; 1.273 + case 3: // 13..16 bytes 1.274 + word0 =UNALIGNED_LOAD32(word_ptr32); 1.275 + sum = word0; 1.276 + word0 = word0 ^ (word0 >> 3); 1.277 + word1 = UNALIGNED_LOAD32(word_ptr32 + 1); 1.278 + sum += word1; 1.279 + word1 = word1 ^ (word1 << 4); 1.280 + word0 += word1; 1.281 + word1 = UNALIGNED_LOAD32(word_ptr32 + 2); 1.282 + sum += word1; 1.283 + word1 = word1 ^ (word1 << 2); 1.284 + word0 += word1; 1.285 + word1 = UNALIGNED_LOAD32(word_ptr32 + 3) & kWordMask0[bytecount & 3]; 1.286 + sum += word1; 1.287 + word1 = word1 ^ (word1 >> 8); 1.288 + word0 += word1; 1.289 + break; 1.290 + case 4: // 17..20 bytes 1.291 + word0 = UNALIGNED_LOAD32(word_ptr32); 1.292 + sum = word0; 1.293 + word0 = word0 ^ (word0 >> 3); 1.294 + word1 = UNALIGNED_LOAD32(word_ptr32 + 1); 1.295 + sum += word1; 1.296 + word1 = word1 ^ (word1 << 4); 1.297 + word0 += word1; 1.298 + word1 = UNALIGNED_LOAD32(word_ptr32 + 2); 1.299 + sum += word1; 1.300 + word1 = word1 ^ (word1 << 2); 1.301 + word0 += word1; 1.302 + word1 = UNALIGNED_LOAD32(word_ptr32 + 3); 1.303 + sum += word1; 1.304 + word1 = word1 ^ (word1 >> 8); 1.305 + word0 += word1; 1.306 + word1 = UNALIGNED_LOAD32(word_ptr32 + 4) & kWordMask0[bytecount & 3]; 1.307 + sum += word1; 1.308 + word1 = word1 ^ (word1 >> 4); 1.309 + word0 += word1; 1.310 + break; 1.311 + default: // 21..24 bytes and higher (ignores beyond 24) 1.312 + word0 = UNALIGNED_LOAD32(word_ptr32); 1.313 + sum = word0; 1.314 + word0 = word0 ^ (word0 >> 3); 1.315 + word1 = UNALIGNED_LOAD32(word_ptr32 + 1); 1.316 + sum += word1; 1.317 + word1 = word1 ^ (word1 << 4); 1.318 + word0 += word1; 1.319 + word1 = UNALIGNED_LOAD32(word_ptr32 + 2); 1.320 + sum += word1; 1.321 + word1 = word1 ^ (word1 << 2); 1.322 + word0 += word1; 1.323 + word1 = UNALIGNED_LOAD32(word_ptr32 + 3); 1.324 + sum += word1; 1.325 + word1 = word1 ^ (word1 >> 8); 1.326 + word0 += word1; 1.327 + word1 = UNALIGNED_LOAD32(word_ptr32 + 4); 1.328 + sum += word1; 1.329 + word1 = word1 ^ (word1 >> 4); 1.330 + word0 += word1; 1.331 + word1 = UNALIGNED_LOAD32(word_ptr32 + 5) & kWordMask0[bytecount & 3]; 1.332 + sum += word1; 1.333 + word1 = word1 ^ (word1 >> 6); 1.334 + word0 += word1; 1.335 + break; 1.336 + } 1.337 + 1.338 + sum += (sum >> 17); // extra 1-bit shift for bytes 2 & 3 1.339 + sum += (sum >> 9); // extra 1-bit shift for bytes 1 & 3 1.340 + sum = (sum & 0xff) << 32; 1.341 + return (word0 ^ prepost) + sum; 1.342 +} 1.343 + 1.344 +// OCTAGRAM wrapper with surrounding spaces 1.345 +// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add 1.346 +// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes 1.347 +// 1.348 +// The low 32 bits follow the pattern from above, tuned to different scripts 1.349 +// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each 1.350 +// For runtime use of tables V3 1.351 +uint64 OctaHash40(const char* word_ptr, int bytecount) { 1.352 + if (bytecount == 0) {return 0;} 1.353 + uint64 prepost = 0; 1.354 + if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} 1.355 + if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} 1.356 + return OctaHash40Mix(word_ptr, bytecount, prepost); 1.357 +} 1.358 + 1.359 + 1.360 +// OCTAGRAM wrapper with surrounding underscores (offline use) 1.361 +// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add 1.362 +// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes 1.363 +// 1.364 +// The low 32 bits follow the pattern from above, tuned to different scripts 1.365 +// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each 1.366 +// For offline construction of tables 1.367 +uint64 OctaHash40underscore(const char* word_ptr, int bytecount) { 1.368 + if (bytecount == 0) {return 0;} 1.369 + const char* local_word_ptr = word_ptr; 1.370 + int local_bytecount = bytecount; 1.371 + uint64 prepost = 0; 1.372 + if (local_word_ptr[0] == '_') { 1.373 + prepost |= kPreSpaceIndicator; 1.374 + ++local_word_ptr; 1.375 + --local_bytecount; 1.376 + } 1.377 + if (local_word_ptr[local_bytecount - 1] == '_') { 1.378 + prepost |= kPostSpaceIndicator; 1.379 + --local_bytecount; 1.380 + } 1.381 + return OctaHash40Mix(local_word_ptr, local_bytecount, prepost); 1.382 +} 1.383 + 1.384 +// Hash a consecutive pair of tokens/words A B 1.385 +// Old: hash is B - A, which gives too many false hits on one-char diffs 1.386 +// Now: rotate(A,13) + B 1.387 +uint64 PairHash(uint64 worda_hash, uint64 wordb_hash) { 1.388 + return ((worda_hash >> 13) | (worda_hash << (64 - 13))) + wordb_hash; 1.389 +} 1.390 + 1.391 + 1.392 + 1.393 + 1.394 +//----------------------------------------------------------------------------// 1.395 +// Finding groups of 1/2/4/8 letters // 1.396 +//----------------------------------------------------------------------------// 1.397 + 1.398 +// src points to a letter. Find the byte length of a unigram starting there. 1.399 +int UniLen(const char* src) { 1.400 + const char* src_end = src; 1.401 + src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; 1.402 + return src_end - src; 1.403 +} 1.404 + 1.405 +// src points to a letter. Find the byte length of a bigram starting there. 1.406 +int BiLen(const char* src) { 1.407 + const char* src_end = src; 1.408 + src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; 1.409 + src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; 1.410 + return src_end - src; 1.411 +} 1.412 + 1.413 +// src points to a letter. Find the byte length of a quadgram starting there. 1.414 +int QuadLen(const char* src) { 1.415 + const char* src_end = src; 1.416 + src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; 1.417 + src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; 1.418 + src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; 1.419 + src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; 1.420 + return src_end - src; 1.421 +} 1.422 + 1.423 +// src points to a letter. Find the byte length of an octagram starting there. 1.424 +int OctaLen(const char* src) { 1.425 + const char* src_end = src; 1.426 + int charcount = 0; 1.427 + while (src_end[0] != ' ') { 1.428 + src_end += UTF8OneCharLen(src); 1.429 + ++charcount; 1.430 + if (charcount == 8) {break;} 1.431 + } 1.432 + return src_end - src; 1.433 +} 1.434 + 1.435 +} // End namespace CLD2 1.436 + 1.437 + 1.438 + 1.439 + 1.440 +