1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/internal/cldutil_shared.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,509 @@ 1.4 +// Copyright 2013 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +// 1.19 +// Author: dsites@google.com (Dick Sites) 1.20 +// 1.21 +// Just the stuff shared between offline table builder and online detector 1.22 +// 1.23 + 1.24 +#ifndef I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__ 1.25 +#define I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__ 1.26 + 1.27 +#include "integral_types.h" 1.28 +#include "cld2tablesummary.h" 1.29 + 1.30 +namespace CLD2 { 1.31 + 1.32 +// Runtime routines for hashing, looking up, and scoring 1.33 +// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams. 1.34 +// Unigrams and bigrams are for CJK languages only, including simplified/ 1.35 +// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and 1.36 +// Zhuang Han characters. Surrounding spaces are not considered. 1.37 +// Quadgrams and octagrams for for non-CJK and include two bits indicating 1.38 +// preceding and trailing spaces (word boundaries). 1.39 + 1.40 + 1.41 +//----------------------------------------------------------------------------// 1.42 +// Main quantized probability table // 1.43 +//----------------------------------------------------------------------------// 1.44 + 1.45 + // Table has 240 eight-byte entries. Each entry has a five-byte array and 1.46 + // a three-byte array of log base 2 probabilities in the range 1..12. 1.47 + // The intended use is to express five or three probabilities in a single-byte 1.48 + // subscript, then decode via this table. These probabilities are 1.49 + // intended to go with an array of five or three language numbers. 1.50 + // 1.51 + // The corresponding language numbers will have to be sorted by descending 1.52 + // probability, then the actual probability subscript chosen to match the 1.53 + // closest available entry in this table. 1.54 + // 1.55 + // Pattern of probability values: 1.56 + // hi 3/4 1/2 1/4 lo hi mid lo 1.57 + // where "3/4" is (hi*3+lo)/4, "1/2" is (hi+lo)/2, and "1/4" is (hi+lo*3)/4 1.58 + // and mid is one of 3/4 1/2 or 1/4. 1.59 + // There are three groups of 78 (=12*13/2) entries, with hi running 1..12 and 1.60 + // lo running 1..hi. Only the first group is used for five-entry lookups. 1.61 + // The mid value in the first group is 1/2, the second group 3/4, and the 1.62 + // third group 1/4. For three-entry lookups, this allows the mid entry to be 1.63 + // somewhat higher or lower than the midpoint, to allow a better match to the 1.64 + // original probabilities. 1.65 + static const int kLgProbV2TblSize = 240; 1.66 + static const uint8 kLgProbV2Tbl[kLgProbV2TblSize * 8] = { 1.67 + 1,1,1,1,1, 1,1,1, // [0] 1.68 + 2,2,2,1,1, 2,2,1, // [1] 1.69 + 2,2,2,2,2, 2,2,2, 1.70 + 3,3,2,2,1, 3,2,1, // [3] 1.71 + 3,3,3,2,2, 3,3,2, 1.72 + 3,3,3,3,3, 3,3,3, 1.73 + 4,3,3,2,1, 4,3,1, // [6] 1.74 + 4,4,3,3,2, 4,3,2, 1.75 + 4,4,4,3,3, 4,4,3, 1.76 + 4,4,4,4,4, 4,4,4, 1.77 + 5,4,3,2,1, 5,3,1, // [10] 1.78 + 5,4,4,3,2, 5,4,2, 1.79 + 5,5,4,4,3, 5,4,3, 1.80 + 5,5,5,4,4, 5,5,4, 1.81 + 5,5,5,5,5, 5,5,5, 1.82 + 6,5,4,2,1, 6,4,1, // [15] 1.83 + 6,5,4,3,2, 6,4,2, 1.84 + 6,5,5,4,3, 6,5,3, 1.85 + 6,6,5,5,4, 6,5,4, 1.86 + 6,6,6,5,5, 6,6,5, 1.87 + 6,6,6,6,6, 6,6,6, 1.88 + 7,6,4,3,1, 7,4,1, // [21] 1.89 + 7,6,5,3,2, 7,5,2, 1.90 + 7,6,5,4,3, 7,5,3, 1.91 + 7,6,6,5,4, 7,6,4, 1.92 + 7,7,6,6,5, 7,6,5, 1.93 + 7,7,7,6,6, 7,7,6, 1.94 + 7,7,7,7,7, 7,7,7, 1.95 + 8,6,5,3,1, 8,5,1, // [28] 1.96 + 8,7,5,4,2, 8,5,2, 1.97 + 8,7,6,4,3, 8,6,3, 1.98 + 8,7,6,5,4, 8,6,4, 1.99 + 8,7,7,6,5, 8,7,5, 1.100 + 8,8,7,7,6, 8,7,6, 1.101 + 8,8,8,7,7, 8,8,7, 1.102 + 8,8,8,8,8, 8,8,8, 1.103 + 9,7,5,3,1, 9,5,1, // [36] 1.104 + 9,7,6,4,2, 9,6,2, 1.105 + 9,8,6,5,3, 9,6,3, 1.106 + 9,8,7,5,4, 9,7,4, 1.107 + 9,8,7,6,5, 9,7,5, 1.108 + 9,8,8,7,6, 9,8,6, 1.109 + 9,9,8,8,7, 9,8,7, 1.110 + 9,9,9,8,8, 9,9,8, 1.111 + 9,9,9,9,9, 9,9,9, 1.112 + 10,8,6,3,1, 10,6,1, // [45] 1.113 + 10,8,6,4,2, 10,6,2, 1.114 + 10,8,7,5,3, 10,7,3, 1.115 + 10,9,7,6,4, 10,7,4, 1.116 + 10,9,8,6,5, 10,8,5, 1.117 + 10,9,8,7,6, 10,8,6, 1.118 + 10,9,9,8,7, 10,9,7, 1.119 + 10,10,9,9,8, 10,9,8, 1.120 + 10,10,10,9,9, 10,10,9, 1.121 + 10,10,10,10,10, 10,10,10, 1.122 + 11,9,6,4,1, 11,6,1, // [55] 1.123 + 11,9,7,4,2, 11,7,2, 1.124 + 11,9,7,5,3, 11,7,3, 1.125 + 11,9,8,6,4, 11,8,4, 1.126 + 11,10,8,7,5, 11,8,5, 1.127 + 11,10,9,7,6, 11,9,6, 1.128 + 11,10,9,8,7, 11,9,7, 1.129 + 11,10,10,9,8, 11,10,8, 1.130 + 11,11,10,10,9, 11,10,9, 1.131 + 11,11,11,10,10, 11,11,10, 1.132 + 11,11,11,11,11, 11,11,11, 1.133 + 12,9,7,4,1, 12,7,1, // [66] 1.134 + 12,10,7,5,2, 12,7,2, 1.135 + 12,10,8,5,3, 12,8,3, 1.136 + 12,10,8,6,4, 12,8,4, 1.137 + 12,10,9,7,5, 12,9,5, 1.138 + 12,11,9,8,6, 12,9,6, 1.139 + 12,11,10,8,7, 12,10,7, 1.140 + 12,11,10,9,8, 12,10,8, 1.141 + 12,11,11,10,9, 12,11,9, 1.142 + 12,12,11,11,10, 12,11,10, 1.143 + 12,12,12,11,11, 12,12,11, 1.144 + 12,12,12,12,12, 12,12,12, 1.145 + 1.146 + 1,1,1,1,1, 1,1,1, 1.147 + 2,2,2,1,1, 2,2,1, 1.148 + 2,2,2,2,2, 2,2,2, 1.149 + 3,3,2,2,1, 3,3,1, 1.150 + 3,3,3,2,2, 3,3,2, 1.151 + 3,3,3,3,3, 3,3,3, 1.152 + 4,3,3,2,1, 4,3,1, 1.153 + 4,4,3,3,2, 4,4,2, 1.154 + 4,4,4,3,3, 4,4,3, 1.155 + 4,4,4,4,4, 4,4,4, 1.156 + 5,4,3,2,1, 5,4,1, 1.157 + 5,4,4,3,2, 5,4,2, 1.158 + 5,5,4,4,3, 5,5,3, 1.159 + 5,5,5,4,4, 5,5,4, 1.160 + 5,5,5,5,5, 5,5,5, 1.161 + 6,5,4,2,1, 6,5,1, 1.162 + 6,5,4,3,2, 6,5,2, 1.163 + 6,5,5,4,3, 6,5,3, 1.164 + 6,6,5,5,4, 6,6,4, 1.165 + 6,6,6,5,5, 6,6,5, 1.166 + 6,6,6,6,6, 6,6,6, 1.167 + 7,6,4,3,1, 7,6,1, 1.168 + 7,6,5,3,2, 7,6,2, 1.169 + 7,6,5,4,3, 7,6,3, 1.170 + 7,6,6,5,4, 7,6,4, 1.171 + 7,7,6,6,5, 7,7,5, 1.172 + 7,7,7,6,6, 7,7,6, 1.173 + 7,7,7,7,7, 7,7,7, 1.174 + 8,6,5,3,1, 8,6,1, 1.175 + 8,7,5,4,2, 8,7,2, 1.176 + 8,7,6,4,3, 8,7,3, 1.177 + 8,7,6,5,4, 8,7,4, 1.178 + 8,7,7,6,5, 8,7,5, 1.179 + 8,8,7,7,6, 8,8,6, 1.180 + 8,8,8,7,7, 8,8,7, 1.181 + 8,8,8,8,8, 8,8,8, 1.182 + 9,7,5,3,1, 9,7,1, 1.183 + 9,7,6,4,2, 9,7,2, 1.184 + 9,8,6,5,3, 9,8,3, 1.185 + 9,8,7,5,4, 9,8,4, 1.186 + 9,8,7,6,5, 9,8,5, 1.187 + 9,8,8,7,6, 9,8,6, 1.188 + 9,9,8,8,7, 9,9,7, 1.189 + 9,9,9,8,8, 9,9,8, 1.190 + 9,9,9,9,9, 9,9,9, 1.191 + 10,8,6,3,1, 10,8,1, 1.192 + 10,8,6,4,2, 10,8,2, 1.193 + 10,8,7,5,3, 10,8,3, 1.194 + 10,9,7,6,4, 10,9,4, 1.195 + 10,9,8,6,5, 10,9,5, 1.196 + 10,9,8,7,6, 10,9,6, 1.197 + 10,9,9,8,7, 10,9,7, 1.198 + 10,10,9,9,8, 10,10,8, 1.199 + 10,10,10,9,9, 10,10,9, 1.200 + 10,10,10,10,10, 10,10,10, 1.201 + 11,9,6,4,1, 11,9,1, 1.202 + 11,9,7,4,2, 11,9,2, 1.203 + 11,9,7,5,3, 11,9,3, 1.204 + 11,9,8,6,4, 11,9,4, 1.205 + 11,10,8,7,5, 11,10,5, 1.206 + 11,10,9,7,6, 11,10,6, 1.207 + 11,10,9,8,7, 11,10,7, 1.208 + 11,10,10,9,8, 11,10,8, 1.209 + 11,11,10,10,9, 11,11,9, 1.210 + 11,11,11,10,10, 11,11,10, 1.211 + 11,11,11,11,11, 11,11,11, 1.212 + 12,9,7,4,1, 12,9,1, 1.213 + 12,10,7,5,2, 12,10,2, 1.214 + 12,10,8,5,3, 12,10,3, 1.215 + 12,10,8,6,4, 12,10,4, 1.216 + 12,10,9,7,5, 12,10,5, 1.217 + 12,11,9,8,6, 12,11,6, 1.218 + 12,11,10,8,7, 12,11,7, 1.219 + 12,11,10,9,8, 12,11,8, 1.220 + 12,11,11,10,9, 12,11,9, 1.221 + 12,12,11,11,10, 12,12,10, 1.222 + 12,12,12,11,11, 12,12,11, 1.223 + 12,12,12,12,12, 12,12,12, 1.224 + 1.225 + 1,1,1,1,1, 1,1,1, 1.226 + 2,2,2,1,1, 2,1,1, 1.227 + 2,2,2,2,2, 2,2,2, 1.228 + 3,3,2,2,1, 3,2,1, 1.229 + 3,3,3,2,2, 3,2,2, 1.230 + 3,3,3,3,3, 3,3,3, 1.231 + 4,3,3,2,1, 4,2,1, 1.232 + 4,4,3,3,2, 4,3,2, 1.233 + 4,4,4,3,3, 4,3,3, 1.234 + 4,4,4,4,4, 4,4,4, 1.235 + 5,4,3,2,1, 5,2,1, 1.236 + 5,4,4,3,2, 5,3,2, 1.237 + 5,5,4,4,3, 5,4,3, 1.238 + 5,5,5,4,4, 5,4,4, 1.239 + 5,5,5,5,5, 5,5,5, 1.240 + 6,5,4,2,1, 6,2,1, 1.241 + 6,5,4,3,2, 6,3,2, 1.242 + 6,5,5,4,3, 6,4,3, 1.243 + 6,6,5,5,4, 6,5,4, 1.244 + 6,6,6,5,5, 6,5,5, 1.245 + 6,6,6,6,6, 6,6,6, 1.246 + 7,6,4,3,1, 7,3,1, 1.247 + 7,6,5,3,2, 7,3,2, 1.248 + 7,6,5,4,3, 7,4,3, 1.249 + 7,6,6,5,4, 7,5,4, 1.250 + 7,7,6,6,5, 7,6,5, 1.251 + 7,7,7,6,6, 7,6,6, 1.252 + 7,7,7,7,7, 7,7,7, 1.253 + 8,6,5,3,1, 8,3,1, 1.254 + 8,7,5,4,2, 8,4,2, 1.255 + 8,7,6,4,3, 8,4,3, 1.256 + 8,7,6,5,4, 8,5,4, 1.257 + 8,7,7,6,5, 8,6,5, 1.258 + 8,8,7,7,6, 8,7,6, 1.259 + 8,8,8,7,7, 8,7,7, 1.260 + 8,8,8,8,8, 8,8,8, 1.261 + 9,7,5,3,1, 9,3,1, 1.262 + 9,7,6,4,2, 9,4,2, 1.263 + 9,8,6,5,3, 9,5,3, 1.264 + 9,8,7,5,4, 9,5,4, 1.265 + 9,8,7,6,5, 9,6,5, 1.266 + 9,8,8,7,6, 9,7,6, 1.267 + 9,9,8,8,7, 9,8,7, 1.268 + 9,9,9,8,8, 9,8,8, 1.269 + 9,9,9,9,9, 9,9,9, 1.270 + 10,8,6,3,1, 10,3,1, 1.271 + 10,8,6,4,2, 10,4,2, 1.272 + 10,8,7,5,3, 10,5,3, 1.273 + 10,9,7,6,4, 10,6,4, 1.274 + 10,9,8,6,5, 10,6,5, 1.275 + 10,9,8,7,6, 10,7,6, 1.276 + 10,9,9,8,7, 10,8,7, 1.277 + 10,10,9,9,8, 10,9,8, 1.278 + 10,10,10,9,9, 10,9,9, 1.279 + 10,10,10,10,10, 10,10,10, 1.280 + 11,9,6,4,1, 11,4,1, 1.281 + 11,9,7,4,2, 11,4,2, 1.282 + 11,9,7,5,3, 11,5,3, 1.283 + 11,9,8,6,4, 11,6,4, 1.284 + 11,10,8,7,5, 11,7,5, 1.285 + 11,10,9,7,6, 11,7,6, 1.286 + 11,10,9,8,7, 11,8,7, 1.287 + 11,10,10,9,8, 11,9,8, 1.288 + 11,11,10,10,9, 11,10,9, 1.289 + 11,11,11,10,10, 11,10,10, 1.290 + 11,11,11,11,11, 11,11,11, 1.291 + 12,9,7,4,1, 12,4,1, 1.292 + 12,10,7,5,2, 12,5,2, 1.293 + 12,10,8,5,3, 12,5,3, 1.294 + 12,10,8,6,4, 12,6,4, 1.295 + 12,10,9,7,5, 12,7,5, 1.296 + 12,11,9,8,6, 12,8,6, 1.297 + 12,11,10,8,7, 12,8,7, 1.298 + 12,11,10,9,8, 12,9,8, 1.299 + 12,11,11,10,9, 12,10,9, 1.300 + 12,12,11,11,10, 12,11,10, 1.301 + 12,12,12,11,11, 12,11,11, 1.302 + 12,12,12,12,12, 12,12,12, 1.303 + 1.304 + // Added 2013.01.28 for CJK compatible mapping 1.305 + 8,5,2,2,2, 8,2,2, 1.306 + 6,6,6,4,2, 6,6,2, 1.307 + 6,5,4,4,4, 6,4,4, 1.308 + 6,4,2,2,2, 6,2,2, 1.309 + 4,3,2,2,2, 4,2,2, 1.310 + 2,2,2,2,2, 2,2,2, 1.311 + }; 1.312 + 1.313 + // Backmap a single desired probability into an entry in kLgProbV2Tbl 1.314 + static const uint8 kLgProbV2TblBackmap[13] = { 1.315 + 0, 1.316 + 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 1.317 + }; 1.318 + 1.319 + // Return address of 8-byte entry[i] 1.320 + inline const uint8* LgProb2TblEntry(int i) { 1.321 + return &kLgProbV2Tbl[i * 8]; 1.322 + } 1.323 + 1.324 + // Return one of three probabilities in an entry 1.325 + inline uint8 LgProb3(const uint8* entry, int j) { 1.326 + return entry[j + 5]; 1.327 + } 1.328 + 1.329 + 1.330 +// Routines to access a hash table of <key:wordhash, value:probs> pairs 1.331 +// Buckets have 4-byte wordhash for sizes < 32K buckets, but only 1.332 +// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as 1.333 +// bucket subscript. 1.334 +// Probs is a packed: three languages plus a subscript for probability table 1.335 +// Buckets have all the keys together, then all the values.Key array never 1.336 +// crosses a cache-line boundary, so no-match case takes exactly one cache miss. 1.337 +// Match case may sometimes take an additional cache miss on value access. 1.338 +// 1.339 +// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64 1.340 +// byte buckets with single cache miss. 1.341 +// Or 2-byte key and 6-byte value, allowing 5 languages instead of three. 1.342 + 1.343 + 1.344 +//----------------------------------------------------------------------------// 1.345 +// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores // 1.346 +//----------------------------------------------------------------------------// 1.347 + 1.348 +// BIGRAM 1.349 +// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post 1.350 +// OVERSHOOTS up to 3 bytes 1.351 +// For runtime use of tables 1.352 +// Does X86 unaligned loads if !defined(NEED_ALIGNED_LOADS)UNALIGNED_LOAD32(_p) 1.353 +uint32 BiHashV2(const char* word_ptr, int bytecount); 1.354 + 1.355 +// QUADGRAM wrapper with surrounding spaces 1.356 +// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add 1.357 +// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes 1.358 +// For runtime use of tables 1.359 +uint32 QuadHashV2(const char* word_ptr, int bytecount); 1.360 + 1.361 +// QUADGRAM wrapper with surrounding underscores (offline use) 1.362 +// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add 1.363 +// OVERSHOOTS up to 3 bytes 1.364 +// For offline construction of tables 1.365 +uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount); 1.366 + 1.367 +// OCTAGRAM wrapper with surrounding spaces 1.368 +// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add 1.369 +// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes 1.370 +uint64 OctaHash40(const char* word_ptr, int bytecount); 1.371 + 1.372 + 1.373 +// OCTAGRAM wrapper with surrounding underscores (offline use) 1.374 +// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add 1.375 +// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes 1.376 +uint64 OctaHash40underscore(const char* word_ptr, int bytecount); 1.377 + 1.378 +// Hash a consecutive pair of tokens/words A B 1.379 +uint64 PairHash(uint64 worda_hash, uint64 wordb_hash); 1.380 + 1.381 + 1.382 +// From 32-bit gram FP, return hash table subscript and remaining key 1.383 +inline void QuadFPJustHash(uint32 quadhash, 1.384 + uint32 keymask, 1.385 + int bucketcount, 1.386 + uint32* subscr, uint32* hashkey) { 1.387 + *subscr = (quadhash + (quadhash >> 12)) & (bucketcount - 1); 1.388 + *hashkey = quadhash & keymask; 1.389 +} 1.390 + 1.391 +// From 40-bit gram FP, return hash table subscript and remaining key 1.392 +inline void OctaFPJustHash(uint64 longwordhash, 1.393 + uint32 keymask, 1.394 + int bucketcount, 1.395 + uint32* subscr, uint32* hashkey) { 1.396 + uint32 temp = (longwordhash + (longwordhash >> 12)) & (bucketcount - 1); 1.397 + *subscr = temp; 1.398 + temp = longwordhash >> 4; 1.399 + *hashkey = temp & keymask; 1.400 +} 1.401 + 1.402 + 1.403 +// Look up 32-bit gram FP in caller-passed table 1.404 +// Typical size 256K entries (1.5MB) 1.405 +// Two-byte hashkey 1.406 +inline const uint32 QuadHashV3Lookup4(const CLD2TableSummary* gram_obj, 1.407 + uint32 quadhash) { 1.408 + uint32 subscr, hashkey; 1.409 + const IndirectProbBucket4* quadtable = gram_obj->kCLDTable; 1.410 + uint32 keymask = gram_obj->kCLDTableKeyMask; 1.411 + int bucketcount = gram_obj->kCLDTableSize; 1.412 + QuadFPJustHash(quadhash, keymask, bucketcount, &subscr, &hashkey); 1.413 + const IndirectProbBucket4* bucket_ptr = &quadtable[subscr]; 1.414 + // Four-way associative, 4 compares 1.415 + if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) { 1.416 + return bucket_ptr->keyvalue[0]; 1.417 + } 1.418 + if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) { 1.419 + return bucket_ptr->keyvalue[1]; 1.420 + } 1.421 + if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) { 1.422 + return bucket_ptr->keyvalue[2]; 1.423 + } 1.424 + if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) { 1.425 + return bucket_ptr->keyvalue[3]; 1.426 + } 1.427 + return 0; 1.428 +} 1.429 + 1.430 +// Look up 40-bit gram FP in caller-passed table 1.431 +// Typical size 256K-4M entries (1-16MB) 1.432 +// 24-12 bit hashkey packed with 8-20 bit indirect lang/probs 1.433 +// keymask is 0xfffff000 for 20-bit hashkey and 12-bit indirect 1.434 +inline const uint32 OctaHashV3Lookup4(const CLD2TableSummary* gram_obj, 1.435 + uint64 longwordhash) { 1.436 + uint32 subscr, hashkey; 1.437 + const IndirectProbBucket4* octatable = gram_obj->kCLDTable; 1.438 + uint32 keymask = gram_obj->kCLDTableKeyMask; 1.439 + int bucketcount = gram_obj->kCLDTableSize; 1.440 + OctaFPJustHash(longwordhash, keymask, bucketcount, 1.441 + &subscr, &hashkey); 1.442 + const IndirectProbBucket4* bucket_ptr = &octatable[subscr]; 1.443 + // Four-way associative, 4 compares 1.444 + if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) { 1.445 + return bucket_ptr->keyvalue[0]; 1.446 + } 1.447 + if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) { 1.448 + return bucket_ptr->keyvalue[1]; 1.449 + } 1.450 + if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) { 1.451 + return bucket_ptr->keyvalue[2]; 1.452 + } 1.453 + if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) { 1.454 + return bucket_ptr->keyvalue[3]; 1.455 + } 1.456 + return 0; 1.457 +} 1.458 + 1.459 + 1.460 +//----------------------------------------------------------------------------// 1.461 +// Finding groups of 1/2/4/8 letters // 1.462 +//----------------------------------------------------------------------------// 1.463 + 1.464 +// Does not advance past space or tab/cr/lf/nul 1.465 +static const uint8 kAdvanceOneCharButSpace[256] = { 1.466 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.467 + 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.468 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.469 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.470 + 1.471 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.472 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.473 + 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 1.474 + 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 1.475 +}; 1.476 + 1.477 + 1.478 +// Advances *only* on space or ASCII vowel (or illegal byte) 1.479 +static const uint8 kAdvanceOneCharSpaceVowel[256] = { 1.480 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.481 + 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.482 + 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, 1.483 + 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, 1.484 + 1.485 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.486 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.487 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.488 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.489 +}; 1.490 + 1.491 + 1.492 +// src points to a letter. Find the byte length of a unigram starting there. 1.493 +int UniLen(const char* src); 1.494 + 1.495 +// src points to a letter. Find the byte length of a bigram starting there. 1.496 +int BiLen(const char* src); 1.497 + 1.498 +// src points to a letter. Find the byte length of a quadgram starting there. 1.499 +int QuadLen(const char* src); 1.500 + 1.501 +// src points to a letter. Find the byte length of an octagram starting there. 1.502 +int OctaLen(const char* src); 1.503 + 1.504 +} // End namespace CLD2 1.505 + 1.506 +#endif // I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__ 1.507 + 1.508 + 1.509 + 1.510 + 1.511 + 1.512 +