browser/components/translation/cld2/internal/cldutil_shared.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 // Copyright 2013 Google Inc. All Rights Reserved.
michael@0 2 //
michael@0 3 // Licensed under the Apache License, Version 2.0 (the "License");
michael@0 4 // you may not use this file except in compliance with the License.
michael@0 5 // You may obtain a copy of the License at
michael@0 6 //
michael@0 7 // http://www.apache.org/licenses/LICENSE-2.0
michael@0 8 //
michael@0 9 // Unless required by applicable law or agreed to in writing, software
michael@0 10 // distributed under the License is distributed on an "AS IS" BASIS,
michael@0 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0 12 // See the License for the specific language governing permissions and
michael@0 13 // limitations under the License.
michael@0 14
michael@0 15 //
michael@0 16 // Author: dsites@google.com (Dick Sites)
michael@0 17 //
michael@0 18 // Just the stuff shared between offline table builder and online detector
michael@0 19 //
michael@0 20
michael@0 21 #ifndef I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__
michael@0 22 #define I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__
michael@0 23
michael@0 24 #include "integral_types.h"
michael@0 25 #include "cld2tablesummary.h"
michael@0 26
michael@0 27 namespace CLD2 {
michael@0 28
michael@0 29 // Runtime routines for hashing, looking up, and scoring
michael@0 30 // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
michael@0 31 // Unigrams and bigrams are for CJK languages only, including simplified/
michael@0 32 // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
michael@0 33 // Zhuang Han characters. Surrounding spaces are not considered.
michael@0 34 // Quadgrams and octagrams for for non-CJK and include two bits indicating
michael@0 35 // preceding and trailing spaces (word boundaries).
michael@0 36
michael@0 37
michael@0 38 //----------------------------------------------------------------------------//
michael@0 39 // Main quantized probability table //
michael@0 40 //----------------------------------------------------------------------------//
michael@0 41
michael@0 42 // Table has 240 eight-byte entries. Each entry has a five-byte array and
michael@0 43 // a three-byte array of log base 2 probabilities in the range 1..12.
michael@0 44 // The intended use is to express five or three probabilities in a single-byte
michael@0 45 // subscript, then decode via this table. These probabilities are
michael@0 46 // intended to go with an array of five or three language numbers.
michael@0 47 //
michael@0 48 // The corresponding language numbers will have to be sorted by descending
michael@0 49 // probability, then the actual probability subscript chosen to match the
michael@0 50 // closest available entry in this table.
michael@0 51 //
michael@0 52 // Pattern of probability values:
michael@0 53 // hi 3/4 1/2 1/4 lo hi mid lo
michael@0 54 // where "3/4" is (hi*3+lo)/4, "1/2" is (hi+lo)/2, and "1/4" is (hi+lo*3)/4
michael@0 55 // and mid is one of 3/4 1/2 or 1/4.
michael@0 56 // There are three groups of 78 (=12*13/2) entries, with hi running 1..12 and
michael@0 57 // lo running 1..hi. Only the first group is used for five-entry lookups.
michael@0 58 // The mid value in the first group is 1/2, the second group 3/4, and the
michael@0 59 // third group 1/4. For three-entry lookups, this allows the mid entry to be
michael@0 60 // somewhat higher or lower than the midpoint, to allow a better match to the
michael@0 61 // original probabilities.
michael@0 62 static const int kLgProbV2TblSize = 240;
michael@0 63 static const uint8 kLgProbV2Tbl[kLgProbV2TblSize * 8] = {
michael@0 64 1,1,1,1,1, 1,1,1, // [0]
michael@0 65 2,2,2,1,1, 2,2,1, // [1]
michael@0 66 2,2,2,2,2, 2,2,2,
michael@0 67 3,3,2,2,1, 3,2,1, // [3]
michael@0 68 3,3,3,2,2, 3,3,2,
michael@0 69 3,3,3,3,3, 3,3,3,
michael@0 70 4,3,3,2,1, 4,3,1, // [6]
michael@0 71 4,4,3,3,2, 4,3,2,
michael@0 72 4,4,4,3,3, 4,4,3,
michael@0 73 4,4,4,4,4, 4,4,4,
michael@0 74 5,4,3,2,1, 5,3,1, // [10]
michael@0 75 5,4,4,3,2, 5,4,2,
michael@0 76 5,5,4,4,3, 5,4,3,
michael@0 77 5,5,5,4,4, 5,5,4,
michael@0 78 5,5,5,5,5, 5,5,5,
michael@0 79 6,5,4,2,1, 6,4,1, // [15]
michael@0 80 6,5,4,3,2, 6,4,2,
michael@0 81 6,5,5,4,3, 6,5,3,
michael@0 82 6,6,5,5,4, 6,5,4,
michael@0 83 6,6,6,5,5, 6,6,5,
michael@0 84 6,6,6,6,6, 6,6,6,
michael@0 85 7,6,4,3,1, 7,4,1, // [21]
michael@0 86 7,6,5,3,2, 7,5,2,
michael@0 87 7,6,5,4,3, 7,5,3,
michael@0 88 7,6,6,5,4, 7,6,4,
michael@0 89 7,7,6,6,5, 7,6,5,
michael@0 90 7,7,7,6,6, 7,7,6,
michael@0 91 7,7,7,7,7, 7,7,7,
michael@0 92 8,6,5,3,1, 8,5,1, // [28]
michael@0 93 8,7,5,4,2, 8,5,2,
michael@0 94 8,7,6,4,3, 8,6,3,
michael@0 95 8,7,6,5,4, 8,6,4,
michael@0 96 8,7,7,6,5, 8,7,5,
michael@0 97 8,8,7,7,6, 8,7,6,
michael@0 98 8,8,8,7,7, 8,8,7,
michael@0 99 8,8,8,8,8, 8,8,8,
michael@0 100 9,7,5,3,1, 9,5,1, // [36]
michael@0 101 9,7,6,4,2, 9,6,2,
michael@0 102 9,8,6,5,3, 9,6,3,
michael@0 103 9,8,7,5,4, 9,7,4,
michael@0 104 9,8,7,6,5, 9,7,5,
michael@0 105 9,8,8,7,6, 9,8,6,
michael@0 106 9,9,8,8,7, 9,8,7,
michael@0 107 9,9,9,8,8, 9,9,8,
michael@0 108 9,9,9,9,9, 9,9,9,
michael@0 109 10,8,6,3,1, 10,6,1, // [45]
michael@0 110 10,8,6,4,2, 10,6,2,
michael@0 111 10,8,7,5,3, 10,7,3,
michael@0 112 10,9,7,6,4, 10,7,4,
michael@0 113 10,9,8,6,5, 10,8,5,
michael@0 114 10,9,8,7,6, 10,8,6,
michael@0 115 10,9,9,8,7, 10,9,7,
michael@0 116 10,10,9,9,8, 10,9,8,
michael@0 117 10,10,10,9,9, 10,10,9,
michael@0 118 10,10,10,10,10, 10,10,10,
michael@0 119 11,9,6,4,1, 11,6,1, // [55]
michael@0 120 11,9,7,4,2, 11,7,2,
michael@0 121 11,9,7,5,3, 11,7,3,
michael@0 122 11,9,8,6,4, 11,8,4,
michael@0 123 11,10,8,7,5, 11,8,5,
michael@0 124 11,10,9,7,6, 11,9,6,
michael@0 125 11,10,9,8,7, 11,9,7,
michael@0 126 11,10,10,9,8, 11,10,8,
michael@0 127 11,11,10,10,9, 11,10,9,
michael@0 128 11,11,11,10,10, 11,11,10,
michael@0 129 11,11,11,11,11, 11,11,11,
michael@0 130 12,9,7,4,1, 12,7,1, // [66]
michael@0 131 12,10,7,5,2, 12,7,2,
michael@0 132 12,10,8,5,3, 12,8,3,
michael@0 133 12,10,8,6,4, 12,8,4,
michael@0 134 12,10,9,7,5, 12,9,5,
michael@0 135 12,11,9,8,6, 12,9,6,
michael@0 136 12,11,10,8,7, 12,10,7,
michael@0 137 12,11,10,9,8, 12,10,8,
michael@0 138 12,11,11,10,9, 12,11,9,
michael@0 139 12,12,11,11,10, 12,11,10,
michael@0 140 12,12,12,11,11, 12,12,11,
michael@0 141 12,12,12,12,12, 12,12,12,
michael@0 142
michael@0 143 1,1,1,1,1, 1,1,1,
michael@0 144 2,2,2,1,1, 2,2,1,
michael@0 145 2,2,2,2,2, 2,2,2,
michael@0 146 3,3,2,2,1, 3,3,1,
michael@0 147 3,3,3,2,2, 3,3,2,
michael@0 148 3,3,3,3,3, 3,3,3,
michael@0 149 4,3,3,2,1, 4,3,1,
michael@0 150 4,4,3,3,2, 4,4,2,
michael@0 151 4,4,4,3,3, 4,4,3,
michael@0 152 4,4,4,4,4, 4,4,4,
michael@0 153 5,4,3,2,1, 5,4,1,
michael@0 154 5,4,4,3,2, 5,4,2,
michael@0 155 5,5,4,4,3, 5,5,3,
michael@0 156 5,5,5,4,4, 5,5,4,
michael@0 157 5,5,5,5,5, 5,5,5,
michael@0 158 6,5,4,2,1, 6,5,1,
michael@0 159 6,5,4,3,2, 6,5,2,
michael@0 160 6,5,5,4,3, 6,5,3,
michael@0 161 6,6,5,5,4, 6,6,4,
michael@0 162 6,6,6,5,5, 6,6,5,
michael@0 163 6,6,6,6,6, 6,6,6,
michael@0 164 7,6,4,3,1, 7,6,1,
michael@0 165 7,6,5,3,2, 7,6,2,
michael@0 166 7,6,5,4,3, 7,6,3,
michael@0 167 7,6,6,5,4, 7,6,4,
michael@0 168 7,7,6,6,5, 7,7,5,
michael@0 169 7,7,7,6,6, 7,7,6,
michael@0 170 7,7,7,7,7, 7,7,7,
michael@0 171 8,6,5,3,1, 8,6,1,
michael@0 172 8,7,5,4,2, 8,7,2,
michael@0 173 8,7,6,4,3, 8,7,3,
michael@0 174 8,7,6,5,4, 8,7,4,
michael@0 175 8,7,7,6,5, 8,7,5,
michael@0 176 8,8,7,7,6, 8,8,6,
michael@0 177 8,8,8,7,7, 8,8,7,
michael@0 178 8,8,8,8,8, 8,8,8,
michael@0 179 9,7,5,3,1, 9,7,1,
michael@0 180 9,7,6,4,2, 9,7,2,
michael@0 181 9,8,6,5,3, 9,8,3,
michael@0 182 9,8,7,5,4, 9,8,4,
michael@0 183 9,8,7,6,5, 9,8,5,
michael@0 184 9,8,8,7,6, 9,8,6,
michael@0 185 9,9,8,8,7, 9,9,7,
michael@0 186 9,9,9,8,8, 9,9,8,
michael@0 187 9,9,9,9,9, 9,9,9,
michael@0 188 10,8,6,3,1, 10,8,1,
michael@0 189 10,8,6,4,2, 10,8,2,
michael@0 190 10,8,7,5,3, 10,8,3,
michael@0 191 10,9,7,6,4, 10,9,4,
michael@0 192 10,9,8,6,5, 10,9,5,
michael@0 193 10,9,8,7,6, 10,9,6,
michael@0 194 10,9,9,8,7, 10,9,7,
michael@0 195 10,10,9,9,8, 10,10,8,
michael@0 196 10,10,10,9,9, 10,10,9,
michael@0 197 10,10,10,10,10, 10,10,10,
michael@0 198 11,9,6,4,1, 11,9,1,
michael@0 199 11,9,7,4,2, 11,9,2,
michael@0 200 11,9,7,5,3, 11,9,3,
michael@0 201 11,9,8,6,4, 11,9,4,
michael@0 202 11,10,8,7,5, 11,10,5,
michael@0 203 11,10,9,7,6, 11,10,6,
michael@0 204 11,10,9,8,7, 11,10,7,
michael@0 205 11,10,10,9,8, 11,10,8,
michael@0 206 11,11,10,10,9, 11,11,9,
michael@0 207 11,11,11,10,10, 11,11,10,
michael@0 208 11,11,11,11,11, 11,11,11,
michael@0 209 12,9,7,4,1, 12,9,1,
michael@0 210 12,10,7,5,2, 12,10,2,
michael@0 211 12,10,8,5,3, 12,10,3,
michael@0 212 12,10,8,6,4, 12,10,4,
michael@0 213 12,10,9,7,5, 12,10,5,
michael@0 214 12,11,9,8,6, 12,11,6,
michael@0 215 12,11,10,8,7, 12,11,7,
michael@0 216 12,11,10,9,8, 12,11,8,
michael@0 217 12,11,11,10,9, 12,11,9,
michael@0 218 12,12,11,11,10, 12,12,10,
michael@0 219 12,12,12,11,11, 12,12,11,
michael@0 220 12,12,12,12,12, 12,12,12,
michael@0 221
michael@0 222 1,1,1,1,1, 1,1,1,
michael@0 223 2,2,2,1,1, 2,1,1,
michael@0 224 2,2,2,2,2, 2,2,2,
michael@0 225 3,3,2,2,1, 3,2,1,
michael@0 226 3,3,3,2,2, 3,2,2,
michael@0 227 3,3,3,3,3, 3,3,3,
michael@0 228 4,3,3,2,1, 4,2,1,
michael@0 229 4,4,3,3,2, 4,3,2,
michael@0 230 4,4,4,3,3, 4,3,3,
michael@0 231 4,4,4,4,4, 4,4,4,
michael@0 232 5,4,3,2,1, 5,2,1,
michael@0 233 5,4,4,3,2, 5,3,2,
michael@0 234 5,5,4,4,3, 5,4,3,
michael@0 235 5,5,5,4,4, 5,4,4,
michael@0 236 5,5,5,5,5, 5,5,5,
michael@0 237 6,5,4,2,1, 6,2,1,
michael@0 238 6,5,4,3,2, 6,3,2,
michael@0 239 6,5,5,4,3, 6,4,3,
michael@0 240 6,6,5,5,4, 6,5,4,
michael@0 241 6,6,6,5,5, 6,5,5,
michael@0 242 6,6,6,6,6, 6,6,6,
michael@0 243 7,6,4,3,1, 7,3,1,
michael@0 244 7,6,5,3,2, 7,3,2,
michael@0 245 7,6,5,4,3, 7,4,3,
michael@0 246 7,6,6,5,4, 7,5,4,
michael@0 247 7,7,6,6,5, 7,6,5,
michael@0 248 7,7,7,6,6, 7,6,6,
michael@0 249 7,7,7,7,7, 7,7,7,
michael@0 250 8,6,5,3,1, 8,3,1,
michael@0 251 8,7,5,4,2, 8,4,2,
michael@0 252 8,7,6,4,3, 8,4,3,
michael@0 253 8,7,6,5,4, 8,5,4,
michael@0 254 8,7,7,6,5, 8,6,5,
michael@0 255 8,8,7,7,6, 8,7,6,
michael@0 256 8,8,8,7,7, 8,7,7,
michael@0 257 8,8,8,8,8, 8,8,8,
michael@0 258 9,7,5,3,1, 9,3,1,
michael@0 259 9,7,6,4,2, 9,4,2,
michael@0 260 9,8,6,5,3, 9,5,3,
michael@0 261 9,8,7,5,4, 9,5,4,
michael@0 262 9,8,7,6,5, 9,6,5,
michael@0 263 9,8,8,7,6, 9,7,6,
michael@0 264 9,9,8,8,7, 9,8,7,
michael@0 265 9,9,9,8,8, 9,8,8,
michael@0 266 9,9,9,9,9, 9,9,9,
michael@0 267 10,8,6,3,1, 10,3,1,
michael@0 268 10,8,6,4,2, 10,4,2,
michael@0 269 10,8,7,5,3, 10,5,3,
michael@0 270 10,9,7,6,4, 10,6,4,
michael@0 271 10,9,8,6,5, 10,6,5,
michael@0 272 10,9,8,7,6, 10,7,6,
michael@0 273 10,9,9,8,7, 10,8,7,
michael@0 274 10,10,9,9,8, 10,9,8,
michael@0 275 10,10,10,9,9, 10,9,9,
michael@0 276 10,10,10,10,10, 10,10,10,
michael@0 277 11,9,6,4,1, 11,4,1,
michael@0 278 11,9,7,4,2, 11,4,2,
michael@0 279 11,9,7,5,3, 11,5,3,
michael@0 280 11,9,8,6,4, 11,6,4,
michael@0 281 11,10,8,7,5, 11,7,5,
michael@0 282 11,10,9,7,6, 11,7,6,
michael@0 283 11,10,9,8,7, 11,8,7,
michael@0 284 11,10,10,9,8, 11,9,8,
michael@0 285 11,11,10,10,9, 11,10,9,
michael@0 286 11,11,11,10,10, 11,10,10,
michael@0 287 11,11,11,11,11, 11,11,11,
michael@0 288 12,9,7,4,1, 12,4,1,
michael@0 289 12,10,7,5,2, 12,5,2,
michael@0 290 12,10,8,5,3, 12,5,3,
michael@0 291 12,10,8,6,4, 12,6,4,
michael@0 292 12,10,9,7,5, 12,7,5,
michael@0 293 12,11,9,8,6, 12,8,6,
michael@0 294 12,11,10,8,7, 12,8,7,
michael@0 295 12,11,10,9,8, 12,9,8,
michael@0 296 12,11,11,10,9, 12,10,9,
michael@0 297 12,12,11,11,10, 12,11,10,
michael@0 298 12,12,12,11,11, 12,11,11,
michael@0 299 12,12,12,12,12, 12,12,12,
michael@0 300
michael@0 301 // Added 2013.01.28 for CJK compatible mapping
michael@0 302 8,5,2,2,2, 8,2,2,
michael@0 303 6,6,6,4,2, 6,6,2,
michael@0 304 6,5,4,4,4, 6,4,4,
michael@0 305 6,4,2,2,2, 6,2,2,
michael@0 306 4,3,2,2,2, 4,2,2,
michael@0 307 2,2,2,2,2, 2,2,2,
michael@0 308 };
michael@0 309
michael@0 310 // Backmap a single desired probability into an entry in kLgProbV2Tbl
michael@0 311 static const uint8 kLgProbV2TblBackmap[13] = {
michael@0 312 0,
michael@0 313 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66,
michael@0 314 };
michael@0 315
michael@0 316 // Return address of 8-byte entry[i]
michael@0 317 inline const uint8* LgProb2TblEntry(int i) {
michael@0 318 return &kLgProbV2Tbl[i * 8];
michael@0 319 }
michael@0 320
michael@0 321 // Return one of three probabilities in an entry
michael@0 322 inline uint8 LgProb3(const uint8* entry, int j) {
michael@0 323 return entry[j + 5];
michael@0 324 }
michael@0 325
michael@0 326
michael@0 327 // Routines to access a hash table of <key:wordhash, value:probs> pairs
michael@0 328 // Buckets have 4-byte wordhash for sizes < 32K buckets, but only
michael@0 329 // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
michael@0 330 // bucket subscript.
michael@0 331 // Probs is a packed: three languages plus a subscript for probability table
michael@0 332 // Buckets have all the keys together, then all the values.Key array never
michael@0 333 // crosses a cache-line boundary, so no-match case takes exactly one cache miss.
michael@0 334 // Match case may sometimes take an additional cache miss on value access.
michael@0 335 //
michael@0 336 // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
michael@0 337 // byte buckets with single cache miss.
michael@0 338 // Or 2-byte key and 6-byte value, allowing 5 languages instead of three.
michael@0 339
michael@0 340
michael@0 341 //----------------------------------------------------------------------------//
michael@0 342 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores //
michael@0 343 //----------------------------------------------------------------------------//
michael@0 344
michael@0 345 // BIGRAM
michael@0 346 // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
michael@0 347 // OVERSHOOTS up to 3 bytes
michael@0 348 // For runtime use of tables
michael@0 349 // Does X86 unaligned loads if !defined(NEED_ALIGNED_LOADS)UNALIGNED_LOAD32(_p)
michael@0 350 uint32 BiHashV2(const char* word_ptr, int bytecount);
michael@0 351
michael@0 352 // QUADGRAM wrapper with surrounding spaces
michael@0 353 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
michael@0 354 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
michael@0 355 // For runtime use of tables
michael@0 356 uint32 QuadHashV2(const char* word_ptr, int bytecount);
michael@0 357
michael@0 358 // QUADGRAM wrapper with surrounding underscores (offline use)
michael@0 359 // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
michael@0 360 // OVERSHOOTS up to 3 bytes
michael@0 361 // For offline construction of tables
michael@0 362 uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount);
michael@0 363
michael@0 364 // OCTAGRAM wrapper with surrounding spaces
michael@0 365 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
michael@0 366 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
michael@0 367 uint64 OctaHash40(const char* word_ptr, int bytecount);
michael@0 368
michael@0 369
michael@0 370 // OCTAGRAM wrapper with surrounding underscores (offline use)
michael@0 371 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
michael@0 372 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
michael@0 373 uint64 OctaHash40underscore(const char* word_ptr, int bytecount);
michael@0 374
michael@0 375 // Hash a consecutive pair of tokens/words A B
michael@0 376 uint64 PairHash(uint64 worda_hash, uint64 wordb_hash);
michael@0 377
michael@0 378
michael@0 379 // From 32-bit gram FP, return hash table subscript and remaining key
michael@0 380 inline void QuadFPJustHash(uint32 quadhash,
michael@0 381 uint32 keymask,
michael@0 382 int bucketcount,
michael@0 383 uint32* subscr, uint32* hashkey) {
michael@0 384 *subscr = (quadhash + (quadhash >> 12)) & (bucketcount - 1);
michael@0 385 *hashkey = quadhash & keymask;
michael@0 386 }
michael@0 387
michael@0 388 // From 40-bit gram FP, return hash table subscript and remaining key
michael@0 389 inline void OctaFPJustHash(uint64 longwordhash,
michael@0 390 uint32 keymask,
michael@0 391 int bucketcount,
michael@0 392 uint32* subscr, uint32* hashkey) {
michael@0 393 uint32 temp = (longwordhash + (longwordhash >> 12)) & (bucketcount - 1);
michael@0 394 *subscr = temp;
michael@0 395 temp = longwordhash >> 4;
michael@0 396 *hashkey = temp & keymask;
michael@0 397 }
michael@0 398
michael@0 399
michael@0 400 // Look up 32-bit gram FP in caller-passed table
michael@0 401 // Typical size 256K entries (1.5MB)
michael@0 402 // Two-byte hashkey
michael@0 403 inline const uint32 QuadHashV3Lookup4(const CLD2TableSummary* gram_obj,
michael@0 404 uint32 quadhash) {
michael@0 405 uint32 subscr, hashkey;
michael@0 406 const IndirectProbBucket4* quadtable = gram_obj->kCLDTable;
michael@0 407 uint32 keymask = gram_obj->kCLDTableKeyMask;
michael@0 408 int bucketcount = gram_obj->kCLDTableSize;
michael@0 409 QuadFPJustHash(quadhash, keymask, bucketcount, &subscr, &hashkey);
michael@0 410 const IndirectProbBucket4* bucket_ptr = &quadtable[subscr];
michael@0 411 // Four-way associative, 4 compares
michael@0 412 if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
michael@0 413 return bucket_ptr->keyvalue[0];
michael@0 414 }
michael@0 415 if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
michael@0 416 return bucket_ptr->keyvalue[1];
michael@0 417 }
michael@0 418 if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
michael@0 419 return bucket_ptr->keyvalue[2];
michael@0 420 }
michael@0 421 if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
michael@0 422 return bucket_ptr->keyvalue[3];
michael@0 423 }
michael@0 424 return 0;
michael@0 425 }
michael@0 426
michael@0 427 // Look up 40-bit gram FP in caller-passed table
michael@0 428 // Typical size 256K-4M entries (1-16MB)
michael@0 429 // 24-12 bit hashkey packed with 8-20 bit indirect lang/probs
michael@0 430 // keymask is 0xfffff000 for 20-bit hashkey and 12-bit indirect
michael@0 431 inline const uint32 OctaHashV3Lookup4(const CLD2TableSummary* gram_obj,
michael@0 432 uint64 longwordhash) {
michael@0 433 uint32 subscr, hashkey;
michael@0 434 const IndirectProbBucket4* octatable = gram_obj->kCLDTable;
michael@0 435 uint32 keymask = gram_obj->kCLDTableKeyMask;
michael@0 436 int bucketcount = gram_obj->kCLDTableSize;
michael@0 437 OctaFPJustHash(longwordhash, keymask, bucketcount,
michael@0 438 &subscr, &hashkey);
michael@0 439 const IndirectProbBucket4* bucket_ptr = &octatable[subscr];
michael@0 440 // Four-way associative, 4 compares
michael@0 441 if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
michael@0 442 return bucket_ptr->keyvalue[0];
michael@0 443 }
michael@0 444 if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
michael@0 445 return bucket_ptr->keyvalue[1];
michael@0 446 }
michael@0 447 if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
michael@0 448 return bucket_ptr->keyvalue[2];
michael@0 449 }
michael@0 450 if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
michael@0 451 return bucket_ptr->keyvalue[3];
michael@0 452 }
michael@0 453 return 0;
michael@0 454 }
michael@0 455
michael@0 456
michael@0 457 //----------------------------------------------------------------------------//
michael@0 458 // Finding groups of 1/2/4/8 letters //
michael@0 459 //----------------------------------------------------------------------------//
michael@0 460
michael@0 461 // Does not advance past space or tab/cr/lf/nul
michael@0 462 static const uint8 kAdvanceOneCharButSpace[256] = {
michael@0 463 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 464 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 465 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 466 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 467
michael@0 468 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 469 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 470 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
michael@0 471 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
michael@0 472 };
michael@0 473
michael@0 474
michael@0 475 // Advances *only* on space or ASCII vowel (or illegal byte)
michael@0 476 static const uint8 kAdvanceOneCharSpaceVowel[256] = {
michael@0 477 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 478 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 479 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
michael@0 480 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
michael@0 481
michael@0 482 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 483 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 484 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 485 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 486 };
michael@0 487
michael@0 488
michael@0 489 // src points to a letter. Find the byte length of a unigram starting there.
michael@0 490 int UniLen(const char* src);
michael@0 491
michael@0 492 // src points to a letter. Find the byte length of a bigram starting there.
michael@0 493 int BiLen(const char* src);
michael@0 494
michael@0 495 // src points to a letter. Find the byte length of a quadgram starting there.
michael@0 496 int QuadLen(const char* src);
michael@0 497
michael@0 498 // src points to a letter. Find the byte length of an octagram starting there.
michael@0 499 int OctaLen(const char* src);
michael@0 500
michael@0 501 } // End namespace CLD2
michael@0 502
michael@0 503 #endif // I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__
michael@0 504
michael@0 505
michael@0 506
michael@0 507
michael@0 508
michael@0 509

mercurial