browser/components/translation/cld2/internal/cldutil_shared.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/browser/components/translation/cld2/internal/cldutil_shared.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,509 @@
     1.4 +// Copyright 2013 Google Inc. All Rights Reserved.
     1.5 +//
     1.6 +// Licensed under the Apache License, Version 2.0 (the "License");
     1.7 +// you may not use this file except in compliance with the License.
     1.8 +// You may obtain a copy of the License at
     1.9 +//
    1.10 +//     http://www.apache.org/licenses/LICENSE-2.0
    1.11 +//
    1.12 +// Unless required by applicable law or agreed to in writing, software
    1.13 +// distributed under the License is distributed on an "AS IS" BASIS,
    1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    1.15 +// See the License for the specific language governing permissions and
    1.16 +// limitations under the License.
    1.17 +
    1.18 +//
    1.19 +// Author: dsites@google.com (Dick Sites)
    1.20 +//
    1.21 +// Just the stuff shared between offline table builder and online detector
    1.22 +//
    1.23 +
    1.24 +#ifndef I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__
    1.25 +#define I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__
    1.26 +
    1.27 +#include "integral_types.h"
    1.28 +#include "cld2tablesummary.h"
    1.29 +
    1.30 +namespace CLD2 {
    1.31 +
    1.32 +// Runtime routines for hashing, looking up, and scoring
    1.33 +// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
    1.34 +// Unigrams and bigrams are for CJK languages only, including simplified/
    1.35 +// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
    1.36 +// Zhuang Han characters. Surrounding spaces are not considered.
    1.37 +// Quadgrams and octagrams for for non-CJK and include two bits indicating
    1.38 +// preceding and trailing spaces (word boundaries).
    1.39 +
    1.40 +
    1.41 +//----------------------------------------------------------------------------//
    1.42 +// Main quantized probability table                                           //
    1.43 +//----------------------------------------------------------------------------//
    1.44 +
    1.45 +  // Table has 240 eight-byte entries. Each entry has a five-byte array and
    1.46 +  // a three-byte array of log base 2 probabilities in the range 1..12.
    1.47 +  // The intended use is to express five or three probabilities in a single-byte
    1.48 +  // subscript, then decode via this table. These probabilities are
    1.49 +  // intended to go with an array of five or three language numbers.
    1.50 +  //
    1.51 +  // The corresponding language numbers will have to be sorted by descending
    1.52 +  // probability, then the actual probability subscript chosen to match the
    1.53 +  // closest available entry in this table.
    1.54 +  //
    1.55 +  // Pattern of probability values:
    1.56 +  // hi 3/4 1/2 1/4 lo    hi mid lo
    1.57 +  // where "3/4" is (hi*3+lo)/4, "1/2" is (hi+lo)/2, and "1/4" is (hi+lo*3)/4
    1.58 +  // and mid is one of 3/4 1/2 or 1/4.
    1.59 +  // There are three groups of 78 (=12*13/2) entries, with hi running 1..12 and
    1.60 +  // lo running 1..hi. Only the first group is used for five-entry lookups.
    1.61 +  // The mid value in the first group is 1/2, the second group 3/4, and the
    1.62 +  // third group 1/4. For three-entry lookups, this allows the mid entry to be
    1.63 +  // somewhat higher or lower than the midpoint, to allow a better match to the
    1.64 +  // original probabilities.
    1.65 +  static const int kLgProbV2TblSize = 240;
    1.66 +  static const uint8 kLgProbV2Tbl[kLgProbV2TblSize * 8] = {
    1.67 +    1,1,1,1,1, 1,1,1,     // [0]
    1.68 +    2,2,2,1,1, 2,2,1,     // [1]
    1.69 +    2,2,2,2,2, 2,2,2,
    1.70 +    3,3,2,2,1, 3,2,1,     // [3]
    1.71 +    3,3,3,2,2, 3,3,2,
    1.72 +    3,3,3,3,3, 3,3,3,
    1.73 +    4,3,3,2,1, 4,3,1,     // [6]
    1.74 +    4,4,3,3,2, 4,3,2,
    1.75 +    4,4,4,3,3, 4,4,3,
    1.76 +    4,4,4,4,4, 4,4,4,
    1.77 +    5,4,3,2,1, 5,3,1,     // [10]
    1.78 +    5,4,4,3,2, 5,4,2,
    1.79 +    5,5,4,4,3, 5,4,3,
    1.80 +    5,5,5,4,4, 5,5,4,
    1.81 +    5,5,5,5,5, 5,5,5,
    1.82 +    6,5,4,2,1, 6,4,1,     // [15]
    1.83 +    6,5,4,3,2, 6,4,2,
    1.84 +    6,5,5,4,3, 6,5,3,
    1.85 +    6,6,5,5,4, 6,5,4,
    1.86 +    6,6,6,5,5, 6,6,5,
    1.87 +    6,6,6,6,6, 6,6,6,
    1.88 +    7,6,4,3,1, 7,4,1,     // [21]
    1.89 +    7,6,5,3,2, 7,5,2,
    1.90 +    7,6,5,4,3, 7,5,3,
    1.91 +    7,6,6,5,4, 7,6,4,
    1.92 +    7,7,6,6,5, 7,6,5,
    1.93 +    7,7,7,6,6, 7,7,6,
    1.94 +    7,7,7,7,7, 7,7,7,
    1.95 +    8,6,5,3,1, 8,5,1,     // [28]
    1.96 +    8,7,5,4,2, 8,5,2,
    1.97 +    8,7,6,4,3, 8,6,3,
    1.98 +    8,7,6,5,4, 8,6,4,
    1.99 +    8,7,7,6,5, 8,7,5,
   1.100 +    8,8,7,7,6, 8,7,6,
   1.101 +    8,8,8,7,7, 8,8,7,
   1.102 +    8,8,8,8,8, 8,8,8,
   1.103 +    9,7,5,3,1, 9,5,1,     // [36]
   1.104 +    9,7,6,4,2, 9,6,2,
   1.105 +    9,8,6,5,3, 9,6,3,
   1.106 +    9,8,7,5,4, 9,7,4,
   1.107 +    9,8,7,6,5, 9,7,5,
   1.108 +    9,8,8,7,6, 9,8,6,
   1.109 +    9,9,8,8,7, 9,8,7,
   1.110 +    9,9,9,8,8, 9,9,8,
   1.111 +    9,9,9,9,9, 9,9,9,
   1.112 +    10,8,6,3,1, 10,6,1,   // [45]
   1.113 +    10,8,6,4,2, 10,6,2,
   1.114 +    10,8,7,5,3, 10,7,3,
   1.115 +    10,9,7,6,4, 10,7,4,
   1.116 +    10,9,8,6,5, 10,8,5,
   1.117 +    10,9,8,7,6, 10,8,6,
   1.118 +    10,9,9,8,7, 10,9,7,
   1.119 +    10,10,9,9,8, 10,9,8,
   1.120 +    10,10,10,9,9, 10,10,9,
   1.121 +    10,10,10,10,10, 10,10,10,
   1.122 +    11,9,6,4,1, 11,6,1,   // [55]
   1.123 +    11,9,7,4,2, 11,7,2,
   1.124 +    11,9,7,5,3, 11,7,3,
   1.125 +    11,9,8,6,4, 11,8,4,
   1.126 +    11,10,8,7,5, 11,8,5,
   1.127 +    11,10,9,7,6, 11,9,6,
   1.128 +    11,10,9,8,7, 11,9,7,
   1.129 +    11,10,10,9,8, 11,10,8,
   1.130 +    11,11,10,10,9, 11,10,9,
   1.131 +    11,11,11,10,10, 11,11,10,
   1.132 +    11,11,11,11,11, 11,11,11,
   1.133 +    12,9,7,4,1, 12,7,1,   // [66]
   1.134 +    12,10,7,5,2, 12,7,2,
   1.135 +    12,10,8,5,3, 12,8,3,
   1.136 +    12,10,8,6,4, 12,8,4,
   1.137 +    12,10,9,7,5, 12,9,5,
   1.138 +    12,11,9,8,6, 12,9,6,
   1.139 +    12,11,10,8,7, 12,10,7,
   1.140 +    12,11,10,9,8, 12,10,8,
   1.141 +    12,11,11,10,9, 12,11,9,
   1.142 +    12,12,11,11,10, 12,11,10,
   1.143 +    12,12,12,11,11, 12,12,11,
   1.144 +    12,12,12,12,12, 12,12,12,
   1.145 +
   1.146 +    1,1,1,1,1, 1,1,1,
   1.147 +    2,2,2,1,1, 2,2,1,
   1.148 +    2,2,2,2,2, 2,2,2,
   1.149 +    3,3,2,2,1, 3,3,1,
   1.150 +    3,3,3,2,2, 3,3,2,
   1.151 +    3,3,3,3,3, 3,3,3,
   1.152 +    4,3,3,2,1, 4,3,1,
   1.153 +    4,4,3,3,2, 4,4,2,
   1.154 +    4,4,4,3,3, 4,4,3,
   1.155 +    4,4,4,4,4, 4,4,4,
   1.156 +    5,4,3,2,1, 5,4,1,
   1.157 +    5,4,4,3,2, 5,4,2,
   1.158 +    5,5,4,4,3, 5,5,3,
   1.159 +    5,5,5,4,4, 5,5,4,
   1.160 +    5,5,5,5,5, 5,5,5,
   1.161 +    6,5,4,2,1, 6,5,1,
   1.162 +    6,5,4,3,2, 6,5,2,
   1.163 +    6,5,5,4,3, 6,5,3,
   1.164 +    6,6,5,5,4, 6,6,4,
   1.165 +    6,6,6,5,5, 6,6,5,
   1.166 +    6,6,6,6,6, 6,6,6,
   1.167 +    7,6,4,3,1, 7,6,1,
   1.168 +    7,6,5,3,2, 7,6,2,
   1.169 +    7,6,5,4,3, 7,6,3,
   1.170 +    7,6,6,5,4, 7,6,4,
   1.171 +    7,7,6,6,5, 7,7,5,
   1.172 +    7,7,7,6,6, 7,7,6,
   1.173 +    7,7,7,7,7, 7,7,7,
   1.174 +    8,6,5,3,1, 8,6,1,
   1.175 +    8,7,5,4,2, 8,7,2,
   1.176 +    8,7,6,4,3, 8,7,3,
   1.177 +    8,7,6,5,4, 8,7,4,
   1.178 +    8,7,7,6,5, 8,7,5,
   1.179 +    8,8,7,7,6, 8,8,6,
   1.180 +    8,8,8,7,7, 8,8,7,
   1.181 +    8,8,8,8,8, 8,8,8,
   1.182 +    9,7,5,3,1, 9,7,1,
   1.183 +    9,7,6,4,2, 9,7,2,
   1.184 +    9,8,6,5,3, 9,8,3,
   1.185 +    9,8,7,5,4, 9,8,4,
   1.186 +    9,8,7,6,5, 9,8,5,
   1.187 +    9,8,8,7,6, 9,8,6,
   1.188 +    9,9,8,8,7, 9,9,7,
   1.189 +    9,9,9,8,8, 9,9,8,
   1.190 +    9,9,9,9,9, 9,9,9,
   1.191 +    10,8,6,3,1, 10,8,1,
   1.192 +    10,8,6,4,2, 10,8,2,
   1.193 +    10,8,7,5,3, 10,8,3,
   1.194 +    10,9,7,6,4, 10,9,4,
   1.195 +    10,9,8,6,5, 10,9,5,
   1.196 +    10,9,8,7,6, 10,9,6,
   1.197 +    10,9,9,8,7, 10,9,7,
   1.198 +    10,10,9,9,8, 10,10,8,
   1.199 +    10,10,10,9,9, 10,10,9,
   1.200 +    10,10,10,10,10, 10,10,10,
   1.201 +    11,9,6,4,1, 11,9,1,
   1.202 +    11,9,7,4,2, 11,9,2,
   1.203 +    11,9,7,5,3, 11,9,3,
   1.204 +    11,9,8,6,4, 11,9,4,
   1.205 +    11,10,8,7,5, 11,10,5,
   1.206 +    11,10,9,7,6, 11,10,6,
   1.207 +    11,10,9,8,7, 11,10,7,
   1.208 +    11,10,10,9,8, 11,10,8,
   1.209 +    11,11,10,10,9, 11,11,9,
   1.210 +    11,11,11,10,10, 11,11,10,
   1.211 +    11,11,11,11,11, 11,11,11,
   1.212 +    12,9,7,4,1, 12,9,1,
   1.213 +    12,10,7,5,2, 12,10,2,
   1.214 +    12,10,8,5,3, 12,10,3,
   1.215 +    12,10,8,6,4, 12,10,4,
   1.216 +    12,10,9,7,5, 12,10,5,
   1.217 +    12,11,9,8,6, 12,11,6,
   1.218 +    12,11,10,8,7, 12,11,7,
   1.219 +    12,11,10,9,8, 12,11,8,
   1.220 +    12,11,11,10,9, 12,11,9,
   1.221 +    12,12,11,11,10, 12,12,10,
   1.222 +    12,12,12,11,11, 12,12,11,
   1.223 +    12,12,12,12,12, 12,12,12,
   1.224 +
   1.225 +    1,1,1,1,1, 1,1,1,
   1.226 +    2,2,2,1,1, 2,1,1,
   1.227 +    2,2,2,2,2, 2,2,2,
   1.228 +    3,3,2,2,1, 3,2,1,
   1.229 +    3,3,3,2,2, 3,2,2,
   1.230 +    3,3,3,3,3, 3,3,3,
   1.231 +    4,3,3,2,1, 4,2,1,
   1.232 +    4,4,3,3,2, 4,3,2,
   1.233 +    4,4,4,3,3, 4,3,3,
   1.234 +    4,4,4,4,4, 4,4,4,
   1.235 +    5,4,3,2,1, 5,2,1,
   1.236 +    5,4,4,3,2, 5,3,2,
   1.237 +    5,5,4,4,3, 5,4,3,
   1.238 +    5,5,5,4,4, 5,4,4,
   1.239 +    5,5,5,5,5, 5,5,5,
   1.240 +    6,5,4,2,1, 6,2,1,
   1.241 +    6,5,4,3,2, 6,3,2,
   1.242 +    6,5,5,4,3, 6,4,3,
   1.243 +    6,6,5,5,4, 6,5,4,
   1.244 +    6,6,6,5,5, 6,5,5,
   1.245 +    6,6,6,6,6, 6,6,6,
   1.246 +    7,6,4,3,1, 7,3,1,
   1.247 +    7,6,5,3,2, 7,3,2,
   1.248 +    7,6,5,4,3, 7,4,3,
   1.249 +    7,6,6,5,4, 7,5,4,
   1.250 +    7,7,6,6,5, 7,6,5,
   1.251 +    7,7,7,6,6, 7,6,6,
   1.252 +    7,7,7,7,7, 7,7,7,
   1.253 +    8,6,5,3,1, 8,3,1,
   1.254 +    8,7,5,4,2, 8,4,2,
   1.255 +    8,7,6,4,3, 8,4,3,
   1.256 +    8,7,6,5,4, 8,5,4,
   1.257 +    8,7,7,6,5, 8,6,5,
   1.258 +    8,8,7,7,6, 8,7,6,
   1.259 +    8,8,8,7,7, 8,7,7,
   1.260 +    8,8,8,8,8, 8,8,8,
   1.261 +    9,7,5,3,1, 9,3,1,
   1.262 +    9,7,6,4,2, 9,4,2,
   1.263 +    9,8,6,5,3, 9,5,3,
   1.264 +    9,8,7,5,4, 9,5,4,
   1.265 +    9,8,7,6,5, 9,6,5,
   1.266 +    9,8,8,7,6, 9,7,6,
   1.267 +    9,9,8,8,7, 9,8,7,
   1.268 +    9,9,9,8,8, 9,8,8,
   1.269 +    9,9,9,9,9, 9,9,9,
   1.270 +    10,8,6,3,1, 10,3,1,
   1.271 +    10,8,6,4,2, 10,4,2,
   1.272 +    10,8,7,5,3, 10,5,3,
   1.273 +    10,9,7,6,4, 10,6,4,
   1.274 +    10,9,8,6,5, 10,6,5,
   1.275 +    10,9,8,7,6, 10,7,6,
   1.276 +    10,9,9,8,7, 10,8,7,
   1.277 +    10,10,9,9,8, 10,9,8,
   1.278 +    10,10,10,9,9, 10,9,9,
   1.279 +    10,10,10,10,10, 10,10,10,
   1.280 +    11,9,6,4,1, 11,4,1,
   1.281 +    11,9,7,4,2, 11,4,2,
   1.282 +    11,9,7,5,3, 11,5,3,
   1.283 +    11,9,8,6,4, 11,6,4,
   1.284 +    11,10,8,7,5, 11,7,5,
   1.285 +    11,10,9,7,6, 11,7,6,
   1.286 +    11,10,9,8,7, 11,8,7,
   1.287 +    11,10,10,9,8, 11,9,8,
   1.288 +    11,11,10,10,9, 11,10,9,
   1.289 +    11,11,11,10,10, 11,10,10,
   1.290 +    11,11,11,11,11, 11,11,11,
   1.291 +    12,9,7,4,1, 12,4,1,
   1.292 +    12,10,7,5,2, 12,5,2,
   1.293 +    12,10,8,5,3, 12,5,3,
   1.294 +    12,10,8,6,4, 12,6,4,
   1.295 +    12,10,9,7,5, 12,7,5,
   1.296 +    12,11,9,8,6, 12,8,6,
   1.297 +    12,11,10,8,7, 12,8,7,
   1.298 +    12,11,10,9,8, 12,9,8,
   1.299 +    12,11,11,10,9, 12,10,9,
   1.300 +    12,12,11,11,10, 12,11,10,
   1.301 +    12,12,12,11,11, 12,11,11,
   1.302 +    12,12,12,12,12, 12,12,12,
   1.303 +
   1.304 +    // Added 2013.01.28 for CJK compatible mapping
   1.305 +    8,5,2,2,2, 8,2,2,
   1.306 +    6,6,6,4,2, 6,6,2,
   1.307 +    6,5,4,4,4, 6,4,4,
   1.308 +    6,4,2,2,2, 6,2,2,
   1.309 +    4,3,2,2,2, 4,2,2,
   1.310 +    2,2,2,2,2, 2,2,2,
   1.311 +  };
   1.312 +
   1.313 +  // Backmap a single desired probability into an entry in kLgProbV2Tbl
   1.314 +  static const uint8 kLgProbV2TblBackmap[13] = {
   1.315 +    0,
   1.316 +    0, 1, 3, 6,   10, 15, 21, 28,   36, 45, 55, 66,
   1.317 +  };
   1.318 +
   1.319 +  // Return address of 8-byte entry[i]
   1.320 +  inline const uint8* LgProb2TblEntry(int i) {
   1.321 +    return &kLgProbV2Tbl[i * 8];
   1.322 +  }
   1.323 +
   1.324 +  // Return one of three probabilities in an entry
   1.325 +  inline uint8 LgProb3(const uint8* entry, int j) {
   1.326 +    return entry[j + 5];
   1.327 +  }
   1.328 +
   1.329 +
   1.330 +// Routines to access a hash table of <key:wordhash, value:probs> pairs
   1.331 +// Buckets have 4-byte wordhash for sizes < 32K buckets, but only
   1.332 +// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
   1.333 +// bucket subscript.
   1.334 +// Probs is a packed: three languages plus a subscript for probability table
   1.335 +// Buckets have all the keys together, then all the values.Key array never
   1.336 +// crosses a cache-line boundary, so no-match case takes exactly one cache miss.
   1.337 +// Match case may sometimes take an additional cache miss on value access.
   1.338 +//
   1.339 +// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
   1.340 +// byte buckets with single cache miss.
   1.341 +// Or 2-byte key and 6-byte value, allowing 5 languages instead  of three.
   1.342 +
   1.343 +
   1.344 +//----------------------------------------------------------------------------//
   1.345 +// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores      //
   1.346 +//----------------------------------------------------------------------------//
   1.347 +
   1.348 +// BIGRAM
   1.349 +// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
   1.350 +// OVERSHOOTS up to 3 bytes
   1.351 +// For runtime use of tables
   1.352 +// Does X86 unaligned loads if !defined(NEED_ALIGNED_LOADS)UNALIGNED_LOAD32(_p)
   1.353 +uint32 BiHashV2(const char* word_ptr, int bytecount);
   1.354 +
   1.355 +// QUADGRAM wrapper with surrounding spaces
   1.356 +// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
   1.357 +// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
   1.358 +// For runtime use of tables
   1.359 +uint32 QuadHashV2(const char* word_ptr, int bytecount);
   1.360 +
   1.361 +// QUADGRAM wrapper with surrounding underscores (offline use)
   1.362 +// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
   1.363 +// OVERSHOOTS up to 3 bytes
   1.364 +// For offline construction of tables
   1.365 +uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount);
   1.366 +
   1.367 +// OCTAGRAM wrapper with surrounding spaces
   1.368 +// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
   1.369 +// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
   1.370 +uint64 OctaHash40(const char* word_ptr, int bytecount);
   1.371 +
   1.372 +
   1.373 +// OCTAGRAM wrapper with surrounding underscores (offline use)
   1.374 +// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
   1.375 +// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
   1.376 +uint64 OctaHash40underscore(const char* word_ptr, int bytecount);
   1.377 +
   1.378 +// Hash a consecutive pair of tokens/words A B
   1.379 +uint64 PairHash(uint64 worda_hash, uint64 wordb_hash);
   1.380 +
   1.381 +
   1.382 +// From 32-bit gram FP, return hash table subscript and remaining key
   1.383 +inline void QuadFPJustHash(uint32 quadhash,
   1.384 +                                uint32 keymask,
   1.385 +                                int bucketcount,
   1.386 +                                uint32* subscr, uint32* hashkey) {
   1.387 +  *subscr = (quadhash + (quadhash >> 12)) & (bucketcount - 1);
   1.388 +  *hashkey = quadhash & keymask;
   1.389 +}
   1.390 +
   1.391 +// From 40-bit gram FP, return hash table subscript and remaining key
   1.392 +inline void OctaFPJustHash(uint64 longwordhash,
   1.393 +                                  uint32 keymask,
   1.394 +                                  int bucketcount,
   1.395 +                                  uint32* subscr, uint32* hashkey) {
   1.396 +  uint32 temp = (longwordhash + (longwordhash >> 12)) & (bucketcount - 1);
   1.397 +  *subscr = temp;
   1.398 +  temp = longwordhash >> 4;
   1.399 +  *hashkey = temp & keymask;
   1.400 +}
   1.401 +
   1.402 +
   1.403 +// Look up 32-bit gram FP in caller-passed table
   1.404 +// Typical size 256K entries (1.5MB)
   1.405 +// Two-byte hashkey
   1.406 +inline const uint32 QuadHashV3Lookup4(const CLD2TableSummary* gram_obj,
   1.407 +                                      uint32 quadhash) {
   1.408 +  uint32 subscr, hashkey;
   1.409 +  const IndirectProbBucket4* quadtable = gram_obj->kCLDTable;
   1.410 +  uint32 keymask = gram_obj->kCLDTableKeyMask;
   1.411 +  int bucketcount = gram_obj->kCLDTableSize;
   1.412 +  QuadFPJustHash(quadhash, keymask, bucketcount, &subscr, &hashkey);
   1.413 +  const IndirectProbBucket4* bucket_ptr = &quadtable[subscr];
   1.414 +  // Four-way associative, 4 compares
   1.415 +  if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
   1.416 +    return bucket_ptr->keyvalue[0];
   1.417 +  }
   1.418 +  if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
   1.419 +    return bucket_ptr->keyvalue[1];
   1.420 +  }
   1.421 +  if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
   1.422 +    return bucket_ptr->keyvalue[2];
   1.423 +  }
   1.424 +  if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
   1.425 +    return bucket_ptr->keyvalue[3];
   1.426 +  }
   1.427 +  return 0;
   1.428 +}
   1.429 +
   1.430 +// Look up 40-bit gram FP in caller-passed table
   1.431 +// Typical size 256K-4M entries (1-16MB)
   1.432 +// 24-12 bit hashkey packed with 8-20 bit indirect lang/probs
   1.433 +// keymask is 0xfffff000 for 20-bit hashkey and 12-bit indirect
   1.434 +inline const uint32 OctaHashV3Lookup4(const CLD2TableSummary* gram_obj,
   1.435 +                                          uint64 longwordhash) {
   1.436 +  uint32 subscr, hashkey;
   1.437 +  const IndirectProbBucket4* octatable = gram_obj->kCLDTable;
   1.438 +  uint32 keymask = gram_obj->kCLDTableKeyMask;
   1.439 +  int bucketcount = gram_obj->kCLDTableSize;
   1.440 +  OctaFPJustHash(longwordhash, keymask, bucketcount,
   1.441 +                        &subscr, &hashkey);
   1.442 +  const IndirectProbBucket4* bucket_ptr = &octatable[subscr];
   1.443 +  // Four-way associative, 4 compares
   1.444 +  if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
   1.445 +    return bucket_ptr->keyvalue[0];
   1.446 +  }
   1.447 +  if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
   1.448 +    return bucket_ptr->keyvalue[1];
   1.449 +  }
   1.450 +  if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
   1.451 +    return bucket_ptr->keyvalue[2];
   1.452 +  }
   1.453 +  if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
   1.454 +    return bucket_ptr->keyvalue[3];
   1.455 +  }
   1.456 +  return 0;
   1.457 +}
   1.458 +
   1.459 +
   1.460 +//----------------------------------------------------------------------------//
   1.461 +// Finding groups of 1/2/4/8 letters                                          //
   1.462 +//----------------------------------------------------------------------------//
   1.463 +
   1.464 +// Does not advance past space or tab/cr/lf/nul
   1.465 +static const uint8 kAdvanceOneCharButSpace[256] = {
   1.466 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1.467 + 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   1.468 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   1.469 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   1.470 +
   1.471 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   1.472 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   1.473 + 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
   1.474 + 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
   1.475 +};
   1.476 +
   1.477 +
   1.478 +// Advances *only* on space or ASCII vowel (or illegal byte)
   1.479 +static const uint8 kAdvanceOneCharSpaceVowel[256] = {
   1.480 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   1.481 + 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1.482 + 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
   1.483 + 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
   1.484 +
   1.485 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   1.486 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   1.487 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1.488 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1.489 +};
   1.490 +
   1.491 +
   1.492 +// src points to a letter. Find the byte length of a unigram starting there.
   1.493 +int UniLen(const char* src);
   1.494 +
   1.495 +// src points to a letter. Find the byte length of a bigram starting there.
   1.496 +int BiLen(const char* src);
   1.497 +
   1.498 +// src points to a letter. Find the byte length of a quadgram starting there.
   1.499 +int QuadLen(const char* src);
   1.500 +
   1.501 +// src points to a letter. Find the byte length of an octagram starting there.
   1.502 +int OctaLen(const char* src);
   1.503 +
   1.504 +}       // End namespace CLD2
   1.505 +
   1.506 +#endif  // I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__
   1.507 +
   1.508 +
   1.509 +
   1.510 +
   1.511 +
   1.512 +

mercurial