Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | // Copyright 2013 Google Inc. All Rights Reserved. |
michael@0 | 2 | // |
michael@0 | 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
michael@0 | 4 | // you may not use this file except in compliance with the License. |
michael@0 | 5 | // You may obtain a copy of the License at |
michael@0 | 6 | // |
michael@0 | 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
michael@0 | 8 | // |
michael@0 | 9 | // Unless required by applicable law or agreed to in writing, software |
michael@0 | 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
michael@0 | 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
michael@0 | 12 | // See the License for the specific language governing permissions and |
michael@0 | 13 | // limitations under the License. |
michael@0 | 14 | |
michael@0 | 15 | // |
michael@0 | 16 | // Author: dsites@google.com (Dick Sites) |
michael@0 | 17 | // |
michael@0 | 18 | // Just the stuff shared between offline table builder and online detector |
michael@0 | 19 | // |
michael@0 | 20 | |
michael@0 | 21 | #ifndef I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__ |
michael@0 | 22 | #define I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__ |
michael@0 | 23 | |
michael@0 | 24 | #include "integral_types.h" |
michael@0 | 25 | #include "cld2tablesummary.h" |
michael@0 | 26 | |
michael@0 | 27 | namespace CLD2 { |
michael@0 | 28 | |
michael@0 | 29 | // Runtime routines for hashing, looking up, and scoring |
michael@0 | 30 | // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams. |
michael@0 | 31 | // Unigrams and bigrams are for CJK languages only, including simplified/ |
michael@0 | 32 | // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and |
michael@0 | 33 | // Zhuang Han characters. Surrounding spaces are not considered. |
michael@0 | 34 | // Quadgrams and octagrams for for non-CJK and include two bits indicating |
michael@0 | 35 | // preceding and trailing spaces (word boundaries). |
michael@0 | 36 | |
michael@0 | 37 | |
michael@0 | 38 | //----------------------------------------------------------------------------// |
michael@0 | 39 | // Main quantized probability table // |
michael@0 | 40 | //----------------------------------------------------------------------------// |
michael@0 | 41 | |
michael@0 | 42 | // Table has 240 eight-byte entries. Each entry has a five-byte array and |
michael@0 | 43 | // a three-byte array of log base 2 probabilities in the range 1..12. |
michael@0 | 44 | // The intended use is to express five or three probabilities in a single-byte |
michael@0 | 45 | // subscript, then decode via this table. These probabilities are |
michael@0 | 46 | // intended to go with an array of five or three language numbers. |
michael@0 | 47 | // |
michael@0 | 48 | // The corresponding language numbers will have to be sorted by descending |
michael@0 | 49 | // probability, then the actual probability subscript chosen to match the |
michael@0 | 50 | // closest available entry in this table. |
michael@0 | 51 | // |
michael@0 | 52 | // Pattern of probability values: |
michael@0 | 53 | // hi 3/4 1/2 1/4 lo hi mid lo |
michael@0 | 54 | // where "3/4" is (hi*3+lo)/4, "1/2" is (hi+lo)/2, and "1/4" is (hi+lo*3)/4 |
michael@0 | 55 | // and mid is one of 3/4 1/2 or 1/4. |
michael@0 | 56 | // There are three groups of 78 (=12*13/2) entries, with hi running 1..12 and |
michael@0 | 57 | // lo running 1..hi. Only the first group is used for five-entry lookups. |
michael@0 | 58 | // The mid value in the first group is 1/2, the second group 3/4, and the |
michael@0 | 59 | // third group 1/4. For three-entry lookups, this allows the mid entry to be |
michael@0 | 60 | // somewhat higher or lower than the midpoint, to allow a better match to the |
michael@0 | 61 | // original probabilities. |
michael@0 | 62 | static const int kLgProbV2TblSize = 240; |
michael@0 | 63 | static const uint8 kLgProbV2Tbl[kLgProbV2TblSize * 8] = { |
michael@0 | 64 | 1,1,1,1,1, 1,1,1, // [0] |
michael@0 | 65 | 2,2,2,1,1, 2,2,1, // [1] |
michael@0 | 66 | 2,2,2,2,2, 2,2,2, |
michael@0 | 67 | 3,3,2,2,1, 3,2,1, // [3] |
michael@0 | 68 | 3,3,3,2,2, 3,3,2, |
michael@0 | 69 | 3,3,3,3,3, 3,3,3, |
michael@0 | 70 | 4,3,3,2,1, 4,3,1, // [6] |
michael@0 | 71 | 4,4,3,3,2, 4,3,2, |
michael@0 | 72 | 4,4,4,3,3, 4,4,3, |
michael@0 | 73 | 4,4,4,4,4, 4,4,4, |
michael@0 | 74 | 5,4,3,2,1, 5,3,1, // [10] |
michael@0 | 75 | 5,4,4,3,2, 5,4,2, |
michael@0 | 76 | 5,5,4,4,3, 5,4,3, |
michael@0 | 77 | 5,5,5,4,4, 5,5,4, |
michael@0 | 78 | 5,5,5,5,5, 5,5,5, |
michael@0 | 79 | 6,5,4,2,1, 6,4,1, // [15] |
michael@0 | 80 | 6,5,4,3,2, 6,4,2, |
michael@0 | 81 | 6,5,5,4,3, 6,5,3, |
michael@0 | 82 | 6,6,5,5,4, 6,5,4, |
michael@0 | 83 | 6,6,6,5,5, 6,6,5, |
michael@0 | 84 | 6,6,6,6,6, 6,6,6, |
michael@0 | 85 | 7,6,4,3,1, 7,4,1, // [21] |
michael@0 | 86 | 7,6,5,3,2, 7,5,2, |
michael@0 | 87 | 7,6,5,4,3, 7,5,3, |
michael@0 | 88 | 7,6,6,5,4, 7,6,4, |
michael@0 | 89 | 7,7,6,6,5, 7,6,5, |
michael@0 | 90 | 7,7,7,6,6, 7,7,6, |
michael@0 | 91 | 7,7,7,7,7, 7,7,7, |
michael@0 | 92 | 8,6,5,3,1, 8,5,1, // [28] |
michael@0 | 93 | 8,7,5,4,2, 8,5,2, |
michael@0 | 94 | 8,7,6,4,3, 8,6,3, |
michael@0 | 95 | 8,7,6,5,4, 8,6,4, |
michael@0 | 96 | 8,7,7,6,5, 8,7,5, |
michael@0 | 97 | 8,8,7,7,6, 8,7,6, |
michael@0 | 98 | 8,8,8,7,7, 8,8,7, |
michael@0 | 99 | 8,8,8,8,8, 8,8,8, |
michael@0 | 100 | 9,7,5,3,1, 9,5,1, // [36] |
michael@0 | 101 | 9,7,6,4,2, 9,6,2, |
michael@0 | 102 | 9,8,6,5,3, 9,6,3, |
michael@0 | 103 | 9,8,7,5,4, 9,7,4, |
michael@0 | 104 | 9,8,7,6,5, 9,7,5, |
michael@0 | 105 | 9,8,8,7,6, 9,8,6, |
michael@0 | 106 | 9,9,8,8,7, 9,8,7, |
michael@0 | 107 | 9,9,9,8,8, 9,9,8, |
michael@0 | 108 | 9,9,9,9,9, 9,9,9, |
michael@0 | 109 | 10,8,6,3,1, 10,6,1, // [45] |
michael@0 | 110 | 10,8,6,4,2, 10,6,2, |
michael@0 | 111 | 10,8,7,5,3, 10,7,3, |
michael@0 | 112 | 10,9,7,6,4, 10,7,4, |
michael@0 | 113 | 10,9,8,6,5, 10,8,5, |
michael@0 | 114 | 10,9,8,7,6, 10,8,6, |
michael@0 | 115 | 10,9,9,8,7, 10,9,7, |
michael@0 | 116 | 10,10,9,9,8, 10,9,8, |
michael@0 | 117 | 10,10,10,9,9, 10,10,9, |
michael@0 | 118 | 10,10,10,10,10, 10,10,10, |
michael@0 | 119 | 11,9,6,4,1, 11,6,1, // [55] |
michael@0 | 120 | 11,9,7,4,2, 11,7,2, |
michael@0 | 121 | 11,9,7,5,3, 11,7,3, |
michael@0 | 122 | 11,9,8,6,4, 11,8,4, |
michael@0 | 123 | 11,10,8,7,5, 11,8,5, |
michael@0 | 124 | 11,10,9,7,6, 11,9,6, |
michael@0 | 125 | 11,10,9,8,7, 11,9,7, |
michael@0 | 126 | 11,10,10,9,8, 11,10,8, |
michael@0 | 127 | 11,11,10,10,9, 11,10,9, |
michael@0 | 128 | 11,11,11,10,10, 11,11,10, |
michael@0 | 129 | 11,11,11,11,11, 11,11,11, |
michael@0 | 130 | 12,9,7,4,1, 12,7,1, // [66] |
michael@0 | 131 | 12,10,7,5,2, 12,7,2, |
michael@0 | 132 | 12,10,8,5,3, 12,8,3, |
michael@0 | 133 | 12,10,8,6,4, 12,8,4, |
michael@0 | 134 | 12,10,9,7,5, 12,9,5, |
michael@0 | 135 | 12,11,9,8,6, 12,9,6, |
michael@0 | 136 | 12,11,10,8,7, 12,10,7, |
michael@0 | 137 | 12,11,10,9,8, 12,10,8, |
michael@0 | 138 | 12,11,11,10,9, 12,11,9, |
michael@0 | 139 | 12,12,11,11,10, 12,11,10, |
michael@0 | 140 | 12,12,12,11,11, 12,12,11, |
michael@0 | 141 | 12,12,12,12,12, 12,12,12, |
michael@0 | 142 | |
michael@0 | 143 | 1,1,1,1,1, 1,1,1, |
michael@0 | 144 | 2,2,2,1,1, 2,2,1, |
michael@0 | 145 | 2,2,2,2,2, 2,2,2, |
michael@0 | 146 | 3,3,2,2,1, 3,3,1, |
michael@0 | 147 | 3,3,3,2,2, 3,3,2, |
michael@0 | 148 | 3,3,3,3,3, 3,3,3, |
michael@0 | 149 | 4,3,3,2,1, 4,3,1, |
michael@0 | 150 | 4,4,3,3,2, 4,4,2, |
michael@0 | 151 | 4,4,4,3,3, 4,4,3, |
michael@0 | 152 | 4,4,4,4,4, 4,4,4, |
michael@0 | 153 | 5,4,3,2,1, 5,4,1, |
michael@0 | 154 | 5,4,4,3,2, 5,4,2, |
michael@0 | 155 | 5,5,4,4,3, 5,5,3, |
michael@0 | 156 | 5,5,5,4,4, 5,5,4, |
michael@0 | 157 | 5,5,5,5,5, 5,5,5, |
michael@0 | 158 | 6,5,4,2,1, 6,5,1, |
michael@0 | 159 | 6,5,4,3,2, 6,5,2, |
michael@0 | 160 | 6,5,5,4,3, 6,5,3, |
michael@0 | 161 | 6,6,5,5,4, 6,6,4, |
michael@0 | 162 | 6,6,6,5,5, 6,6,5, |
michael@0 | 163 | 6,6,6,6,6, 6,6,6, |
michael@0 | 164 | 7,6,4,3,1, 7,6,1, |
michael@0 | 165 | 7,6,5,3,2, 7,6,2, |
michael@0 | 166 | 7,6,5,4,3, 7,6,3, |
michael@0 | 167 | 7,6,6,5,4, 7,6,4, |
michael@0 | 168 | 7,7,6,6,5, 7,7,5, |
michael@0 | 169 | 7,7,7,6,6, 7,7,6, |
michael@0 | 170 | 7,7,7,7,7, 7,7,7, |
michael@0 | 171 | 8,6,5,3,1, 8,6,1, |
michael@0 | 172 | 8,7,5,4,2, 8,7,2, |
michael@0 | 173 | 8,7,6,4,3, 8,7,3, |
michael@0 | 174 | 8,7,6,5,4, 8,7,4, |
michael@0 | 175 | 8,7,7,6,5, 8,7,5, |
michael@0 | 176 | 8,8,7,7,6, 8,8,6, |
michael@0 | 177 | 8,8,8,7,7, 8,8,7, |
michael@0 | 178 | 8,8,8,8,8, 8,8,8, |
michael@0 | 179 | 9,7,5,3,1, 9,7,1, |
michael@0 | 180 | 9,7,6,4,2, 9,7,2, |
michael@0 | 181 | 9,8,6,5,3, 9,8,3, |
michael@0 | 182 | 9,8,7,5,4, 9,8,4, |
michael@0 | 183 | 9,8,7,6,5, 9,8,5, |
michael@0 | 184 | 9,8,8,7,6, 9,8,6, |
michael@0 | 185 | 9,9,8,8,7, 9,9,7, |
michael@0 | 186 | 9,9,9,8,8, 9,9,8, |
michael@0 | 187 | 9,9,9,9,9, 9,9,9, |
michael@0 | 188 | 10,8,6,3,1, 10,8,1, |
michael@0 | 189 | 10,8,6,4,2, 10,8,2, |
michael@0 | 190 | 10,8,7,5,3, 10,8,3, |
michael@0 | 191 | 10,9,7,6,4, 10,9,4, |
michael@0 | 192 | 10,9,8,6,5, 10,9,5, |
michael@0 | 193 | 10,9,8,7,6, 10,9,6, |
michael@0 | 194 | 10,9,9,8,7, 10,9,7, |
michael@0 | 195 | 10,10,9,9,8, 10,10,8, |
michael@0 | 196 | 10,10,10,9,9, 10,10,9, |
michael@0 | 197 | 10,10,10,10,10, 10,10,10, |
michael@0 | 198 | 11,9,6,4,1, 11,9,1, |
michael@0 | 199 | 11,9,7,4,2, 11,9,2, |
michael@0 | 200 | 11,9,7,5,3, 11,9,3, |
michael@0 | 201 | 11,9,8,6,4, 11,9,4, |
michael@0 | 202 | 11,10,8,7,5, 11,10,5, |
michael@0 | 203 | 11,10,9,7,6, 11,10,6, |
michael@0 | 204 | 11,10,9,8,7, 11,10,7, |
michael@0 | 205 | 11,10,10,9,8, 11,10,8, |
michael@0 | 206 | 11,11,10,10,9, 11,11,9, |
michael@0 | 207 | 11,11,11,10,10, 11,11,10, |
michael@0 | 208 | 11,11,11,11,11, 11,11,11, |
michael@0 | 209 | 12,9,7,4,1, 12,9,1, |
michael@0 | 210 | 12,10,7,5,2, 12,10,2, |
michael@0 | 211 | 12,10,8,5,3, 12,10,3, |
michael@0 | 212 | 12,10,8,6,4, 12,10,4, |
michael@0 | 213 | 12,10,9,7,5, 12,10,5, |
michael@0 | 214 | 12,11,9,8,6, 12,11,6, |
michael@0 | 215 | 12,11,10,8,7, 12,11,7, |
michael@0 | 216 | 12,11,10,9,8, 12,11,8, |
michael@0 | 217 | 12,11,11,10,9, 12,11,9, |
michael@0 | 218 | 12,12,11,11,10, 12,12,10, |
michael@0 | 219 | 12,12,12,11,11, 12,12,11, |
michael@0 | 220 | 12,12,12,12,12, 12,12,12, |
michael@0 | 221 | |
michael@0 | 222 | 1,1,1,1,1, 1,1,1, |
michael@0 | 223 | 2,2,2,1,1, 2,1,1, |
michael@0 | 224 | 2,2,2,2,2, 2,2,2, |
michael@0 | 225 | 3,3,2,2,1, 3,2,1, |
michael@0 | 226 | 3,3,3,2,2, 3,2,2, |
michael@0 | 227 | 3,3,3,3,3, 3,3,3, |
michael@0 | 228 | 4,3,3,2,1, 4,2,1, |
michael@0 | 229 | 4,4,3,3,2, 4,3,2, |
michael@0 | 230 | 4,4,4,3,3, 4,3,3, |
michael@0 | 231 | 4,4,4,4,4, 4,4,4, |
michael@0 | 232 | 5,4,3,2,1, 5,2,1, |
michael@0 | 233 | 5,4,4,3,2, 5,3,2, |
michael@0 | 234 | 5,5,4,4,3, 5,4,3, |
michael@0 | 235 | 5,5,5,4,4, 5,4,4, |
michael@0 | 236 | 5,5,5,5,5, 5,5,5, |
michael@0 | 237 | 6,5,4,2,1, 6,2,1, |
michael@0 | 238 | 6,5,4,3,2, 6,3,2, |
michael@0 | 239 | 6,5,5,4,3, 6,4,3, |
michael@0 | 240 | 6,6,5,5,4, 6,5,4, |
michael@0 | 241 | 6,6,6,5,5, 6,5,5, |
michael@0 | 242 | 6,6,6,6,6, 6,6,6, |
michael@0 | 243 | 7,6,4,3,1, 7,3,1, |
michael@0 | 244 | 7,6,5,3,2, 7,3,2, |
michael@0 | 245 | 7,6,5,4,3, 7,4,3, |
michael@0 | 246 | 7,6,6,5,4, 7,5,4, |
michael@0 | 247 | 7,7,6,6,5, 7,6,5, |
michael@0 | 248 | 7,7,7,6,6, 7,6,6, |
michael@0 | 249 | 7,7,7,7,7, 7,7,7, |
michael@0 | 250 | 8,6,5,3,1, 8,3,1, |
michael@0 | 251 | 8,7,5,4,2, 8,4,2, |
michael@0 | 252 | 8,7,6,4,3, 8,4,3, |
michael@0 | 253 | 8,7,6,5,4, 8,5,4, |
michael@0 | 254 | 8,7,7,6,5, 8,6,5, |
michael@0 | 255 | 8,8,7,7,6, 8,7,6, |
michael@0 | 256 | 8,8,8,7,7, 8,7,7, |
michael@0 | 257 | 8,8,8,8,8, 8,8,8, |
michael@0 | 258 | 9,7,5,3,1, 9,3,1, |
michael@0 | 259 | 9,7,6,4,2, 9,4,2, |
michael@0 | 260 | 9,8,6,5,3, 9,5,3, |
michael@0 | 261 | 9,8,7,5,4, 9,5,4, |
michael@0 | 262 | 9,8,7,6,5, 9,6,5, |
michael@0 | 263 | 9,8,8,7,6, 9,7,6, |
michael@0 | 264 | 9,9,8,8,7, 9,8,7, |
michael@0 | 265 | 9,9,9,8,8, 9,8,8, |
michael@0 | 266 | 9,9,9,9,9, 9,9,9, |
michael@0 | 267 | 10,8,6,3,1, 10,3,1, |
michael@0 | 268 | 10,8,6,4,2, 10,4,2, |
michael@0 | 269 | 10,8,7,5,3, 10,5,3, |
michael@0 | 270 | 10,9,7,6,4, 10,6,4, |
michael@0 | 271 | 10,9,8,6,5, 10,6,5, |
michael@0 | 272 | 10,9,8,7,6, 10,7,6, |
michael@0 | 273 | 10,9,9,8,7, 10,8,7, |
michael@0 | 274 | 10,10,9,9,8, 10,9,8, |
michael@0 | 275 | 10,10,10,9,9, 10,9,9, |
michael@0 | 276 | 10,10,10,10,10, 10,10,10, |
michael@0 | 277 | 11,9,6,4,1, 11,4,1, |
michael@0 | 278 | 11,9,7,4,2, 11,4,2, |
michael@0 | 279 | 11,9,7,5,3, 11,5,3, |
michael@0 | 280 | 11,9,8,6,4, 11,6,4, |
michael@0 | 281 | 11,10,8,7,5, 11,7,5, |
michael@0 | 282 | 11,10,9,7,6, 11,7,6, |
michael@0 | 283 | 11,10,9,8,7, 11,8,7, |
michael@0 | 284 | 11,10,10,9,8, 11,9,8, |
michael@0 | 285 | 11,11,10,10,9, 11,10,9, |
michael@0 | 286 | 11,11,11,10,10, 11,10,10, |
michael@0 | 287 | 11,11,11,11,11, 11,11,11, |
michael@0 | 288 | 12,9,7,4,1, 12,4,1, |
michael@0 | 289 | 12,10,7,5,2, 12,5,2, |
michael@0 | 290 | 12,10,8,5,3, 12,5,3, |
michael@0 | 291 | 12,10,8,6,4, 12,6,4, |
michael@0 | 292 | 12,10,9,7,5, 12,7,5, |
michael@0 | 293 | 12,11,9,8,6, 12,8,6, |
michael@0 | 294 | 12,11,10,8,7, 12,8,7, |
michael@0 | 295 | 12,11,10,9,8, 12,9,8, |
michael@0 | 296 | 12,11,11,10,9, 12,10,9, |
michael@0 | 297 | 12,12,11,11,10, 12,11,10, |
michael@0 | 298 | 12,12,12,11,11, 12,11,11, |
michael@0 | 299 | 12,12,12,12,12, 12,12,12, |
michael@0 | 300 | |
michael@0 | 301 | // Added 2013.01.28 for CJK compatible mapping |
michael@0 | 302 | 8,5,2,2,2, 8,2,2, |
michael@0 | 303 | 6,6,6,4,2, 6,6,2, |
michael@0 | 304 | 6,5,4,4,4, 6,4,4, |
michael@0 | 305 | 6,4,2,2,2, 6,2,2, |
michael@0 | 306 | 4,3,2,2,2, 4,2,2, |
michael@0 | 307 | 2,2,2,2,2, 2,2,2, |
michael@0 | 308 | }; |
michael@0 | 309 | |
michael@0 | 310 | // Backmap a single desired probability into an entry in kLgProbV2Tbl |
michael@0 | 311 | static const uint8 kLgProbV2TblBackmap[13] = { |
michael@0 | 312 | 0, |
michael@0 | 313 | 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, |
michael@0 | 314 | }; |
michael@0 | 315 | |
michael@0 | 316 | // Return address of 8-byte entry[i] |
michael@0 | 317 | inline const uint8* LgProb2TblEntry(int i) { |
michael@0 | 318 | return &kLgProbV2Tbl[i * 8]; |
michael@0 | 319 | } |
michael@0 | 320 | |
michael@0 | 321 | // Return one of three probabilities in an entry |
michael@0 | 322 | inline uint8 LgProb3(const uint8* entry, int j) { |
michael@0 | 323 | return entry[j + 5]; |
michael@0 | 324 | } |
michael@0 | 325 | |
michael@0 | 326 | |
michael@0 | 327 | // Routines to access a hash table of <key:wordhash, value:probs> pairs |
michael@0 | 328 | // Buckets have 4-byte wordhash for sizes < 32K buckets, but only |
michael@0 | 329 | // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as |
michael@0 | 330 | // bucket subscript. |
michael@0 | 331 | // Probs is a packed: three languages plus a subscript for probability table |
michael@0 | 332 | // Buckets have all the keys together, then all the values.Key array never |
michael@0 | 333 | // crosses a cache-line boundary, so no-match case takes exactly one cache miss. |
michael@0 | 334 | // Match case may sometimes take an additional cache miss on value access. |
michael@0 | 335 | // |
michael@0 | 336 | // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64 |
michael@0 | 337 | // byte buckets with single cache miss. |
michael@0 | 338 | // Or 2-byte key and 6-byte value, allowing 5 languages instead of three. |
michael@0 | 339 | |
michael@0 | 340 | |
michael@0 | 341 | //----------------------------------------------------------------------------// |
michael@0 | 342 | // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores // |
michael@0 | 343 | //----------------------------------------------------------------------------// |
michael@0 | 344 | |
michael@0 | 345 | // BIGRAM |
michael@0 | 346 | // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post |
michael@0 | 347 | // OVERSHOOTS up to 3 bytes |
michael@0 | 348 | // For runtime use of tables |
michael@0 | 349 | // Does X86 unaligned loads if !defined(NEED_ALIGNED_LOADS)UNALIGNED_LOAD32(_p) |
michael@0 | 350 | uint32 BiHashV2(const char* word_ptr, int bytecount); |
michael@0 | 351 | |
michael@0 | 352 | // QUADGRAM wrapper with surrounding spaces |
michael@0 | 353 | // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add |
michael@0 | 354 | // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes |
michael@0 | 355 | // For runtime use of tables |
michael@0 | 356 | uint32 QuadHashV2(const char* word_ptr, int bytecount); |
michael@0 | 357 | |
michael@0 | 358 | // QUADGRAM wrapper with surrounding underscores (offline use) |
michael@0 | 359 | // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add |
michael@0 | 360 | // OVERSHOOTS up to 3 bytes |
michael@0 | 361 | // For offline construction of tables |
michael@0 | 362 | uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount); |
michael@0 | 363 | |
michael@0 | 364 | // OCTAGRAM wrapper with surrounding spaces |
michael@0 | 365 | // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add |
michael@0 | 366 | // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes |
michael@0 | 367 | uint64 OctaHash40(const char* word_ptr, int bytecount); |
michael@0 | 368 | |
michael@0 | 369 | |
michael@0 | 370 | // OCTAGRAM wrapper with surrounding underscores (offline use) |
michael@0 | 371 | // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add |
michael@0 | 372 | // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes |
michael@0 | 373 | uint64 OctaHash40underscore(const char* word_ptr, int bytecount); |
michael@0 | 374 | |
michael@0 | 375 | // Hash a consecutive pair of tokens/words A B |
michael@0 | 376 | uint64 PairHash(uint64 worda_hash, uint64 wordb_hash); |
michael@0 | 377 | |
michael@0 | 378 | |
michael@0 | 379 | // From 32-bit gram FP, return hash table subscript and remaining key |
michael@0 | 380 | inline void QuadFPJustHash(uint32 quadhash, |
michael@0 | 381 | uint32 keymask, |
michael@0 | 382 | int bucketcount, |
michael@0 | 383 | uint32* subscr, uint32* hashkey) { |
michael@0 | 384 | *subscr = (quadhash + (quadhash >> 12)) & (bucketcount - 1); |
michael@0 | 385 | *hashkey = quadhash & keymask; |
michael@0 | 386 | } |
michael@0 | 387 | |
michael@0 | 388 | // From 40-bit gram FP, return hash table subscript and remaining key |
michael@0 | 389 | inline void OctaFPJustHash(uint64 longwordhash, |
michael@0 | 390 | uint32 keymask, |
michael@0 | 391 | int bucketcount, |
michael@0 | 392 | uint32* subscr, uint32* hashkey) { |
michael@0 | 393 | uint32 temp = (longwordhash + (longwordhash >> 12)) & (bucketcount - 1); |
michael@0 | 394 | *subscr = temp; |
michael@0 | 395 | temp = longwordhash >> 4; |
michael@0 | 396 | *hashkey = temp & keymask; |
michael@0 | 397 | } |
michael@0 | 398 | |
michael@0 | 399 | |
michael@0 | 400 | // Look up 32-bit gram FP in caller-passed table |
michael@0 | 401 | // Typical size 256K entries (1.5MB) |
michael@0 | 402 | // Two-byte hashkey |
michael@0 | 403 | inline const uint32 QuadHashV3Lookup4(const CLD2TableSummary* gram_obj, |
michael@0 | 404 | uint32 quadhash) { |
michael@0 | 405 | uint32 subscr, hashkey; |
michael@0 | 406 | const IndirectProbBucket4* quadtable = gram_obj->kCLDTable; |
michael@0 | 407 | uint32 keymask = gram_obj->kCLDTableKeyMask; |
michael@0 | 408 | int bucketcount = gram_obj->kCLDTableSize; |
michael@0 | 409 | QuadFPJustHash(quadhash, keymask, bucketcount, &subscr, &hashkey); |
michael@0 | 410 | const IndirectProbBucket4* bucket_ptr = &quadtable[subscr]; |
michael@0 | 411 | // Four-way associative, 4 compares |
michael@0 | 412 | if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) { |
michael@0 | 413 | return bucket_ptr->keyvalue[0]; |
michael@0 | 414 | } |
michael@0 | 415 | if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) { |
michael@0 | 416 | return bucket_ptr->keyvalue[1]; |
michael@0 | 417 | } |
michael@0 | 418 | if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) { |
michael@0 | 419 | return bucket_ptr->keyvalue[2]; |
michael@0 | 420 | } |
michael@0 | 421 | if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) { |
michael@0 | 422 | return bucket_ptr->keyvalue[3]; |
michael@0 | 423 | } |
michael@0 | 424 | return 0; |
michael@0 | 425 | } |
michael@0 | 426 | |
michael@0 | 427 | // Look up 40-bit gram FP in caller-passed table |
michael@0 | 428 | // Typical size 256K-4M entries (1-16MB) |
michael@0 | 429 | // 24-12 bit hashkey packed with 8-20 bit indirect lang/probs |
michael@0 | 430 | // keymask is 0xfffff000 for 20-bit hashkey and 12-bit indirect |
michael@0 | 431 | inline const uint32 OctaHashV3Lookup4(const CLD2TableSummary* gram_obj, |
michael@0 | 432 | uint64 longwordhash) { |
michael@0 | 433 | uint32 subscr, hashkey; |
michael@0 | 434 | const IndirectProbBucket4* octatable = gram_obj->kCLDTable; |
michael@0 | 435 | uint32 keymask = gram_obj->kCLDTableKeyMask; |
michael@0 | 436 | int bucketcount = gram_obj->kCLDTableSize; |
michael@0 | 437 | OctaFPJustHash(longwordhash, keymask, bucketcount, |
michael@0 | 438 | &subscr, &hashkey); |
michael@0 | 439 | const IndirectProbBucket4* bucket_ptr = &octatable[subscr]; |
michael@0 | 440 | // Four-way associative, 4 compares |
michael@0 | 441 | if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) { |
michael@0 | 442 | return bucket_ptr->keyvalue[0]; |
michael@0 | 443 | } |
michael@0 | 444 | if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) { |
michael@0 | 445 | return bucket_ptr->keyvalue[1]; |
michael@0 | 446 | } |
michael@0 | 447 | if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) { |
michael@0 | 448 | return bucket_ptr->keyvalue[2]; |
michael@0 | 449 | } |
michael@0 | 450 | if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) { |
michael@0 | 451 | return bucket_ptr->keyvalue[3]; |
michael@0 | 452 | } |
michael@0 | 453 | return 0; |
michael@0 | 454 | } |
michael@0 | 455 | |
michael@0 | 456 | |
michael@0 | 457 | //----------------------------------------------------------------------------// |
michael@0 | 458 | // Finding groups of 1/2/4/8 letters // |
michael@0 | 459 | //----------------------------------------------------------------------------// |
michael@0 | 460 | |
michael@0 | 461 | // Does not advance past space or tab/cr/lf/nul |
michael@0 | 462 | static const uint8 kAdvanceOneCharButSpace[256] = { |
michael@0 | 463 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 464 | 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 465 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 466 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 467 | |
michael@0 | 468 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 469 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 470 | 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
michael@0 | 471 | 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, |
michael@0 | 472 | }; |
michael@0 | 473 | |
michael@0 | 474 | |
michael@0 | 475 | // Advances *only* on space or ASCII vowel (or illegal byte) |
michael@0 | 476 | static const uint8 kAdvanceOneCharSpaceVowel[256] = { |
michael@0 | 477 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 478 | 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 479 | 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 480 | 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 481 | |
michael@0 | 482 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 483 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 484 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 485 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 486 | }; |
michael@0 | 487 | |
michael@0 | 488 | |
michael@0 | 489 | // src points to a letter. Find the byte length of a unigram starting there. |
michael@0 | 490 | int UniLen(const char* src); |
michael@0 | 491 | |
michael@0 | 492 | // src points to a letter. Find the byte length of a bigram starting there. |
michael@0 | 493 | int BiLen(const char* src); |
michael@0 | 494 | |
michael@0 | 495 | // src points to a letter. Find the byte length of a quadgram starting there. |
michael@0 | 496 | int QuadLen(const char* src); |
michael@0 | 497 | |
michael@0 | 498 | // src points to a letter. Find the byte length of an octagram starting there. |
michael@0 | 499 | int OctaLen(const char* src); |
michael@0 | 500 | |
michael@0 | 501 | } // End namespace CLD2 |
michael@0 | 502 | |
michael@0 | 503 | #endif // I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__ |
michael@0 | 504 | |
michael@0 | 505 | |
michael@0 | 506 | |
michael@0 | 507 | |
michael@0 | 508 | |
michael@0 | 509 |