Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | // Copyright 2013 Google Inc. All Rights Reserved. |
michael@0 | 2 | // |
michael@0 | 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
michael@0 | 4 | // you may not use this file except in compliance with the License. |
michael@0 | 5 | // You may obtain a copy of the License at |
michael@0 | 6 | // |
michael@0 | 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
michael@0 | 8 | // |
michael@0 | 9 | // Unless required by applicable law or agreed to in writing, software |
michael@0 | 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
michael@0 | 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
michael@0 | 12 | // See the License for the specific language governing permissions and |
michael@0 | 13 | // limitations under the License. |
michael@0 | 14 | |
michael@0 | 15 | // |
michael@0 | 16 | // Author: dsites@google.com (Dick Sites) |
michael@0 | 17 | // |
michael@0 | 18 | |
michael@0 | 19 | #include "cldutil_shared.h" |
michael@0 | 20 | #include <string> |
michael@0 | 21 | |
michael@0 | 22 | #include "cld2tablesummary.h" |
michael@0 | 23 | #include "integral_types.h" |
michael@0 | 24 | #include "port.h" |
michael@0 | 25 | #include "utf8statetable.h" |
michael@0 | 26 | |
michael@0 | 27 | namespace CLD2 { |
michael@0 | 28 | |
michael@0 | 29 | // Runtime routines for hashing, looking up, and scoring |
michael@0 | 30 | // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams. |
michael@0 | 31 | // Unigrams and bigrams are for CJK languages only, including simplified/ |
michael@0 | 32 | // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and |
michael@0 | 33 | // Zhuang Han characters. Surrounding spaces are not considered. |
michael@0 | 34 | // Quadgrams and octagrams for for non-CJK and include two bits indicating |
michael@0 | 35 | // preceding and trailing spaces (word boundaries). |
michael@0 | 36 | |
michael@0 | 37 | |
michael@0 | 38 | // Indicator bits for leading/trailing space around quad/octagram |
michael@0 | 39 | // NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of |
michael@0 | 40 | // 1-, 2-, or 3-bytes each. |
michael@0 | 41 | static const uint32 kPreSpaceIndicator = 0x00004444; |
michael@0 | 42 | static const uint32 kPostSpaceIndicator = 0x44440000; |
michael@0 | 43 | |
michael@0 | 44 | // Little-endian masks for 0..24 bytes picked up as uint32's |
michael@0 | 45 | static const uint32 kWordMask0[4] = { |
michael@0 | 46 | 0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF |
michael@0 | 47 | }; |
michael@0 | 48 | |
michael@0 | 49 | static const int kMinCJKUTF8CharBytes = 3; |
michael@0 | 50 | |
michael@0 | 51 | static const int kMinGramCount = 3; |
michael@0 | 52 | static const int kMaxGramCount = 16; |
michael@0 | 53 | |
michael@0 | 54 | static const int UTFmax = 4; // Max number of bytes in a UTF-8 character |
michael@0 | 55 | |
michael@0 | 56 | |
michael@0 | 57 | // Routines to access a hash table of <key:wordhash, value:probs> pairs |
michael@0 | 58 | // Buckets have 4-byte wordhash for sizes < 32K buckets, but only |
michael@0 | 59 | // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as |
michael@0 | 60 | // bucket subscript. |
michael@0 | 61 | // Probs is a packed: three languages plus a subscript for probability table |
michael@0 | 62 | // Buckets have all the keys together, then all the values.Key array never |
michael@0 | 63 | // crosses a cache-line boundary, so no-match case takes exactly one cache miss. |
michael@0 | 64 | // Match case may sometimes take an additional cache miss on value access. |
michael@0 | 65 | // |
michael@0 | 66 | // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64 |
michael@0 | 67 | // byte buckets with single cache miss. |
michael@0 | 68 | // Or 2-byte key and 6-byte value, allowing 5 languages instead of three. |
michael@0 | 69 | |
michael@0 | 70 | |
michael@0 | 71 | //----------------------------------------------------------------------------// |
michael@0 | 72 | // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores // |
michael@0 | 73 | //----------------------------------------------------------------------------// |
michael@0 | 74 | |
michael@0 | 75 | // Design principles for these hash functions |
michael@0 | 76 | // - Few operations |
michael@0 | 77 | // - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in |
michael@0 | 78 | // Latin script expect 1- and 2-byte mixtures. |
michael@0 | 79 | // - Last byte of each character has about 5 bits of information |
michael@0 | 80 | // - Spread good bits around so they can interact in at least two ways |
michael@0 | 81 | // with other characters |
michael@0 | 82 | // - Use add for additional mixing thorugh carries |
michael@0 | 83 | |
michael@0 | 84 | // CJK Three-byte bigram |
michael@0 | 85 | // ....dddd..cccccc..bbbbbb....aaaa |
michael@0 | 86 | // ..................ffffff..eeeeee |
michael@0 | 87 | // make |
michael@0 | 88 | // ....dddd..cccccc..bbbbbb....aaaa |
michael@0 | 89 | // 000....dddd..cccccc..bbbbbb....a |
michael@0 | 90 | // ..................ffffff..eeeeee |
michael@0 | 91 | // ffffff..eeeeee000000000000000000 |
michael@0 | 92 | // |
michael@0 | 93 | // CJK Four-byte bigram |
michael@0 | 94 | // ..dddddd..cccccc....bbbb....aaaa |
michael@0 | 95 | // ..hhhhhh..gggggg....ffff....eeee |
michael@0 | 96 | // make |
michael@0 | 97 | // ..dddddd..cccccc....bbbb....aaaa |
michael@0 | 98 | // 000..dddddd..cccccc....bbbb....a |
michael@0 | 99 | // ..hhhhhh..gggggg....ffff....eeee |
michael@0 | 100 | // ..ffff....eeee000000000000000000 |
michael@0 | 101 | |
michael@0 | 102 | // BIGRAM |
michael@0 | 103 | // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post |
michael@0 | 104 | // OVERSHOOTS up to 3 bytes |
michael@0 | 105 | // For runtime use of tables |
michael@0 | 106 | // Does X86 unaligned loads |
michael@0 | 107 | uint32 BiHashV2(const char* word_ptr, int bytecount) { |
michael@0 | 108 | if (bytecount == 0) {return 0;} |
michael@0 | 109 | const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr); |
michael@0 | 110 | uint32 word0, word1; |
michael@0 | 111 | if (bytecount <= 4) { |
michael@0 | 112 | word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3]; |
michael@0 | 113 | word0 = word0 ^ (word0 >> 3); |
michael@0 | 114 | return word0; |
michael@0 | 115 | } |
michael@0 | 116 | // Else do 8 bytes |
michael@0 | 117 | word0 = UNALIGNED_LOAD32(word_ptr32); |
michael@0 | 118 | word0 = word0 ^ (word0 >> 3); |
michael@0 | 119 | word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3]; |
michael@0 | 120 | word1 = word1 ^ (word1 << 18); |
michael@0 | 121 | return word0 + word1; |
michael@0 | 122 | } |
michael@0 | 123 | |
michael@0 | 124 | // |
michael@0 | 125 | // Ascii-7 One-byte chars |
michael@0 | 126 | // ...ddddd...ccccc...bbbbb...aaaaa |
michael@0 | 127 | // make |
michael@0 | 128 | // ...ddddd...ccccc...bbbbb...aaaaa |
michael@0 | 129 | // 000...ddddd...ccccc...bbbbb...aa |
michael@0 | 130 | // |
michael@0 | 131 | // Latin 1- and 2-byte chars |
michael@0 | 132 | // ...ddddd...ccccc...bbbbb...aaaaa |
michael@0 | 133 | // ...................fffff...eeeee |
michael@0 | 134 | // make |
michael@0 | 135 | // ...ddddd...ccccc...bbbbb...aaaaa |
michael@0 | 136 | // 000...ddddd...ccccc...bbbbb...aa |
michael@0 | 137 | // ...................fffff...eeeee |
michael@0 | 138 | // ...............fffff...eeeee0000 |
michael@0 | 139 | // |
michael@0 | 140 | // Non-CJK Two-byte chars |
michael@0 | 141 | // ...ddddd...........bbbbb........ |
michael@0 | 142 | // ...hhhhh...........fffff........ |
michael@0 | 143 | // make |
michael@0 | 144 | // ...ddddd...........bbbbb........ |
michael@0 | 145 | // 000...ddddd...........bbbbb..... |
michael@0 | 146 | // ...hhhhh...........fffff........ |
michael@0 | 147 | // hhhh...........fffff........0000 |
michael@0 | 148 | // |
michael@0 | 149 | // Non-CJK Three-byte chars |
michael@0 | 150 | // ...........ccccc................ |
michael@0 | 151 | // ...................fffff........ |
michael@0 | 152 | // ...lllll...................iiiii |
michael@0 | 153 | // make |
michael@0 | 154 | // ...........ccccc................ |
michael@0 | 155 | // 000...........ccccc............. |
michael@0 | 156 | // ...................fffff........ |
michael@0 | 157 | // ...............fffff........0000 |
michael@0 | 158 | // ...lllll...................iiiii |
michael@0 | 159 | // .lllll...................iiiii00 |
michael@0 | 160 | // |
michael@0 | 161 | |
michael@0 | 162 | // QUADGRAM |
michael@0 | 163 | // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add |
michael@0 | 164 | // OVERSHOOTS up to 3 bytes |
michael@0 | 165 | // For runtime use of tables |
michael@0 | 166 | // Does X86 unaligned loads |
michael@0 | 167 | uint32 QuadHashV2Mix(const char* word_ptr, int bytecount, uint32 prepost) { |
michael@0 | 168 | const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr); |
michael@0 | 169 | uint32 word0, word1, word2; |
michael@0 | 170 | if (bytecount <= 4) { |
michael@0 | 171 | word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3]; |
michael@0 | 172 | word0 = word0 ^ (word0 >> 3); |
michael@0 | 173 | return word0 ^ prepost; |
michael@0 | 174 | } else if (bytecount <= 8) { |
michael@0 | 175 | word0 = UNALIGNED_LOAD32(word_ptr32); |
michael@0 | 176 | word0 = word0 ^ (word0 >> 3); |
michael@0 | 177 | word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3]; |
michael@0 | 178 | word1 = word1 ^ (word1 << 4); |
michael@0 | 179 | return (word0 ^ prepost) + word1; |
michael@0 | 180 | } |
michael@0 | 181 | // else do 12 bytes |
michael@0 | 182 | word0 = UNALIGNED_LOAD32(word_ptr32); |
michael@0 | 183 | word0 = word0 ^ (word0 >> 3); |
michael@0 | 184 | word1 = UNALIGNED_LOAD32(word_ptr32 + 1); |
michael@0 | 185 | word1 = word1 ^ (word1 << 4); |
michael@0 | 186 | word2 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3]; |
michael@0 | 187 | word2 = word2 ^ (word2 << 2); |
michael@0 | 188 | return (word0 ^ prepost) + word1 + word2; |
michael@0 | 189 | } |
michael@0 | 190 | |
michael@0 | 191 | |
michael@0 | 192 | // QUADGRAM wrapper with surrounding spaces |
michael@0 | 193 | // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add |
michael@0 | 194 | // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes |
michael@0 | 195 | // For runtime use of tables |
michael@0 | 196 | uint32 QuadHashV2(const char* word_ptr, int bytecount) { |
michael@0 | 197 | if (bytecount == 0) {return 0;} |
michael@0 | 198 | uint32 prepost = 0; |
michael@0 | 199 | if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} |
michael@0 | 200 | if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} |
michael@0 | 201 | return QuadHashV2Mix(word_ptr, bytecount, prepost); |
michael@0 | 202 | } |
michael@0 | 203 | |
michael@0 | 204 | // QUADGRAM wrapper with surrounding underscores (offline use) |
michael@0 | 205 | // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add |
michael@0 | 206 | // OVERSHOOTS up to 3 bytes |
michael@0 | 207 | // For offline construction of tables |
michael@0 | 208 | uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount) { |
michael@0 | 209 | if (bytecount == 0) {return 0;} |
michael@0 | 210 | const char* local_word_ptr = word_ptr; |
michael@0 | 211 | int local_bytecount = bytecount; |
michael@0 | 212 | uint32 prepost = 0; |
michael@0 | 213 | if (local_word_ptr[0] == '_') { |
michael@0 | 214 | prepost |= kPreSpaceIndicator; |
michael@0 | 215 | ++local_word_ptr; |
michael@0 | 216 | --local_bytecount; |
michael@0 | 217 | } |
michael@0 | 218 | if (local_word_ptr[local_bytecount - 1] == '_') { |
michael@0 | 219 | prepost |= kPostSpaceIndicator; |
michael@0 | 220 | --local_bytecount; |
michael@0 | 221 | } |
michael@0 | 222 | return QuadHashV2Mix(local_word_ptr, local_bytecount, prepost); |
michael@0 | 223 | } |
michael@0 | 224 | |
michael@0 | 225 | |
michael@0 | 226 | // OCTAGRAM |
michael@0 | 227 | // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add |
michael@0 | 228 | // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes |
michael@0 | 229 | // |
michael@0 | 230 | // The low 32 bits follow the pattern from above, tuned to different scripts |
michael@0 | 231 | // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each |
michael@0 | 232 | // For runtime use of tables V3 |
michael@0 | 233 | // Does X86 unaligned loads |
michael@0 | 234 | uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) { |
michael@0 | 235 | const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr); |
michael@0 | 236 | uint64 word0; |
michael@0 | 237 | uint64 word1; |
michael@0 | 238 | uint64 sum; |
michael@0 | 239 | |
michael@0 | 240 | if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} |
michael@0 | 241 | if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} |
michael@0 | 242 | switch ((bytecount - 1) >> 2) { |
michael@0 | 243 | case 0: // 1..4 bytes |
michael@0 | 244 | word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3]; |
michael@0 | 245 | sum = word0; |
michael@0 | 246 | word0 = word0 ^ (word0 >> 3); |
michael@0 | 247 | break; |
michael@0 | 248 | case 1: // 5..8 bytes |
michael@0 | 249 | word0 = UNALIGNED_LOAD32(word_ptr32); |
michael@0 | 250 | sum = word0; |
michael@0 | 251 | word0 = word0 ^ (word0 >> 3); |
michael@0 | 252 | word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3]; |
michael@0 | 253 | sum += word1; |
michael@0 | 254 | word1 = word1 ^ (word1 << 4); |
michael@0 | 255 | word0 += word1; |
michael@0 | 256 | break; |
michael@0 | 257 | case 2: // 9..12 bytes |
michael@0 | 258 | word0 = UNALIGNED_LOAD32(word_ptr32); |
michael@0 | 259 | sum = word0; |
michael@0 | 260 | word0 = word0 ^ (word0 >> 3); |
michael@0 | 261 | word1 = UNALIGNED_LOAD32(word_ptr32 + 1); |
michael@0 | 262 | sum += word1; |
michael@0 | 263 | word1 = word1 ^ (word1 << 4); |
michael@0 | 264 | word0 += word1; |
michael@0 | 265 | word1 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3]; |
michael@0 | 266 | sum += word1; |
michael@0 | 267 | word1 = word1 ^ (word1 << 2); |
michael@0 | 268 | word0 += word1; |
michael@0 | 269 | break; |
michael@0 | 270 | case 3: // 13..16 bytes |
michael@0 | 271 | word0 =UNALIGNED_LOAD32(word_ptr32); |
michael@0 | 272 | sum = word0; |
michael@0 | 273 | word0 = word0 ^ (word0 >> 3); |
michael@0 | 274 | word1 = UNALIGNED_LOAD32(word_ptr32 + 1); |
michael@0 | 275 | sum += word1; |
michael@0 | 276 | word1 = word1 ^ (word1 << 4); |
michael@0 | 277 | word0 += word1; |
michael@0 | 278 | word1 = UNALIGNED_LOAD32(word_ptr32 + 2); |
michael@0 | 279 | sum += word1; |
michael@0 | 280 | word1 = word1 ^ (word1 << 2); |
michael@0 | 281 | word0 += word1; |
michael@0 | 282 | word1 = UNALIGNED_LOAD32(word_ptr32 + 3) & kWordMask0[bytecount & 3]; |
michael@0 | 283 | sum += word1; |
michael@0 | 284 | word1 = word1 ^ (word1 >> 8); |
michael@0 | 285 | word0 += word1; |
michael@0 | 286 | break; |
michael@0 | 287 | case 4: // 17..20 bytes |
michael@0 | 288 | word0 = UNALIGNED_LOAD32(word_ptr32); |
michael@0 | 289 | sum = word0; |
michael@0 | 290 | word0 = word0 ^ (word0 >> 3); |
michael@0 | 291 | word1 = UNALIGNED_LOAD32(word_ptr32 + 1); |
michael@0 | 292 | sum += word1; |
michael@0 | 293 | word1 = word1 ^ (word1 << 4); |
michael@0 | 294 | word0 += word1; |
michael@0 | 295 | word1 = UNALIGNED_LOAD32(word_ptr32 + 2); |
michael@0 | 296 | sum += word1; |
michael@0 | 297 | word1 = word1 ^ (word1 << 2); |
michael@0 | 298 | word0 += word1; |
michael@0 | 299 | word1 = UNALIGNED_LOAD32(word_ptr32 + 3); |
michael@0 | 300 | sum += word1; |
michael@0 | 301 | word1 = word1 ^ (word1 >> 8); |
michael@0 | 302 | word0 += word1; |
michael@0 | 303 | word1 = UNALIGNED_LOAD32(word_ptr32 + 4) & kWordMask0[bytecount & 3]; |
michael@0 | 304 | sum += word1; |
michael@0 | 305 | word1 = word1 ^ (word1 >> 4); |
michael@0 | 306 | word0 += word1; |
michael@0 | 307 | break; |
michael@0 | 308 | default: // 21..24 bytes and higher (ignores beyond 24) |
michael@0 | 309 | word0 = UNALIGNED_LOAD32(word_ptr32); |
michael@0 | 310 | sum = word0; |
michael@0 | 311 | word0 = word0 ^ (word0 >> 3); |
michael@0 | 312 | word1 = UNALIGNED_LOAD32(word_ptr32 + 1); |
michael@0 | 313 | sum += word1; |
michael@0 | 314 | word1 = word1 ^ (word1 << 4); |
michael@0 | 315 | word0 += word1; |
michael@0 | 316 | word1 = UNALIGNED_LOAD32(word_ptr32 + 2); |
michael@0 | 317 | sum += word1; |
michael@0 | 318 | word1 = word1 ^ (word1 << 2); |
michael@0 | 319 | word0 += word1; |
michael@0 | 320 | word1 = UNALIGNED_LOAD32(word_ptr32 + 3); |
michael@0 | 321 | sum += word1; |
michael@0 | 322 | word1 = word1 ^ (word1 >> 8); |
michael@0 | 323 | word0 += word1; |
michael@0 | 324 | word1 = UNALIGNED_LOAD32(word_ptr32 + 4); |
michael@0 | 325 | sum += word1; |
michael@0 | 326 | word1 = word1 ^ (word1 >> 4); |
michael@0 | 327 | word0 += word1; |
michael@0 | 328 | word1 = UNALIGNED_LOAD32(word_ptr32 + 5) & kWordMask0[bytecount & 3]; |
michael@0 | 329 | sum += word1; |
michael@0 | 330 | word1 = word1 ^ (word1 >> 6); |
michael@0 | 331 | word0 += word1; |
michael@0 | 332 | break; |
michael@0 | 333 | } |
michael@0 | 334 | |
michael@0 | 335 | sum += (sum >> 17); // extra 1-bit shift for bytes 2 & 3 |
michael@0 | 336 | sum += (sum >> 9); // extra 1-bit shift for bytes 1 & 3 |
michael@0 | 337 | sum = (sum & 0xff) << 32; |
michael@0 | 338 | return (word0 ^ prepost) + sum; |
michael@0 | 339 | } |
michael@0 | 340 | |
michael@0 | 341 | // OCTAGRAM wrapper with surrounding spaces |
michael@0 | 342 | // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add |
michael@0 | 343 | // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes |
michael@0 | 344 | // |
michael@0 | 345 | // The low 32 bits follow the pattern from above, tuned to different scripts |
michael@0 | 346 | // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each |
michael@0 | 347 | // For runtime use of tables V3 |
michael@0 | 348 | uint64 OctaHash40(const char* word_ptr, int bytecount) { |
michael@0 | 349 | if (bytecount == 0) {return 0;} |
michael@0 | 350 | uint64 prepost = 0; |
michael@0 | 351 | if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} |
michael@0 | 352 | if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} |
michael@0 | 353 | return OctaHash40Mix(word_ptr, bytecount, prepost); |
michael@0 | 354 | } |
michael@0 | 355 | |
michael@0 | 356 | |
michael@0 | 357 | // OCTAGRAM wrapper with surrounding underscores (offline use) |
michael@0 | 358 | // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add |
michael@0 | 359 | // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes |
michael@0 | 360 | // |
michael@0 | 361 | // The low 32 bits follow the pattern from above, tuned to different scripts |
michael@0 | 362 | // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each |
michael@0 | 363 | // For offline construction of tables |
michael@0 | 364 | uint64 OctaHash40underscore(const char* word_ptr, int bytecount) { |
michael@0 | 365 | if (bytecount == 0) {return 0;} |
michael@0 | 366 | const char* local_word_ptr = word_ptr; |
michael@0 | 367 | int local_bytecount = bytecount; |
michael@0 | 368 | uint64 prepost = 0; |
michael@0 | 369 | if (local_word_ptr[0] == '_') { |
michael@0 | 370 | prepost |= kPreSpaceIndicator; |
michael@0 | 371 | ++local_word_ptr; |
michael@0 | 372 | --local_bytecount; |
michael@0 | 373 | } |
michael@0 | 374 | if (local_word_ptr[local_bytecount - 1] == '_') { |
michael@0 | 375 | prepost |= kPostSpaceIndicator; |
michael@0 | 376 | --local_bytecount; |
michael@0 | 377 | } |
michael@0 | 378 | return OctaHash40Mix(local_word_ptr, local_bytecount, prepost); |
michael@0 | 379 | } |
michael@0 | 380 | |
michael@0 | 381 | // Hash a consecutive pair of tokens/words A B |
michael@0 | 382 | // Old: hash is B - A, which gives too many false hits on one-char diffs |
michael@0 | 383 | // Now: rotate(A,13) + B |
michael@0 | 384 | uint64 PairHash(uint64 worda_hash, uint64 wordb_hash) { |
michael@0 | 385 | return ((worda_hash >> 13) | (worda_hash << (64 - 13))) + wordb_hash; |
michael@0 | 386 | } |
michael@0 | 387 | |
michael@0 | 388 | |
michael@0 | 389 | |
michael@0 | 390 | |
michael@0 | 391 | //----------------------------------------------------------------------------// |
michael@0 | 392 | // Finding groups of 1/2/4/8 letters // |
michael@0 | 393 | //----------------------------------------------------------------------------// |
michael@0 | 394 | |
michael@0 | 395 | // src points to a letter. Find the byte length of a unigram starting there. |
michael@0 | 396 | int UniLen(const char* src) { |
michael@0 | 397 | const char* src_end = src; |
michael@0 | 398 | src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
michael@0 | 399 | return src_end - src; |
michael@0 | 400 | } |
michael@0 | 401 | |
michael@0 | 402 | // src points to a letter. Find the byte length of a bigram starting there. |
michael@0 | 403 | int BiLen(const char* src) { |
michael@0 | 404 | const char* src_end = src; |
michael@0 | 405 | src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
michael@0 | 406 | src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
michael@0 | 407 | return src_end - src; |
michael@0 | 408 | } |
michael@0 | 409 | |
michael@0 | 410 | // src points to a letter. Find the byte length of a quadgram starting there. |
michael@0 | 411 | int QuadLen(const char* src) { |
michael@0 | 412 | const char* src_end = src; |
michael@0 | 413 | src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
michael@0 | 414 | src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
michael@0 | 415 | src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
michael@0 | 416 | src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
michael@0 | 417 | return src_end - src; |
michael@0 | 418 | } |
michael@0 | 419 | |
michael@0 | 420 | // src points to a letter. Find the byte length of an octagram starting there. |
michael@0 | 421 | int OctaLen(const char* src) { |
michael@0 | 422 | const char* src_end = src; |
michael@0 | 423 | int charcount = 0; |
michael@0 | 424 | while (src_end[0] != ' ') { |
michael@0 | 425 | src_end += UTF8OneCharLen(src); |
michael@0 | 426 | ++charcount; |
michael@0 | 427 | if (charcount == 8) {break;} |
michael@0 | 428 | } |
michael@0 | 429 | return src_end - src; |
michael@0 | 430 | } |
michael@0 | 431 | |
michael@0 | 432 | } // End namespace CLD2 |
michael@0 | 433 | |
michael@0 | 434 | |
michael@0 | 435 | |
michael@0 | 436 | |
michael@0 | 437 |