Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 //
16 // Author: dsites@google.com (Dick Sites)
17 //
19 #include "cldutil_shared.h"
20 #include <string>
22 #include "cld2tablesummary.h"
23 #include "integral_types.h"
24 #include "port.h"
25 #include "utf8statetable.h"
27 namespace CLD2 {
29 // Runtime routines for hashing, looking up, and scoring
30 // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
31 // Unigrams and bigrams are for CJK languages only, including simplified/
32 // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
33 // Zhuang Han characters. Surrounding spaces are not considered.
34 // Quadgrams and octagrams for for non-CJK and include two bits indicating
35 // preceding and trailing spaces (word boundaries).
38 // Indicator bits for leading/trailing space around quad/octagram
39 // NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
40 // 1-, 2-, or 3-bytes each.
41 static const uint32 kPreSpaceIndicator = 0x00004444;
42 static const uint32 kPostSpaceIndicator = 0x44440000;
44 // Little-endian masks for 0..24 bytes picked up as uint32's
45 static const uint32 kWordMask0[4] = {
46 0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
47 };
49 static const int kMinCJKUTF8CharBytes = 3;
51 static const int kMinGramCount = 3;
52 static const int kMaxGramCount = 16;
54 static const int UTFmax = 4; // Max number of bytes in a UTF-8 character
57 // Routines to access a hash table of <key:wordhash, value:probs> pairs
58 // Buckets have 4-byte wordhash for sizes < 32K buckets, but only
59 // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
60 // bucket subscript.
61 // Probs is a packed: three languages plus a subscript for probability table
62 // Buckets have all the keys together, then all the values.Key array never
63 // crosses a cache-line boundary, so no-match case takes exactly one cache miss.
64 // Match case may sometimes take an additional cache miss on value access.
65 //
66 // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
67 // byte buckets with single cache miss.
68 // Or 2-byte key and 6-byte value, allowing 5 languages instead of three.
71 //----------------------------------------------------------------------------//
72 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores //
73 //----------------------------------------------------------------------------//
75 // Design principles for these hash functions
76 // - Few operations
77 // - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
78 // Latin script expect 1- and 2-byte mixtures.
79 // - Last byte of each character has about 5 bits of information
80 // - Spread good bits around so they can interact in at least two ways
81 // with other characters
82 // - Use add for additional mixing thorugh carries
84 // CJK Three-byte bigram
85 // ....dddd..cccccc..bbbbbb....aaaa
86 // ..................ffffff..eeeeee
87 // make
88 // ....dddd..cccccc..bbbbbb....aaaa
89 // 000....dddd..cccccc..bbbbbb....a
90 // ..................ffffff..eeeeee
91 // ffffff..eeeeee000000000000000000
92 //
93 // CJK Four-byte bigram
94 // ..dddddd..cccccc....bbbb....aaaa
95 // ..hhhhhh..gggggg....ffff....eeee
96 // make
97 // ..dddddd..cccccc....bbbb....aaaa
98 // 000..dddddd..cccccc....bbbb....a
99 // ..hhhhhh..gggggg....ffff....eeee
100 // ..ffff....eeee000000000000000000
102 // BIGRAM
103 // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
104 // OVERSHOOTS up to 3 bytes
105 // For runtime use of tables
106 // Does X86 unaligned loads
107 uint32 BiHashV2(const char* word_ptr, int bytecount) {
108 if (bytecount == 0) {return 0;}
109 const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
110 uint32 word0, word1;
111 if (bytecount <= 4) {
112 word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
113 word0 = word0 ^ (word0 >> 3);
114 return word0;
115 }
116 // Else do 8 bytes
117 word0 = UNALIGNED_LOAD32(word_ptr32);
118 word0 = word0 ^ (word0 >> 3);
119 word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
120 word1 = word1 ^ (word1 << 18);
121 return word0 + word1;
122 }
124 //
125 // Ascii-7 One-byte chars
126 // ...ddddd...ccccc...bbbbb...aaaaa
127 // make
128 // ...ddddd...ccccc...bbbbb...aaaaa
129 // 000...ddddd...ccccc...bbbbb...aa
130 //
131 // Latin 1- and 2-byte chars
132 // ...ddddd...ccccc...bbbbb...aaaaa
133 // ...................fffff...eeeee
134 // make
135 // ...ddddd...ccccc...bbbbb...aaaaa
136 // 000...ddddd...ccccc...bbbbb...aa
137 // ...................fffff...eeeee
138 // ...............fffff...eeeee0000
139 //
140 // Non-CJK Two-byte chars
141 // ...ddddd...........bbbbb........
142 // ...hhhhh...........fffff........
143 // make
144 // ...ddddd...........bbbbb........
145 // 000...ddddd...........bbbbb.....
146 // ...hhhhh...........fffff........
147 // hhhh...........fffff........0000
148 //
149 // Non-CJK Three-byte chars
150 // ...........ccccc................
151 // ...................fffff........
152 // ...lllll...................iiiii
153 // make
154 // ...........ccccc................
155 // 000...........ccccc.............
156 // ...................fffff........
157 // ...............fffff........0000
158 // ...lllll...................iiiii
159 // .lllll...................iiiii00
160 //
162 // QUADGRAM
163 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
164 // OVERSHOOTS up to 3 bytes
165 // For runtime use of tables
166 // Does X86 unaligned loads
167 uint32 QuadHashV2Mix(const char* word_ptr, int bytecount, uint32 prepost) {
168 const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
169 uint32 word0, word1, word2;
170 if (bytecount <= 4) {
171 word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
172 word0 = word0 ^ (word0 >> 3);
173 return word0 ^ prepost;
174 } else if (bytecount <= 8) {
175 word0 = UNALIGNED_LOAD32(word_ptr32);
176 word0 = word0 ^ (word0 >> 3);
177 word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
178 word1 = word1 ^ (word1 << 4);
179 return (word0 ^ prepost) + word1;
180 }
181 // else do 12 bytes
182 word0 = UNALIGNED_LOAD32(word_ptr32);
183 word0 = word0 ^ (word0 >> 3);
184 word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
185 word1 = word1 ^ (word1 << 4);
186 word2 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];
187 word2 = word2 ^ (word2 << 2);
188 return (word0 ^ prepost) + word1 + word2;
189 }
192 // QUADGRAM wrapper with surrounding spaces
193 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
194 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
195 // For runtime use of tables
196 uint32 QuadHashV2(const char* word_ptr, int bytecount) {
197 if (bytecount == 0) {return 0;}
198 uint32 prepost = 0;
199 if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
200 if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
201 return QuadHashV2Mix(word_ptr, bytecount, prepost);
202 }
204 // QUADGRAM wrapper with surrounding underscores (offline use)
205 // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
206 // OVERSHOOTS up to 3 bytes
207 // For offline construction of tables
208 uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount) {
209 if (bytecount == 0) {return 0;}
210 const char* local_word_ptr = word_ptr;
211 int local_bytecount = bytecount;
212 uint32 prepost = 0;
213 if (local_word_ptr[0] == '_') {
214 prepost |= kPreSpaceIndicator;
215 ++local_word_ptr;
216 --local_bytecount;
217 }
218 if (local_word_ptr[local_bytecount - 1] == '_') {
219 prepost |= kPostSpaceIndicator;
220 --local_bytecount;
221 }
222 return QuadHashV2Mix(local_word_ptr, local_bytecount, prepost);
223 }
226 // OCTAGRAM
227 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
228 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
229 //
230 // The low 32 bits follow the pattern from above, tuned to different scripts
231 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
232 // For runtime use of tables V3
233 // Does X86 unaligned loads
234 uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
235 const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
236 uint64 word0;
237 uint64 word1;
238 uint64 sum;
240 if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
241 if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
242 switch ((bytecount - 1) >> 2) {
243 case 0: // 1..4 bytes
244 word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
245 sum = word0;
246 word0 = word0 ^ (word0 >> 3);
247 break;
248 case 1: // 5..8 bytes
249 word0 = UNALIGNED_LOAD32(word_ptr32);
250 sum = word0;
251 word0 = word0 ^ (word0 >> 3);
252 word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
253 sum += word1;
254 word1 = word1 ^ (word1 << 4);
255 word0 += word1;
256 break;
257 case 2: // 9..12 bytes
258 word0 = UNALIGNED_LOAD32(word_ptr32);
259 sum = word0;
260 word0 = word0 ^ (word0 >> 3);
261 word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
262 sum += word1;
263 word1 = word1 ^ (word1 << 4);
264 word0 += word1;
265 word1 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];
266 sum += word1;
267 word1 = word1 ^ (word1 << 2);
268 word0 += word1;
269 break;
270 case 3: // 13..16 bytes
271 word0 =UNALIGNED_LOAD32(word_ptr32);
272 sum = word0;
273 word0 = word0 ^ (word0 >> 3);
274 word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
275 sum += word1;
276 word1 = word1 ^ (word1 << 4);
277 word0 += word1;
278 word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
279 sum += word1;
280 word1 = word1 ^ (word1 << 2);
281 word0 += word1;
282 word1 = UNALIGNED_LOAD32(word_ptr32 + 3) & kWordMask0[bytecount & 3];
283 sum += word1;
284 word1 = word1 ^ (word1 >> 8);
285 word0 += word1;
286 break;
287 case 4: // 17..20 bytes
288 word0 = UNALIGNED_LOAD32(word_ptr32);
289 sum = word0;
290 word0 = word0 ^ (word0 >> 3);
291 word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
292 sum += word1;
293 word1 = word1 ^ (word1 << 4);
294 word0 += word1;
295 word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
296 sum += word1;
297 word1 = word1 ^ (word1 << 2);
298 word0 += word1;
299 word1 = UNALIGNED_LOAD32(word_ptr32 + 3);
300 sum += word1;
301 word1 = word1 ^ (word1 >> 8);
302 word0 += word1;
303 word1 = UNALIGNED_LOAD32(word_ptr32 + 4) & kWordMask0[bytecount & 3];
304 sum += word1;
305 word1 = word1 ^ (word1 >> 4);
306 word0 += word1;
307 break;
308 default: // 21..24 bytes and higher (ignores beyond 24)
309 word0 = UNALIGNED_LOAD32(word_ptr32);
310 sum = word0;
311 word0 = word0 ^ (word0 >> 3);
312 word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
313 sum += word1;
314 word1 = word1 ^ (word1 << 4);
315 word0 += word1;
316 word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
317 sum += word1;
318 word1 = word1 ^ (word1 << 2);
319 word0 += word1;
320 word1 = UNALIGNED_LOAD32(word_ptr32 + 3);
321 sum += word1;
322 word1 = word1 ^ (word1 >> 8);
323 word0 += word1;
324 word1 = UNALIGNED_LOAD32(word_ptr32 + 4);
325 sum += word1;
326 word1 = word1 ^ (word1 >> 4);
327 word0 += word1;
328 word1 = UNALIGNED_LOAD32(word_ptr32 + 5) & kWordMask0[bytecount & 3];
329 sum += word1;
330 word1 = word1 ^ (word1 >> 6);
331 word0 += word1;
332 break;
333 }
335 sum += (sum >> 17); // extra 1-bit shift for bytes 2 & 3
336 sum += (sum >> 9); // extra 1-bit shift for bytes 1 & 3
337 sum = (sum & 0xff) << 32;
338 return (word0 ^ prepost) + sum;
339 }
341 // OCTAGRAM wrapper with surrounding spaces
342 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
343 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
344 //
345 // The low 32 bits follow the pattern from above, tuned to different scripts
346 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
347 // For runtime use of tables V3
348 uint64 OctaHash40(const char* word_ptr, int bytecount) {
349 if (bytecount == 0) {return 0;}
350 uint64 prepost = 0;
351 if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
352 if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
353 return OctaHash40Mix(word_ptr, bytecount, prepost);
354 }
357 // OCTAGRAM wrapper with surrounding underscores (offline use)
358 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
359 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
360 //
361 // The low 32 bits follow the pattern from above, tuned to different scripts
362 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
363 // For offline construction of tables
364 uint64 OctaHash40underscore(const char* word_ptr, int bytecount) {
365 if (bytecount == 0) {return 0;}
366 const char* local_word_ptr = word_ptr;
367 int local_bytecount = bytecount;
368 uint64 prepost = 0;
369 if (local_word_ptr[0] == '_') {
370 prepost |= kPreSpaceIndicator;
371 ++local_word_ptr;
372 --local_bytecount;
373 }
374 if (local_word_ptr[local_bytecount - 1] == '_') {
375 prepost |= kPostSpaceIndicator;
376 --local_bytecount;
377 }
378 return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
379 }
381 // Hash a consecutive pair of tokens/words A B
382 // Old: hash is B - A, which gives too many false hits on one-char diffs
383 // Now: rotate(A,13) + B
384 uint64 PairHash(uint64 worda_hash, uint64 wordb_hash) {
385 return ((worda_hash >> 13) | (worda_hash << (64 - 13))) + wordb_hash;
386 }
391 //----------------------------------------------------------------------------//
392 // Finding groups of 1/2/4/8 letters //
393 //----------------------------------------------------------------------------//
395 // src points to a letter. Find the byte length of a unigram starting there.
396 int UniLen(const char* src) {
397 const char* src_end = src;
398 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
399 return src_end - src;
400 }
402 // src points to a letter. Find the byte length of a bigram starting there.
403 int BiLen(const char* src) {
404 const char* src_end = src;
405 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
406 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
407 return src_end - src;
408 }
410 // src points to a letter. Find the byte length of a quadgram starting there.
411 int QuadLen(const char* src) {
412 const char* src_end = src;
413 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
414 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
415 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
416 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
417 return src_end - src;
418 }
420 // src points to a letter. Find the byte length of an octagram starting there.
421 int OctaLen(const char* src) {
422 const char* src_end = src;
423 int charcount = 0;
424 while (src_end[0] != ' ') {
425 src_end += UTF8OneCharLen(src);
426 ++charcount;
427 if (charcount == 8) {break;}
428 }
429 return src_end - src;
430 }
432 } // End namespace CLD2