|
1 // Copyright 2013 Google Inc. All Rights Reserved. |
|
2 // |
|
3 // Licensed under the Apache License, Version 2.0 (the "License"); |
|
4 // you may not use this file except in compliance with the License. |
|
5 // You may obtain a copy of the License at |
|
6 // |
|
7 // http://www.apache.org/licenses/LICENSE-2.0 |
|
8 // |
|
9 // Unless required by applicable law or agreed to in writing, software |
|
10 // distributed under the License is distributed on an "AS IS" BASIS, |
|
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
12 // See the License for the specific language governing permissions and |
|
13 // limitations under the License. |
|
14 |
|
15 // |
|
16 // Author: dsites@google.com (Dick Sites) |
|
17 // |
|
18 |
|
19 #include "cldutil_shared.h" |
|
20 #include <string> |
|
21 |
|
22 #include "cld2tablesummary.h" |
|
23 #include "integral_types.h" |
|
24 #include "port.h" |
|
25 #include "utf8statetable.h" |
|
26 |
|
27 namespace CLD2 { |
|
28 |
|
29 // Runtime routines for hashing, looking up, and scoring |
|
30 // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams. |
|
31 // Unigrams and bigrams are for CJK languages only, including simplified/ |
|
32 // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and |
|
33 // Zhuang Han characters. Surrounding spaces are not considered. |
|
34 // Quadgrams and octagrams for for non-CJK and include two bits indicating |
|
35 // preceding and trailing spaces (word boundaries). |
|
36 |
|
37 |
|
38 // Indicator bits for leading/trailing space around quad/octagram |
|
39 // NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of |
|
40 // 1-, 2-, or 3-bytes each. |
|
41 static const uint32 kPreSpaceIndicator = 0x00004444; |
|
42 static const uint32 kPostSpaceIndicator = 0x44440000; |
|
43 |
|
44 // Little-endian masks for 0..24 bytes picked up as uint32's |
|
45 static const uint32 kWordMask0[4] = { |
|
46 0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF |
|
47 }; |
|
48 |
|
49 static const int kMinCJKUTF8CharBytes = 3; |
|
50 |
|
51 static const int kMinGramCount = 3; |
|
52 static const int kMaxGramCount = 16; |
|
53 |
|
54 static const int UTFmax = 4; // Max number of bytes in a UTF-8 character |
|
55 |
|
56 |
|
57 // Routines to access a hash table of <key:wordhash, value:probs> pairs |
|
58 // Buckets have 4-byte wordhash for sizes < 32K buckets, but only |
|
59 // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as |
|
60 // bucket subscript. |
|
61 // Probs is a packed: three languages plus a subscript for probability table |
|
62 // Buckets have all the keys together, then all the values.Key array never |
|
63 // crosses a cache-line boundary, so no-match case takes exactly one cache miss. |
|
64 // Match case may sometimes take an additional cache miss on value access. |
|
65 // |
|
66 // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64 |
|
67 // byte buckets with single cache miss. |
|
68 // Or 2-byte key and 6-byte value, allowing 5 languages instead of three. |
|
69 |
|
70 |
|
71 //----------------------------------------------------------------------------// |
|
72 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores // |
|
73 //----------------------------------------------------------------------------// |
|
74 |
|
75 // Design principles for these hash functions |
|
76 // - Few operations |
|
77 // - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in |
|
78 // Latin script expect 1- and 2-byte mixtures. |
|
79 // - Last byte of each character has about 5 bits of information |
|
80 // - Spread good bits around so they can interact in at least two ways |
|
81 // with other characters |
|
82 // - Use add for additional mixing thorugh carries |
|
83 |
|
84 // CJK Three-byte bigram |
|
85 // ....dddd..cccccc..bbbbbb....aaaa |
|
86 // ..................ffffff..eeeeee |
|
87 // make |
|
88 // ....dddd..cccccc..bbbbbb....aaaa |
|
89 // 000....dddd..cccccc..bbbbbb....a |
|
90 // ..................ffffff..eeeeee |
|
91 // ffffff..eeeeee000000000000000000 |
|
92 // |
|
93 // CJK Four-byte bigram |
|
94 // ..dddddd..cccccc....bbbb....aaaa |
|
95 // ..hhhhhh..gggggg....ffff....eeee |
|
96 // make |
|
97 // ..dddddd..cccccc....bbbb....aaaa |
|
98 // 000..dddddd..cccccc....bbbb....a |
|
99 // ..hhhhhh..gggggg....ffff....eeee |
|
100 // ..ffff....eeee000000000000000000 |
|
101 |
|
102 // BIGRAM |
|
103 // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post |
|
104 // OVERSHOOTS up to 3 bytes |
|
105 // For runtime use of tables |
|
106 // Does X86 unaligned loads |
|
107 uint32 BiHashV2(const char* word_ptr, int bytecount) { |
|
108 if (bytecount == 0) {return 0;} |
|
109 const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr); |
|
110 uint32 word0, word1; |
|
111 if (bytecount <= 4) { |
|
112 word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3]; |
|
113 word0 = word0 ^ (word0 >> 3); |
|
114 return word0; |
|
115 } |
|
116 // Else do 8 bytes |
|
117 word0 = UNALIGNED_LOAD32(word_ptr32); |
|
118 word0 = word0 ^ (word0 >> 3); |
|
119 word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3]; |
|
120 word1 = word1 ^ (word1 << 18); |
|
121 return word0 + word1; |
|
122 } |
|
123 |
|
124 // |
|
125 // Ascii-7 One-byte chars |
|
126 // ...ddddd...ccccc...bbbbb...aaaaa |
|
127 // make |
|
128 // ...ddddd...ccccc...bbbbb...aaaaa |
|
129 // 000...ddddd...ccccc...bbbbb...aa |
|
130 // |
|
131 // Latin 1- and 2-byte chars |
|
132 // ...ddddd...ccccc...bbbbb...aaaaa |
|
133 // ...................fffff...eeeee |
|
134 // make |
|
135 // ...ddddd...ccccc...bbbbb...aaaaa |
|
136 // 000...ddddd...ccccc...bbbbb...aa |
|
137 // ...................fffff...eeeee |
|
138 // ...............fffff...eeeee0000 |
|
139 // |
|
140 // Non-CJK Two-byte chars |
|
141 // ...ddddd...........bbbbb........ |
|
142 // ...hhhhh...........fffff........ |
|
143 // make |
|
144 // ...ddddd...........bbbbb........ |
|
145 // 000...ddddd...........bbbbb..... |
|
146 // ...hhhhh...........fffff........ |
|
147 // hhhh...........fffff........0000 |
|
148 // |
|
149 // Non-CJK Three-byte chars |
|
150 // ...........ccccc................ |
|
151 // ...................fffff........ |
|
152 // ...lllll...................iiiii |
|
153 // make |
|
154 // ...........ccccc................ |
|
155 // 000...........ccccc............. |
|
156 // ...................fffff........ |
|
157 // ...............fffff........0000 |
|
158 // ...lllll...................iiiii |
|
159 // .lllll...................iiiii00 |
|
160 // |
|
161 |
|
162 // QUADGRAM |
|
163 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add |
|
164 // OVERSHOOTS up to 3 bytes |
|
165 // For runtime use of tables |
|
166 // Does X86 unaligned loads |
|
167 uint32 QuadHashV2Mix(const char* word_ptr, int bytecount, uint32 prepost) { |
|
168 const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr); |
|
169 uint32 word0, word1, word2; |
|
170 if (bytecount <= 4) { |
|
171 word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3]; |
|
172 word0 = word0 ^ (word0 >> 3); |
|
173 return word0 ^ prepost; |
|
174 } else if (bytecount <= 8) { |
|
175 word0 = UNALIGNED_LOAD32(word_ptr32); |
|
176 word0 = word0 ^ (word0 >> 3); |
|
177 word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3]; |
|
178 word1 = word1 ^ (word1 << 4); |
|
179 return (word0 ^ prepost) + word1; |
|
180 } |
|
181 // else do 12 bytes |
|
182 word0 = UNALIGNED_LOAD32(word_ptr32); |
|
183 word0 = word0 ^ (word0 >> 3); |
|
184 word1 = UNALIGNED_LOAD32(word_ptr32 + 1); |
|
185 word1 = word1 ^ (word1 << 4); |
|
186 word2 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3]; |
|
187 word2 = word2 ^ (word2 << 2); |
|
188 return (word0 ^ prepost) + word1 + word2; |
|
189 } |
|
190 |
|
191 |
|
192 // QUADGRAM wrapper with surrounding spaces |
|
193 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add |
|
194 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes |
|
195 // For runtime use of tables |
|
196 uint32 QuadHashV2(const char* word_ptr, int bytecount) { |
|
197 if (bytecount == 0) {return 0;} |
|
198 uint32 prepost = 0; |
|
199 if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} |
|
200 if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} |
|
201 return QuadHashV2Mix(word_ptr, bytecount, prepost); |
|
202 } |
|
203 |
|
204 // QUADGRAM wrapper with surrounding underscores (offline use) |
|
205 // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add |
|
206 // OVERSHOOTS up to 3 bytes |
|
207 // For offline construction of tables |
|
208 uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount) { |
|
209 if (bytecount == 0) {return 0;} |
|
210 const char* local_word_ptr = word_ptr; |
|
211 int local_bytecount = bytecount; |
|
212 uint32 prepost = 0; |
|
213 if (local_word_ptr[0] == '_') { |
|
214 prepost |= kPreSpaceIndicator; |
|
215 ++local_word_ptr; |
|
216 --local_bytecount; |
|
217 } |
|
218 if (local_word_ptr[local_bytecount - 1] == '_') { |
|
219 prepost |= kPostSpaceIndicator; |
|
220 --local_bytecount; |
|
221 } |
|
222 return QuadHashV2Mix(local_word_ptr, local_bytecount, prepost); |
|
223 } |
|
224 |
|
225 |
|
226 // OCTAGRAM |
|
227 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add |
|
228 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes |
|
229 // |
|
230 // The low 32 bits follow the pattern from above, tuned to different scripts |
|
231 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each |
|
232 // For runtime use of tables V3 |
|
233 // Does X86 unaligned loads |
|
234 uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) { |
|
235 const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr); |
|
236 uint64 word0; |
|
237 uint64 word1; |
|
238 uint64 sum; |
|
239 |
|
240 if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} |
|
241 if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} |
|
242 switch ((bytecount - 1) >> 2) { |
|
243 case 0: // 1..4 bytes |
|
244 word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3]; |
|
245 sum = word0; |
|
246 word0 = word0 ^ (word0 >> 3); |
|
247 break; |
|
248 case 1: // 5..8 bytes |
|
249 word0 = UNALIGNED_LOAD32(word_ptr32); |
|
250 sum = word0; |
|
251 word0 = word0 ^ (word0 >> 3); |
|
252 word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3]; |
|
253 sum += word1; |
|
254 word1 = word1 ^ (word1 << 4); |
|
255 word0 += word1; |
|
256 break; |
|
257 case 2: // 9..12 bytes |
|
258 word0 = UNALIGNED_LOAD32(word_ptr32); |
|
259 sum = word0; |
|
260 word0 = word0 ^ (word0 >> 3); |
|
261 word1 = UNALIGNED_LOAD32(word_ptr32 + 1); |
|
262 sum += word1; |
|
263 word1 = word1 ^ (word1 << 4); |
|
264 word0 += word1; |
|
265 word1 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3]; |
|
266 sum += word1; |
|
267 word1 = word1 ^ (word1 << 2); |
|
268 word0 += word1; |
|
269 break; |
|
270 case 3: // 13..16 bytes |
|
271 word0 =UNALIGNED_LOAD32(word_ptr32); |
|
272 sum = word0; |
|
273 word0 = word0 ^ (word0 >> 3); |
|
274 word1 = UNALIGNED_LOAD32(word_ptr32 + 1); |
|
275 sum += word1; |
|
276 word1 = word1 ^ (word1 << 4); |
|
277 word0 += word1; |
|
278 word1 = UNALIGNED_LOAD32(word_ptr32 + 2); |
|
279 sum += word1; |
|
280 word1 = word1 ^ (word1 << 2); |
|
281 word0 += word1; |
|
282 word1 = UNALIGNED_LOAD32(word_ptr32 + 3) & kWordMask0[bytecount & 3]; |
|
283 sum += word1; |
|
284 word1 = word1 ^ (word1 >> 8); |
|
285 word0 += word1; |
|
286 break; |
|
287 case 4: // 17..20 bytes |
|
288 word0 = UNALIGNED_LOAD32(word_ptr32); |
|
289 sum = word0; |
|
290 word0 = word0 ^ (word0 >> 3); |
|
291 word1 = UNALIGNED_LOAD32(word_ptr32 + 1); |
|
292 sum += word1; |
|
293 word1 = word1 ^ (word1 << 4); |
|
294 word0 += word1; |
|
295 word1 = UNALIGNED_LOAD32(word_ptr32 + 2); |
|
296 sum += word1; |
|
297 word1 = word1 ^ (word1 << 2); |
|
298 word0 += word1; |
|
299 word1 = UNALIGNED_LOAD32(word_ptr32 + 3); |
|
300 sum += word1; |
|
301 word1 = word1 ^ (word1 >> 8); |
|
302 word0 += word1; |
|
303 word1 = UNALIGNED_LOAD32(word_ptr32 + 4) & kWordMask0[bytecount & 3]; |
|
304 sum += word1; |
|
305 word1 = word1 ^ (word1 >> 4); |
|
306 word0 += word1; |
|
307 break; |
|
308 default: // 21..24 bytes and higher (ignores beyond 24) |
|
309 word0 = UNALIGNED_LOAD32(word_ptr32); |
|
310 sum = word0; |
|
311 word0 = word0 ^ (word0 >> 3); |
|
312 word1 = UNALIGNED_LOAD32(word_ptr32 + 1); |
|
313 sum += word1; |
|
314 word1 = word1 ^ (word1 << 4); |
|
315 word0 += word1; |
|
316 word1 = UNALIGNED_LOAD32(word_ptr32 + 2); |
|
317 sum += word1; |
|
318 word1 = word1 ^ (word1 << 2); |
|
319 word0 += word1; |
|
320 word1 = UNALIGNED_LOAD32(word_ptr32 + 3); |
|
321 sum += word1; |
|
322 word1 = word1 ^ (word1 >> 8); |
|
323 word0 += word1; |
|
324 word1 = UNALIGNED_LOAD32(word_ptr32 + 4); |
|
325 sum += word1; |
|
326 word1 = word1 ^ (word1 >> 4); |
|
327 word0 += word1; |
|
328 word1 = UNALIGNED_LOAD32(word_ptr32 + 5) & kWordMask0[bytecount & 3]; |
|
329 sum += word1; |
|
330 word1 = word1 ^ (word1 >> 6); |
|
331 word0 += word1; |
|
332 break; |
|
333 } |
|
334 |
|
335 sum += (sum >> 17); // extra 1-bit shift for bytes 2 & 3 |
|
336 sum += (sum >> 9); // extra 1-bit shift for bytes 1 & 3 |
|
337 sum = (sum & 0xff) << 32; |
|
338 return (word0 ^ prepost) + sum; |
|
339 } |
|
340 |
|
341 // OCTAGRAM wrapper with surrounding spaces |
|
342 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add |
|
343 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes |
|
344 // |
|
345 // The low 32 bits follow the pattern from above, tuned to different scripts |
|
346 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each |
|
347 // For runtime use of tables V3 |
|
348 uint64 OctaHash40(const char* word_ptr, int bytecount) { |
|
349 if (bytecount == 0) {return 0;} |
|
350 uint64 prepost = 0; |
|
351 if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} |
|
352 if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} |
|
353 return OctaHash40Mix(word_ptr, bytecount, prepost); |
|
354 } |
|
355 |
|
356 |
|
357 // OCTAGRAM wrapper with surrounding underscores (offline use) |
|
358 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add |
|
359 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes |
|
360 // |
|
361 // The low 32 bits follow the pattern from above, tuned to different scripts |
|
362 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each |
|
363 // For offline construction of tables |
|
364 uint64 OctaHash40underscore(const char* word_ptr, int bytecount) { |
|
365 if (bytecount == 0) {return 0;} |
|
366 const char* local_word_ptr = word_ptr; |
|
367 int local_bytecount = bytecount; |
|
368 uint64 prepost = 0; |
|
369 if (local_word_ptr[0] == '_') { |
|
370 prepost |= kPreSpaceIndicator; |
|
371 ++local_word_ptr; |
|
372 --local_bytecount; |
|
373 } |
|
374 if (local_word_ptr[local_bytecount - 1] == '_') { |
|
375 prepost |= kPostSpaceIndicator; |
|
376 --local_bytecount; |
|
377 } |
|
378 return OctaHash40Mix(local_word_ptr, local_bytecount, prepost); |
|
379 } |
|
380 |
|
381 // Hash a consecutive pair of tokens/words A B |
|
382 // Old: hash is B - A, which gives too many false hits on one-char diffs |
|
383 // Now: rotate(A,13) + B |
|
384 uint64 PairHash(uint64 worda_hash, uint64 wordb_hash) { |
|
385 return ((worda_hash >> 13) | (worda_hash << (64 - 13))) + wordb_hash; |
|
386 } |
|
387 |
|
388 |
|
389 |
|
390 |
|
391 //----------------------------------------------------------------------------// |
|
392 // Finding groups of 1/2/4/8 letters // |
|
393 //----------------------------------------------------------------------------// |
|
394 |
|
395 // src points to a letter. Find the byte length of a unigram starting there. |
|
396 int UniLen(const char* src) { |
|
397 const char* src_end = src; |
|
398 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
|
399 return src_end - src; |
|
400 } |
|
401 |
|
402 // src points to a letter. Find the byte length of a bigram starting there. |
|
403 int BiLen(const char* src) { |
|
404 const char* src_end = src; |
|
405 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
|
406 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
|
407 return src_end - src; |
|
408 } |
|
409 |
|
410 // src points to a letter. Find the byte length of a quadgram starting there. |
|
411 int QuadLen(const char* src) { |
|
412 const char* src_end = src; |
|
413 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
|
414 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
|
415 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
|
416 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
|
417 return src_end - src; |
|
418 } |
|
419 |
|
420 // src points to a letter. Find the byte length of an octagram starting there. |
|
421 int OctaLen(const char* src) { |
|
422 const char* src_end = src; |
|
423 int charcount = 0; |
|
424 while (src_end[0] != ' ') { |
|
425 src_end += UTF8OneCharLen(src); |
|
426 ++charcount; |
|
427 if (charcount == 8) {break;} |
|
428 } |
|
429 return src_end - src; |
|
430 } |
|
431 |
|
432 } // End namespace CLD2 |
|
433 |
|
434 |
|
435 |
|
436 |
|
437 |