|
1 // Copyright 2013 Google Inc. All Rights Reserved. |
|
2 // |
|
3 // Licensed under the Apache License, Version 2.0 (the "License"); |
|
4 // you may not use this file except in compliance with the License. |
|
5 // You may obtain a copy of the License at |
|
6 // |
|
7 // http://www.apache.org/licenses/LICENSE-2.0 |
|
8 // |
|
9 // Unless required by applicable law or agreed to in writing, software |
|
10 // distributed under the License is distributed on an "AS IS" BASIS, |
|
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
12 // See the License for the specific language governing permissions and |
|
13 // limitations under the License. |
|
14 |
|
15 // |
|
16 // Author: dsites@google.com (Dick Sites) |
|
17 // |
|
18 // Just the stuff shared between offline table builder and online detector |
|
19 // |
|
20 |
|
21 #ifndef I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__ |
|
22 #define I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__ |
|
23 |
|
24 #include "integral_types.h" |
|
25 #include "cld2tablesummary.h" |
|
26 |
|
27 namespace CLD2 { |
|
28 |
|
29 // Runtime routines for hashing, looking up, and scoring |
|
30 // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams. |
|
31 // Unigrams and bigrams are for CJK languages only, including simplified/ |
|
32 // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and |
|
33 // Zhuang Han characters. Surrounding spaces are not considered. |
|
34 // Quadgrams and octagrams for for non-CJK and include two bits indicating |
|
35 // preceding and trailing spaces (word boundaries). |
|
36 |
|
37 |
|
38 //----------------------------------------------------------------------------// |
|
39 // Main quantized probability table // |
|
40 //----------------------------------------------------------------------------// |
|
41 |
|
42 // Table has 240 eight-byte entries. Each entry has a five-byte array and |
|
43 // a three-byte array of log base 2 probabilities in the range 1..12. |
|
44 // The intended use is to express five or three probabilities in a single-byte |
|
45 // subscript, then decode via this table. These probabilities are |
|
46 // intended to go with an array of five or three language numbers. |
|
47 // |
|
48 // The corresponding language numbers will have to be sorted by descending |
|
49 // probability, then the actual probability subscript chosen to match the |
|
50 // closest available entry in this table. |
|
51 // |
|
52 // Pattern of probability values: |
|
53 // hi 3/4 1/2 1/4 lo hi mid lo |
|
54 // where "3/4" is (hi*3+lo)/4, "1/2" is (hi+lo)/2, and "1/4" is (hi+lo*3)/4 |
|
55 // and mid is one of 3/4 1/2 or 1/4. |
|
56 // There are three groups of 78 (=12*13/2) entries, with hi running 1..12 and |
|
57 // lo running 1..hi. Only the first group is used for five-entry lookups. |
|
58 // The mid value in the first group is 1/2, the second group 3/4, and the |
|
59 // third group 1/4. For three-entry lookups, this allows the mid entry to be |
|
60 // somewhat higher or lower than the midpoint, to allow a better match to the |
|
61 // original probabilities. |
|
62 static const int kLgProbV2TblSize = 240; |
|
63 static const uint8 kLgProbV2Tbl[kLgProbV2TblSize * 8] = { |
|
64 1,1,1,1,1, 1,1,1, // [0] |
|
65 2,2,2,1,1, 2,2,1, // [1] |
|
66 2,2,2,2,2, 2,2,2, |
|
67 3,3,2,2,1, 3,2,1, // [3] |
|
68 3,3,3,2,2, 3,3,2, |
|
69 3,3,3,3,3, 3,3,3, |
|
70 4,3,3,2,1, 4,3,1, // [6] |
|
71 4,4,3,3,2, 4,3,2, |
|
72 4,4,4,3,3, 4,4,3, |
|
73 4,4,4,4,4, 4,4,4, |
|
74 5,4,3,2,1, 5,3,1, // [10] |
|
75 5,4,4,3,2, 5,4,2, |
|
76 5,5,4,4,3, 5,4,3, |
|
77 5,5,5,4,4, 5,5,4, |
|
78 5,5,5,5,5, 5,5,5, |
|
79 6,5,4,2,1, 6,4,1, // [15] |
|
80 6,5,4,3,2, 6,4,2, |
|
81 6,5,5,4,3, 6,5,3, |
|
82 6,6,5,5,4, 6,5,4, |
|
83 6,6,6,5,5, 6,6,5, |
|
84 6,6,6,6,6, 6,6,6, |
|
85 7,6,4,3,1, 7,4,1, // [21] |
|
86 7,6,5,3,2, 7,5,2, |
|
87 7,6,5,4,3, 7,5,3, |
|
88 7,6,6,5,4, 7,6,4, |
|
89 7,7,6,6,5, 7,6,5, |
|
90 7,7,7,6,6, 7,7,6, |
|
91 7,7,7,7,7, 7,7,7, |
|
92 8,6,5,3,1, 8,5,1, // [28] |
|
93 8,7,5,4,2, 8,5,2, |
|
94 8,7,6,4,3, 8,6,3, |
|
95 8,7,6,5,4, 8,6,4, |
|
96 8,7,7,6,5, 8,7,5, |
|
97 8,8,7,7,6, 8,7,6, |
|
98 8,8,8,7,7, 8,8,7, |
|
99 8,8,8,8,8, 8,8,8, |
|
100 9,7,5,3,1, 9,5,1, // [36] |
|
101 9,7,6,4,2, 9,6,2, |
|
102 9,8,6,5,3, 9,6,3, |
|
103 9,8,7,5,4, 9,7,4, |
|
104 9,8,7,6,5, 9,7,5, |
|
105 9,8,8,7,6, 9,8,6, |
|
106 9,9,8,8,7, 9,8,7, |
|
107 9,9,9,8,8, 9,9,8, |
|
108 9,9,9,9,9, 9,9,9, |
|
109 10,8,6,3,1, 10,6,1, // [45] |
|
110 10,8,6,4,2, 10,6,2, |
|
111 10,8,7,5,3, 10,7,3, |
|
112 10,9,7,6,4, 10,7,4, |
|
113 10,9,8,6,5, 10,8,5, |
|
114 10,9,8,7,6, 10,8,6, |
|
115 10,9,9,8,7, 10,9,7, |
|
116 10,10,9,9,8, 10,9,8, |
|
117 10,10,10,9,9, 10,10,9, |
|
118 10,10,10,10,10, 10,10,10, |
|
119 11,9,6,4,1, 11,6,1, // [55] |
|
120 11,9,7,4,2, 11,7,2, |
|
121 11,9,7,5,3, 11,7,3, |
|
122 11,9,8,6,4, 11,8,4, |
|
123 11,10,8,7,5, 11,8,5, |
|
124 11,10,9,7,6, 11,9,6, |
|
125 11,10,9,8,7, 11,9,7, |
|
126 11,10,10,9,8, 11,10,8, |
|
127 11,11,10,10,9, 11,10,9, |
|
128 11,11,11,10,10, 11,11,10, |
|
129 11,11,11,11,11, 11,11,11, |
|
130 12,9,7,4,1, 12,7,1, // [66] |
|
131 12,10,7,5,2, 12,7,2, |
|
132 12,10,8,5,3, 12,8,3, |
|
133 12,10,8,6,4, 12,8,4, |
|
134 12,10,9,7,5, 12,9,5, |
|
135 12,11,9,8,6, 12,9,6, |
|
136 12,11,10,8,7, 12,10,7, |
|
137 12,11,10,9,8, 12,10,8, |
|
138 12,11,11,10,9, 12,11,9, |
|
139 12,12,11,11,10, 12,11,10, |
|
140 12,12,12,11,11, 12,12,11, |
|
141 12,12,12,12,12, 12,12,12, |
|
142 |
|
143 1,1,1,1,1, 1,1,1, |
|
144 2,2,2,1,1, 2,2,1, |
|
145 2,2,2,2,2, 2,2,2, |
|
146 3,3,2,2,1, 3,3,1, |
|
147 3,3,3,2,2, 3,3,2, |
|
148 3,3,3,3,3, 3,3,3, |
|
149 4,3,3,2,1, 4,3,1, |
|
150 4,4,3,3,2, 4,4,2, |
|
151 4,4,4,3,3, 4,4,3, |
|
152 4,4,4,4,4, 4,4,4, |
|
153 5,4,3,2,1, 5,4,1, |
|
154 5,4,4,3,2, 5,4,2, |
|
155 5,5,4,4,3, 5,5,3, |
|
156 5,5,5,4,4, 5,5,4, |
|
157 5,5,5,5,5, 5,5,5, |
|
158 6,5,4,2,1, 6,5,1, |
|
159 6,5,4,3,2, 6,5,2, |
|
160 6,5,5,4,3, 6,5,3, |
|
161 6,6,5,5,4, 6,6,4, |
|
162 6,6,6,5,5, 6,6,5, |
|
163 6,6,6,6,6, 6,6,6, |
|
164 7,6,4,3,1, 7,6,1, |
|
165 7,6,5,3,2, 7,6,2, |
|
166 7,6,5,4,3, 7,6,3, |
|
167 7,6,6,5,4, 7,6,4, |
|
168 7,7,6,6,5, 7,7,5, |
|
169 7,7,7,6,6, 7,7,6, |
|
170 7,7,7,7,7, 7,7,7, |
|
171 8,6,5,3,1, 8,6,1, |
|
172 8,7,5,4,2, 8,7,2, |
|
173 8,7,6,4,3, 8,7,3, |
|
174 8,7,6,5,4, 8,7,4, |
|
175 8,7,7,6,5, 8,7,5, |
|
176 8,8,7,7,6, 8,8,6, |
|
177 8,8,8,7,7, 8,8,7, |
|
178 8,8,8,8,8, 8,8,8, |
|
179 9,7,5,3,1, 9,7,1, |
|
180 9,7,6,4,2, 9,7,2, |
|
181 9,8,6,5,3, 9,8,3, |
|
182 9,8,7,5,4, 9,8,4, |
|
183 9,8,7,6,5, 9,8,5, |
|
184 9,8,8,7,6, 9,8,6, |
|
185 9,9,8,8,7, 9,9,7, |
|
186 9,9,9,8,8, 9,9,8, |
|
187 9,9,9,9,9, 9,9,9, |
|
188 10,8,6,3,1, 10,8,1, |
|
189 10,8,6,4,2, 10,8,2, |
|
190 10,8,7,5,3, 10,8,3, |
|
191 10,9,7,6,4, 10,9,4, |
|
192 10,9,8,6,5, 10,9,5, |
|
193 10,9,8,7,6, 10,9,6, |
|
194 10,9,9,8,7, 10,9,7, |
|
195 10,10,9,9,8, 10,10,8, |
|
196 10,10,10,9,9, 10,10,9, |
|
197 10,10,10,10,10, 10,10,10, |
|
198 11,9,6,4,1, 11,9,1, |
|
199 11,9,7,4,2, 11,9,2, |
|
200 11,9,7,5,3, 11,9,3, |
|
201 11,9,8,6,4, 11,9,4, |
|
202 11,10,8,7,5, 11,10,5, |
|
203 11,10,9,7,6, 11,10,6, |
|
204 11,10,9,8,7, 11,10,7, |
|
205 11,10,10,9,8, 11,10,8, |
|
206 11,11,10,10,9, 11,11,9, |
|
207 11,11,11,10,10, 11,11,10, |
|
208 11,11,11,11,11, 11,11,11, |
|
209 12,9,7,4,1, 12,9,1, |
|
210 12,10,7,5,2, 12,10,2, |
|
211 12,10,8,5,3, 12,10,3, |
|
212 12,10,8,6,4, 12,10,4, |
|
213 12,10,9,7,5, 12,10,5, |
|
214 12,11,9,8,6, 12,11,6, |
|
215 12,11,10,8,7, 12,11,7, |
|
216 12,11,10,9,8, 12,11,8, |
|
217 12,11,11,10,9, 12,11,9, |
|
218 12,12,11,11,10, 12,12,10, |
|
219 12,12,12,11,11, 12,12,11, |
|
220 12,12,12,12,12, 12,12,12, |
|
221 |
|
222 1,1,1,1,1, 1,1,1, |
|
223 2,2,2,1,1, 2,1,1, |
|
224 2,2,2,2,2, 2,2,2, |
|
225 3,3,2,2,1, 3,2,1, |
|
226 3,3,3,2,2, 3,2,2, |
|
227 3,3,3,3,3, 3,3,3, |
|
228 4,3,3,2,1, 4,2,1, |
|
229 4,4,3,3,2, 4,3,2, |
|
230 4,4,4,3,3, 4,3,3, |
|
231 4,4,4,4,4, 4,4,4, |
|
232 5,4,3,2,1, 5,2,1, |
|
233 5,4,4,3,2, 5,3,2, |
|
234 5,5,4,4,3, 5,4,3, |
|
235 5,5,5,4,4, 5,4,4, |
|
236 5,5,5,5,5, 5,5,5, |
|
237 6,5,4,2,1, 6,2,1, |
|
238 6,5,4,3,2, 6,3,2, |
|
239 6,5,5,4,3, 6,4,3, |
|
240 6,6,5,5,4, 6,5,4, |
|
241 6,6,6,5,5, 6,5,5, |
|
242 6,6,6,6,6, 6,6,6, |
|
243 7,6,4,3,1, 7,3,1, |
|
244 7,6,5,3,2, 7,3,2, |
|
245 7,6,5,4,3, 7,4,3, |
|
246 7,6,6,5,4, 7,5,4, |
|
247 7,7,6,6,5, 7,6,5, |
|
248 7,7,7,6,6, 7,6,6, |
|
249 7,7,7,7,7, 7,7,7, |
|
250 8,6,5,3,1, 8,3,1, |
|
251 8,7,5,4,2, 8,4,2, |
|
252 8,7,6,4,3, 8,4,3, |
|
253 8,7,6,5,4, 8,5,4, |
|
254 8,7,7,6,5, 8,6,5, |
|
255 8,8,7,7,6, 8,7,6, |
|
256 8,8,8,7,7, 8,7,7, |
|
257 8,8,8,8,8, 8,8,8, |
|
258 9,7,5,3,1, 9,3,1, |
|
259 9,7,6,4,2, 9,4,2, |
|
260 9,8,6,5,3, 9,5,3, |
|
261 9,8,7,5,4, 9,5,4, |
|
262 9,8,7,6,5, 9,6,5, |
|
263 9,8,8,7,6, 9,7,6, |
|
264 9,9,8,8,7, 9,8,7, |
|
265 9,9,9,8,8, 9,8,8, |
|
266 9,9,9,9,9, 9,9,9, |
|
267 10,8,6,3,1, 10,3,1, |
|
268 10,8,6,4,2, 10,4,2, |
|
269 10,8,7,5,3, 10,5,3, |
|
270 10,9,7,6,4, 10,6,4, |
|
271 10,9,8,6,5, 10,6,5, |
|
272 10,9,8,7,6, 10,7,6, |
|
273 10,9,9,8,7, 10,8,7, |
|
274 10,10,9,9,8, 10,9,8, |
|
275 10,10,10,9,9, 10,9,9, |
|
276 10,10,10,10,10, 10,10,10, |
|
277 11,9,6,4,1, 11,4,1, |
|
278 11,9,7,4,2, 11,4,2, |
|
279 11,9,7,5,3, 11,5,3, |
|
280 11,9,8,6,4, 11,6,4, |
|
281 11,10,8,7,5, 11,7,5, |
|
282 11,10,9,7,6, 11,7,6, |
|
283 11,10,9,8,7, 11,8,7, |
|
284 11,10,10,9,8, 11,9,8, |
|
285 11,11,10,10,9, 11,10,9, |
|
286 11,11,11,10,10, 11,10,10, |
|
287 11,11,11,11,11, 11,11,11, |
|
288 12,9,7,4,1, 12,4,1, |
|
289 12,10,7,5,2, 12,5,2, |
|
290 12,10,8,5,3, 12,5,3, |
|
291 12,10,8,6,4, 12,6,4, |
|
292 12,10,9,7,5, 12,7,5, |
|
293 12,11,9,8,6, 12,8,6, |
|
294 12,11,10,8,7, 12,8,7, |
|
295 12,11,10,9,8, 12,9,8, |
|
296 12,11,11,10,9, 12,10,9, |
|
297 12,12,11,11,10, 12,11,10, |
|
298 12,12,12,11,11, 12,11,11, |
|
299 12,12,12,12,12, 12,12,12, |
|
300 |
|
301 // Added 2013.01.28 for CJK compatible mapping |
|
302 8,5,2,2,2, 8,2,2, |
|
303 6,6,6,4,2, 6,6,2, |
|
304 6,5,4,4,4, 6,4,4, |
|
305 6,4,2,2,2, 6,2,2, |
|
306 4,3,2,2,2, 4,2,2, |
|
307 2,2,2,2,2, 2,2,2, |
|
308 }; |
|
309 |
|
310 // Backmap a single desired probability into an entry in kLgProbV2Tbl |
|
311 static const uint8 kLgProbV2TblBackmap[13] = { |
|
312 0, |
|
313 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, |
|
314 }; |
|
315 |
|
316 // Return address of 8-byte entry[i] |
|
317 inline const uint8* LgProb2TblEntry(int i) { |
|
318 return &kLgProbV2Tbl[i * 8]; |
|
319 } |
|
320 |
|
321 // Return one of three probabilities in an entry |
|
322 inline uint8 LgProb3(const uint8* entry, int j) { |
|
323 return entry[j + 5]; |
|
324 } |
|
325 |
|
326 |
|
327 // Routines to access a hash table of <key:wordhash, value:probs> pairs |
|
328 // Buckets have 4-byte wordhash for sizes < 32K buckets, but only |
|
329 // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as |
|
330 // bucket subscript. |
|
331 // Probs is a packed: three languages plus a subscript for probability table |
|
332 // Buckets have all the keys together, then all the values.Key array never |
|
333 // crosses a cache-line boundary, so no-match case takes exactly one cache miss. |
|
334 // Match case may sometimes take an additional cache miss on value access. |
|
335 // |
|
336 // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64 |
|
337 // byte buckets with single cache miss. |
|
338 // Or 2-byte key and 6-byte value, allowing 5 languages instead of three. |
|
339 |
|
340 |
|
341 //----------------------------------------------------------------------------// |
|
342 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores // |
|
343 //----------------------------------------------------------------------------// |
|
344 |
|
345 // BIGRAM |
|
346 // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post |
|
347 // OVERSHOOTS up to 3 bytes |
|
348 // For runtime use of tables |
|
349 // Does X86 unaligned loads if !defined(NEED_ALIGNED_LOADS)UNALIGNED_LOAD32(_p) |
|
350 uint32 BiHashV2(const char* word_ptr, int bytecount); |
|
351 |
|
352 // QUADGRAM wrapper with surrounding spaces |
|
353 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add |
|
354 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes |
|
355 // For runtime use of tables |
|
356 uint32 QuadHashV2(const char* word_ptr, int bytecount); |
|
357 |
|
358 // QUADGRAM wrapper with surrounding underscores (offline use) |
|
359 // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add |
|
360 // OVERSHOOTS up to 3 bytes |
|
361 // For offline construction of tables |
|
362 uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount); |
|
363 |
|
364 // OCTAGRAM wrapper with surrounding spaces |
|
365 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add |
|
366 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes |
|
367 uint64 OctaHash40(const char* word_ptr, int bytecount); |
|
368 |
|
369 |
|
370 // OCTAGRAM wrapper with surrounding underscores (offline use) |
|
371 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add |
|
372 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes |
|
373 uint64 OctaHash40underscore(const char* word_ptr, int bytecount); |
|
374 |
|
375 // Hash a consecutive pair of tokens/words A B |
|
376 uint64 PairHash(uint64 worda_hash, uint64 wordb_hash); |
|
377 |
|
378 |
|
379 // From 32-bit gram FP, return hash table subscript and remaining key |
|
380 inline void QuadFPJustHash(uint32 quadhash, |
|
381 uint32 keymask, |
|
382 int bucketcount, |
|
383 uint32* subscr, uint32* hashkey) { |
|
384 *subscr = (quadhash + (quadhash >> 12)) & (bucketcount - 1); |
|
385 *hashkey = quadhash & keymask; |
|
386 } |
|
387 |
|
388 // From 40-bit gram FP, return hash table subscript and remaining key |
|
389 inline void OctaFPJustHash(uint64 longwordhash, |
|
390 uint32 keymask, |
|
391 int bucketcount, |
|
392 uint32* subscr, uint32* hashkey) { |
|
393 uint32 temp = (longwordhash + (longwordhash >> 12)) & (bucketcount - 1); |
|
394 *subscr = temp; |
|
395 temp = longwordhash >> 4; |
|
396 *hashkey = temp & keymask; |
|
397 } |
|
398 |
|
399 |
|
400 // Look up 32-bit gram FP in caller-passed table |
|
401 // Typical size 256K entries (1.5MB) |
|
402 // Two-byte hashkey |
|
403 inline const uint32 QuadHashV3Lookup4(const CLD2TableSummary* gram_obj, |
|
404 uint32 quadhash) { |
|
405 uint32 subscr, hashkey; |
|
406 const IndirectProbBucket4* quadtable = gram_obj->kCLDTable; |
|
407 uint32 keymask = gram_obj->kCLDTableKeyMask; |
|
408 int bucketcount = gram_obj->kCLDTableSize; |
|
409 QuadFPJustHash(quadhash, keymask, bucketcount, &subscr, &hashkey); |
|
410 const IndirectProbBucket4* bucket_ptr = &quadtable[subscr]; |
|
411 // Four-way associative, 4 compares |
|
412 if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) { |
|
413 return bucket_ptr->keyvalue[0]; |
|
414 } |
|
415 if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) { |
|
416 return bucket_ptr->keyvalue[1]; |
|
417 } |
|
418 if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) { |
|
419 return bucket_ptr->keyvalue[2]; |
|
420 } |
|
421 if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) { |
|
422 return bucket_ptr->keyvalue[3]; |
|
423 } |
|
424 return 0; |
|
425 } |
|
426 |
|
427 // Look up 40-bit gram FP in caller-passed table |
|
428 // Typical size 256K-4M entries (1-16MB) |
|
429 // 24-12 bit hashkey packed with 8-20 bit indirect lang/probs |
|
430 // keymask is 0xfffff000 for 20-bit hashkey and 12-bit indirect |
|
431 inline const uint32 OctaHashV3Lookup4(const CLD2TableSummary* gram_obj, |
|
432 uint64 longwordhash) { |
|
433 uint32 subscr, hashkey; |
|
434 const IndirectProbBucket4* octatable = gram_obj->kCLDTable; |
|
435 uint32 keymask = gram_obj->kCLDTableKeyMask; |
|
436 int bucketcount = gram_obj->kCLDTableSize; |
|
437 OctaFPJustHash(longwordhash, keymask, bucketcount, |
|
438 &subscr, &hashkey); |
|
439 const IndirectProbBucket4* bucket_ptr = &octatable[subscr]; |
|
440 // Four-way associative, 4 compares |
|
441 if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) { |
|
442 return bucket_ptr->keyvalue[0]; |
|
443 } |
|
444 if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) { |
|
445 return bucket_ptr->keyvalue[1]; |
|
446 } |
|
447 if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) { |
|
448 return bucket_ptr->keyvalue[2]; |
|
449 } |
|
450 if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) { |
|
451 return bucket_ptr->keyvalue[3]; |
|
452 } |
|
453 return 0; |
|
454 } |
|
455 |
|
456 |
|
457 //----------------------------------------------------------------------------// |
|
458 // Finding groups of 1/2/4/8 letters // |
|
459 //----------------------------------------------------------------------------// |
|
460 |
|
461 // Does not advance past space or tab/cr/lf/nul |
|
462 static const uint8 kAdvanceOneCharButSpace[256] = { |
|
463 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
464 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
465 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
466 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
467 |
|
468 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
469 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
470 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
|
471 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, |
|
472 }; |
|
473 |
|
474 |
|
475 // Advances *only* on space or ASCII vowel (or illegal byte) |
|
476 static const uint8 kAdvanceOneCharSpaceVowel[256] = { |
|
477 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
478 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
479 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, |
|
480 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, |
|
481 |
|
482 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
483 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
484 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
485 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
486 }; |
|
487 |
|
488 |
|
489 // src points to a letter. Find the byte length of a unigram starting there. |
|
490 int UniLen(const char* src); |
|
491 |
|
492 // src points to a letter. Find the byte length of a bigram starting there. |
|
493 int BiLen(const char* src); |
|
494 |
|
495 // src points to a letter. Find the byte length of a quadgram starting there. |
|
496 int QuadLen(const char* src); |
|
497 |
|
498 // src points to a letter. Find the byte length of an octagram starting there. |
|
499 int OctaLen(const char* src); |
|
500 |
|
501 } // End namespace CLD2 |
|
502 |
|
503 #endif // I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__ |
|
504 |
|
505 |
|
506 |
|
507 |
|
508 |
|
509 |