|
1 // Copyright 2013 Google Inc. All Rights Reserved. |
|
2 // |
|
3 // Licensed under the Apache License, Version 2.0 (the "License"); |
|
4 // you may not use this file except in compliance with the License. |
|
5 // You may obtain a copy of the License at |
|
6 // |
|
7 // http://www.apache.org/licenses/LICENSE-2.0 |
|
8 // |
|
9 // Unless required by applicable law or agreed to in writing, software |
|
10 // distributed under the License is distributed on an "AS IS" BASIS, |
|
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
12 // See the License for the specific language governing permissions and |
|
13 // limitations under the License. |
|
14 |
|
15 // |
|
16 // Author: dsites@google.com (Dick Sites) |
|
17 // Updated 2014.01 for dual table lookup |
|
18 // |
|
19 |
|
20 #include "cldutil.h" |
|
21 #include <string> |
|
22 |
|
23 #include "cld2tablesummary.h" |
|
24 #include "integral_types.h" |
|
25 #include "port.h" |
|
26 #include "utf8statetable.h" |
|
27 |
|
28 namespace CLD2 { |
|
29 |
|
30 // Caller supplies the right tables in scoringcontext |
|
31 |
|
32 // Runtime routines for hashing, looking up, and scoring |
|
33 // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams. |
|
34 // Unigrams and bigrams are for CJK languages only, including simplified/ |
|
35 // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and |
|
36 // Zhuang Han characters. Surrounding spaces are not considered. |
|
37 // Quadgrams and octagrams for for non-CJK and include two bits indicating |
|
38 // preceding and trailing spaces (word boundaries). |
|
39 |
|
40 |
|
41 static const int kMinCJKUTF8CharBytes = 3; |
|
42 |
|
43 static const int kMinGramCount = 3; |
|
44 static const int kMaxGramCount = 16; |
|
45 |
|
46 static const int UTFmax = 4; // Max number of bytes in a UTF-8 character |
|
47 |
|
48 // 1 to skip ASCII space, vowels AEIOU aeiou and UTF-8 continuation bytes 80-BF |
|
49 static const uint8 kSkipSpaceVowelContinue[256] = { |
|
50 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
51 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
52 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, |
|
53 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, |
|
54 |
|
55 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
56 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
57 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
58 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
59 }; |
|
60 |
|
61 // 1 to skip ASCII space, and UTF-8 continuation bytes 80-BF |
|
62 static const uint8 kSkipSpaceContinue[256] = { |
|
63 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
64 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
65 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
66 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
67 |
|
68 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
69 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
70 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
71 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
72 }; |
|
73 |
|
74 |
|
75 // Always advances one UTF-8 character |
|
76 static const uint8 kAdvanceOneChar[256] = { |
|
77 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
78 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
79 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
80 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
81 |
|
82 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
83 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
84 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
|
85 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, |
|
86 }; |
|
87 |
|
88 // Advances *only* on space (or illegal byte) |
|
89 static const uint8 kAdvanceOneCharSpace[256] = { |
|
90 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
91 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
92 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
93 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
94 |
|
95 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
96 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
|
97 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
98 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
99 }; |
|
100 |
|
101 |
|
102 // Routines to access a hash table of <key:wordhash, value:probs> pairs |
|
103 // Buckets have 4-byte wordhash for sizes < 32K buckets, but only |
|
104 // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as |
|
105 // bucket subscript. |
|
106 // Probs is a packed: three languages plus a subscript for probability table |
|
107 // Buckets have all the keys together, then all the values.Key array never |
|
108 // crosses a cache-line boundary, so no-match case takes exactly one cache miss. |
|
109 // Match case may sometimes take an additional cache miss on value access. |
|
110 // |
|
111 // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64 |
|
112 // byte buckets with single cache miss. |
|
113 // Or 2-byte key and 6-byte value, allowing 5 languages instead of three. |
|
114 //------------------------------------------------------------------------------ |
|
115 |
|
116 //----------------------------------------------------------------------------// |
|
117 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores // |
|
118 //----------------------------------------------------------------------------// |
|
119 |
|
120 //----------------------------------------------------------------------------// |
|
121 // Scoring single groups of letters // |
|
122 //----------------------------------------------------------------------------// |
|
123 |
|
124 // BIGRAM, QUADGRAM, OCTAGRAM score one => tote |
|
125 // Input: 4-byte entry of 3 language numbers and one probability subscript, plus |
|
126 // an accumulator tote. (language 0 means unused entry) |
|
127 // Output: running sums in tote updated |
|
128 void ProcessProbV2Tote(uint32 probs, Tote* tote) { |
|
129 uint8 prob123 = (probs >> 0) & 0xff; |
|
130 const uint8* prob123_entry = LgProb2TblEntry(prob123); |
|
131 |
|
132 uint8 top1 = (probs >> 8) & 0xff; |
|
133 if (top1 > 0) {tote->Add(top1, LgProb3(prob123_entry, 0));} |
|
134 uint8 top2 = (probs >> 16) & 0xff; |
|
135 if (top2 > 0) {tote->Add(top2, LgProb3(prob123_entry, 1));} |
|
136 uint8 top3 = (probs >> 24) & 0xff; |
|
137 if (top3 > 0) {tote->Add(top3, LgProb3(prob123_entry, 2));} |
|
138 } |
|
139 |
|
140 // Return score for a particular per-script language, or zero |
|
141 int GetLangScore(uint32 probs, uint8 pslang) { |
|
142 uint8 prob123 = (probs >> 0) & 0xff; |
|
143 const uint8* prob123_entry = LgProb2TblEntry(prob123); |
|
144 int retval = 0; |
|
145 uint8 top1 = (probs >> 8) & 0xff; |
|
146 if (top1 == pslang) {retval += LgProb3(prob123_entry, 0);} |
|
147 uint8 top2 = (probs >> 16) & 0xff; |
|
148 if (top2 == pslang) {retval += LgProb3(prob123_entry, 1);} |
|
149 uint8 top3 = (probs >> 24) & 0xff; |
|
150 if (top3 == pslang) {retval += LgProb3(prob123_entry, 2);} |
|
151 return retval; |
|
152 } |
|
153 |
|
154 //----------------------------------------------------------------------------// |
|
155 // Routines to accumulate probabilities // |
|
156 //----------------------------------------------------------------------------// |
|
157 |
|
158 |
|
159 // BIGRAM, using hash table, always advancing by 1 char |
|
160 // Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj |
|
161 // Score all bigrams in isrc, using languages that have bigrams (CJK) |
|
162 // Return number of bigrams that hit in the hash table |
|
163 int DoBigramScoreV3(const CLD2TableSummary* bigram_obj, |
|
164 const char* isrc, int srclen, Tote* chunk_tote) { |
|
165 int hit_count = 0; |
|
166 const char* src = isrc; |
|
167 |
|
168 // Hashtable-based CJK bigram lookup |
|
169 const uint8* usrc = reinterpret_cast<const uint8*>(src); |
|
170 const uint8* usrclimit1 = usrc + srclen - UTFmax; |
|
171 |
|
172 while (usrc < usrclimit1) { |
|
173 int len = kAdvanceOneChar[usrc[0]]; |
|
174 int len2 = kAdvanceOneChar[usrc[len]] + len; |
|
175 |
|
176 if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible |
|
177 // Lookup and score this bigram |
|
178 // Always ignore pre/post spaces |
|
179 uint32 bihash = BiHashV2(reinterpret_cast<const char*>(usrc), len2); |
|
180 uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash); |
|
181 // Now go indirect on the subscript |
|
182 probs = bigram_obj->kCLDTableInd[probs & |
|
183 ~bigram_obj->kCLDTableKeyMask]; |
|
184 |
|
185 // Process the bigram |
|
186 if (probs != 0) { |
|
187 ProcessProbV2Tote(probs, chunk_tote); |
|
188 ++hit_count; |
|
189 } |
|
190 } |
|
191 usrc += len; // Advance by one char |
|
192 } |
|
193 |
|
194 return hit_count; |
|
195 } |
|
196 |
|
197 |
|
198 // Score up to 64KB of a single script span in one pass |
|
199 // Make a dummy entry off the end to calc length of last span |
|
200 // Return offset of first unused input byte |
|
201 int GetUniHits(const char* text, |
|
202 int letter_offset, int letter_limit, |
|
203 ScoringContext* scoringcontext, |
|
204 ScoringHitBuffer* hitbuffer) { |
|
205 const char* isrc = &text[letter_offset]; |
|
206 const char* src = isrc; |
|
207 // Limit is end, which has extra 20 20 20 00 past len |
|
208 const char* srclimit = &text[letter_limit]; |
|
209 |
|
210 // Local copies |
|
211 const UTF8PropObj* unigram_obj = |
|
212 scoringcontext->scoringtables->unigram_obj; |
|
213 int next_base = hitbuffer->next_base; |
|
214 int next_base_limit = hitbuffer->maxscoringhits; |
|
215 |
|
216 // Visit all unigrams |
|
217 if (src[0] == ' ') {++src;} // skip any initial space |
|
218 while (src < srclimit) { |
|
219 const uint8* usrc = reinterpret_cast<const uint8*>(src); |
|
220 int len = kAdvanceOneChar[usrc[0]]; |
|
221 src += len; |
|
222 // Look up property of one UTF-8 character and advance over it. |
|
223 // Updates usrc and len (bad interface design), hence increment above |
|
224 int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &len); |
|
225 if (propval > 0) { |
|
226 // Save indirect subscript for later scoring; 1 or 2 langprobs |
|
227 int indirect_subscr = propval; |
|
228 hitbuffer->base[next_base].offset = src - text; // Offset in text |
|
229 hitbuffer->base[next_base].indirect = indirect_subscr; |
|
230 ++next_base; |
|
231 } |
|
232 |
|
233 if (next_base >= next_base_limit) {break;} |
|
234 } |
|
235 |
|
236 hitbuffer->next_base = next_base; |
|
237 |
|
238 // Make a dummy entry off the end to calc length of last span |
|
239 int dummy_offset = src - text; |
|
240 hitbuffer->base[hitbuffer->next_base].offset = dummy_offset; |
|
241 hitbuffer->base[hitbuffer->next_base].indirect = 0; |
|
242 |
|
243 return src - text; |
|
244 } |
|
245 |
|
246 // Score up to 64KB of a single script span, doing both delta-bi and |
|
247 // distinct bis in one pass |
|
248 void GetBiHits(const char* text, |
|
249 int letter_offset, int letter_limit, |
|
250 ScoringContext* scoringcontext, |
|
251 ScoringHitBuffer* hitbuffer) { |
|
252 const char* isrc = &text[letter_offset]; |
|
253 const char* src = isrc; |
|
254 // Limit is end |
|
255 const char* srclimit1 = &text[letter_limit]; |
|
256 |
|
257 // Local copies |
|
258 const CLD2TableSummary* deltabi_obj = |
|
259 scoringcontext->scoringtables->deltabi_obj; |
|
260 const CLD2TableSummary* distinctbi_obj = |
|
261 scoringcontext->scoringtables->distinctbi_obj; |
|
262 int next_delta = hitbuffer->next_delta; |
|
263 int next_delta_limit = hitbuffer->maxscoringhits; |
|
264 int next_distinct = hitbuffer->next_distinct; |
|
265 // We can do 2 inserts per loop, so -1 |
|
266 int next_distinct_limit = hitbuffer->maxscoringhits - 1; |
|
267 |
|
268 while (src < srclimit1) { |
|
269 const uint8* usrc = reinterpret_cast<const uint8*>(src); |
|
270 int len = kAdvanceOneChar[usrc[0]]; |
|
271 int len2 = kAdvanceOneChar[usrc[len]] + len; |
|
272 |
|
273 if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible |
|
274 // Lookup and this bigram and save <offset, indirect> |
|
275 uint32 bihash = BiHashV2(src, len2); |
|
276 uint32 probs = QuadHashV3Lookup4(deltabi_obj, bihash); |
|
277 // Now go indirect on the subscript |
|
278 if (probs != 0) { |
|
279 // Save indirect subscript for later scoring; 1 langprob |
|
280 int indirect_subscr = probs & ~deltabi_obj->kCLDTableKeyMask; |
|
281 hitbuffer->delta[next_delta].offset = src - text; |
|
282 hitbuffer->delta[next_delta].indirect = indirect_subscr; |
|
283 ++next_delta; |
|
284 } |
|
285 // Lookup this distinct bigram and save <offset, indirect> |
|
286 probs = QuadHashV3Lookup4(distinctbi_obj, bihash); |
|
287 if (probs != 0) { |
|
288 int indirect_subscr = probs & ~distinctbi_obj->kCLDTableKeyMask; |
|
289 hitbuffer->distinct[next_distinct].offset = src - text; |
|
290 hitbuffer->distinct[next_distinct].indirect = indirect_subscr; |
|
291 ++next_distinct; |
|
292 } |
|
293 } |
|
294 src += len; // Advance by one char (not two) |
|
295 |
|
296 // Almost always srclimit hit first |
|
297 if (next_delta >= next_delta_limit) {break;} |
|
298 if (next_distinct >= next_distinct_limit) {break;} |
|
299 } |
|
300 |
|
301 hitbuffer->next_delta = next_delta; |
|
302 hitbuffer->next_distinct = next_distinct; |
|
303 |
|
304 // Make a dummy entry off the end to calc length of last span |
|
305 int dummy_offset = src - text; |
|
306 hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset; |
|
307 hitbuffer->delta[hitbuffer->next_delta].indirect = 0; |
|
308 hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset; |
|
309 hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0; |
|
310 } |
|
311 |
|
312 // Score up to 64KB of a single script span in one pass |
|
313 // Make a dummy entry off the end to calc length of last span |
|
314 // Return offset of first unused input byte |
|
315 int GetQuadHits(const char* text, |
|
316 int letter_offset, int letter_limit, |
|
317 ScoringContext* scoringcontext, |
|
318 ScoringHitBuffer* hitbuffer) { |
|
319 const char* isrc = &text[letter_offset]; |
|
320 const char* src = isrc; |
|
321 // Limit is end, which has extra 20 20 20 00 past len |
|
322 const char* srclimit = &text[letter_limit]; |
|
323 |
|
324 // Local copies |
|
325 const CLD2TableSummary* quadgram_obj = |
|
326 scoringcontext->scoringtables->quadgram_obj; |
|
327 const CLD2TableSummary* quadgram_obj2 = |
|
328 scoringcontext->scoringtables->quadgram_obj2; |
|
329 int next_base = hitbuffer->next_base; |
|
330 int next_base_limit = hitbuffer->maxscoringhits; |
|
331 |
|
332 // Run a little cache of last quad hits to catch overly-repetitive "text" |
|
333 // We don't care if we miss a couple repetitions at scriptspan boundaries |
|
334 int next_prior_quadhash = 0; |
|
335 uint32 prior_quadhash[2] = {0, 0}; |
|
336 |
|
337 // Visit all quadgrams |
|
338 if (src[0] == ' ') {++src;} // skip any initial space |
|
339 while (src < srclimit) { |
|
340 // Find one quadgram |
|
341 const char* src_end = src; |
|
342 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
|
343 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
|
344 const char* src_mid = src_end; |
|
345 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
|
346 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
|
347 int len = src_end - src; |
|
348 // Hash the quadgram |
|
349 uint32 quadhash = QuadHashV2(src, len); |
|
350 |
|
351 // Filter out recent repeats |
|
352 if ((quadhash != prior_quadhash[0]) && (quadhash != prior_quadhash[1])) { |
|
353 // Look up this quadgram and save <offset, indirect> |
|
354 uint32 indirect_flag = 0; // For dual tables |
|
355 const CLD2TableSummary* hit_obj = quadgram_obj; |
|
356 uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash); |
|
357 if ((probs == 0) && (quadgram_obj2->kCLDTableSize != 0)) { |
|
358 // Try lookup in dual table if not found in first one |
|
359 // Note: we need to know later which of two indirect tables to use. |
|
360 indirect_flag = 0x80000000u; |
|
361 hit_obj = quadgram_obj2; |
|
362 probs = QuadHashV3Lookup4(quadgram_obj2, quadhash); |
|
363 } |
|
364 if (probs != 0) { |
|
365 // Round-robin two entries of actual hits |
|
366 prior_quadhash[next_prior_quadhash] = quadhash; |
|
367 next_prior_quadhash = (next_prior_quadhash + 1) & 1; |
|
368 |
|
369 // Save indirect subscript for later scoring; 1 or 2 langprobs |
|
370 int indirect_subscr = probs & ~hit_obj->kCLDTableKeyMask; |
|
371 hitbuffer->base[next_base].offset = src - text; // Offset in text |
|
372 // Flip the high bit for table2 |
|
373 hitbuffer->base[next_base].indirect = indirect_subscr | indirect_flag; |
|
374 ++next_base; |
|
375 } |
|
376 } |
|
377 |
|
378 // Advance: all the way past word if at end-of-word, else 2 chars |
|
379 if (src_end[0] == ' ') { |
|
380 src = src_end; |
|
381 } else { |
|
382 src = src_mid; |
|
383 } |
|
384 |
|
385 // Skip over space at end of word, or ASCII vowel in middle of word |
|
386 // Use kAdvanceOneCharSpace instead to get rid of vowel hack |
|
387 if (src < srclimit) { |
|
388 src += kAdvanceOneCharSpaceVowel[(uint8)src[0]]; |
|
389 } else { |
|
390 // Advancing by 4/8/16 can overshoot, but we are about to exit anyway |
|
391 src = srclimit; |
|
392 } |
|
393 |
|
394 if (next_base >= next_base_limit) {break;} |
|
395 } |
|
396 |
|
397 hitbuffer->next_base = next_base; |
|
398 |
|
399 // Make a dummy entry off the end to calc length of last span |
|
400 int dummy_offset = src - text; |
|
401 hitbuffer->base[hitbuffer->next_base].offset = dummy_offset; |
|
402 hitbuffer->base[hitbuffer->next_base].indirect = 0; |
|
403 |
|
404 return src - text; |
|
405 } |
|
406 |
|
407 // inputs: |
|
408 // const tables |
|
409 // const char* isrc, int srclen (in sscriptbuffer) |
|
410 // intermediates: |
|
411 // vector of octa <offset, probs> (which need indirect table to decode) |
|
412 // vector of distinct <offset, probs> (which need indirect table to decode) |
|
413 |
|
414 // Score up to 64KB of a single script span, doing both delta-octa and |
|
415 // distinct words in one pass |
|
416 void GetOctaHits(const char* text, |
|
417 int letter_offset, int letter_limit, |
|
418 ScoringContext* scoringcontext, |
|
419 ScoringHitBuffer* hitbuffer) { |
|
420 const char* isrc = &text[letter_offset]; |
|
421 const char* src = isrc; |
|
422 // Limit is end+1, to include extra space char (0x20) off the end |
|
423 const char* srclimit = &text[letter_limit + 1]; |
|
424 |
|
425 // Local copies |
|
426 const CLD2TableSummary* deltaocta_obj = |
|
427 scoringcontext->scoringtables->deltaocta_obj; |
|
428 int next_delta = hitbuffer->next_delta; |
|
429 int next_delta_limit = hitbuffer->maxscoringhits; |
|
430 |
|
431 const CLD2TableSummary* distinctocta_obj = |
|
432 scoringcontext->scoringtables->distinctocta_obj; |
|
433 int next_distinct = hitbuffer->next_distinct; |
|
434 // We can do 2 inserts per loop, so -1 |
|
435 int next_distinct_limit = hitbuffer->maxscoringhits - 1; |
|
436 |
|
437 // Run a little cache of last octa hits to catch overly-repetitive "text" |
|
438 // We don't care if we miss a couple repetitions at scriptspan boundaries |
|
439 int next_prior_octahash = 0; |
|
440 uint64 prior_octahash[2] = {0, 0}; |
|
441 |
|
442 // Score all words truncated to 8 characters |
|
443 int charcount = 0; |
|
444 // Skip any initial space |
|
445 if (src[0] == ' ') {++src;} |
|
446 |
|
447 // Begin the first word |
|
448 const char* prior_word_start = src; |
|
449 const char* word_start = src; |
|
450 const char* word_end = word_start; |
|
451 while (src < srclimit) { |
|
452 // Terminate previous word or continue current word |
|
453 if (src[0] == ' ') { |
|
454 int len = word_end - word_start; |
|
455 // Hash the word |
|
456 uint64 wordhash40 = OctaHash40(word_start, len); |
|
457 uint32 probs; |
|
458 |
|
459 // Filter out recent repeats. Unlike quads, we update even if no hit, |
|
460 // so we can get hits on same word if separated by non-hit words |
|
461 if ((wordhash40 != prior_octahash[0]) && |
|
462 (wordhash40 != prior_octahash[1])) { |
|
463 // Round-robin two entries of words |
|
464 prior_octahash[next_prior_octahash] = wordhash40; |
|
465 next_prior_octahash = 1 - next_prior_octahash; // Alternates 0,1,0,1 |
|
466 |
|
467 // (1) Lookup distinct word PAIR. For a pair, we want an asymmetrical |
|
468 // function of the two word hashs. For words A B C, B-A and C-B are good |
|
469 // enough and fast. We use the same table as distinct single words |
|
470 // Do not look up a pair of identical words -- all pairs hash to zero |
|
471 // Both 1- and 2-word distinct lookups are in distinctocta_obj now |
|
472 // Do this first, because it has the lowest offset |
|
473 uint64 tmp_prior_hash = prior_octahash[next_prior_octahash]; |
|
474 if ((tmp_prior_hash != 0) && (tmp_prior_hash != wordhash40)) { |
|
475 uint64 pair_hash = PairHash(tmp_prior_hash, wordhash40); |
|
476 probs = OctaHashV3Lookup4(distinctocta_obj, pair_hash); |
|
477 if (probs != 0) { |
|
478 int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask; |
|
479 hitbuffer->distinct[next_distinct].offset = prior_word_start - text; |
|
480 hitbuffer->distinct[next_distinct].indirect = indirect_subscr; |
|
481 ++next_distinct; |
|
482 } |
|
483 } |
|
484 |
|
485 // (2) Lookup this distinct word and save <offset, indirect> |
|
486 probs = OctaHashV3Lookup4(distinctocta_obj, wordhash40); |
|
487 if (probs != 0) { |
|
488 int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask; |
|
489 hitbuffer->distinct[next_distinct].offset = word_start - text; |
|
490 hitbuffer->distinct[next_distinct].indirect = indirect_subscr; |
|
491 ++next_distinct; |
|
492 } |
|
493 |
|
494 // (3) Lookup this word and save <offset, indirect> |
|
495 probs = OctaHashV3Lookup4(deltaocta_obj, wordhash40); |
|
496 if (probs != 0) { |
|
497 // Save indirect subscript for later scoring; 1 langprob |
|
498 int indirect_subscr = probs & ~deltaocta_obj->kCLDTableKeyMask; |
|
499 hitbuffer->delta[next_delta].offset = word_start - text; |
|
500 hitbuffer->delta[next_delta].indirect = indirect_subscr; |
|
501 ++next_delta; |
|
502 } |
|
503 } |
|
504 |
|
505 // Begin the next word |
|
506 charcount = 0; |
|
507 prior_word_start = word_start; |
|
508 word_start = src + 1; // Over the space |
|
509 word_end = word_start; |
|
510 } else { |
|
511 ++charcount; |
|
512 } |
|
513 |
|
514 // Advance to next char |
|
515 src += UTF8OneCharLen(src); |
|
516 if (charcount <= 8) { |
|
517 word_end = src; |
|
518 } |
|
519 // Almost always srclimit hit first |
|
520 if (next_delta >= next_delta_limit) {break;} |
|
521 if (next_distinct >= next_distinct_limit) {break;} |
|
522 } |
|
523 |
|
524 hitbuffer->next_delta = next_delta; |
|
525 hitbuffer->next_distinct = next_distinct; |
|
526 |
|
527 // Make a dummy entry off the end to calc length of last span |
|
528 int dummy_offset = src - text; |
|
529 hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset; |
|
530 hitbuffer->delta[hitbuffer->next_delta].indirect = 0; |
|
531 hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset; |
|
532 hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0; |
|
533 } |
|
534 |
|
535 |
|
536 //----------------------------------------------------------------------------// |
|
537 // Reliability calculations, for single language and between languages // |
|
538 //----------------------------------------------------------------------------// |
|
539 |
|
540 // Return reliablity of result 0..100 for top two scores |
|
541 // delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable |
|
542 // (on a scale where +1 is a factor of 2 ** 1.6 = 3.02) |
|
543 // Threshold is uni/quadgram increment count, bounded above and below. |
|
544 // |
|
545 // Requiring a factor of 3 improvement (e.g. +1 log base 3) |
|
546 // for each scored quadgram is too stringent, so I've backed this off to a |
|
547 // factor of 2 (e.g. +5/8 log base 3). |
|
548 // |
|
549 // I also somewhat lowered the Min/MaxGramCount limits above |
|
550 // |
|
551 // Added: if fewer than 8 quads/unis, max reliability is 12*n percent |
|
552 // |
|
553 int ReliabilityDelta(int value1, int value2, int gramcount) { |
|
554 int max_reliability_percent = 100; |
|
555 if (gramcount < 8) { |
|
556 max_reliability_percent = 12 * gramcount; |
|
557 } |
|
558 int fully_reliable_thresh = (gramcount * 5) >> 3; // see note above |
|
559 if (fully_reliable_thresh < kMinGramCount) { // Fully = 3..16 |
|
560 fully_reliable_thresh = kMinGramCount; |
|
561 } else if (fully_reliable_thresh > kMaxGramCount) { |
|
562 fully_reliable_thresh = kMaxGramCount; |
|
563 } |
|
564 |
|
565 int delta = value1 - value2; |
|
566 if (delta >= fully_reliable_thresh) {return max_reliability_percent;} |
|
567 if (delta <= 0) {return 0;} |
|
568 return minint(max_reliability_percent, |
|
569 (100 * delta) / fully_reliable_thresh); |
|
570 } |
|
571 |
|
572 // Return reliablity of result 0..100 for top score vs. expected mainsteam score |
|
573 // Values are score per 1024 bytes of input |
|
574 // ratio = max(top/mainstream, mainstream/top) |
|
575 // ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable |
|
576 // Change: short-text word scoring can give unusually good results. |
|
577 // Let top exceed mainstream by 4x at 50% reliable |
|
578 // |
|
579 // dsites April 2010: These could be tightened up. It would be |
|
580 // reasonable with newer data and round-robin table allocation to start ramping |
|
581 // down at mean * 1.5 and mean/1.5, while letting mean*2 and mean/2 pass, |
|
582 // but just barely. |
|
583 // |
|
584 // dsites March 2013: Tightened up a bit. |
|
585 static const double kRatio100 = 1.5; |
|
586 static const double kRatio0 = 4.0; |
|
587 int ReliabilityExpected(int actual_score_1kb, int expected_score_1kb) { |
|
588 if (expected_score_1kb == 0) {return 100;} // No reliability data available yet |
|
589 if (actual_score_1kb == 0) {return 0;} // zero score = unreliable |
|
590 double ratio; |
|
591 if (expected_score_1kb > actual_score_1kb) { |
|
592 ratio = (1.0 * expected_score_1kb) / actual_score_1kb; |
|
593 } else { |
|
594 ratio = (1.0 * actual_score_1kb) / expected_score_1kb; |
|
595 } |
|
596 // Ratio 1.0 .. 1.5 scores 100% |
|
597 // Ratio 2.0 scores 80% |
|
598 // Linear decline, to ratio 4.0 scores 0% |
|
599 if (ratio <= kRatio100) {return 100;} |
|
600 if (ratio > kRatio0) {return 0;} |
|
601 |
|
602 int percent_good = 100.0 * (kRatio0 - ratio) / (kRatio0 - kRatio100); |
|
603 return percent_good; |
|
604 } |
|
605 |
|
606 // Create a langprob packed value from its parts. |
|
607 // qprob is quantized [0..12] |
|
608 // We use Latn script to represent any RTypeMany language |
|
609 uint32 MakeLangProb(Language lang, int qprob) { |
|
610 uint32 pslang = PerScriptNumber(ULScript_Latin, lang); |
|
611 uint32 retval = (pslang << 8) | kLgProbV2TblBackmap[qprob]; |
|
612 return retval; |
|
613 } |
|
614 |
|
615 } // End namespace CLD2 |
|
616 |
|
617 |
|
618 |
|
619 |
|
620 |