|
1 // Copyright 2013 Google Inc. All Rights Reserved. |
|
2 // |
|
3 // Licensed under the Apache License, Version 2.0 (the "License"); |
|
4 // you may not use this file except in compliance with the License. |
|
5 // You may obtain a copy of the License at |
|
6 // |
|
7 // http://www.apache.org/licenses/LICENSE-2.0 |
|
8 // |
|
9 // Unless required by applicable law or agreed to in writing, software |
|
10 // distributed under the License is distributed on an "AS IS" BASIS, |
|
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
12 // See the License for the specific language governing permissions and |
|
13 // limitations under the License. |
|
14 |
|
15 // |
|
16 // Author: dsites@google.com (Dick Sites) |
|
17 // Updated 2014.01 for dual table lookup |
|
18 // |
|
19 |
|
20 #include "scoreonescriptspan.h" |
|
21 |
|
22 #include "cldutil.h" |
|
23 #include "debug.h" |
|
24 #include "lang_script.h" |
|
25 |
|
26 #include <stdio.h> |
|
27 |
|
28 using namespace std; |
|
29 |
|
30 namespace CLD2 { |
|
31 |
|
32 static const int kUnreliablePercentThreshold = 75; |
|
33 |
|
34 void AddLangProb(uint32 langprob, Tote* chunk_tote) { |
|
35 ProcessProbV2Tote(langprob, chunk_tote); |
|
36 } |
|
37 |
|
38 void ZeroPSLang(uint32 langprob, Tote* chunk_tote) { |
|
39 uint8 top1 = (langprob >> 8) & 0xff; |
|
40 chunk_tote->SetScore(top1, 0); |
|
41 } |
|
42 |
|
43 bool SameCloseSet(uint16 lang1, uint16 lang2) { |
|
44 int lang1_close_set = LanguageCloseSet(static_cast<Language>(lang1)); |
|
45 if (lang1_close_set == 0) {return false;} |
|
46 int lang2_close_set = LanguageCloseSet(static_cast<Language>(lang2)); |
|
47 return (lang1_close_set == lang2_close_set); |
|
48 } |
|
49 |
|
50 bool SameCloseSet(Language lang1, Language lang2) { |
|
51 int lang1_close_set = LanguageCloseSet(lang1); |
|
52 if (lang1_close_set == 0) {return false;} |
|
53 int lang2_close_set = LanguageCloseSet(lang2); |
|
54 return (lang1_close_set == lang2_close_set); |
|
55 } |
|
56 |
|
57 |
|
58 // Needs expected score per 1KB in scoring context |
|
59 void SetChunkSummary(ULScript ulscript, int first_linear_in_chunk, |
|
60 int offset, int len, |
|
61 const ScoringContext* scoringcontext, |
|
62 const Tote* chunk_tote, |
|
63 ChunkSummary* chunksummary) { |
|
64 int key3[3]; |
|
65 chunk_tote->CurrentTopThreeKeys(key3); |
|
66 Language lang1 = FromPerScriptNumber(ulscript, key3[0]); |
|
67 Language lang2 = FromPerScriptNumber(ulscript, key3[1]); |
|
68 |
|
69 int actual_score_per_kb = 0; |
|
70 if (len > 0) { |
|
71 actual_score_per_kb = (chunk_tote->GetScore(key3[0]) << 10) / len; |
|
72 } |
|
73 int expected_subscr = lang1 * 4 + LScript4(ulscript); |
|
74 int expected_score_per_kb = |
|
75 scoringcontext->scoringtables->kExpectedScore[expected_subscr]; |
|
76 |
|
77 chunksummary->offset = offset; |
|
78 chunksummary->chunk_start = first_linear_in_chunk; |
|
79 chunksummary->lang1 = lang1; |
|
80 chunksummary->lang2 = lang2; |
|
81 chunksummary->score1 = chunk_tote->GetScore(key3[0]); |
|
82 chunksummary->score2 = chunk_tote->GetScore(key3[1]); |
|
83 chunksummary->bytes = len; |
|
84 chunksummary->grams = chunk_tote->GetScoreCount(); |
|
85 chunksummary->ulscript = ulscript; |
|
86 chunksummary->reliability_delta = ReliabilityDelta(chunksummary->score1, |
|
87 chunksummary->score2, |
|
88 chunksummary->grams); |
|
89 // If lang1/lang2 in same close set, set delta reliability to 100% |
|
90 if (SameCloseSet(lang1, lang2)) { |
|
91 chunksummary->reliability_delta = 100; |
|
92 } |
|
93 chunksummary->reliability_score = |
|
94 ReliabilityExpected(actual_score_per_kb, expected_score_per_kb); |
|
95 } |
|
96 |
|
97 // Return true if just lang1 is there: lang2=0 and lang3=0 |
|
98 bool IsSingleLang(uint32 langprob) { |
|
99 // Probably a bug -- which end is lang1? But only used to call empty Boost1 |
|
100 return ((langprob & 0x00ffff00) == 0); |
|
101 } |
|
102 |
|
103 // Update scoring context distinct_boost for single language quad |
|
104 void AddDistinctBoost1(uint32 langprob, ScoringContext* scoringcontext) { |
|
105 // Probably keep this empty -- not a good enough signal |
|
106 } |
|
107 |
|
108 // Update scoring context distinct_boost for distinct octagram |
|
109 // Keep last 4 used. Since these are mostly (except at splices) in |
|
110 // hitbuffer, we might be able to just use a subscript and splice |
|
111 void AddDistinctBoost2(uint32 langprob, ScoringContext* scoringcontext) { |
|
112 // this is called 0..n times per chunk with decoded hitbuffer->distinct... |
|
113 LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn; |
|
114 if (scoringcontext->ulscript != ULScript_Latin) { |
|
115 distinct_boost = &scoringcontext->distinct_boost.othr; |
|
116 } |
|
117 int n = distinct_boost->n; |
|
118 distinct_boost->langprob[n] = langprob; |
|
119 distinct_boost->n = distinct_boost->wrap(n + 1); |
|
120 } |
|
121 |
|
122 // For each chunk, add extra weight for language priors (from content-lang and |
|
123 // meta lang=xx) and distinctive tokens |
|
124 void ScoreBoosts(const ScoringContext* scoringcontext, Tote* chunk_tote) { |
|
125 // Get boosts for current script |
|
126 const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn; |
|
127 const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn; |
|
128 const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn; |
|
129 if (scoringcontext->ulscript != ULScript_Latin) { |
|
130 langprior_boost = &scoringcontext->langprior_boost.othr; |
|
131 langprior_whack = &scoringcontext->langprior_whack.othr; |
|
132 distinct_boost = &scoringcontext->distinct_boost.othr; |
|
133 } |
|
134 |
|
135 for (int k = 0; k < kMaxBoosts; ++k) { |
|
136 uint32 langprob = langprior_boost->langprob[k]; |
|
137 if (langprob > 0) {AddLangProb(langprob, chunk_tote);} |
|
138 } |
|
139 for (int k = 0; k < kMaxBoosts; ++k) { |
|
140 uint32 langprob = distinct_boost->langprob[k]; |
|
141 if (langprob > 0) {AddLangProb(langprob, chunk_tote);} |
|
142 } |
|
143 // boost has a packed set of per-script langs and probabilites |
|
144 // whack has a packed set of per-script lang to be suppressed (zeroed) |
|
145 // When a language in a close set is given as an explicit hint, others in |
|
146 // that set will be whacked here. |
|
147 for (int k = 0; k < kMaxBoosts; ++k) { |
|
148 uint32 langprob = langprior_whack->langprob[k]; |
|
149 if (langprob > 0) {ZeroPSLang(langprob, chunk_tote);} |
|
150 } |
|
151 } |
|
152 |
|
153 |
|
154 |
|
155 // At this point, The chunk is described by |
|
156 // hitbuffer->base[cspan->chunk_base .. cspan->chunk_base + cspan->base_len) |
|
157 // hitbuffer->delta[cspan->chunk_delta ... ) |
|
158 // hitbuffer->distinct[cspan->chunk_distinct ... ) |
|
159 // Scored text is in text[lo..hi) where |
|
160 // lo is 0 or the min of first base/delta/distinct hitbuffer offset and |
|
161 // hi is the min of next base/delta/distinct hitbuffer offset after |
|
162 // base_len, etc. |
|
163 void GetTextSpanOffsets(const ScoringHitBuffer* hitbuffer, |
|
164 const ChunkSpan* cspan, int* lo, int* hi) { |
|
165 // Front of this span |
|
166 int lo_base = hitbuffer->base[cspan->chunk_base].offset; |
|
167 int lo_delta = hitbuffer->delta[cspan->chunk_delta].offset; |
|
168 int lo_distinct = hitbuffer->distinct[cspan->chunk_distinct].offset; |
|
169 // Front of next span |
|
170 int hi_base = hitbuffer->base[cspan->chunk_base + |
|
171 cspan->base_len].offset; |
|
172 int hi_delta = hitbuffer->delta[cspan->chunk_delta + |
|
173 cspan->delta_len].offset; |
|
174 int hi_distinct = hitbuffer->distinct[cspan->chunk_distinct + |
|
175 cspan->distinct_len].offset; |
|
176 |
|
177 *lo = 0; |
|
178 // if (cspan->chunk_base > 0) { |
|
179 // *lo = minint(minint(lo_base, lo_delta), lo_distinct); |
|
180 // } |
|
181 *lo = minint(minint(lo_base, lo_delta), lo_distinct); |
|
182 *hi = minint(minint(hi_base, hi_delta), hi_distinct); |
|
183 } |
|
184 |
|
185 |
|
186 int DiffScore(const CLD2TableSummary* obj, int indirect, |
|
187 uint16 lang1, uint16 lang2) { |
|
188 if (indirect < static_cast<int>(obj->kCLDTableSizeOne)) { |
|
189 // Up to three languages at indirect |
|
190 uint32 langprob = obj->kCLDTableInd[indirect]; |
|
191 return GetLangScore(langprob, lang1) - GetLangScore(langprob, lang2); |
|
192 } else { |
|
193 // Up to six languages at start + 2 * (indirect - start) |
|
194 indirect += (indirect - obj->kCLDTableSizeOne); |
|
195 uint32 langprob = obj->kCLDTableInd[indirect]; |
|
196 uint32 langprob2 = obj->kCLDTableInd[indirect + 1]; |
|
197 return (GetLangScore(langprob, lang1) + GetLangScore(langprob2, lang1)) - |
|
198 (GetLangScore(langprob, lang2) + GetLangScore(langprob2, lang2)); |
|
199 } |
|
200 |
|
201 } |
|
202 |
|
203 // Score all the bases, deltas, distincts, boosts for one chunk into chunk_tote |
|
204 // After last chunk there is always a hitbuffer entry with an offset just off |
|
205 // the end of the text. |
|
206 // Sets delta_len, and distinct_len |
|
207 void ScoreOneChunk(const char* text, ULScript ulscript, |
|
208 const ScoringHitBuffer* hitbuffer, |
|
209 int chunk_i, |
|
210 ScoringContext* scoringcontext, |
|
211 ChunkSpan* cspan, Tote* chunk_tote, |
|
212 ChunkSummary* chunksummary) { |
|
213 int first_linear_in_chunk = hitbuffer->chunk_start[chunk_i]; |
|
214 int first_linear_in_next_chunk = hitbuffer->chunk_start[chunk_i + 1]; |
|
215 |
|
216 chunk_tote->Reinit(); |
|
217 cspan->delta_len = 0; |
|
218 cspan->distinct_len = 0; |
|
219 if (scoringcontext->flags_cld2_verbose) { |
|
220 fprintf(scoringcontext->debug_file, "<br>ScoreOneChunk[%d..%d) ", |
|
221 first_linear_in_chunk, first_linear_in_next_chunk); |
|
222 } |
|
223 |
|
224 // 2013.02.05 linear design: just use base and base_len for the span |
|
225 cspan->chunk_base = first_linear_in_chunk; |
|
226 cspan->base_len = first_linear_in_next_chunk - first_linear_in_chunk; |
|
227 for (int i = first_linear_in_chunk; i < first_linear_in_next_chunk; ++i) { |
|
228 uint32 langprob = hitbuffer->linear[i].langprob; |
|
229 AddLangProb(langprob, chunk_tote); |
|
230 if (hitbuffer->linear[i].type <= QUADHIT) { |
|
231 chunk_tote->AddScoreCount(); // Just count quads, not octas |
|
232 } |
|
233 if (hitbuffer->linear[i].type == DISTINCTHIT) { |
|
234 AddDistinctBoost2(langprob, scoringcontext); |
|
235 } |
|
236 } |
|
237 |
|
238 // Score language prior boosts |
|
239 // Score distinct word boost |
|
240 ScoreBoosts(scoringcontext, chunk_tote); |
|
241 |
|
242 int lo = hitbuffer->linear[first_linear_in_chunk].offset; |
|
243 int hi = hitbuffer->linear[first_linear_in_next_chunk].offset; |
|
244 |
|
245 // Chunk_tote: get top langs, scores, etc. and fill in chunk summary |
|
246 SetChunkSummary(ulscript, first_linear_in_chunk, lo, hi - lo, |
|
247 scoringcontext, chunk_tote, chunksummary); |
|
248 |
|
249 bool more_to_come = false; |
|
250 bool score_cjk = false; |
|
251 if (scoringcontext->flags_cld2_html) { |
|
252 // Show one chunk in readable output |
|
253 CLD2_Debug(text, lo, hi, more_to_come, score_cjk, hitbuffer, |
|
254 scoringcontext, cspan, chunksummary); |
|
255 } |
|
256 |
|
257 scoringcontext->prior_chunk_lang = static_cast<Language>(chunksummary->lang1); |
|
258 } |
|
259 |
|
260 |
|
261 // Score chunks of text described by hitbuffer, allowing each to be in a |
|
262 // different language, and optionally adjusting the boundaries inbetween. |
|
263 // Set last_cspan to the last chunkspan used |
|
264 void ScoreAllHits(const char* text, ULScript ulscript, |
|
265 bool more_to_come, bool score_cjk, |
|
266 const ScoringHitBuffer* hitbuffer, |
|
267 ScoringContext* scoringcontext, |
|
268 SummaryBuffer* summarybuffer, ChunkSpan* last_cspan) { |
|
269 ChunkSpan prior_cspan = {0, 0, 0, 0, 0, 0}; |
|
270 ChunkSpan cspan = {0, 0, 0, 0, 0, 0}; |
|
271 |
|
272 for (int i = 0; i < hitbuffer->next_chunk_start; ++i) { |
|
273 // Score one chunk |
|
274 // Sets delta_len, and distinct_len |
|
275 Tote chunk_tote; |
|
276 ChunkSummary chunksummary; |
|
277 ScoreOneChunk(text, ulscript, |
|
278 hitbuffer, i, |
|
279 scoringcontext, &cspan, &chunk_tote, &chunksummary); |
|
280 |
|
281 // Put result in summarybuffer |
|
282 if (summarybuffer->n < kMaxSummaries) { |
|
283 summarybuffer->chunksummary[summarybuffer->n] = chunksummary; |
|
284 summarybuffer->n += 1; |
|
285 } |
|
286 |
|
287 prior_cspan = cspan; |
|
288 cspan.chunk_base += cspan.base_len; |
|
289 cspan.chunk_delta += cspan.delta_len; |
|
290 cspan.chunk_distinct += cspan.distinct_len; |
|
291 } |
|
292 |
|
293 // Add one dummy off the end to hold first unused linear_in_chunk |
|
294 int linear_off_end = hitbuffer->next_linear; |
|
295 int offset_off_end = hitbuffer->linear[linear_off_end].offset; |
|
296 ChunkSummary* cs = &summarybuffer->chunksummary[summarybuffer->n]; |
|
297 memset(cs, 0, sizeof(ChunkSummary)); |
|
298 cs->offset = offset_off_end; |
|
299 cs->chunk_start = linear_off_end; |
|
300 *last_cspan = prior_cspan; |
|
301 } |
|
302 |
|
303 |
|
304 void SummaryBufferToDocTote(const SummaryBuffer* summarybuffer, |
|
305 bool more_to_come, DocTote* doc_tote) { |
|
306 int cs_bytes_sum = 0; |
|
307 for (int i = 0; i < summarybuffer->n; ++i) { |
|
308 const ChunkSummary* cs = &summarybuffer->chunksummary[i]; |
|
309 int reliability = minint(cs->reliability_delta, cs->reliability_score); |
|
310 // doc_tote uses full languages |
|
311 doc_tote->Add(cs->lang1, cs->bytes, cs->score1, reliability); |
|
312 cs_bytes_sum += cs->bytes; |
|
313 } |
|
314 } |
|
315 |
|
316 // Turn on for debugging vectors |
|
317 static const bool kShowLettersOriginal = false; |
|
318 |
|
319 |
|
320 // If next chunk language matches last vector language, extend last element |
|
321 // Otherwise add new element to vector |
|
322 void ItemToVector(ScriptScanner* scanner, |
|
323 ResultChunkVector* vec, Language new_lang, |
|
324 int mapped_offset, int mapped_len) { |
|
325 uint16 last_vec_lang = static_cast<uint16>(UNKNOWN_LANGUAGE); |
|
326 int last_vec_subscr = vec->size() - 1; |
|
327 if (last_vec_subscr >= 0) { |
|
328 ResultChunk* priorrc = &(*vec)[last_vec_subscr]; |
|
329 last_vec_lang = priorrc->lang1; |
|
330 if (new_lang == last_vec_lang) { |
|
331 // Extend prior. Current mapped_offset may be beyond prior end, so do |
|
332 // the arithmetic to include any such gap |
|
333 priorrc->bytes = minint((mapped_offset + mapped_len) - priorrc->offset, |
|
334 kMaxResultChunkBytes); |
|
335 if (kShowLettersOriginal) { |
|
336 // Optionally print the new chunk original text |
|
337 string temp2(&scanner->GetBufferStart()[priorrc->offset], |
|
338 priorrc->bytes); |
|
339 fprintf(stderr, "Item[%d..%d) '%s'<br>\n", |
|
340 priorrc->offset, priorrc->offset + priorrc->bytes, |
|
341 GetHtmlEscapedText(temp2).c_str()); |
|
342 } |
|
343 return; |
|
344 } |
|
345 } |
|
346 // Add new vector element |
|
347 ResultChunk rc; |
|
348 rc.offset = mapped_offset; |
|
349 rc.bytes = minint(mapped_len, kMaxResultChunkBytes); |
|
350 rc.lang1 = static_cast<uint16>(new_lang); |
|
351 vec->push_back(rc); |
|
352 if (kShowLettersOriginal) { |
|
353 // Optionally print the new chunk original text |
|
354 string temp2(&scanner->GetBufferStart()[rc.offset], rc.bytes); |
|
355 fprintf(stderr, "Item[%d..%d) '%s'<br>\n", |
|
356 rc.offset, rc.offset + rc.bytes, |
|
357 GetHtmlEscapedText(temp2).c_str()); |
|
358 } |
|
359 } |
|
360 |
|
361 uint16 PriorVecLang(const ResultChunkVector* vec) { |
|
362 if (vec->empty()) {return static_cast<uint16>(UNKNOWN_LANGUAGE);} |
|
363 return (*vec)[vec->size() - 1].lang1; |
|
364 } |
|
365 |
|
366 uint16 NextChunkLang(const SummaryBuffer* summarybuffer, int i) { |
|
367 if ((i + 1) >= summarybuffer->n) { |
|
368 return static_cast<uint16>(UNKNOWN_LANGUAGE); |
|
369 } |
|
370 return summarybuffer->chunksummary[i + 1].lang1; |
|
371 } |
|
372 |
|
373 |
|
374 |
|
375 // Add n elements of summarybuffer to resultchunk vector: |
|
376 // Each element is letters-only text [offset..offset+bytes) |
|
377 // This maps back to original[Back(offset)..Back(offset+bytes)) |
|
378 // |
|
379 // We go out of our way to minimize the variation in the ResultChunkVector, |
|
380 // so that the caller has fewer but more meaningful spans in different |
|
381 // lanaguges, for the likely purpose of translation or spell-check. |
|
382 // |
|
383 // The language of each chunk is lang1, but it might be unreliable for |
|
384 // either of two reasons: its score is relatively too close to the score of |
|
385 // lang2, or its score is too far away from the expected score of real text in |
|
386 // the given language. Unreliable languages are mapped to Unknown. |
|
387 // |
|
388 void SummaryBufferToVector(ScriptScanner* scanner, const char* text, |
|
389 const SummaryBuffer* summarybuffer, |
|
390 bool more_to_come, ResultChunkVector* vec) { |
|
391 if (vec == NULL) {return;} |
|
392 |
|
393 if (kShowLettersOriginal) { |
|
394 fprintf(stderr, "map2original_ "); |
|
395 scanner->map2original_.DumpWindow(); |
|
396 fprintf(stderr, "<br>\n"); |
|
397 fprintf(stderr, "map2uplow_ "); |
|
398 scanner->map2uplow_.DumpWindow(); |
|
399 fprintf(stderr, "<br>\n"); |
|
400 } |
|
401 |
|
402 for (int i = 0; i < summarybuffer->n; ++i) { |
|
403 const ChunkSummary* cs = &summarybuffer->chunksummary[i]; |
|
404 int unmapped_offset = cs->offset; |
|
405 int unmapped_len = cs->bytes; |
|
406 |
|
407 if (kShowLettersOriginal) { |
|
408 // Optionally print the chunk lowercase letters/marks text |
|
409 string temp(&text[unmapped_offset], unmapped_len); |
|
410 fprintf(stderr, "Letters [%d..%d) '%s'<br>\n", |
|
411 unmapped_offset, unmapped_offset + unmapped_len, |
|
412 GetHtmlEscapedText(temp).c_str()); |
|
413 } |
|
414 |
|
415 int mapped_offset = scanner->MapBack(unmapped_offset); |
|
416 |
|
417 // Trim back a little to prefer splicing original at word boundaries |
|
418 if (mapped_offset > 0) { |
|
419 // Size of prior vector entry, if any |
|
420 int prior_size = 0; |
|
421 if (!vec->empty()) { |
|
422 ResultChunk* rc = &(*vec)[vec->size() - 1]; |
|
423 prior_size = rc->bytes; |
|
424 } |
|
425 // Maximum back up size to leave at least 3 bytes in prior, |
|
426 // and not entire buffer, and no more than 12 bytes total backup |
|
427 int n_limit = minint(prior_size - 3, mapped_offset); |
|
428 n_limit = minint(n_limit, 12); |
|
429 |
|
430 // Backscan over letters, stopping if prior byte is < 0x41 |
|
431 // There is some possibility that we will backscan over a different script |
|
432 const char* s = &scanner->GetBufferStart()[mapped_offset]; |
|
433 const unsigned char* us = reinterpret_cast<const unsigned char*>(s); |
|
434 int n = 0; |
|
435 while ((n < n_limit) && (us[-n - 1] >= 0x41)) {++n;} |
|
436 if (n >= n_limit) {n = 0;} // New boundary not found within range |
|
437 |
|
438 // Also back up exactly one leading punctuation character if '"#@ |
|
439 if (n < n_limit) { |
|
440 unsigned char c = us[-n - 1]; |
|
441 if ((c == '\'') || (c == '"') || (c == '#') || (c == '@')) {++n;} |
|
442 } |
|
443 // Shrink the previous chunk slightly |
|
444 if (n > 0) { |
|
445 ResultChunk* rc = &(*vec)[vec->size() - 1]; |
|
446 rc->bytes -= n; |
|
447 mapped_offset -= n; |
|
448 if (kShowLettersOriginal) { |
|
449 fprintf(stderr, "Back up %d bytes<br>\n", n); |
|
450 // Optionally print the prior chunk original text |
|
451 string temp2(&scanner->GetBufferStart()[rc->offset], rc->bytes); |
|
452 fprintf(stderr, "Prior [%d..%d) '%s'<br>\n", |
|
453 rc->offset, rc->offset + rc->bytes, |
|
454 GetHtmlEscapedText(temp2).c_str()); |
|
455 } |
|
456 } |
|
457 } |
|
458 |
|
459 int mapped_len = |
|
460 scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset; |
|
461 |
|
462 if (kShowLettersOriginal) { |
|
463 // Optionally print the chunk original text |
|
464 string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len); |
|
465 fprintf(stderr, "Original[%d..%d) '%s'<br>\n", |
|
466 mapped_offset, mapped_offset + mapped_len, |
|
467 GetHtmlEscapedText(temp2).c_str()); |
|
468 } |
|
469 |
|
470 Language new_lang = static_cast<Language>(cs->lang1); |
|
471 bool reliability_delta_bad = |
|
472 (cs->reliability_delta < kUnreliablePercentThreshold); |
|
473 bool reliability_score_bad = |
|
474 (cs->reliability_score < kUnreliablePercentThreshold); |
|
475 |
|
476 // If the top language matches last vector, ignore reliability_delta |
|
477 uint16 prior_lang = PriorVecLang(vec); |
|
478 if (prior_lang == cs->lang1) { |
|
479 reliability_delta_bad = false; |
|
480 } |
|
481 // If the top language is in same close set as last vector, set up to merge |
|
482 if (SameCloseSet(cs->lang1, prior_lang)) { |
|
483 new_lang = static_cast<Language>(prior_lang); |
|
484 reliability_delta_bad = false; |
|
485 } |
|
486 // If the top two languages are in the same close set and the last vector |
|
487 // language is the second language, set up to merge |
|
488 if (SameCloseSet(cs->lang1, cs->lang2) && |
|
489 (prior_lang == cs->lang2)) { |
|
490 new_lang = static_cast<Language>(prior_lang); |
|
491 reliability_delta_bad = false; |
|
492 } |
|
493 // If unreliable and the last and next vector languages are both |
|
494 // the second language, set up to merge |
|
495 uint16 next_lang = NextChunkLang(summarybuffer, i); |
|
496 if (reliability_delta_bad && |
|
497 (prior_lang == cs->lang2) && (next_lang == cs->lang2)) { |
|
498 new_lang = static_cast<Language>(prior_lang); |
|
499 reliability_delta_bad = false; |
|
500 } |
|
501 |
|
502 if (reliability_delta_bad || reliability_score_bad) { |
|
503 new_lang = UNKNOWN_LANGUAGE; |
|
504 } |
|
505 ItemToVector(scanner, vec, new_lang, mapped_offset, mapped_len); |
|
506 } |
|
507 } |
|
508 |
|
509 // Add just one element to resultchunk vector: |
|
510 // For RTypeNone or RTypeOne |
|
511 void JustOneItemToVector(ScriptScanner* scanner, const char* text, |
|
512 Language lang1, int unmapped_offset, int unmapped_len, |
|
513 ResultChunkVector* vec) { |
|
514 if (vec == NULL) {return;} |
|
515 |
|
516 if (kShowLettersOriginal) { |
|
517 fprintf(stderr, "map2original_ "); |
|
518 scanner->map2original_.DumpWindow(); |
|
519 fprintf(stderr, "<br>\n"); |
|
520 fprintf(stderr, "map2uplow_ "); |
|
521 scanner->map2uplow_.DumpWindow(); |
|
522 fprintf(stderr, "<br>\n"); |
|
523 } |
|
524 |
|
525 if (kShowLettersOriginal) { |
|
526 // Optionally print the chunk lowercase letters/marks text |
|
527 string temp(&text[unmapped_offset], unmapped_len); |
|
528 fprintf(stderr, "Letters1 [%d..%d) '%s'<br>\n", |
|
529 unmapped_offset, unmapped_offset + unmapped_len, |
|
530 GetHtmlEscapedText(temp).c_str()); |
|
531 } |
|
532 |
|
533 int mapped_offset = scanner->MapBack(unmapped_offset); |
|
534 int mapped_len = |
|
535 scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset; |
|
536 |
|
537 if (kShowLettersOriginal) { |
|
538 // Optionally print the chunk original text |
|
539 string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len); |
|
540 fprintf(stderr, "Original1[%d..%d) '%s'<br>\n", |
|
541 mapped_offset, mapped_offset + mapped_len, |
|
542 GetHtmlEscapedText(temp2).c_str()); |
|
543 } |
|
544 |
|
545 ItemToVector(scanner, vec, lang1, mapped_offset, mapped_len); |
|
546 } |
|
547 |
|
548 |
|
549 // Debugging. Not thread safe. Defined in getonescriptspan |
|
550 char* DisplayPiece(const char* next_byte_, int byte_length_); |
|
551 |
|
552 // If high bit is on, take out high bit and add 2B to make table2 entries easy |
|
553 inline int PrintableIndirect(int x) { |
|
554 if ((x & 0x80000000u) != 0) { |
|
555 return (x & ~0x80000000u) + 2000000000; |
|
556 } |
|
557 return x; |
|
558 } |
|
559 void DumpHitBuffer(FILE* df, const char* text, |
|
560 const ScoringHitBuffer* hitbuffer) { |
|
561 fprintf(df, |
|
562 "<br>DumpHitBuffer[%s, next_base/delta/distinct %d, %d, %d)<br>\n", |
|
563 ULScriptCode(hitbuffer->ulscript), |
|
564 hitbuffer->next_base, hitbuffer->next_delta, |
|
565 hitbuffer->next_distinct); |
|
566 for (int i = 0; i < hitbuffer->maxscoringhits; ++i) { |
|
567 if (i < hitbuffer->next_base) { |
|
568 fprintf(df, "Q[%d]%d,%d,%s ", |
|
569 i, hitbuffer->base[i].offset, |
|
570 PrintableIndirect(hitbuffer->base[i].indirect), |
|
571 DisplayPiece(&text[hitbuffer->base[i].offset], 6)); |
|
572 } |
|
573 if (i < hitbuffer->next_delta) { |
|
574 fprintf(df, "DL[%d]%d,%d,%s ", |
|
575 i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect, |
|
576 DisplayPiece(&text[hitbuffer->delta[i].offset], 12)); |
|
577 } |
|
578 if (i < hitbuffer->next_distinct) { |
|
579 fprintf(df, "D[%d]%d,%d,%s ", |
|
580 i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect, |
|
581 DisplayPiece(&text[hitbuffer->distinct[i].offset], 12)); |
|
582 } |
|
583 if (i < hitbuffer->next_base) { |
|
584 fprintf(df, "<br>\n"); |
|
585 } |
|
586 if (i > 50) {break;} |
|
587 } |
|
588 if (hitbuffer->next_base > 50) { |
|
589 int i = hitbuffer->next_base; |
|
590 fprintf(df, "Q[%d]%d,%d,%s ", |
|
591 i, hitbuffer->base[i].offset, |
|
592 PrintableIndirect(hitbuffer->base[i].indirect), |
|
593 DisplayPiece(&text[hitbuffer->base[i].offset], 6)); |
|
594 } |
|
595 if (hitbuffer->next_delta > 50) { |
|
596 int i = hitbuffer->next_delta; |
|
597 fprintf(df, "DL[%d]%d,%d,%s ", |
|
598 i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect, |
|
599 DisplayPiece(&text[hitbuffer->delta[i].offset], 12)); |
|
600 } |
|
601 if (hitbuffer->next_distinct > 50) { |
|
602 int i = hitbuffer->next_distinct; |
|
603 fprintf(df, "D[%d]%d,%d,%s ", |
|
604 i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect, |
|
605 DisplayPiece(&text[hitbuffer->distinct[i].offset], 12)); |
|
606 } |
|
607 fprintf(df, "<br>\n"); |
|
608 } |
|
609 |
|
610 |
|
611 void DumpLinearBuffer(FILE* df, const char* text, |
|
612 const ScoringHitBuffer* hitbuffer) { |
|
613 fprintf(df, "<br>DumpLinearBuffer[%d)<br>\n", |
|
614 hitbuffer->next_linear); |
|
615 // Include the dummy entry off the end |
|
616 for (int i = 0; i < hitbuffer->next_linear + 1; ++i) { |
|
617 if ((50 < i) && (i < (hitbuffer->next_linear - 1))) {continue;} |
|
618 fprintf(df, "[%d]%d,%c=%08x,%s<br>\n", |
|
619 i, hitbuffer->linear[i].offset, |
|
620 "UQLD"[hitbuffer->linear[i].type], |
|
621 hitbuffer->linear[i].langprob, |
|
622 DisplayPiece(&text[hitbuffer->linear[i].offset], 6)); |
|
623 } |
|
624 fprintf(df, "<br>\n"); |
|
625 |
|
626 fprintf(df, "DumpChunkStart[%d]<br>\n", hitbuffer->next_chunk_start); |
|
627 for (int i = 0; i < hitbuffer->next_chunk_start + 1; ++i) { |
|
628 fprintf(df, "[%d]%d\n", i, hitbuffer->chunk_start[i]); |
|
629 } |
|
630 fprintf(df, "<br>\n"); |
|
631 } |
|
632 |
|
633 // Move this verbose debugging output to debug.cc eventually |
|
634 void DumpChunkSummary(FILE* df, const ChunkSummary* cs) { |
|
635 // Print chunksummary |
|
636 fprintf(df, "%d lin[%d] %s.%d %s.%d %dB %d# %s %dRd %dRs<br>\n", |
|
637 cs->offset, |
|
638 cs->chunk_start, |
|
639 LanguageCode(static_cast<Language>(cs->lang1)), |
|
640 cs->score1, |
|
641 LanguageCode(static_cast<Language>(cs->lang2)), |
|
642 cs->score2, |
|
643 cs->bytes, |
|
644 cs->grams, |
|
645 ULScriptCode(static_cast<ULScript>(cs->ulscript)), |
|
646 cs->reliability_delta, |
|
647 cs->reliability_score); |
|
648 } |
|
649 |
|
650 void DumpSummaryBuffer(FILE* df, const SummaryBuffer* summarybuffer) { |
|
651 fprintf(df, "<br>DumpSummaryBuffer[%d]<br>\n", summarybuffer->n); |
|
652 fprintf(df, "[i] offset linear[chunk_start] lang.score1 lang.score2 " |
|
653 "bytesB ngrams# script rel_delta rel_score<br>\n"); |
|
654 for (int i = 0; i <= summarybuffer->n; ++i) { |
|
655 fprintf(df, "[%d] ", i); |
|
656 DumpChunkSummary(df, &summarybuffer->chunksummary[i]); |
|
657 } |
|
658 fprintf(df, "<br>\n"); |
|
659 } |
|
660 |
|
661 |
|
662 |
|
663 // Within hitbufer->linear[] |
|
664 // <-- prior chunk --><-- this chunk --> |
|
665 // | | | |
|
666 // linear0 linear1 linear2 |
|
667 // lang0 lang1 |
|
668 // The goal of sharpening is to move this_linear to better separate langs |
|
669 int BetterBoundary(const char* text, |
|
670 ScoringHitBuffer* hitbuffer, |
|
671 ScoringContext* scoringcontext, |
|
672 uint16 pslang0, uint16 pslang1, |
|
673 int linear0, int linear1, int linear2) { |
|
674 // Degenerate case, no change |
|
675 if ((linear2 - linear0) <= 8) {return linear1;} |
|
676 |
|
677 // Each diff gives pslang0 score - pslang1 score |
|
678 // Running diff has four entries + + + + followed by four entries - - - - |
|
679 // so that this value is maximal at the sharpest boundary between pslang0 |
|
680 // (positive diffs) and pslang1 (negative diffs) |
|
681 int running_diff = 0; |
|
682 int diff[8]; // Ring buffer of pslang0-pslang1 differences |
|
683 // Initialize with first 8 diffs |
|
684 for (int i = linear0; i < linear0 + 8; ++i) { |
|
685 int j = i & 7; |
|
686 uint32 langprob = hitbuffer->linear[i].langprob; |
|
687 diff[j] = GetLangScore(langprob, pslang0) - |
|
688 GetLangScore(langprob, pslang1); |
|
689 if (i < linear0 + 4) { |
|
690 // First four diffs pslang0 - pslang1 |
|
691 running_diff += diff[j]; |
|
692 } else { |
|
693 // Second four diffs -(pslang0 - pslang1) |
|
694 running_diff -= diff[j]; |
|
695 } |
|
696 } |
|
697 |
|
698 // Now scan for sharpest boundary. j is at left end of 8 entries |
|
699 // To be a boundary, there must be both >0 and <0 entries in the window |
|
700 int better_boundary_value = 0; |
|
701 int better_boundary = linear1; |
|
702 for (int i = linear0; i < linear2 - 8; ++i) { |
|
703 int j = i & 7; |
|
704 if (better_boundary_value < running_diff) { |
|
705 bool has_plus = false; |
|
706 bool has_minus = false; |
|
707 for (int kk = 0; kk < 8; ++kk) { |
|
708 if (diff[kk] > 0) {has_plus = true;} |
|
709 if (diff[kk] < 0) {has_minus = true;} |
|
710 } |
|
711 if (has_plus && has_minus) { |
|
712 better_boundary_value = running_diff; |
|
713 better_boundary = i + 4; |
|
714 } |
|
715 } |
|
716 // Shift right one entry |
|
717 uint32 langprob = hitbuffer->linear[i + 8].langprob; |
|
718 int newdiff = GetLangScore(langprob, pslang0) - |
|
719 GetLangScore(langprob, pslang1); |
|
720 int middiff = diff[(i + 4) & 7]; |
|
721 int olddiff = diff[j]; |
|
722 diff[j] = newdiff; |
|
723 running_diff -= olddiff; // Remove left |
|
724 running_diff += 2 * middiff; // Convert middle from - to + |
|
725 running_diff -= newdiff; // Insert right |
|
726 } |
|
727 |
|
728 if (scoringcontext->flags_cld2_verbose && (linear1 != better_boundary)) { |
|
729 Language lang0 = FromPerScriptNumber(scoringcontext->ulscript, pslang0); |
|
730 Language lang1 = FromPerScriptNumber(scoringcontext->ulscript, pslang1); |
|
731 fprintf(scoringcontext->debug_file, " Better lin[%d=>%d] %s^^%s <br>\n", |
|
732 linear1, better_boundary, |
|
733 LanguageCode(lang0), LanguageCode(lang1)); |
|
734 int lin0_off = hitbuffer->linear[linear0].offset; |
|
735 int lin1_off = hitbuffer->linear[linear1].offset; |
|
736 int lin2_off = hitbuffer->linear[linear2].offset; |
|
737 int better_offm1 = hitbuffer->linear[better_boundary - 1].offset; |
|
738 int better_off = hitbuffer->linear[better_boundary].offset; |
|
739 int better_offp1 = hitbuffer->linear[better_boundary + 1].offset; |
|
740 string old0(&text[lin0_off], lin1_off - lin0_off); |
|
741 string old1(&text[lin1_off], lin2_off - lin1_off); |
|
742 string new0(&text[lin0_off], better_offm1 - lin0_off); |
|
743 string new0m1(&text[better_offm1], better_off - better_offm1); |
|
744 string new1(&text[better_off], better_offp1 - better_off); |
|
745 string new1p1(&text[better_offp1], lin2_off - better_offp1); |
|
746 fprintf(scoringcontext->debug_file, "%s^^%s => <br>\n%s^%s^^%s^%s<br>\n", |
|
747 GetHtmlEscapedText(old0).c_str(), |
|
748 GetHtmlEscapedText(old1).c_str(), |
|
749 GetHtmlEscapedText(new0).c_str(), |
|
750 GetHtmlEscapedText(new0m1).c_str(), |
|
751 GetHtmlEscapedText(new1).c_str(), |
|
752 GetHtmlEscapedText(new1p1).c_str()); |
|
753 // Slow picture of differences per linear entry |
|
754 int d; |
|
755 for (int i = linear0; i < linear2; ++i) { |
|
756 if (i == better_boundary) { |
|
757 fprintf(scoringcontext->debug_file, "^^ "); |
|
758 } |
|
759 uint32 langprob = hitbuffer->linear[i].langprob; |
|
760 d = GetLangScore(langprob, pslang0) - GetLangScore(langprob, pslang1); |
|
761 const char* s = "="; |
|
762 //if (d > 2) {s = "\xc2\xaf";} // Macron |
|
763 if (d > 2) {s = "#";} |
|
764 else if (d > 0) {s = "+";} |
|
765 else if (d < -2) {s = "_";} |
|
766 else if (d < 0) {s = "-";} |
|
767 fprintf(scoringcontext->debug_file, "%s ", s); |
|
768 } |
|
769 fprintf(scoringcontext->debug_file, " (scale: #+=-_)<br>\n"); |
|
770 } |
|
771 return better_boundary; |
|
772 } |
|
773 |
|
774 |
|
775 // For all but the first summary, if its top language differs from |
|
776 // the previous chunk, refine the boundary |
|
777 // Linearized version |
|
778 void SharpenBoundaries(const char* text, |
|
779 bool more_to_come, |
|
780 ScoringHitBuffer* hitbuffer, |
|
781 ScoringContext* scoringcontext, |
|
782 SummaryBuffer* summarybuffer) { |
|
783 |
|
784 int prior_linear = summarybuffer->chunksummary[0].chunk_start; |
|
785 uint16 prior_lang = summarybuffer->chunksummary[0].lang1; |
|
786 |
|
787 if (scoringcontext->flags_cld2_verbose) { |
|
788 fprintf(scoringcontext->debug_file, "<br>SharpenBoundaries<br>\n"); |
|
789 } |
|
790 for (int i = 1; i < summarybuffer->n; ++i) { |
|
791 ChunkSummary* cs = &summarybuffer->chunksummary[i]; |
|
792 uint16 this_lang = cs->lang1; |
|
793 if (this_lang == prior_lang) { |
|
794 prior_linear = cs->chunk_start; |
|
795 continue; |
|
796 } |
|
797 |
|
798 int this_linear = cs->chunk_start; |
|
799 int next_linear = summarybuffer->chunksummary[i + 1].chunk_start; |
|
800 |
|
801 // If this/prior in same close set, don't move boundary |
|
802 if (SameCloseSet(prior_lang, this_lang)) { |
|
803 prior_linear = this_linear; |
|
804 prior_lang = this_lang; |
|
805 continue; |
|
806 } |
|
807 |
|
808 |
|
809 // Within hitbuffer->linear[] |
|
810 // <-- prior chunk --><-- this chunk --> |
|
811 // | | | |
|
812 // prior_linear this_linear next_linear |
|
813 // prior_lang this_lang |
|
814 // The goal of sharpening is to move this_linear to better separate langs |
|
815 |
|
816 uint8 pslang0 = PerScriptNumber(scoringcontext->ulscript, |
|
817 static_cast<Language>(prior_lang)); |
|
818 uint8 pslang1 = PerScriptNumber(scoringcontext->ulscript, |
|
819 static_cast<Language>(this_lang)); |
|
820 int better_linear = BetterBoundary(text, |
|
821 hitbuffer, |
|
822 scoringcontext, |
|
823 pslang0, pslang1, |
|
824 prior_linear, this_linear, next_linear); |
|
825 |
|
826 int old_offset = hitbuffer->linear[this_linear].offset; |
|
827 int new_offset = hitbuffer->linear[better_linear].offset; |
|
828 cs->chunk_start = better_linear; |
|
829 cs->offset = new_offset; |
|
830 // If this_linear moved right, make bytes smaller for this, larger for prior |
|
831 // If this_linear moved left, make bytes larger for this, smaller for prior |
|
832 cs->bytes -= (new_offset - old_offset); |
|
833 summarybuffer->chunksummary[i - 1].bytes += (new_offset - old_offset); |
|
834 |
|
835 this_linear = better_linear; // Update so that next chunk doesn't intrude |
|
836 |
|
837 // Consider rescoring the two chunks |
|
838 |
|
839 // Update for next round (note: using pre-updated boundary) |
|
840 prior_linear = this_linear; |
|
841 prior_lang = this_lang; |
|
842 } |
|
843 } |
|
844 |
|
845 // Make a langprob that gives small weight to the default language for ulscript |
|
846 uint32 DefaultLangProb(ULScript ulscript) { |
|
847 Language default_lang = DefaultLanguage(ulscript); |
|
848 return MakeLangProb(default_lang, 1); |
|
849 } |
|
850 |
|
851 // Effectively, do a merge-sort based on text offsets |
|
852 // Look up each indirect value in appropriate scoring table and keep |
|
853 // just the resulting langprobs |
|
854 void LinearizeAll(ScoringContext* scoringcontext, bool score_cjk, |
|
855 ScoringHitBuffer* hitbuffer) { |
|
856 const CLD2TableSummary* base_obj; // unigram or quadgram |
|
857 const CLD2TableSummary* base_obj2; // quadgram dual table |
|
858 const CLD2TableSummary* delta_obj; // bigram or octagram |
|
859 const CLD2TableSummary* distinct_obj; // bigram or octagram |
|
860 uint16 base_hit; |
|
861 if (score_cjk) { |
|
862 base_obj = scoringcontext->scoringtables->unigram_compat_obj; |
|
863 base_obj2 = scoringcontext->scoringtables->unigram_compat_obj; |
|
864 delta_obj = scoringcontext->scoringtables->deltabi_obj; |
|
865 distinct_obj = scoringcontext->scoringtables->distinctbi_obj; |
|
866 base_hit = UNIHIT; |
|
867 } else { |
|
868 base_obj = scoringcontext->scoringtables->quadgram_obj; |
|
869 base_obj2 = scoringcontext->scoringtables->quadgram_obj2; |
|
870 delta_obj = scoringcontext->scoringtables->deltaocta_obj; |
|
871 distinct_obj = scoringcontext->scoringtables->distinctocta_obj; |
|
872 base_hit = QUADHIT; |
|
873 } |
|
874 |
|
875 int base_limit = hitbuffer->next_base; |
|
876 int delta_limit = hitbuffer->next_delta; |
|
877 int distinct_limit = hitbuffer->next_distinct; |
|
878 int base_i = 0; |
|
879 int delta_i = 0; |
|
880 int distinct_i = 0; |
|
881 int linear_i = 0; |
|
882 |
|
883 // Start with an initial base hit for the default language for this script |
|
884 // Inserting this avoids edge effects with no hits at all |
|
885 hitbuffer->linear[linear_i].offset = hitbuffer->lowest_offset; |
|
886 hitbuffer->linear[linear_i].type = base_hit; |
|
887 hitbuffer->linear[linear_i].langprob = |
|
888 DefaultLangProb(scoringcontext->ulscript); |
|
889 ++linear_i; |
|
890 |
|
891 while ((base_i < base_limit) || (delta_i < delta_limit) || |
|
892 (distinct_i < distinct_limit)) { |
|
893 int base_off = hitbuffer->base[base_i].offset; |
|
894 int delta_off = hitbuffer->delta[delta_i].offset; |
|
895 int distinct_off = hitbuffer->distinct[distinct_i].offset; |
|
896 |
|
897 // Do delta and distinct first, so that they are not lost at base_limit |
|
898 if ((delta_i < delta_limit) && |
|
899 (delta_off <= base_off) && (delta_off <= distinct_off)) { |
|
900 // Add delta entry |
|
901 int indirect = hitbuffer->delta[delta_i].indirect; |
|
902 ++delta_i; |
|
903 uint32 langprob = delta_obj->kCLDTableInd[indirect]; |
|
904 if (langprob > 0) { |
|
905 hitbuffer->linear[linear_i].offset = delta_off; |
|
906 hitbuffer->linear[linear_i].type = DELTAHIT; |
|
907 hitbuffer->linear[linear_i].langprob = langprob; |
|
908 ++linear_i; |
|
909 } |
|
910 } |
|
911 else if ((distinct_i < distinct_limit) && |
|
912 (distinct_off <= base_off) && (distinct_off <= delta_off)) { |
|
913 // Add distinct entry |
|
914 int indirect = hitbuffer->distinct[distinct_i].indirect; |
|
915 ++distinct_i; |
|
916 uint32 langprob = distinct_obj->kCLDTableInd[indirect]; |
|
917 if (langprob > 0) { |
|
918 hitbuffer->linear[linear_i].offset = distinct_off; |
|
919 hitbuffer->linear[linear_i].type = DISTINCTHIT; |
|
920 hitbuffer->linear[linear_i].langprob = langprob; |
|
921 ++linear_i; |
|
922 } |
|
923 } |
|
924 else { |
|
925 // Add one or two base entries |
|
926 int indirect = hitbuffer->base[base_i].indirect; |
|
927 // First, get right scoring table |
|
928 const CLD2TableSummary* local_base_obj = base_obj; |
|
929 if ((indirect & 0x80000000u) != 0) { |
|
930 local_base_obj = base_obj2; |
|
931 indirect &= ~0x80000000u; |
|
932 } |
|
933 ++base_i; |
|
934 // One langprob in kQuadInd[0..SingleSize), |
|
935 // two in kQuadInd[SingleSize..Size) |
|
936 if (indirect < static_cast<int>(local_base_obj->kCLDTableSizeOne)) { |
|
937 // Up to three languages at indirect |
|
938 uint32 langprob = local_base_obj->kCLDTableInd[indirect]; |
|
939 if (langprob > 0) { |
|
940 hitbuffer->linear[linear_i].offset = base_off; |
|
941 hitbuffer->linear[linear_i].type = base_hit; |
|
942 hitbuffer->linear[linear_i].langprob = langprob; |
|
943 ++linear_i; |
|
944 } |
|
945 } else { |
|
946 // Up to six languages at start + 2 * (indirect - start) |
|
947 indirect += (indirect - local_base_obj->kCLDTableSizeOne); |
|
948 uint32 langprob = local_base_obj->kCLDTableInd[indirect]; |
|
949 uint32 langprob2 = local_base_obj->kCLDTableInd[indirect + 1]; |
|
950 if (langprob > 0) { |
|
951 hitbuffer->linear[linear_i].offset = base_off; |
|
952 hitbuffer->linear[linear_i].type = base_hit; |
|
953 hitbuffer->linear[linear_i].langprob = langprob; |
|
954 ++linear_i; |
|
955 } |
|
956 if (langprob2 > 0) { |
|
957 hitbuffer->linear[linear_i].offset = base_off; |
|
958 hitbuffer->linear[linear_i].type = base_hit; |
|
959 hitbuffer->linear[linear_i].langprob = langprob2; |
|
960 ++linear_i; |
|
961 } |
|
962 } |
|
963 } |
|
964 } |
|
965 |
|
966 // Update |
|
967 hitbuffer->next_linear = linear_i; |
|
968 |
|
969 // Add a dummy entry off the end, just to capture final offset |
|
970 hitbuffer->linear[linear_i].offset = |
|
971 hitbuffer->base[hitbuffer->next_base].offset; |
|
972 hitbuffer->linear[linear_i].langprob = 0; |
|
973 } |
|
974 |
|
975 // Break linear array into chunks of ~20 quadgram hits or ~50 CJK unigram hits |
|
976 void ChunkAll(int letter_offset, bool score_cjk, ScoringHitBuffer* hitbuffer) { |
|
977 int chunksize; |
|
978 uint16 base_hit; |
|
979 if (score_cjk) { |
|
980 chunksize = kChunksizeUnis; |
|
981 base_hit = UNIHIT; |
|
982 } else { |
|
983 chunksize = kChunksizeQuads; |
|
984 base_hit = QUADHIT; |
|
985 } |
|
986 |
|
987 int linear_i = 0; |
|
988 int linear_off_end = hitbuffer->next_linear; |
|
989 int text_i = letter_offset; // Next unseen text offset |
|
990 int next_chunk_start = 0; |
|
991 int bases_left = hitbuffer->next_base; |
|
992 while (bases_left > 0) { |
|
993 // Linearize one chunk |
|
994 int base_len = chunksize; // Default; may be changed below |
|
995 if (bases_left < (chunksize + (chunksize >> 1))) { |
|
996 // If within 1.5 chunks of the end, avoid runts by using it all |
|
997 base_len = bases_left; |
|
998 } else if (bases_left < (2 * chunksize)) { |
|
999 // Avoid runts by splitting 1.5 to 2 chunks in half (about 3/4 each) |
|
1000 base_len = (bases_left + 1) >> 1; |
|
1001 } |
|
1002 |
|
1003 hitbuffer->chunk_start[next_chunk_start] = linear_i; |
|
1004 hitbuffer->chunk_offset[next_chunk_start] = text_i; |
|
1005 ++next_chunk_start; |
|
1006 |
|
1007 int base_count = 0; |
|
1008 while ((base_count < base_len) && (linear_i < linear_off_end)) { |
|
1009 if (hitbuffer->linear[linear_i].type == base_hit) {++base_count;} |
|
1010 ++linear_i; |
|
1011 } |
|
1012 text_i = hitbuffer->linear[linear_i].offset; // Next unseen text offset |
|
1013 bases_left -= base_len; |
|
1014 } |
|
1015 |
|
1016 // If no base hits at all, make a single dummy chunk |
|
1017 if (next_chunk_start == 0) { |
|
1018 hitbuffer->chunk_start[next_chunk_start] = 0; |
|
1019 hitbuffer->chunk_offset[next_chunk_start] = hitbuffer->linear[0].offset; |
|
1020 ++next_chunk_start; |
|
1021 } |
|
1022 |
|
1023 // Remember the linear array start of dummy entry |
|
1024 hitbuffer->next_chunk_start = next_chunk_start; |
|
1025 |
|
1026 // Add a dummy entry off the end, just to capture final linear subscr |
|
1027 hitbuffer->chunk_start[next_chunk_start] = hitbuffer->next_linear; |
|
1028 hitbuffer->chunk_offset[next_chunk_start] = text_i; |
|
1029 } |
|
1030 |
|
1031 |
|
1032 // Merge-sort the individual hit arrays, go indirect on the scoring subscripts, |
|
1033 // break linear array into chunks. |
|
1034 // |
|
1035 // Input: |
|
1036 // hitbuffer base, delta, distinct arrays |
|
1037 // Output: |
|
1038 // linear array |
|
1039 // chunk_start array |
|
1040 // |
|
1041 void LinearizeHitBuffer(int letter_offset, |
|
1042 ScoringContext* scoringcontext, |
|
1043 bool more_to_come, bool score_cjk, |
|
1044 ScoringHitBuffer* hitbuffer) { |
|
1045 LinearizeAll(scoringcontext, score_cjk, hitbuffer); |
|
1046 ChunkAll(letter_offset, score_cjk, hitbuffer); |
|
1047 } |
|
1048 |
|
1049 |
|
1050 |
|
1051 // The hitbuffer is in an awkward form -- three sets of base/delta/distinct |
|
1052 // scores, each with an indirect subscript to one of six scoring tables, some |
|
1053 // of which can yield two langprobs for six languages, others one langprob for |
|
1054 // three languages. The only correlation between base/delta/distinct is their |
|
1055 // offsets into the letters-only text buffer. |
|
1056 // |
|
1057 // SummaryBuffer needs to be built to linear, giving linear offset of start of |
|
1058 // each chunk |
|
1059 // |
|
1060 // So we first do all the langprob lookups and merge-sort by offset to make |
|
1061 // a single linear vector, building a side vector of chunk beginnings as we go. |
|
1062 // The sharpening is simply moving the beginnings, scoring is a simple linear |
|
1063 // sweep, etc. |
|
1064 |
|
1065 void ProcessHitBuffer(const LangSpan& scriptspan, |
|
1066 int letter_offset, |
|
1067 ScoringContext* scoringcontext, |
|
1068 DocTote* doc_tote, |
|
1069 ResultChunkVector* vec, |
|
1070 bool more_to_come, bool score_cjk, |
|
1071 ScoringHitBuffer* hitbuffer) { |
|
1072 if (scoringcontext->flags_cld2_verbose) { |
|
1073 fprintf(scoringcontext->debug_file, "Hitbuffer[) "); |
|
1074 DumpHitBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer); |
|
1075 } |
|
1076 |
|
1077 LinearizeHitBuffer(letter_offset, scoringcontext, more_to_come, score_cjk, |
|
1078 hitbuffer); |
|
1079 |
|
1080 if (scoringcontext->flags_cld2_verbose) { |
|
1081 fprintf(scoringcontext->debug_file, "Linear[) "); |
|
1082 DumpLinearBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer); |
|
1083 } |
|
1084 |
|
1085 SummaryBuffer summarybuffer; |
|
1086 summarybuffer.n = 0; |
|
1087 ChunkSpan last_cspan; |
|
1088 ScoreAllHits(scriptspan.text, scriptspan.ulscript, |
|
1089 more_to_come, score_cjk, hitbuffer, |
|
1090 scoringcontext, |
|
1091 &summarybuffer, &last_cspan); |
|
1092 |
|
1093 if (scoringcontext->flags_cld2_verbose) { |
|
1094 DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer); |
|
1095 } |
|
1096 |
|
1097 if (vec != NULL) { |
|
1098 // Sharpen boundaries of summarybuffer |
|
1099 // This is not a high-performance path |
|
1100 SharpenBoundaries(scriptspan.text, more_to_come, hitbuffer, scoringcontext, |
|
1101 &summarybuffer); |
|
1102 // Show after the sharpening |
|
1103 // CLD2_Debug2(scriptspan.text, more_to_come, score_cjk, |
|
1104 // hitbuffer, scoringcontext, &summarybuffer); |
|
1105 |
|
1106 if (scoringcontext->flags_cld2_verbose) { |
|
1107 DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer); |
|
1108 } |
|
1109 } |
|
1110 |
|
1111 SummaryBufferToDocTote(&summarybuffer, more_to_come, doc_tote); |
|
1112 SummaryBufferToVector(scoringcontext->scanner, scriptspan.text, |
|
1113 &summarybuffer, more_to_come, vec); |
|
1114 } |
|
1115 |
|
1116 void SpliceHitBuffer(ScoringHitBuffer* hitbuffer, int next_offset) { |
|
1117 // Splice hitbuffer and summarybuffer for next round. With big chunks and |
|
1118 // distinctive-word state carried across chunks, we might not need to do this. |
|
1119 hitbuffer->next_base = 0; |
|
1120 hitbuffer->next_delta = 0; |
|
1121 hitbuffer->next_distinct = 0; |
|
1122 hitbuffer->next_linear = 0; |
|
1123 hitbuffer->next_chunk_start = 0; |
|
1124 hitbuffer->lowest_offset = next_offset; |
|
1125 } |
|
1126 |
|
1127 |
|
1128 // Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating |
|
1129 // scoringcontext |
|
1130 void ScoreEntireScriptSpan(const LangSpan& scriptspan, |
|
1131 ScoringContext* scoringcontext, |
|
1132 DocTote* doc_tote, |
|
1133 ResultChunkVector* vec) { |
|
1134 int bytes = scriptspan.text_bytes; |
|
1135 // Artificially set score to 1024 per 1KB, or 1 per byte |
|
1136 int score = bytes; |
|
1137 int reliability = 100; |
|
1138 // doc_tote uses full languages |
|
1139 Language one_one_lang = DefaultLanguage(scriptspan.ulscript); |
|
1140 doc_tote->Add(one_one_lang, bytes, score, reliability); |
|
1141 |
|
1142 if (scoringcontext->flags_cld2_html) { |
|
1143 ChunkSummary chunksummary = { |
|
1144 1, 0, |
|
1145 one_one_lang, UNKNOWN_LANGUAGE, score, 1, |
|
1146 bytes, 0, scriptspan.ulscript, reliability, reliability |
|
1147 }; |
|
1148 CLD2_Debug(scriptspan.text, 1, scriptspan.text_bytes, |
|
1149 false, false, NULL, |
|
1150 scoringcontext, NULL, &chunksummary); |
|
1151 } |
|
1152 |
|
1153 // First byte is always a space |
|
1154 JustOneItemToVector(scoringcontext->scanner, scriptspan.text, |
|
1155 one_one_lang, 1, bytes - 1, vec); |
|
1156 |
|
1157 scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; |
|
1158 } |
|
1159 |
|
1160 // Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext |
|
1161 void ScoreCJKScriptSpan(const LangSpan& scriptspan, |
|
1162 ScoringContext* scoringcontext, |
|
1163 DocTote* doc_tote, |
|
1164 ResultChunkVector* vec) { |
|
1165 // Allocate three parallel arrays of scoring hits |
|
1166 ScoringHitBuffer* hitbuffer = new ScoringHitBuffer; |
|
1167 hitbuffer->init(); |
|
1168 hitbuffer->ulscript = scriptspan.ulscript; |
|
1169 |
|
1170 scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; |
|
1171 scoringcontext->oldest_distinct_boost = 0; |
|
1172 |
|
1173 // Incoming scriptspan has a single leading space at scriptspan.text[0] |
|
1174 // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3] |
|
1175 |
|
1176 int letter_offset = 1; // Skip initial space |
|
1177 hitbuffer->lowest_offset = letter_offset; |
|
1178 int letter_limit = scriptspan.text_bytes; |
|
1179 while (letter_offset < letter_limit) { |
|
1180 if (scoringcontext->flags_cld2_verbose) { |
|
1181 fprintf(scoringcontext->debug_file, " ScoreCJKScriptSpan[%d,%d)<br>\n", |
|
1182 letter_offset, letter_limit); |
|
1183 } |
|
1184 // |
|
1185 // Fill up one hitbuffer, possibly splicing onto previous fragment |
|
1186 // |
|
1187 // NOTE: GetUniHits deals with close repeats |
|
1188 // NOTE: After last chunk there is always a hitbuffer entry with an offset |
|
1189 // just off the end of the text = next_offset. |
|
1190 int next_offset = GetUniHits(scriptspan.text, letter_offset, letter_limit, |
|
1191 scoringcontext, hitbuffer); |
|
1192 // NOTE: GetBiHitVectors deals with close repeats, |
|
1193 // does one hash and two lookups (delta and distinct) per word |
|
1194 GetBiHits(scriptspan.text, letter_offset, next_offset, |
|
1195 scoringcontext, hitbuffer); |
|
1196 |
|
1197 // |
|
1198 // Score one hitbuffer in chunks to summarybuffer |
|
1199 // |
|
1200 bool more_to_come = next_offset < letter_limit; |
|
1201 bool score_cjk = true; |
|
1202 ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec, |
|
1203 more_to_come, score_cjk, hitbuffer); |
|
1204 SpliceHitBuffer(hitbuffer, next_offset); |
|
1205 |
|
1206 letter_offset = next_offset; |
|
1207 } |
|
1208 |
|
1209 delete hitbuffer; |
|
1210 // Context across buffers is not connected yet |
|
1211 scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; |
|
1212 } |
|
1213 |
|
1214 |
|
1215 |
|
1216 // Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext |
|
1217 // We have a scriptspan with all lowercase text in one script. Look up |
|
1218 // quadgrams and octagrams, saving the hits in three parallel vectors. |
|
1219 // Score from those vectors in chunks, toting each chunk to get a single |
|
1220 // language, and combining into the overall document score. The hit vectors |
|
1221 // in general are not big enough to handle and entire scriptspan, so |
|
1222 // repeat until the entire scriptspan is scored. |
|
1223 // Caller deals with minimizing numbr of runt scriptspans |
|
1224 // This routine deals with minimizing number of runt chunks. |
|
1225 // |
|
1226 // Returns updated scoringcontext |
|
1227 // Returns updated doc_tote |
|
1228 // If vec != NULL, appends to that vector of ResultChunk's |
|
1229 void ScoreQuadScriptSpan(const LangSpan& scriptspan, |
|
1230 ScoringContext* scoringcontext, |
|
1231 DocTote* doc_tote, |
|
1232 ResultChunkVector* vec) { |
|
1233 // Allocate three parallel arrays of scoring hits |
|
1234 ScoringHitBuffer* hitbuffer = new ScoringHitBuffer; |
|
1235 hitbuffer->init(); |
|
1236 hitbuffer->ulscript = scriptspan.ulscript; |
|
1237 |
|
1238 scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; |
|
1239 scoringcontext->oldest_distinct_boost = 0; |
|
1240 |
|
1241 // Incoming scriptspan has a single leading space at scriptspan.text[0] |
|
1242 // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3] |
|
1243 |
|
1244 int letter_offset = 1; // Skip initial space |
|
1245 hitbuffer->lowest_offset = letter_offset; |
|
1246 int letter_limit = scriptspan.text_bytes; |
|
1247 while (letter_offset < letter_limit) { |
|
1248 // |
|
1249 // Fill up one hitbuffer, possibly splicing onto previous fragment |
|
1250 // |
|
1251 // NOTE: GetQuadHits deals with close repeats |
|
1252 // NOTE: After last chunk there is always a hitbuffer entry with an offset |
|
1253 // just off the end of the text = next_offset. |
|
1254 int next_offset = GetQuadHits(scriptspan.text, letter_offset, letter_limit, |
|
1255 scoringcontext, hitbuffer); |
|
1256 // If true, there is more text to process in this scriptspan |
|
1257 // NOTE: GetOctaHitVectors deals with close repeats, |
|
1258 // does one hash and two lookups (delta and distinct) per word |
|
1259 GetOctaHits(scriptspan.text, letter_offset, next_offset, |
|
1260 scoringcontext, hitbuffer); |
|
1261 |
|
1262 // |
|
1263 // Score one hitbuffer in chunks to summarybuffer |
|
1264 // |
|
1265 bool more_to_come = next_offset < letter_limit; |
|
1266 bool score_cjk = false; |
|
1267 ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec, |
|
1268 more_to_come, score_cjk, hitbuffer); |
|
1269 SpliceHitBuffer(hitbuffer, next_offset); |
|
1270 |
|
1271 letter_offset = next_offset; |
|
1272 } |
|
1273 |
|
1274 delete hitbuffer; |
|
1275 } |
|
1276 |
|
1277 |
|
1278 // Score one scriptspan into doc_tote and vec, updating scoringcontext |
|
1279 // Inputs: |
|
1280 // One scriptspan of perhaps 40-60KB, all same script lower-case letters |
|
1281 // and single ASCII spaces. First character is a space to allow simple |
|
1282 // begining-of-word detect. End of buffer has three spaces and NUL to |
|
1283 // allow easy scan-to-end-of-word. |
|
1284 // Scoring context of |
|
1285 // scoring tables |
|
1286 // flags |
|
1287 // running boosts |
|
1288 // Outputs: |
|
1289 // Updated doc_tote giving overall languages and byte counts |
|
1290 // Optional updated chunk vector giving offset, length, language |
|
1291 // |
|
1292 // Caller initializes flags, boosts, doc_tote and vec. |
|
1293 // Caller aggregates across multiple scriptspans |
|
1294 // Caller calculates final document result |
|
1295 // Caller deals with detecting and triggering suppression of repeated text. |
|
1296 // |
|
1297 // This top-level routine just chooses the recognition type and calls one of |
|
1298 // the next-level-down routines. |
|
1299 // |
|
1300 void ScoreOneScriptSpan(const LangSpan& scriptspan, |
|
1301 ScoringContext* scoringcontext, |
|
1302 DocTote* doc_tote, |
|
1303 ResultChunkVector* vec) { |
|
1304 if (scoringcontext->flags_cld2_verbose) { |
|
1305 fprintf(scoringcontext->debug_file, "<br>ScoreOneScriptSpan(%s,%d) ", |
|
1306 ULScriptCode(scriptspan.ulscript), scriptspan.text_bytes); |
|
1307 // Optionally print the chunk lowercase letters/marks text |
|
1308 string temp(&scriptspan.text[0], scriptspan.text_bytes); |
|
1309 fprintf(scoringcontext->debug_file, "'%s'", |
|
1310 GetHtmlEscapedText(temp).c_str()); |
|
1311 fprintf(scoringcontext->debug_file, "<br>\n"); |
|
1312 } |
|
1313 scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; |
|
1314 scoringcontext->oldest_distinct_boost = 0; |
|
1315 ULScriptRType rtype = ULScriptRecognitionType(scriptspan.ulscript); |
|
1316 if (scoringcontext->flags_cld2_score_as_quads && (rtype != RTypeCJK)) { |
|
1317 rtype = RTypeMany; |
|
1318 } |
|
1319 switch (rtype) { |
|
1320 case RTypeNone: |
|
1321 case RTypeOne: |
|
1322 ScoreEntireScriptSpan(scriptspan, scoringcontext, doc_tote, vec); |
|
1323 break; |
|
1324 case RTypeCJK: |
|
1325 ScoreCJKScriptSpan(scriptspan, scoringcontext, doc_tote, vec); |
|
1326 break; |
|
1327 case RTypeMany: |
|
1328 ScoreQuadScriptSpan(scriptspan, scoringcontext, doc_tote, vec); |
|
1329 break; |
|
1330 } |
|
1331 } |
|
1332 |
|
1333 } // End namespace CLD2 |
|
1334 |