browser/components/translation/cld2/internal/scoreonescriptspan.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 // Copyright 2013 Google Inc. All Rights Reserved.
michael@0 2 //
michael@0 3 // Licensed under the Apache License, Version 2.0 (the "License");
michael@0 4 // you may not use this file except in compliance with the License.
michael@0 5 // You may obtain a copy of the License at
michael@0 6 //
michael@0 7 // http://www.apache.org/licenses/LICENSE-2.0
michael@0 8 //
michael@0 9 // Unless required by applicable law or agreed to in writing, software
michael@0 10 // distributed under the License is distributed on an "AS IS" BASIS,
michael@0 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0 12 // See the License for the specific language governing permissions and
michael@0 13 // limitations under the License.
michael@0 14
michael@0 15 //
michael@0 16 // Author: dsites@google.com (Dick Sites)
michael@0 17 // Updated 2014.01 for dual table lookup
michael@0 18 //
michael@0 19
michael@0 20 #include "scoreonescriptspan.h"
michael@0 21
michael@0 22 #include "cldutil.h"
michael@0 23 #include "debug.h"
michael@0 24 #include "lang_script.h"
michael@0 25
michael@0 26 #include <stdio.h>
michael@0 27
michael@0 28 using namespace std;
michael@0 29
michael@0 30 namespace CLD2 {
michael@0 31
michael@0 32 static const int kUnreliablePercentThreshold = 75;
michael@0 33
michael@0 34 void AddLangProb(uint32 langprob, Tote* chunk_tote) {
michael@0 35 ProcessProbV2Tote(langprob, chunk_tote);
michael@0 36 }
michael@0 37
michael@0 38 void ZeroPSLang(uint32 langprob, Tote* chunk_tote) {
michael@0 39 uint8 top1 = (langprob >> 8) & 0xff;
michael@0 40 chunk_tote->SetScore(top1, 0);
michael@0 41 }
michael@0 42
michael@0 43 bool SameCloseSet(uint16 lang1, uint16 lang2) {
michael@0 44 int lang1_close_set = LanguageCloseSet(static_cast<Language>(lang1));
michael@0 45 if (lang1_close_set == 0) {return false;}
michael@0 46 int lang2_close_set = LanguageCloseSet(static_cast<Language>(lang2));
michael@0 47 return (lang1_close_set == lang2_close_set);
michael@0 48 }
michael@0 49
michael@0 50 bool SameCloseSet(Language lang1, Language lang2) {
michael@0 51 int lang1_close_set = LanguageCloseSet(lang1);
michael@0 52 if (lang1_close_set == 0) {return false;}
michael@0 53 int lang2_close_set = LanguageCloseSet(lang2);
michael@0 54 return (lang1_close_set == lang2_close_set);
michael@0 55 }
michael@0 56
michael@0 57
michael@0 58 // Needs expected score per 1KB in scoring context
michael@0 59 void SetChunkSummary(ULScript ulscript, int first_linear_in_chunk,
michael@0 60 int offset, int len,
michael@0 61 const ScoringContext* scoringcontext,
michael@0 62 const Tote* chunk_tote,
michael@0 63 ChunkSummary* chunksummary) {
michael@0 64 int key3[3];
michael@0 65 chunk_tote->CurrentTopThreeKeys(key3);
michael@0 66 Language lang1 = FromPerScriptNumber(ulscript, key3[0]);
michael@0 67 Language lang2 = FromPerScriptNumber(ulscript, key3[1]);
michael@0 68
michael@0 69 int actual_score_per_kb = 0;
michael@0 70 if (len > 0) {
michael@0 71 actual_score_per_kb = (chunk_tote->GetScore(key3[0]) << 10) / len;
michael@0 72 }
michael@0 73 int expected_subscr = lang1 * 4 + LScript4(ulscript);
michael@0 74 int expected_score_per_kb =
michael@0 75 scoringcontext->scoringtables->kExpectedScore[expected_subscr];
michael@0 76
michael@0 77 chunksummary->offset = offset;
michael@0 78 chunksummary->chunk_start = first_linear_in_chunk;
michael@0 79 chunksummary->lang1 = lang1;
michael@0 80 chunksummary->lang2 = lang2;
michael@0 81 chunksummary->score1 = chunk_tote->GetScore(key3[0]);
michael@0 82 chunksummary->score2 = chunk_tote->GetScore(key3[1]);
michael@0 83 chunksummary->bytes = len;
michael@0 84 chunksummary->grams = chunk_tote->GetScoreCount();
michael@0 85 chunksummary->ulscript = ulscript;
michael@0 86 chunksummary->reliability_delta = ReliabilityDelta(chunksummary->score1,
michael@0 87 chunksummary->score2,
michael@0 88 chunksummary->grams);
michael@0 89 // If lang1/lang2 in same close set, set delta reliability to 100%
michael@0 90 if (SameCloseSet(lang1, lang2)) {
michael@0 91 chunksummary->reliability_delta = 100;
michael@0 92 }
michael@0 93 chunksummary->reliability_score =
michael@0 94 ReliabilityExpected(actual_score_per_kb, expected_score_per_kb);
michael@0 95 }
michael@0 96
michael@0 97 // Return true if just lang1 is there: lang2=0 and lang3=0
michael@0 98 bool IsSingleLang(uint32 langprob) {
michael@0 99 // Probably a bug -- which end is lang1? But only used to call empty Boost1
michael@0 100 return ((langprob & 0x00ffff00) == 0);
michael@0 101 }
michael@0 102
michael@0 103 // Update scoring context distinct_boost for single language quad
michael@0 104 void AddDistinctBoost1(uint32 langprob, ScoringContext* scoringcontext) {
michael@0 105 // Probably keep this empty -- not a good enough signal
michael@0 106 }
michael@0 107
michael@0 108 // Update scoring context distinct_boost for distinct octagram
michael@0 109 // Keep last 4 used. Since these are mostly (except at splices) in
michael@0 110 // hitbuffer, we might be able to just use a subscript and splice
michael@0 111 void AddDistinctBoost2(uint32 langprob, ScoringContext* scoringcontext) {
michael@0 112 // this is called 0..n times per chunk with decoded hitbuffer->distinct...
michael@0 113 LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
michael@0 114 if (scoringcontext->ulscript != ULScript_Latin) {
michael@0 115 distinct_boost = &scoringcontext->distinct_boost.othr;
michael@0 116 }
michael@0 117 int n = distinct_boost->n;
michael@0 118 distinct_boost->langprob[n] = langprob;
michael@0 119 distinct_boost->n = distinct_boost->wrap(n + 1);
michael@0 120 }
michael@0 121
michael@0 122 // For each chunk, add extra weight for language priors (from content-lang and
michael@0 123 // meta lang=xx) and distinctive tokens
michael@0 124 void ScoreBoosts(const ScoringContext* scoringcontext, Tote* chunk_tote) {
michael@0 125 // Get boosts for current script
michael@0 126 const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
michael@0 127 const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
michael@0 128 const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
michael@0 129 if (scoringcontext->ulscript != ULScript_Latin) {
michael@0 130 langprior_boost = &scoringcontext->langprior_boost.othr;
michael@0 131 langprior_whack = &scoringcontext->langprior_whack.othr;
michael@0 132 distinct_boost = &scoringcontext->distinct_boost.othr;
michael@0 133 }
michael@0 134
michael@0 135 for (int k = 0; k < kMaxBoosts; ++k) {
michael@0 136 uint32 langprob = langprior_boost->langprob[k];
michael@0 137 if (langprob > 0) {AddLangProb(langprob, chunk_tote);}
michael@0 138 }
michael@0 139 for (int k = 0; k < kMaxBoosts; ++k) {
michael@0 140 uint32 langprob = distinct_boost->langprob[k];
michael@0 141 if (langprob > 0) {AddLangProb(langprob, chunk_tote);}
michael@0 142 }
michael@0 143 // boost has a packed set of per-script langs and probabilites
michael@0 144 // whack has a packed set of per-script lang to be suppressed (zeroed)
michael@0 145 // When a language in a close set is given as an explicit hint, others in
michael@0 146 // that set will be whacked here.
michael@0 147 for (int k = 0; k < kMaxBoosts; ++k) {
michael@0 148 uint32 langprob = langprior_whack->langprob[k];
michael@0 149 if (langprob > 0) {ZeroPSLang(langprob, chunk_tote);}
michael@0 150 }
michael@0 151 }
michael@0 152
michael@0 153
michael@0 154
michael@0 155 // At this point, The chunk is described by
michael@0 156 // hitbuffer->base[cspan->chunk_base .. cspan->chunk_base + cspan->base_len)
michael@0 157 // hitbuffer->delta[cspan->chunk_delta ... )
michael@0 158 // hitbuffer->distinct[cspan->chunk_distinct ... )
michael@0 159 // Scored text is in text[lo..hi) where
michael@0 160 // lo is 0 or the min of first base/delta/distinct hitbuffer offset and
michael@0 161 // hi is the min of next base/delta/distinct hitbuffer offset after
michael@0 162 // base_len, etc.
michael@0 163 void GetTextSpanOffsets(const ScoringHitBuffer* hitbuffer,
michael@0 164 const ChunkSpan* cspan, int* lo, int* hi) {
michael@0 165 // Front of this span
michael@0 166 int lo_base = hitbuffer->base[cspan->chunk_base].offset;
michael@0 167 int lo_delta = hitbuffer->delta[cspan->chunk_delta].offset;
michael@0 168 int lo_distinct = hitbuffer->distinct[cspan->chunk_distinct].offset;
michael@0 169 // Front of next span
michael@0 170 int hi_base = hitbuffer->base[cspan->chunk_base +
michael@0 171 cspan->base_len].offset;
michael@0 172 int hi_delta = hitbuffer->delta[cspan->chunk_delta +
michael@0 173 cspan->delta_len].offset;
michael@0 174 int hi_distinct = hitbuffer->distinct[cspan->chunk_distinct +
michael@0 175 cspan->distinct_len].offset;
michael@0 176
michael@0 177 *lo = 0;
michael@0 178 // if (cspan->chunk_base > 0) {
michael@0 179 // *lo = minint(minint(lo_base, lo_delta), lo_distinct);
michael@0 180 // }
michael@0 181 *lo = minint(minint(lo_base, lo_delta), lo_distinct);
michael@0 182 *hi = minint(minint(hi_base, hi_delta), hi_distinct);
michael@0 183 }
michael@0 184
michael@0 185
michael@0 186 int DiffScore(const CLD2TableSummary* obj, int indirect,
michael@0 187 uint16 lang1, uint16 lang2) {
michael@0 188 if (indirect < static_cast<int>(obj->kCLDTableSizeOne)) {
michael@0 189 // Up to three languages at indirect
michael@0 190 uint32 langprob = obj->kCLDTableInd[indirect];
michael@0 191 return GetLangScore(langprob, lang1) - GetLangScore(langprob, lang2);
michael@0 192 } else {
michael@0 193 // Up to six languages at start + 2 * (indirect - start)
michael@0 194 indirect += (indirect - obj->kCLDTableSizeOne);
michael@0 195 uint32 langprob = obj->kCLDTableInd[indirect];
michael@0 196 uint32 langprob2 = obj->kCLDTableInd[indirect + 1];
michael@0 197 return (GetLangScore(langprob, lang1) + GetLangScore(langprob2, lang1)) -
michael@0 198 (GetLangScore(langprob, lang2) + GetLangScore(langprob2, lang2));
michael@0 199 }
michael@0 200
michael@0 201 }
michael@0 202
michael@0 203 // Score all the bases, deltas, distincts, boosts for one chunk into chunk_tote
michael@0 204 // After last chunk there is always a hitbuffer entry with an offset just off
michael@0 205 // the end of the text.
michael@0 206 // Sets delta_len, and distinct_len
michael@0 207 void ScoreOneChunk(const char* text, ULScript ulscript,
michael@0 208 const ScoringHitBuffer* hitbuffer,
michael@0 209 int chunk_i,
michael@0 210 ScoringContext* scoringcontext,
michael@0 211 ChunkSpan* cspan, Tote* chunk_tote,
michael@0 212 ChunkSummary* chunksummary) {
michael@0 213 int first_linear_in_chunk = hitbuffer->chunk_start[chunk_i];
michael@0 214 int first_linear_in_next_chunk = hitbuffer->chunk_start[chunk_i + 1];
michael@0 215
michael@0 216 chunk_tote->Reinit();
michael@0 217 cspan->delta_len = 0;
michael@0 218 cspan->distinct_len = 0;
michael@0 219 if (scoringcontext->flags_cld2_verbose) {
michael@0 220 fprintf(scoringcontext->debug_file, "<br>ScoreOneChunk[%d..%d) ",
michael@0 221 first_linear_in_chunk, first_linear_in_next_chunk);
michael@0 222 }
michael@0 223
michael@0 224 // 2013.02.05 linear design: just use base and base_len for the span
michael@0 225 cspan->chunk_base = first_linear_in_chunk;
michael@0 226 cspan->base_len = first_linear_in_next_chunk - first_linear_in_chunk;
michael@0 227 for (int i = first_linear_in_chunk; i < first_linear_in_next_chunk; ++i) {
michael@0 228 uint32 langprob = hitbuffer->linear[i].langprob;
michael@0 229 AddLangProb(langprob, chunk_tote);
michael@0 230 if (hitbuffer->linear[i].type <= QUADHIT) {
michael@0 231 chunk_tote->AddScoreCount(); // Just count quads, not octas
michael@0 232 }
michael@0 233 if (hitbuffer->linear[i].type == DISTINCTHIT) {
michael@0 234 AddDistinctBoost2(langprob, scoringcontext);
michael@0 235 }
michael@0 236 }
michael@0 237
michael@0 238 // Score language prior boosts
michael@0 239 // Score distinct word boost
michael@0 240 ScoreBoosts(scoringcontext, chunk_tote);
michael@0 241
michael@0 242 int lo = hitbuffer->linear[first_linear_in_chunk].offset;
michael@0 243 int hi = hitbuffer->linear[first_linear_in_next_chunk].offset;
michael@0 244
michael@0 245 // Chunk_tote: get top langs, scores, etc. and fill in chunk summary
michael@0 246 SetChunkSummary(ulscript, first_linear_in_chunk, lo, hi - lo,
michael@0 247 scoringcontext, chunk_tote, chunksummary);
michael@0 248
michael@0 249 bool more_to_come = false;
michael@0 250 bool score_cjk = false;
michael@0 251 if (scoringcontext->flags_cld2_html) {
michael@0 252 // Show one chunk in readable output
michael@0 253 CLD2_Debug(text, lo, hi, more_to_come, score_cjk, hitbuffer,
michael@0 254 scoringcontext, cspan, chunksummary);
michael@0 255 }
michael@0 256
michael@0 257 scoringcontext->prior_chunk_lang = static_cast<Language>(chunksummary->lang1);
michael@0 258 }
michael@0 259
michael@0 260
michael@0 261 // Score chunks of text described by hitbuffer, allowing each to be in a
michael@0 262 // different language, and optionally adjusting the boundaries inbetween.
michael@0 263 // Set last_cspan to the last chunkspan used
michael@0 264 void ScoreAllHits(const char* text, ULScript ulscript,
michael@0 265 bool more_to_come, bool score_cjk,
michael@0 266 const ScoringHitBuffer* hitbuffer,
michael@0 267 ScoringContext* scoringcontext,
michael@0 268 SummaryBuffer* summarybuffer, ChunkSpan* last_cspan) {
michael@0 269 ChunkSpan prior_cspan = {0, 0, 0, 0, 0, 0};
michael@0 270 ChunkSpan cspan = {0, 0, 0, 0, 0, 0};
michael@0 271
michael@0 272 for (int i = 0; i < hitbuffer->next_chunk_start; ++i) {
michael@0 273 // Score one chunk
michael@0 274 // Sets delta_len, and distinct_len
michael@0 275 Tote chunk_tote;
michael@0 276 ChunkSummary chunksummary;
michael@0 277 ScoreOneChunk(text, ulscript,
michael@0 278 hitbuffer, i,
michael@0 279 scoringcontext, &cspan, &chunk_tote, &chunksummary);
michael@0 280
michael@0 281 // Put result in summarybuffer
michael@0 282 if (summarybuffer->n < kMaxSummaries) {
michael@0 283 summarybuffer->chunksummary[summarybuffer->n] = chunksummary;
michael@0 284 summarybuffer->n += 1;
michael@0 285 }
michael@0 286
michael@0 287 prior_cspan = cspan;
michael@0 288 cspan.chunk_base += cspan.base_len;
michael@0 289 cspan.chunk_delta += cspan.delta_len;
michael@0 290 cspan.chunk_distinct += cspan.distinct_len;
michael@0 291 }
michael@0 292
michael@0 293 // Add one dummy off the end to hold first unused linear_in_chunk
michael@0 294 int linear_off_end = hitbuffer->next_linear;
michael@0 295 int offset_off_end = hitbuffer->linear[linear_off_end].offset;
michael@0 296 ChunkSummary* cs = &summarybuffer->chunksummary[summarybuffer->n];
michael@0 297 memset(cs, 0, sizeof(ChunkSummary));
michael@0 298 cs->offset = offset_off_end;
michael@0 299 cs->chunk_start = linear_off_end;
michael@0 300 *last_cspan = prior_cspan;
michael@0 301 }
michael@0 302
michael@0 303
michael@0 304 void SummaryBufferToDocTote(const SummaryBuffer* summarybuffer,
michael@0 305 bool more_to_come, DocTote* doc_tote) {
michael@0 306 int cs_bytes_sum = 0;
michael@0 307 for (int i = 0; i < summarybuffer->n; ++i) {
michael@0 308 const ChunkSummary* cs = &summarybuffer->chunksummary[i];
michael@0 309 int reliability = minint(cs->reliability_delta, cs->reliability_score);
michael@0 310 // doc_tote uses full languages
michael@0 311 doc_tote->Add(cs->lang1, cs->bytes, cs->score1, reliability);
michael@0 312 cs_bytes_sum += cs->bytes;
michael@0 313 }
michael@0 314 }
michael@0 315
michael@0 316 // Turn on for debugging vectors
michael@0 317 static const bool kShowLettersOriginal = false;
michael@0 318
michael@0 319
michael@0 320 // If next chunk language matches last vector language, extend last element
michael@0 321 // Otherwise add new element to vector
michael@0 322 void ItemToVector(ScriptScanner* scanner,
michael@0 323 ResultChunkVector* vec, Language new_lang,
michael@0 324 int mapped_offset, int mapped_len) {
michael@0 325 uint16 last_vec_lang = static_cast<uint16>(UNKNOWN_LANGUAGE);
michael@0 326 int last_vec_subscr = vec->size() - 1;
michael@0 327 if (last_vec_subscr >= 0) {
michael@0 328 ResultChunk* priorrc = &(*vec)[last_vec_subscr];
michael@0 329 last_vec_lang = priorrc->lang1;
michael@0 330 if (new_lang == last_vec_lang) {
michael@0 331 // Extend prior. Current mapped_offset may be beyond prior end, so do
michael@0 332 // the arithmetic to include any such gap
michael@0 333 priorrc->bytes = minint((mapped_offset + mapped_len) - priorrc->offset,
michael@0 334 kMaxResultChunkBytes);
michael@0 335 if (kShowLettersOriginal) {
michael@0 336 // Optionally print the new chunk original text
michael@0 337 string temp2(&scanner->GetBufferStart()[priorrc->offset],
michael@0 338 priorrc->bytes);
michael@0 339 fprintf(stderr, "Item[%d..%d) '%s'<br>\n",
michael@0 340 priorrc->offset, priorrc->offset + priorrc->bytes,
michael@0 341 GetHtmlEscapedText(temp2).c_str());
michael@0 342 }
michael@0 343 return;
michael@0 344 }
michael@0 345 }
michael@0 346 // Add new vector element
michael@0 347 ResultChunk rc;
michael@0 348 rc.offset = mapped_offset;
michael@0 349 rc.bytes = minint(mapped_len, kMaxResultChunkBytes);
michael@0 350 rc.lang1 = static_cast<uint16>(new_lang);
michael@0 351 vec->push_back(rc);
michael@0 352 if (kShowLettersOriginal) {
michael@0 353 // Optionally print the new chunk original text
michael@0 354 string temp2(&scanner->GetBufferStart()[rc.offset], rc.bytes);
michael@0 355 fprintf(stderr, "Item[%d..%d) '%s'<br>\n",
michael@0 356 rc.offset, rc.offset + rc.bytes,
michael@0 357 GetHtmlEscapedText(temp2).c_str());
michael@0 358 }
michael@0 359 }
michael@0 360
michael@0 361 uint16 PriorVecLang(const ResultChunkVector* vec) {
michael@0 362 if (vec->empty()) {return static_cast<uint16>(UNKNOWN_LANGUAGE);}
michael@0 363 return (*vec)[vec->size() - 1].lang1;
michael@0 364 }
michael@0 365
michael@0 366 uint16 NextChunkLang(const SummaryBuffer* summarybuffer, int i) {
michael@0 367 if ((i + 1) >= summarybuffer->n) {
michael@0 368 return static_cast<uint16>(UNKNOWN_LANGUAGE);
michael@0 369 }
michael@0 370 return summarybuffer->chunksummary[i + 1].lang1;
michael@0 371 }
michael@0 372
michael@0 373
michael@0 374
michael@0 375 // Add n elements of summarybuffer to resultchunk vector:
michael@0 376 // Each element is letters-only text [offset..offset+bytes)
michael@0 377 // This maps back to original[Back(offset)..Back(offset+bytes))
michael@0 378 //
michael@0 379 // We go out of our way to minimize the variation in the ResultChunkVector,
michael@0 380 // so that the caller has fewer but more meaningful spans in different
michael@0 381 // lanaguges, for the likely purpose of translation or spell-check.
michael@0 382 //
michael@0 383 // The language of each chunk is lang1, but it might be unreliable for
michael@0 384 // either of two reasons: its score is relatively too close to the score of
michael@0 385 // lang2, or its score is too far away from the expected score of real text in
michael@0 386 // the given language. Unreliable languages are mapped to Unknown.
michael@0 387 //
michael@0 388 void SummaryBufferToVector(ScriptScanner* scanner, const char* text,
michael@0 389 const SummaryBuffer* summarybuffer,
michael@0 390 bool more_to_come, ResultChunkVector* vec) {
michael@0 391 if (vec == NULL) {return;}
michael@0 392
michael@0 393 if (kShowLettersOriginal) {
michael@0 394 fprintf(stderr, "map2original_ ");
michael@0 395 scanner->map2original_.DumpWindow();
michael@0 396 fprintf(stderr, "<br>\n");
michael@0 397 fprintf(stderr, "map2uplow_ ");
michael@0 398 scanner->map2uplow_.DumpWindow();
michael@0 399 fprintf(stderr, "<br>\n");
michael@0 400 }
michael@0 401
michael@0 402 for (int i = 0; i < summarybuffer->n; ++i) {
michael@0 403 const ChunkSummary* cs = &summarybuffer->chunksummary[i];
michael@0 404 int unmapped_offset = cs->offset;
michael@0 405 int unmapped_len = cs->bytes;
michael@0 406
michael@0 407 if (kShowLettersOriginal) {
michael@0 408 // Optionally print the chunk lowercase letters/marks text
michael@0 409 string temp(&text[unmapped_offset], unmapped_len);
michael@0 410 fprintf(stderr, "Letters [%d..%d) '%s'<br>\n",
michael@0 411 unmapped_offset, unmapped_offset + unmapped_len,
michael@0 412 GetHtmlEscapedText(temp).c_str());
michael@0 413 }
michael@0 414
michael@0 415 int mapped_offset = scanner->MapBack(unmapped_offset);
michael@0 416
michael@0 417 // Trim back a little to prefer splicing original at word boundaries
michael@0 418 if (mapped_offset > 0) {
michael@0 419 // Size of prior vector entry, if any
michael@0 420 int prior_size = 0;
michael@0 421 if (!vec->empty()) {
michael@0 422 ResultChunk* rc = &(*vec)[vec->size() - 1];
michael@0 423 prior_size = rc->bytes;
michael@0 424 }
michael@0 425 // Maximum back up size to leave at least 3 bytes in prior,
michael@0 426 // and not entire buffer, and no more than 12 bytes total backup
michael@0 427 int n_limit = minint(prior_size - 3, mapped_offset);
michael@0 428 n_limit = minint(n_limit, 12);
michael@0 429
michael@0 430 // Backscan over letters, stopping if prior byte is < 0x41
michael@0 431 // There is some possibility that we will backscan over a different script
michael@0 432 const char* s = &scanner->GetBufferStart()[mapped_offset];
michael@0 433 const unsigned char* us = reinterpret_cast<const unsigned char*>(s);
michael@0 434 int n = 0;
michael@0 435 while ((n < n_limit) && (us[-n - 1] >= 0x41)) {++n;}
michael@0 436 if (n >= n_limit) {n = 0;} // New boundary not found within range
michael@0 437
michael@0 438 // Also back up exactly one leading punctuation character if '"#@
michael@0 439 if (n < n_limit) {
michael@0 440 unsigned char c = us[-n - 1];
michael@0 441 if ((c == '\'') || (c == '"') || (c == '#') || (c == '@')) {++n;}
michael@0 442 }
michael@0 443 // Shrink the previous chunk slightly
michael@0 444 if (n > 0) {
michael@0 445 ResultChunk* rc = &(*vec)[vec->size() - 1];
michael@0 446 rc->bytes -= n;
michael@0 447 mapped_offset -= n;
michael@0 448 if (kShowLettersOriginal) {
michael@0 449 fprintf(stderr, "Back up %d bytes<br>\n", n);
michael@0 450 // Optionally print the prior chunk original text
michael@0 451 string temp2(&scanner->GetBufferStart()[rc->offset], rc->bytes);
michael@0 452 fprintf(stderr, "Prior [%d..%d) '%s'<br>\n",
michael@0 453 rc->offset, rc->offset + rc->bytes,
michael@0 454 GetHtmlEscapedText(temp2).c_str());
michael@0 455 }
michael@0 456 }
michael@0 457 }
michael@0 458
michael@0 459 int mapped_len =
michael@0 460 scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset;
michael@0 461
michael@0 462 if (kShowLettersOriginal) {
michael@0 463 // Optionally print the chunk original text
michael@0 464 string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len);
michael@0 465 fprintf(stderr, "Original[%d..%d) '%s'<br>\n",
michael@0 466 mapped_offset, mapped_offset + mapped_len,
michael@0 467 GetHtmlEscapedText(temp2).c_str());
michael@0 468 }
michael@0 469
michael@0 470 Language new_lang = static_cast<Language>(cs->lang1);
michael@0 471 bool reliability_delta_bad =
michael@0 472 (cs->reliability_delta < kUnreliablePercentThreshold);
michael@0 473 bool reliability_score_bad =
michael@0 474 (cs->reliability_score < kUnreliablePercentThreshold);
michael@0 475
michael@0 476 // If the top language matches last vector, ignore reliability_delta
michael@0 477 uint16 prior_lang = PriorVecLang(vec);
michael@0 478 if (prior_lang == cs->lang1) {
michael@0 479 reliability_delta_bad = false;
michael@0 480 }
michael@0 481 // If the top language is in same close set as last vector, set up to merge
michael@0 482 if (SameCloseSet(cs->lang1, prior_lang)) {
michael@0 483 new_lang = static_cast<Language>(prior_lang);
michael@0 484 reliability_delta_bad = false;
michael@0 485 }
michael@0 486 // If the top two languages are in the same close set and the last vector
michael@0 487 // language is the second language, set up to merge
michael@0 488 if (SameCloseSet(cs->lang1, cs->lang2) &&
michael@0 489 (prior_lang == cs->lang2)) {
michael@0 490 new_lang = static_cast<Language>(prior_lang);
michael@0 491 reliability_delta_bad = false;
michael@0 492 }
michael@0 493 // If unreliable and the last and next vector languages are both
michael@0 494 // the second language, set up to merge
michael@0 495 uint16 next_lang = NextChunkLang(summarybuffer, i);
michael@0 496 if (reliability_delta_bad &&
michael@0 497 (prior_lang == cs->lang2) && (next_lang == cs->lang2)) {
michael@0 498 new_lang = static_cast<Language>(prior_lang);
michael@0 499 reliability_delta_bad = false;
michael@0 500 }
michael@0 501
michael@0 502 if (reliability_delta_bad || reliability_score_bad) {
michael@0 503 new_lang = UNKNOWN_LANGUAGE;
michael@0 504 }
michael@0 505 ItemToVector(scanner, vec, new_lang, mapped_offset, mapped_len);
michael@0 506 }
michael@0 507 }
michael@0 508
michael@0 509 // Add just one element to resultchunk vector:
michael@0 510 // For RTypeNone or RTypeOne
michael@0 511 void JustOneItemToVector(ScriptScanner* scanner, const char* text,
michael@0 512 Language lang1, int unmapped_offset, int unmapped_len,
michael@0 513 ResultChunkVector* vec) {
michael@0 514 if (vec == NULL) {return;}
michael@0 515
michael@0 516 if (kShowLettersOriginal) {
michael@0 517 fprintf(stderr, "map2original_ ");
michael@0 518 scanner->map2original_.DumpWindow();
michael@0 519 fprintf(stderr, "<br>\n");
michael@0 520 fprintf(stderr, "map2uplow_ ");
michael@0 521 scanner->map2uplow_.DumpWindow();
michael@0 522 fprintf(stderr, "<br>\n");
michael@0 523 }
michael@0 524
michael@0 525 if (kShowLettersOriginal) {
michael@0 526 // Optionally print the chunk lowercase letters/marks text
michael@0 527 string temp(&text[unmapped_offset], unmapped_len);
michael@0 528 fprintf(stderr, "Letters1 [%d..%d) '%s'<br>\n",
michael@0 529 unmapped_offset, unmapped_offset + unmapped_len,
michael@0 530 GetHtmlEscapedText(temp).c_str());
michael@0 531 }
michael@0 532
michael@0 533 int mapped_offset = scanner->MapBack(unmapped_offset);
michael@0 534 int mapped_len =
michael@0 535 scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset;
michael@0 536
michael@0 537 if (kShowLettersOriginal) {
michael@0 538 // Optionally print the chunk original text
michael@0 539 string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len);
michael@0 540 fprintf(stderr, "Original1[%d..%d) '%s'<br>\n",
michael@0 541 mapped_offset, mapped_offset + mapped_len,
michael@0 542 GetHtmlEscapedText(temp2).c_str());
michael@0 543 }
michael@0 544
michael@0 545 ItemToVector(scanner, vec, lang1, mapped_offset, mapped_len);
michael@0 546 }
michael@0 547
michael@0 548
michael@0 549 // Debugging. Not thread safe. Defined in getonescriptspan
michael@0 550 char* DisplayPiece(const char* next_byte_, int byte_length_);
michael@0 551
michael@0 552 // If high bit is on, take out high bit and add 2B to make table2 entries easy
michael@0 553 inline int PrintableIndirect(int x) {
michael@0 554 if ((x & 0x80000000u) != 0) {
michael@0 555 return (x & ~0x80000000u) + 2000000000;
michael@0 556 }
michael@0 557 return x;
michael@0 558 }
michael@0 559 void DumpHitBuffer(FILE* df, const char* text,
michael@0 560 const ScoringHitBuffer* hitbuffer) {
michael@0 561 fprintf(df,
michael@0 562 "<br>DumpHitBuffer[%s, next_base/delta/distinct %d, %d, %d)<br>\n",
michael@0 563 ULScriptCode(hitbuffer->ulscript),
michael@0 564 hitbuffer->next_base, hitbuffer->next_delta,
michael@0 565 hitbuffer->next_distinct);
michael@0 566 for (int i = 0; i < hitbuffer->maxscoringhits; ++i) {
michael@0 567 if (i < hitbuffer->next_base) {
michael@0 568 fprintf(df, "Q[%d]%d,%d,%s ",
michael@0 569 i, hitbuffer->base[i].offset,
michael@0 570 PrintableIndirect(hitbuffer->base[i].indirect),
michael@0 571 DisplayPiece(&text[hitbuffer->base[i].offset], 6));
michael@0 572 }
michael@0 573 if (i < hitbuffer->next_delta) {
michael@0 574 fprintf(df, "DL[%d]%d,%d,%s ",
michael@0 575 i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,
michael@0 576 DisplayPiece(&text[hitbuffer->delta[i].offset], 12));
michael@0 577 }
michael@0 578 if (i < hitbuffer->next_distinct) {
michael@0 579 fprintf(df, "D[%d]%d,%d,%s ",
michael@0 580 i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect,
michael@0 581 DisplayPiece(&text[hitbuffer->distinct[i].offset], 12));
michael@0 582 }
michael@0 583 if (i < hitbuffer->next_base) {
michael@0 584 fprintf(df, "<br>\n");
michael@0 585 }
michael@0 586 if (i > 50) {break;}
michael@0 587 }
michael@0 588 if (hitbuffer->next_base > 50) {
michael@0 589 int i = hitbuffer->next_base;
michael@0 590 fprintf(df, "Q[%d]%d,%d,%s ",
michael@0 591 i, hitbuffer->base[i].offset,
michael@0 592 PrintableIndirect(hitbuffer->base[i].indirect),
michael@0 593 DisplayPiece(&text[hitbuffer->base[i].offset], 6));
michael@0 594 }
michael@0 595 if (hitbuffer->next_delta > 50) {
michael@0 596 int i = hitbuffer->next_delta;
michael@0 597 fprintf(df, "DL[%d]%d,%d,%s ",
michael@0 598 i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,
michael@0 599 DisplayPiece(&text[hitbuffer->delta[i].offset], 12));
michael@0 600 }
michael@0 601 if (hitbuffer->next_distinct > 50) {
michael@0 602 int i = hitbuffer->next_distinct;
michael@0 603 fprintf(df, "D[%d]%d,%d,%s ",
michael@0 604 i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect,
michael@0 605 DisplayPiece(&text[hitbuffer->distinct[i].offset], 12));
michael@0 606 }
michael@0 607 fprintf(df, "<br>\n");
michael@0 608 }
michael@0 609
michael@0 610
michael@0 611 void DumpLinearBuffer(FILE* df, const char* text,
michael@0 612 const ScoringHitBuffer* hitbuffer) {
michael@0 613 fprintf(df, "<br>DumpLinearBuffer[%d)<br>\n",
michael@0 614 hitbuffer->next_linear);
michael@0 615 // Include the dummy entry off the end
michael@0 616 for (int i = 0; i < hitbuffer->next_linear + 1; ++i) {
michael@0 617 if ((50 < i) && (i < (hitbuffer->next_linear - 1))) {continue;}
michael@0 618 fprintf(df, "[%d]%d,%c=%08x,%s<br>\n",
michael@0 619 i, hitbuffer->linear[i].offset,
michael@0 620 "UQLD"[hitbuffer->linear[i].type],
michael@0 621 hitbuffer->linear[i].langprob,
michael@0 622 DisplayPiece(&text[hitbuffer->linear[i].offset], 6));
michael@0 623 }
michael@0 624 fprintf(df, "<br>\n");
michael@0 625
michael@0 626 fprintf(df, "DumpChunkStart[%d]<br>\n", hitbuffer->next_chunk_start);
michael@0 627 for (int i = 0; i < hitbuffer->next_chunk_start + 1; ++i) {
michael@0 628 fprintf(df, "[%d]%d\n", i, hitbuffer->chunk_start[i]);
michael@0 629 }
michael@0 630 fprintf(df, "<br>\n");
michael@0 631 }
michael@0 632
michael@0 633 // Move this verbose debugging output to debug.cc eventually
michael@0 634 void DumpChunkSummary(FILE* df, const ChunkSummary* cs) {
michael@0 635 // Print chunksummary
michael@0 636 fprintf(df, "%d lin[%d] %s.%d %s.%d %dB %d# %s %dRd %dRs<br>\n",
michael@0 637 cs->offset,
michael@0 638 cs->chunk_start,
michael@0 639 LanguageCode(static_cast<Language>(cs->lang1)),
michael@0 640 cs->score1,
michael@0 641 LanguageCode(static_cast<Language>(cs->lang2)),
michael@0 642 cs->score2,
michael@0 643 cs->bytes,
michael@0 644 cs->grams,
michael@0 645 ULScriptCode(static_cast<ULScript>(cs->ulscript)),
michael@0 646 cs->reliability_delta,
michael@0 647 cs->reliability_score);
michael@0 648 }
michael@0 649
michael@0 650 void DumpSummaryBuffer(FILE* df, const SummaryBuffer* summarybuffer) {
michael@0 651 fprintf(df, "<br>DumpSummaryBuffer[%d]<br>\n", summarybuffer->n);
michael@0 652 fprintf(df, "[i] offset linear[chunk_start] lang.score1 lang.score2 "
michael@0 653 "bytesB ngrams# script rel_delta rel_score<br>\n");
michael@0 654 for (int i = 0; i <= summarybuffer->n; ++i) {
michael@0 655 fprintf(df, "[%d] ", i);
michael@0 656 DumpChunkSummary(df, &summarybuffer->chunksummary[i]);
michael@0 657 }
michael@0 658 fprintf(df, "<br>\n");
michael@0 659 }
michael@0 660
michael@0 661
michael@0 662
michael@0 663 // Within hitbufer->linear[]
michael@0 664 // <-- prior chunk --><-- this chunk -->
michael@0 665 // | | |
michael@0 666 // linear0 linear1 linear2
michael@0 667 // lang0 lang1
michael@0 668 // The goal of sharpening is to move this_linear to better separate langs
michael@0 669 int BetterBoundary(const char* text,
michael@0 670 ScoringHitBuffer* hitbuffer,
michael@0 671 ScoringContext* scoringcontext,
michael@0 672 uint16 pslang0, uint16 pslang1,
michael@0 673 int linear0, int linear1, int linear2) {
michael@0 674 // Degenerate case, no change
michael@0 675 if ((linear2 - linear0) <= 8) {return linear1;}
michael@0 676
michael@0 677 // Each diff gives pslang0 score - pslang1 score
michael@0 678 // Running diff has four entries + + + + followed by four entries - - - -
michael@0 679 // so that this value is maximal at the sharpest boundary between pslang0
michael@0 680 // (positive diffs) and pslang1 (negative diffs)
michael@0 681 int running_diff = 0;
michael@0 682 int diff[8]; // Ring buffer of pslang0-pslang1 differences
michael@0 683 // Initialize with first 8 diffs
michael@0 684 for (int i = linear0; i < linear0 + 8; ++i) {
michael@0 685 int j = i & 7;
michael@0 686 uint32 langprob = hitbuffer->linear[i].langprob;
michael@0 687 diff[j] = GetLangScore(langprob, pslang0) -
michael@0 688 GetLangScore(langprob, pslang1);
michael@0 689 if (i < linear0 + 4) {
michael@0 690 // First four diffs pslang0 - pslang1
michael@0 691 running_diff += diff[j];
michael@0 692 } else {
michael@0 693 // Second four diffs -(pslang0 - pslang1)
michael@0 694 running_diff -= diff[j];
michael@0 695 }
michael@0 696 }
michael@0 697
michael@0 698 // Now scan for sharpest boundary. j is at left end of 8 entries
michael@0 699 // To be a boundary, there must be both >0 and <0 entries in the window
michael@0 700 int better_boundary_value = 0;
michael@0 701 int better_boundary = linear1;
michael@0 702 for (int i = linear0; i < linear2 - 8; ++i) {
michael@0 703 int j = i & 7;
michael@0 704 if (better_boundary_value < running_diff) {
michael@0 705 bool has_plus = false;
michael@0 706 bool has_minus = false;
michael@0 707 for (int kk = 0; kk < 8; ++kk) {
michael@0 708 if (diff[kk] > 0) {has_plus = true;}
michael@0 709 if (diff[kk] < 0) {has_minus = true;}
michael@0 710 }
michael@0 711 if (has_plus && has_minus) {
michael@0 712 better_boundary_value = running_diff;
michael@0 713 better_boundary = i + 4;
michael@0 714 }
michael@0 715 }
michael@0 716 // Shift right one entry
michael@0 717 uint32 langprob = hitbuffer->linear[i + 8].langprob;
michael@0 718 int newdiff = GetLangScore(langprob, pslang0) -
michael@0 719 GetLangScore(langprob, pslang1);
michael@0 720 int middiff = diff[(i + 4) & 7];
michael@0 721 int olddiff = diff[j];
michael@0 722 diff[j] = newdiff;
michael@0 723 running_diff -= olddiff; // Remove left
michael@0 724 running_diff += 2 * middiff; // Convert middle from - to +
michael@0 725 running_diff -= newdiff; // Insert right
michael@0 726 }
michael@0 727
michael@0 728 if (scoringcontext->flags_cld2_verbose && (linear1 != better_boundary)) {
michael@0 729 Language lang0 = FromPerScriptNumber(scoringcontext->ulscript, pslang0);
michael@0 730 Language lang1 = FromPerScriptNumber(scoringcontext->ulscript, pslang1);
michael@0 731 fprintf(scoringcontext->debug_file, " Better lin[%d=>%d] %s^^%s <br>\n",
michael@0 732 linear1, better_boundary,
michael@0 733 LanguageCode(lang0), LanguageCode(lang1));
michael@0 734 int lin0_off = hitbuffer->linear[linear0].offset;
michael@0 735 int lin1_off = hitbuffer->linear[linear1].offset;
michael@0 736 int lin2_off = hitbuffer->linear[linear2].offset;
michael@0 737 int better_offm1 = hitbuffer->linear[better_boundary - 1].offset;
michael@0 738 int better_off = hitbuffer->linear[better_boundary].offset;
michael@0 739 int better_offp1 = hitbuffer->linear[better_boundary + 1].offset;
michael@0 740 string old0(&text[lin0_off], lin1_off - lin0_off);
michael@0 741 string old1(&text[lin1_off], lin2_off - lin1_off);
michael@0 742 string new0(&text[lin0_off], better_offm1 - lin0_off);
michael@0 743 string new0m1(&text[better_offm1], better_off - better_offm1);
michael@0 744 string new1(&text[better_off], better_offp1 - better_off);
michael@0 745 string new1p1(&text[better_offp1], lin2_off - better_offp1);
michael@0 746 fprintf(scoringcontext->debug_file, "%s^^%s => <br>\n%s^%s^^%s^%s<br>\n",
michael@0 747 GetHtmlEscapedText(old0).c_str(),
michael@0 748 GetHtmlEscapedText(old1).c_str(),
michael@0 749 GetHtmlEscapedText(new0).c_str(),
michael@0 750 GetHtmlEscapedText(new0m1).c_str(),
michael@0 751 GetHtmlEscapedText(new1).c_str(),
michael@0 752 GetHtmlEscapedText(new1p1).c_str());
michael@0 753 // Slow picture of differences per linear entry
michael@0 754 int d;
michael@0 755 for (int i = linear0; i < linear2; ++i) {
michael@0 756 if (i == better_boundary) {
michael@0 757 fprintf(scoringcontext->debug_file, "^^ ");
michael@0 758 }
michael@0 759 uint32 langprob = hitbuffer->linear[i].langprob;
michael@0 760 d = GetLangScore(langprob, pslang0) - GetLangScore(langprob, pslang1);
michael@0 761 const char* s = "=";
michael@0 762 //if (d > 2) {s = "\xc2\xaf";} // Macron
michael@0 763 if (d > 2) {s = "#";}
michael@0 764 else if (d > 0) {s = "+";}
michael@0 765 else if (d < -2) {s = "_";}
michael@0 766 else if (d < 0) {s = "-";}
michael@0 767 fprintf(scoringcontext->debug_file, "%s ", s);
michael@0 768 }
michael@0 769 fprintf(scoringcontext->debug_file, " &nbsp;&nbsp;(scale: #+=-_)<br>\n");
michael@0 770 }
michael@0 771 return better_boundary;
michael@0 772 }
michael@0 773
michael@0 774
michael@0 775 // For all but the first summary, if its top language differs from
michael@0 776 // the previous chunk, refine the boundary
michael@0 777 // Linearized version
michael@0 778 void SharpenBoundaries(const char* text,
michael@0 779 bool more_to_come,
michael@0 780 ScoringHitBuffer* hitbuffer,
michael@0 781 ScoringContext* scoringcontext,
michael@0 782 SummaryBuffer* summarybuffer) {
michael@0 783
michael@0 784 int prior_linear = summarybuffer->chunksummary[0].chunk_start;
michael@0 785 uint16 prior_lang = summarybuffer->chunksummary[0].lang1;
michael@0 786
michael@0 787 if (scoringcontext->flags_cld2_verbose) {
michael@0 788 fprintf(scoringcontext->debug_file, "<br>SharpenBoundaries<br>\n");
michael@0 789 }
michael@0 790 for (int i = 1; i < summarybuffer->n; ++i) {
michael@0 791 ChunkSummary* cs = &summarybuffer->chunksummary[i];
michael@0 792 uint16 this_lang = cs->lang1;
michael@0 793 if (this_lang == prior_lang) {
michael@0 794 prior_linear = cs->chunk_start;
michael@0 795 continue;
michael@0 796 }
michael@0 797
michael@0 798 int this_linear = cs->chunk_start;
michael@0 799 int next_linear = summarybuffer->chunksummary[i + 1].chunk_start;
michael@0 800
michael@0 801 // If this/prior in same close set, don't move boundary
michael@0 802 if (SameCloseSet(prior_lang, this_lang)) {
michael@0 803 prior_linear = this_linear;
michael@0 804 prior_lang = this_lang;
michael@0 805 continue;
michael@0 806 }
michael@0 807
michael@0 808
michael@0 809 // Within hitbuffer->linear[]
michael@0 810 // <-- prior chunk --><-- this chunk -->
michael@0 811 // | | |
michael@0 812 // prior_linear this_linear next_linear
michael@0 813 // prior_lang this_lang
michael@0 814 // The goal of sharpening is to move this_linear to better separate langs
michael@0 815
michael@0 816 uint8 pslang0 = PerScriptNumber(scoringcontext->ulscript,
michael@0 817 static_cast<Language>(prior_lang));
michael@0 818 uint8 pslang1 = PerScriptNumber(scoringcontext->ulscript,
michael@0 819 static_cast<Language>(this_lang));
michael@0 820 int better_linear = BetterBoundary(text,
michael@0 821 hitbuffer,
michael@0 822 scoringcontext,
michael@0 823 pslang0, pslang1,
michael@0 824 prior_linear, this_linear, next_linear);
michael@0 825
michael@0 826 int old_offset = hitbuffer->linear[this_linear].offset;
michael@0 827 int new_offset = hitbuffer->linear[better_linear].offset;
michael@0 828 cs->chunk_start = better_linear;
michael@0 829 cs->offset = new_offset;
michael@0 830 // If this_linear moved right, make bytes smaller for this, larger for prior
michael@0 831 // If this_linear moved left, make bytes larger for this, smaller for prior
michael@0 832 cs->bytes -= (new_offset - old_offset);
michael@0 833 summarybuffer->chunksummary[i - 1].bytes += (new_offset - old_offset);
michael@0 834
michael@0 835 this_linear = better_linear; // Update so that next chunk doesn't intrude
michael@0 836
michael@0 837 // Consider rescoring the two chunks
michael@0 838
michael@0 839 // Update for next round (note: using pre-updated boundary)
michael@0 840 prior_linear = this_linear;
michael@0 841 prior_lang = this_lang;
michael@0 842 }
michael@0 843 }
michael@0 844
michael@0 845 // Make a langprob that gives small weight to the default language for ulscript
michael@0 846 uint32 DefaultLangProb(ULScript ulscript) {
michael@0 847 Language default_lang = DefaultLanguage(ulscript);
michael@0 848 return MakeLangProb(default_lang, 1);
michael@0 849 }
michael@0 850
michael@0 851 // Effectively, do a merge-sort based on text offsets
michael@0 852 // Look up each indirect value in appropriate scoring table and keep
michael@0 853 // just the resulting langprobs
michael@0 854 void LinearizeAll(ScoringContext* scoringcontext, bool score_cjk,
michael@0 855 ScoringHitBuffer* hitbuffer) {
michael@0 856 const CLD2TableSummary* base_obj; // unigram or quadgram
michael@0 857 const CLD2TableSummary* base_obj2; // quadgram dual table
michael@0 858 const CLD2TableSummary* delta_obj; // bigram or octagram
michael@0 859 const CLD2TableSummary* distinct_obj; // bigram or octagram
michael@0 860 uint16 base_hit;
michael@0 861 if (score_cjk) {
michael@0 862 base_obj = scoringcontext->scoringtables->unigram_compat_obj;
michael@0 863 base_obj2 = scoringcontext->scoringtables->unigram_compat_obj;
michael@0 864 delta_obj = scoringcontext->scoringtables->deltabi_obj;
michael@0 865 distinct_obj = scoringcontext->scoringtables->distinctbi_obj;
michael@0 866 base_hit = UNIHIT;
michael@0 867 } else {
michael@0 868 base_obj = scoringcontext->scoringtables->quadgram_obj;
michael@0 869 base_obj2 = scoringcontext->scoringtables->quadgram_obj2;
michael@0 870 delta_obj = scoringcontext->scoringtables->deltaocta_obj;
michael@0 871 distinct_obj = scoringcontext->scoringtables->distinctocta_obj;
michael@0 872 base_hit = QUADHIT;
michael@0 873 }
michael@0 874
michael@0 875 int base_limit = hitbuffer->next_base;
michael@0 876 int delta_limit = hitbuffer->next_delta;
michael@0 877 int distinct_limit = hitbuffer->next_distinct;
michael@0 878 int base_i = 0;
michael@0 879 int delta_i = 0;
michael@0 880 int distinct_i = 0;
michael@0 881 int linear_i = 0;
michael@0 882
michael@0 883 // Start with an initial base hit for the default language for this script
michael@0 884 // Inserting this avoids edge effects with no hits at all
michael@0 885 hitbuffer->linear[linear_i].offset = hitbuffer->lowest_offset;
michael@0 886 hitbuffer->linear[linear_i].type = base_hit;
michael@0 887 hitbuffer->linear[linear_i].langprob =
michael@0 888 DefaultLangProb(scoringcontext->ulscript);
michael@0 889 ++linear_i;
michael@0 890
michael@0 891 while ((base_i < base_limit) || (delta_i < delta_limit) ||
michael@0 892 (distinct_i < distinct_limit)) {
michael@0 893 int base_off = hitbuffer->base[base_i].offset;
michael@0 894 int delta_off = hitbuffer->delta[delta_i].offset;
michael@0 895 int distinct_off = hitbuffer->distinct[distinct_i].offset;
michael@0 896
michael@0 897 // Do delta and distinct first, so that they are not lost at base_limit
michael@0 898 if ((delta_i < delta_limit) &&
michael@0 899 (delta_off <= base_off) && (delta_off <= distinct_off)) {
michael@0 900 // Add delta entry
michael@0 901 int indirect = hitbuffer->delta[delta_i].indirect;
michael@0 902 ++delta_i;
michael@0 903 uint32 langprob = delta_obj->kCLDTableInd[indirect];
michael@0 904 if (langprob > 0) {
michael@0 905 hitbuffer->linear[linear_i].offset = delta_off;
michael@0 906 hitbuffer->linear[linear_i].type = DELTAHIT;
michael@0 907 hitbuffer->linear[linear_i].langprob = langprob;
michael@0 908 ++linear_i;
michael@0 909 }
michael@0 910 }
michael@0 911 else if ((distinct_i < distinct_limit) &&
michael@0 912 (distinct_off <= base_off) && (distinct_off <= delta_off)) {
michael@0 913 // Add distinct entry
michael@0 914 int indirect = hitbuffer->distinct[distinct_i].indirect;
michael@0 915 ++distinct_i;
michael@0 916 uint32 langprob = distinct_obj->kCLDTableInd[indirect];
michael@0 917 if (langprob > 0) {
michael@0 918 hitbuffer->linear[linear_i].offset = distinct_off;
michael@0 919 hitbuffer->linear[linear_i].type = DISTINCTHIT;
michael@0 920 hitbuffer->linear[linear_i].langprob = langprob;
michael@0 921 ++linear_i;
michael@0 922 }
michael@0 923 }
michael@0 924 else {
michael@0 925 // Add one or two base entries
michael@0 926 int indirect = hitbuffer->base[base_i].indirect;
michael@0 927 // First, get right scoring table
michael@0 928 const CLD2TableSummary* local_base_obj = base_obj;
michael@0 929 if ((indirect & 0x80000000u) != 0) {
michael@0 930 local_base_obj = base_obj2;
michael@0 931 indirect &= ~0x80000000u;
michael@0 932 }
michael@0 933 ++base_i;
michael@0 934 // One langprob in kQuadInd[0..SingleSize),
michael@0 935 // two in kQuadInd[SingleSize..Size)
michael@0 936 if (indirect < static_cast<int>(local_base_obj->kCLDTableSizeOne)) {
michael@0 937 // Up to three languages at indirect
michael@0 938 uint32 langprob = local_base_obj->kCLDTableInd[indirect];
michael@0 939 if (langprob > 0) {
michael@0 940 hitbuffer->linear[linear_i].offset = base_off;
michael@0 941 hitbuffer->linear[linear_i].type = base_hit;
michael@0 942 hitbuffer->linear[linear_i].langprob = langprob;
michael@0 943 ++linear_i;
michael@0 944 }
michael@0 945 } else {
michael@0 946 // Up to six languages at start + 2 * (indirect - start)
michael@0 947 indirect += (indirect - local_base_obj->kCLDTableSizeOne);
michael@0 948 uint32 langprob = local_base_obj->kCLDTableInd[indirect];
michael@0 949 uint32 langprob2 = local_base_obj->kCLDTableInd[indirect + 1];
michael@0 950 if (langprob > 0) {
michael@0 951 hitbuffer->linear[linear_i].offset = base_off;
michael@0 952 hitbuffer->linear[linear_i].type = base_hit;
michael@0 953 hitbuffer->linear[linear_i].langprob = langprob;
michael@0 954 ++linear_i;
michael@0 955 }
michael@0 956 if (langprob2 > 0) {
michael@0 957 hitbuffer->linear[linear_i].offset = base_off;
michael@0 958 hitbuffer->linear[linear_i].type = base_hit;
michael@0 959 hitbuffer->linear[linear_i].langprob = langprob2;
michael@0 960 ++linear_i;
michael@0 961 }
michael@0 962 }
michael@0 963 }
michael@0 964 }
michael@0 965
michael@0 966 // Update
michael@0 967 hitbuffer->next_linear = linear_i;
michael@0 968
michael@0 969 // Add a dummy entry off the end, just to capture final offset
michael@0 970 hitbuffer->linear[linear_i].offset =
michael@0 971 hitbuffer->base[hitbuffer->next_base].offset;
michael@0 972 hitbuffer->linear[linear_i].langprob = 0;
michael@0 973 }
michael@0 974
michael@0 975 // Break linear array into chunks of ~20 quadgram hits or ~50 CJK unigram hits
michael@0 976 void ChunkAll(int letter_offset, bool score_cjk, ScoringHitBuffer* hitbuffer) {
michael@0 977 int chunksize;
michael@0 978 uint16 base_hit;
michael@0 979 if (score_cjk) {
michael@0 980 chunksize = kChunksizeUnis;
michael@0 981 base_hit = UNIHIT;
michael@0 982 } else {
michael@0 983 chunksize = kChunksizeQuads;
michael@0 984 base_hit = QUADHIT;
michael@0 985 }
michael@0 986
michael@0 987 int linear_i = 0;
michael@0 988 int linear_off_end = hitbuffer->next_linear;
michael@0 989 int text_i = letter_offset; // Next unseen text offset
michael@0 990 int next_chunk_start = 0;
michael@0 991 int bases_left = hitbuffer->next_base;
michael@0 992 while (bases_left > 0) {
michael@0 993 // Linearize one chunk
michael@0 994 int base_len = chunksize; // Default; may be changed below
michael@0 995 if (bases_left < (chunksize + (chunksize >> 1))) {
michael@0 996 // If within 1.5 chunks of the end, avoid runts by using it all
michael@0 997 base_len = bases_left;
michael@0 998 } else if (bases_left < (2 * chunksize)) {
michael@0 999 // Avoid runts by splitting 1.5 to 2 chunks in half (about 3/4 each)
michael@0 1000 base_len = (bases_left + 1) >> 1;
michael@0 1001 }
michael@0 1002
michael@0 1003 hitbuffer->chunk_start[next_chunk_start] = linear_i;
michael@0 1004 hitbuffer->chunk_offset[next_chunk_start] = text_i;
michael@0 1005 ++next_chunk_start;
michael@0 1006
michael@0 1007 int base_count = 0;
michael@0 1008 while ((base_count < base_len) && (linear_i < linear_off_end)) {
michael@0 1009 if (hitbuffer->linear[linear_i].type == base_hit) {++base_count;}
michael@0 1010 ++linear_i;
michael@0 1011 }
michael@0 1012 text_i = hitbuffer->linear[linear_i].offset; // Next unseen text offset
michael@0 1013 bases_left -= base_len;
michael@0 1014 }
michael@0 1015
michael@0 1016 // If no base hits at all, make a single dummy chunk
michael@0 1017 if (next_chunk_start == 0) {
michael@0 1018 hitbuffer->chunk_start[next_chunk_start] = 0;
michael@0 1019 hitbuffer->chunk_offset[next_chunk_start] = hitbuffer->linear[0].offset;
michael@0 1020 ++next_chunk_start;
michael@0 1021 }
michael@0 1022
michael@0 1023 // Remember the linear array start of dummy entry
michael@0 1024 hitbuffer->next_chunk_start = next_chunk_start;
michael@0 1025
michael@0 1026 // Add a dummy entry off the end, just to capture final linear subscr
michael@0 1027 hitbuffer->chunk_start[next_chunk_start] = hitbuffer->next_linear;
michael@0 1028 hitbuffer->chunk_offset[next_chunk_start] = text_i;
michael@0 1029 }
michael@0 1030
michael@0 1031
michael@0 1032 // Merge-sort the individual hit arrays, go indirect on the scoring subscripts,
michael@0 1033 // break linear array into chunks.
michael@0 1034 //
michael@0 1035 // Input:
michael@0 1036 // hitbuffer base, delta, distinct arrays
michael@0 1037 // Output:
michael@0 1038 // linear array
michael@0 1039 // chunk_start array
michael@0 1040 //
michael@0 1041 void LinearizeHitBuffer(int letter_offset,
michael@0 1042 ScoringContext* scoringcontext,
michael@0 1043 bool more_to_come, bool score_cjk,
michael@0 1044 ScoringHitBuffer* hitbuffer) {
michael@0 1045 LinearizeAll(scoringcontext, score_cjk, hitbuffer);
michael@0 1046 ChunkAll(letter_offset, score_cjk, hitbuffer);
michael@0 1047 }
michael@0 1048
michael@0 1049
michael@0 1050
michael@0 1051 // The hitbuffer is in an awkward form -- three sets of base/delta/distinct
michael@0 1052 // scores, each with an indirect subscript to one of six scoring tables, some
michael@0 1053 // of which can yield two langprobs for six languages, others one langprob for
michael@0 1054 // three languages. The only correlation between base/delta/distinct is their
michael@0 1055 // offsets into the letters-only text buffer.
michael@0 1056 //
michael@0 1057 // SummaryBuffer needs to be built to linear, giving linear offset of start of
michael@0 1058 // each chunk
michael@0 1059 //
michael@0 1060 // So we first do all the langprob lookups and merge-sort by offset to make
michael@0 1061 // a single linear vector, building a side vector of chunk beginnings as we go.
michael@0 1062 // The sharpening is simply moving the beginnings, scoring is a simple linear
michael@0 1063 // sweep, etc.
michael@0 1064
michael@0 1065 void ProcessHitBuffer(const LangSpan& scriptspan,
michael@0 1066 int letter_offset,
michael@0 1067 ScoringContext* scoringcontext,
michael@0 1068 DocTote* doc_tote,
michael@0 1069 ResultChunkVector* vec,
michael@0 1070 bool more_to_come, bool score_cjk,
michael@0 1071 ScoringHitBuffer* hitbuffer) {
michael@0 1072 if (scoringcontext->flags_cld2_verbose) {
michael@0 1073 fprintf(scoringcontext->debug_file, "Hitbuffer[) ");
michael@0 1074 DumpHitBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer);
michael@0 1075 }
michael@0 1076
michael@0 1077 LinearizeHitBuffer(letter_offset, scoringcontext, more_to_come, score_cjk,
michael@0 1078 hitbuffer);
michael@0 1079
michael@0 1080 if (scoringcontext->flags_cld2_verbose) {
michael@0 1081 fprintf(scoringcontext->debug_file, "Linear[) ");
michael@0 1082 DumpLinearBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer);
michael@0 1083 }
michael@0 1084
michael@0 1085 SummaryBuffer summarybuffer;
michael@0 1086 summarybuffer.n = 0;
michael@0 1087 ChunkSpan last_cspan;
michael@0 1088 ScoreAllHits(scriptspan.text, scriptspan.ulscript,
michael@0 1089 more_to_come, score_cjk, hitbuffer,
michael@0 1090 scoringcontext,
michael@0 1091 &summarybuffer, &last_cspan);
michael@0 1092
michael@0 1093 if (scoringcontext->flags_cld2_verbose) {
michael@0 1094 DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer);
michael@0 1095 }
michael@0 1096
michael@0 1097 if (vec != NULL) {
michael@0 1098 // Sharpen boundaries of summarybuffer
michael@0 1099 // This is not a high-performance path
michael@0 1100 SharpenBoundaries(scriptspan.text, more_to_come, hitbuffer, scoringcontext,
michael@0 1101 &summarybuffer);
michael@0 1102 // Show after the sharpening
michael@0 1103 // CLD2_Debug2(scriptspan.text, more_to_come, score_cjk,
michael@0 1104 // hitbuffer, scoringcontext, &summarybuffer);
michael@0 1105
michael@0 1106 if (scoringcontext->flags_cld2_verbose) {
michael@0 1107 DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer);
michael@0 1108 }
michael@0 1109 }
michael@0 1110
michael@0 1111 SummaryBufferToDocTote(&summarybuffer, more_to_come, doc_tote);
michael@0 1112 SummaryBufferToVector(scoringcontext->scanner, scriptspan.text,
michael@0 1113 &summarybuffer, more_to_come, vec);
michael@0 1114 }
michael@0 1115
michael@0 1116 void SpliceHitBuffer(ScoringHitBuffer* hitbuffer, int next_offset) {
michael@0 1117 // Splice hitbuffer and summarybuffer for next round. With big chunks and
michael@0 1118 // distinctive-word state carried across chunks, we might not need to do this.
michael@0 1119 hitbuffer->next_base = 0;
michael@0 1120 hitbuffer->next_delta = 0;
michael@0 1121 hitbuffer->next_distinct = 0;
michael@0 1122 hitbuffer->next_linear = 0;
michael@0 1123 hitbuffer->next_chunk_start = 0;
michael@0 1124 hitbuffer->lowest_offset = next_offset;
michael@0 1125 }
michael@0 1126
michael@0 1127
michael@0 1128 // Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating
michael@0 1129 // scoringcontext
michael@0 1130 void ScoreEntireScriptSpan(const LangSpan& scriptspan,
michael@0 1131 ScoringContext* scoringcontext,
michael@0 1132 DocTote* doc_tote,
michael@0 1133 ResultChunkVector* vec) {
michael@0 1134 int bytes = scriptspan.text_bytes;
michael@0 1135 // Artificially set score to 1024 per 1KB, or 1 per byte
michael@0 1136 int score = bytes;
michael@0 1137 int reliability = 100;
michael@0 1138 // doc_tote uses full languages
michael@0 1139 Language one_one_lang = DefaultLanguage(scriptspan.ulscript);
michael@0 1140 doc_tote->Add(one_one_lang, bytes, score, reliability);
michael@0 1141
michael@0 1142 if (scoringcontext->flags_cld2_html) {
michael@0 1143 ChunkSummary chunksummary = {
michael@0 1144 1, 0,
michael@0 1145 one_one_lang, UNKNOWN_LANGUAGE, score, 1,
michael@0 1146 bytes, 0, scriptspan.ulscript, reliability, reliability
michael@0 1147 };
michael@0 1148 CLD2_Debug(scriptspan.text, 1, scriptspan.text_bytes,
michael@0 1149 false, false, NULL,
michael@0 1150 scoringcontext, NULL, &chunksummary);
michael@0 1151 }
michael@0 1152
michael@0 1153 // First byte is always a space
michael@0 1154 JustOneItemToVector(scoringcontext->scanner, scriptspan.text,
michael@0 1155 one_one_lang, 1, bytes - 1, vec);
michael@0 1156
michael@0 1157 scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
michael@0 1158 }
michael@0 1159
michael@0 1160 // Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext
michael@0 1161 void ScoreCJKScriptSpan(const LangSpan& scriptspan,
michael@0 1162 ScoringContext* scoringcontext,
michael@0 1163 DocTote* doc_tote,
michael@0 1164 ResultChunkVector* vec) {
michael@0 1165 // Allocate three parallel arrays of scoring hits
michael@0 1166 ScoringHitBuffer* hitbuffer = new ScoringHitBuffer;
michael@0 1167 hitbuffer->init();
michael@0 1168 hitbuffer->ulscript = scriptspan.ulscript;
michael@0 1169
michael@0 1170 scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
michael@0 1171 scoringcontext->oldest_distinct_boost = 0;
michael@0 1172
michael@0 1173 // Incoming scriptspan has a single leading space at scriptspan.text[0]
michael@0 1174 // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3]
michael@0 1175
michael@0 1176 int letter_offset = 1; // Skip initial space
michael@0 1177 hitbuffer->lowest_offset = letter_offset;
michael@0 1178 int letter_limit = scriptspan.text_bytes;
michael@0 1179 while (letter_offset < letter_limit) {
michael@0 1180 if (scoringcontext->flags_cld2_verbose) {
michael@0 1181 fprintf(scoringcontext->debug_file, " ScoreCJKScriptSpan[%d,%d)<br>\n",
michael@0 1182 letter_offset, letter_limit);
michael@0 1183 }
michael@0 1184 //
michael@0 1185 // Fill up one hitbuffer, possibly splicing onto previous fragment
michael@0 1186 //
michael@0 1187 // NOTE: GetUniHits deals with close repeats
michael@0 1188 // NOTE: After last chunk there is always a hitbuffer entry with an offset
michael@0 1189 // just off the end of the text = next_offset.
michael@0 1190 int next_offset = GetUniHits(scriptspan.text, letter_offset, letter_limit,
michael@0 1191 scoringcontext, hitbuffer);
michael@0 1192 // NOTE: GetBiHitVectors deals with close repeats,
michael@0 1193 // does one hash and two lookups (delta and distinct) per word
michael@0 1194 GetBiHits(scriptspan.text, letter_offset, next_offset,
michael@0 1195 scoringcontext, hitbuffer);
michael@0 1196
michael@0 1197 //
michael@0 1198 // Score one hitbuffer in chunks to summarybuffer
michael@0 1199 //
michael@0 1200 bool more_to_come = next_offset < letter_limit;
michael@0 1201 bool score_cjk = true;
michael@0 1202 ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec,
michael@0 1203 more_to_come, score_cjk, hitbuffer);
michael@0 1204 SpliceHitBuffer(hitbuffer, next_offset);
michael@0 1205
michael@0 1206 letter_offset = next_offset;
michael@0 1207 }
michael@0 1208
michael@0 1209 delete hitbuffer;
michael@0 1210 // Context across buffers is not connected yet
michael@0 1211 scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
michael@0 1212 }
michael@0 1213
michael@0 1214
michael@0 1215
michael@0 1216 // Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext
michael@0 1217 // We have a scriptspan with all lowercase text in one script. Look up
michael@0 1218 // quadgrams and octagrams, saving the hits in three parallel vectors.
michael@0 1219 // Score from those vectors in chunks, toting each chunk to get a single
michael@0 1220 // language, and combining into the overall document score. The hit vectors
michael@0 1221 // in general are not big enough to handle and entire scriptspan, so
michael@0 1222 // repeat until the entire scriptspan is scored.
michael@0 1223 // Caller deals with minimizing numbr of runt scriptspans
michael@0 1224 // This routine deals with minimizing number of runt chunks.
michael@0 1225 //
michael@0 1226 // Returns updated scoringcontext
michael@0 1227 // Returns updated doc_tote
michael@0 1228 // If vec != NULL, appends to that vector of ResultChunk's
michael@0 1229 void ScoreQuadScriptSpan(const LangSpan& scriptspan,
michael@0 1230 ScoringContext* scoringcontext,
michael@0 1231 DocTote* doc_tote,
michael@0 1232 ResultChunkVector* vec) {
michael@0 1233 // Allocate three parallel arrays of scoring hits
michael@0 1234 ScoringHitBuffer* hitbuffer = new ScoringHitBuffer;
michael@0 1235 hitbuffer->init();
michael@0 1236 hitbuffer->ulscript = scriptspan.ulscript;
michael@0 1237
michael@0 1238 scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
michael@0 1239 scoringcontext->oldest_distinct_boost = 0;
michael@0 1240
michael@0 1241 // Incoming scriptspan has a single leading space at scriptspan.text[0]
michael@0 1242 // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3]
michael@0 1243
michael@0 1244 int letter_offset = 1; // Skip initial space
michael@0 1245 hitbuffer->lowest_offset = letter_offset;
michael@0 1246 int letter_limit = scriptspan.text_bytes;
michael@0 1247 while (letter_offset < letter_limit) {
michael@0 1248 //
michael@0 1249 // Fill up one hitbuffer, possibly splicing onto previous fragment
michael@0 1250 //
michael@0 1251 // NOTE: GetQuadHits deals with close repeats
michael@0 1252 // NOTE: After last chunk there is always a hitbuffer entry with an offset
michael@0 1253 // just off the end of the text = next_offset.
michael@0 1254 int next_offset = GetQuadHits(scriptspan.text, letter_offset, letter_limit,
michael@0 1255 scoringcontext, hitbuffer);
michael@0 1256 // If true, there is more text to process in this scriptspan
michael@0 1257 // NOTE: GetOctaHitVectors deals with close repeats,
michael@0 1258 // does one hash and two lookups (delta and distinct) per word
michael@0 1259 GetOctaHits(scriptspan.text, letter_offset, next_offset,
michael@0 1260 scoringcontext, hitbuffer);
michael@0 1261
michael@0 1262 //
michael@0 1263 // Score one hitbuffer in chunks to summarybuffer
michael@0 1264 //
michael@0 1265 bool more_to_come = next_offset < letter_limit;
michael@0 1266 bool score_cjk = false;
michael@0 1267 ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec,
michael@0 1268 more_to_come, score_cjk, hitbuffer);
michael@0 1269 SpliceHitBuffer(hitbuffer, next_offset);
michael@0 1270
michael@0 1271 letter_offset = next_offset;
michael@0 1272 }
michael@0 1273
michael@0 1274 delete hitbuffer;
michael@0 1275 }
michael@0 1276
michael@0 1277
michael@0 1278 // Score one scriptspan into doc_tote and vec, updating scoringcontext
michael@0 1279 // Inputs:
michael@0 1280 // One scriptspan of perhaps 40-60KB, all same script lower-case letters
michael@0 1281 // and single ASCII spaces. First character is a space to allow simple
michael@0 1282 // begining-of-word detect. End of buffer has three spaces and NUL to
michael@0 1283 // allow easy scan-to-end-of-word.
michael@0 1284 // Scoring context of
michael@0 1285 // scoring tables
michael@0 1286 // flags
michael@0 1287 // running boosts
michael@0 1288 // Outputs:
michael@0 1289 // Updated doc_tote giving overall languages and byte counts
michael@0 1290 // Optional updated chunk vector giving offset, length, language
michael@0 1291 //
michael@0 1292 // Caller initializes flags, boosts, doc_tote and vec.
michael@0 1293 // Caller aggregates across multiple scriptspans
michael@0 1294 // Caller calculates final document result
michael@0 1295 // Caller deals with detecting and triggering suppression of repeated text.
michael@0 1296 //
michael@0 1297 // This top-level routine just chooses the recognition type and calls one of
michael@0 1298 // the next-level-down routines.
michael@0 1299 //
michael@0 1300 void ScoreOneScriptSpan(const LangSpan& scriptspan,
michael@0 1301 ScoringContext* scoringcontext,
michael@0 1302 DocTote* doc_tote,
michael@0 1303 ResultChunkVector* vec) {
michael@0 1304 if (scoringcontext->flags_cld2_verbose) {
michael@0 1305 fprintf(scoringcontext->debug_file, "<br>ScoreOneScriptSpan(%s,%d) ",
michael@0 1306 ULScriptCode(scriptspan.ulscript), scriptspan.text_bytes);
michael@0 1307 // Optionally print the chunk lowercase letters/marks text
michael@0 1308 string temp(&scriptspan.text[0], scriptspan.text_bytes);
michael@0 1309 fprintf(scoringcontext->debug_file, "'%s'",
michael@0 1310 GetHtmlEscapedText(temp).c_str());
michael@0 1311 fprintf(scoringcontext->debug_file, "<br>\n");
michael@0 1312 }
michael@0 1313 scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
michael@0 1314 scoringcontext->oldest_distinct_boost = 0;
michael@0 1315 ULScriptRType rtype = ULScriptRecognitionType(scriptspan.ulscript);
michael@0 1316 if (scoringcontext->flags_cld2_score_as_quads && (rtype != RTypeCJK)) {
michael@0 1317 rtype = RTypeMany;
michael@0 1318 }
michael@0 1319 switch (rtype) {
michael@0 1320 case RTypeNone:
michael@0 1321 case RTypeOne:
michael@0 1322 ScoreEntireScriptSpan(scriptspan, scoringcontext, doc_tote, vec);
michael@0 1323 break;
michael@0 1324 case RTypeCJK:
michael@0 1325 ScoreCJKScriptSpan(scriptspan, scoringcontext, doc_tote, vec);
michael@0 1326 break;
michael@0 1327 case RTypeMany:
michael@0 1328 ScoreQuadScriptSpan(scriptspan, scoringcontext, doc_tote, vec);
michael@0 1329 break;
michael@0 1330 }
michael@0 1331 }
michael@0 1332
michael@0 1333 } // End namespace CLD2
michael@0 1334

mercurial