1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/internal/scoreonescriptspan.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1334 @@ 1.4 +// Copyright 2013 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +// 1.19 +// Author: dsites@google.com (Dick Sites) 1.20 +// Updated 2014.01 for dual table lookup 1.21 +// 1.22 + 1.23 +#include "scoreonescriptspan.h" 1.24 + 1.25 +#include "cldutil.h" 1.26 +#include "debug.h" 1.27 +#include "lang_script.h" 1.28 + 1.29 +#include <stdio.h> 1.30 + 1.31 +using namespace std; 1.32 + 1.33 +namespace CLD2 { 1.34 + 1.35 +static const int kUnreliablePercentThreshold = 75; 1.36 + 1.37 +void AddLangProb(uint32 langprob, Tote* chunk_tote) { 1.38 + ProcessProbV2Tote(langprob, chunk_tote); 1.39 +} 1.40 + 1.41 +void ZeroPSLang(uint32 langprob, Tote* chunk_tote) { 1.42 + uint8 top1 = (langprob >> 8) & 0xff; 1.43 + chunk_tote->SetScore(top1, 0); 1.44 +} 1.45 + 1.46 +bool SameCloseSet(uint16 lang1, uint16 lang2) { 1.47 + int lang1_close_set = LanguageCloseSet(static_cast<Language>(lang1)); 1.48 + if (lang1_close_set == 0) {return false;} 1.49 + int lang2_close_set = LanguageCloseSet(static_cast<Language>(lang2)); 1.50 + return (lang1_close_set == lang2_close_set); 1.51 +} 1.52 + 1.53 +bool SameCloseSet(Language lang1, Language lang2) { 1.54 + int lang1_close_set = LanguageCloseSet(lang1); 1.55 + if (lang1_close_set == 0) {return false;} 1.56 + int lang2_close_set = LanguageCloseSet(lang2); 1.57 + return (lang1_close_set == lang2_close_set); 1.58 +} 1.59 + 1.60 + 1.61 +// Needs expected score per 1KB in scoring context 1.62 +void SetChunkSummary(ULScript ulscript, int first_linear_in_chunk, 1.63 + int offset, int len, 1.64 + const ScoringContext* scoringcontext, 1.65 + const Tote* chunk_tote, 1.66 + ChunkSummary* chunksummary) { 1.67 + int key3[3]; 1.68 + chunk_tote->CurrentTopThreeKeys(key3); 1.69 + Language lang1 = FromPerScriptNumber(ulscript, key3[0]); 1.70 + Language lang2 = FromPerScriptNumber(ulscript, key3[1]); 1.71 + 1.72 + int actual_score_per_kb = 0; 1.73 + if (len > 0) { 1.74 + actual_score_per_kb = (chunk_tote->GetScore(key3[0]) << 10) / len; 1.75 + } 1.76 + int expected_subscr = lang1 * 4 + LScript4(ulscript); 1.77 + int expected_score_per_kb = 1.78 + scoringcontext->scoringtables->kExpectedScore[expected_subscr]; 1.79 + 1.80 + chunksummary->offset = offset; 1.81 + chunksummary->chunk_start = first_linear_in_chunk; 1.82 + chunksummary->lang1 = lang1; 1.83 + chunksummary->lang2 = lang2; 1.84 + chunksummary->score1 = chunk_tote->GetScore(key3[0]); 1.85 + chunksummary->score2 = chunk_tote->GetScore(key3[1]); 1.86 + chunksummary->bytes = len; 1.87 + chunksummary->grams = chunk_tote->GetScoreCount(); 1.88 + chunksummary->ulscript = ulscript; 1.89 + chunksummary->reliability_delta = ReliabilityDelta(chunksummary->score1, 1.90 + chunksummary->score2, 1.91 + chunksummary->grams); 1.92 + // If lang1/lang2 in same close set, set delta reliability to 100% 1.93 + if (SameCloseSet(lang1, lang2)) { 1.94 + chunksummary->reliability_delta = 100; 1.95 + } 1.96 + chunksummary->reliability_score = 1.97 + ReliabilityExpected(actual_score_per_kb, expected_score_per_kb); 1.98 +} 1.99 + 1.100 +// Return true if just lang1 is there: lang2=0 and lang3=0 1.101 +bool IsSingleLang(uint32 langprob) { 1.102 + // Probably a bug -- which end is lang1? But only used to call empty Boost1 1.103 + return ((langprob & 0x00ffff00) == 0); 1.104 +} 1.105 + 1.106 +// Update scoring context distinct_boost for single language quad 1.107 +void AddDistinctBoost1(uint32 langprob, ScoringContext* scoringcontext) { 1.108 + // Probably keep this empty -- not a good enough signal 1.109 +} 1.110 + 1.111 +// Update scoring context distinct_boost for distinct octagram 1.112 +// Keep last 4 used. Since these are mostly (except at splices) in 1.113 +// hitbuffer, we might be able to just use a subscript and splice 1.114 +void AddDistinctBoost2(uint32 langprob, ScoringContext* scoringcontext) { 1.115 +// this is called 0..n times per chunk with decoded hitbuffer->distinct... 1.116 + LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn; 1.117 + if (scoringcontext->ulscript != ULScript_Latin) { 1.118 + distinct_boost = &scoringcontext->distinct_boost.othr; 1.119 + } 1.120 + int n = distinct_boost->n; 1.121 + distinct_boost->langprob[n] = langprob; 1.122 + distinct_boost->n = distinct_boost->wrap(n + 1); 1.123 +} 1.124 + 1.125 +// For each chunk, add extra weight for language priors (from content-lang and 1.126 +// meta lang=xx) and distinctive tokens 1.127 +void ScoreBoosts(const ScoringContext* scoringcontext, Tote* chunk_tote) { 1.128 + // Get boosts for current script 1.129 + const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn; 1.130 + const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn; 1.131 + const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn; 1.132 + if (scoringcontext->ulscript != ULScript_Latin) { 1.133 + langprior_boost = &scoringcontext->langprior_boost.othr; 1.134 + langprior_whack = &scoringcontext->langprior_whack.othr; 1.135 + distinct_boost = &scoringcontext->distinct_boost.othr; 1.136 + } 1.137 + 1.138 + for (int k = 0; k < kMaxBoosts; ++k) { 1.139 + uint32 langprob = langprior_boost->langprob[k]; 1.140 + if (langprob > 0) {AddLangProb(langprob, chunk_tote);} 1.141 + } 1.142 + for (int k = 0; k < kMaxBoosts; ++k) { 1.143 + uint32 langprob = distinct_boost->langprob[k]; 1.144 + if (langprob > 0) {AddLangProb(langprob, chunk_tote);} 1.145 + } 1.146 + // boost has a packed set of per-script langs and probabilites 1.147 + // whack has a packed set of per-script lang to be suppressed (zeroed) 1.148 + // When a language in a close set is given as an explicit hint, others in 1.149 + // that set will be whacked here. 1.150 + for (int k = 0; k < kMaxBoosts; ++k) { 1.151 + uint32 langprob = langprior_whack->langprob[k]; 1.152 + if (langprob > 0) {ZeroPSLang(langprob, chunk_tote);} 1.153 + } 1.154 +} 1.155 + 1.156 + 1.157 + 1.158 +// At this point, The chunk is described by 1.159 +// hitbuffer->base[cspan->chunk_base .. cspan->chunk_base + cspan->base_len) 1.160 +// hitbuffer->delta[cspan->chunk_delta ... ) 1.161 +// hitbuffer->distinct[cspan->chunk_distinct ... ) 1.162 +// Scored text is in text[lo..hi) where 1.163 +// lo is 0 or the min of first base/delta/distinct hitbuffer offset and 1.164 +// hi is the min of next base/delta/distinct hitbuffer offset after 1.165 +// base_len, etc. 1.166 +void GetTextSpanOffsets(const ScoringHitBuffer* hitbuffer, 1.167 + const ChunkSpan* cspan, int* lo, int* hi) { 1.168 + // Front of this span 1.169 + int lo_base = hitbuffer->base[cspan->chunk_base].offset; 1.170 + int lo_delta = hitbuffer->delta[cspan->chunk_delta].offset; 1.171 + int lo_distinct = hitbuffer->distinct[cspan->chunk_distinct].offset; 1.172 + // Front of next span 1.173 + int hi_base = hitbuffer->base[cspan->chunk_base + 1.174 + cspan->base_len].offset; 1.175 + int hi_delta = hitbuffer->delta[cspan->chunk_delta + 1.176 + cspan->delta_len].offset; 1.177 + int hi_distinct = hitbuffer->distinct[cspan->chunk_distinct + 1.178 + cspan->distinct_len].offset; 1.179 + 1.180 + *lo = 0; 1.181 +// if (cspan->chunk_base > 0) { 1.182 +// *lo = minint(minint(lo_base, lo_delta), lo_distinct); 1.183 +// } 1.184 + *lo = minint(minint(lo_base, lo_delta), lo_distinct); 1.185 + *hi = minint(minint(hi_base, hi_delta), hi_distinct); 1.186 +} 1.187 + 1.188 + 1.189 +int DiffScore(const CLD2TableSummary* obj, int indirect, 1.190 + uint16 lang1, uint16 lang2) { 1.191 + if (indirect < static_cast<int>(obj->kCLDTableSizeOne)) { 1.192 + // Up to three languages at indirect 1.193 + uint32 langprob = obj->kCLDTableInd[indirect]; 1.194 + return GetLangScore(langprob, lang1) - GetLangScore(langprob, lang2); 1.195 + } else { 1.196 + // Up to six languages at start + 2 * (indirect - start) 1.197 + indirect += (indirect - obj->kCLDTableSizeOne); 1.198 + uint32 langprob = obj->kCLDTableInd[indirect]; 1.199 + uint32 langprob2 = obj->kCLDTableInd[indirect + 1]; 1.200 + return (GetLangScore(langprob, lang1) + GetLangScore(langprob2, lang1)) - 1.201 + (GetLangScore(langprob, lang2) + GetLangScore(langprob2, lang2)); 1.202 + } 1.203 + 1.204 +} 1.205 + 1.206 +// Score all the bases, deltas, distincts, boosts for one chunk into chunk_tote 1.207 +// After last chunk there is always a hitbuffer entry with an offset just off 1.208 +// the end of the text. 1.209 +// Sets delta_len, and distinct_len 1.210 +void ScoreOneChunk(const char* text, ULScript ulscript, 1.211 + const ScoringHitBuffer* hitbuffer, 1.212 + int chunk_i, 1.213 + ScoringContext* scoringcontext, 1.214 + ChunkSpan* cspan, Tote* chunk_tote, 1.215 + ChunkSummary* chunksummary) { 1.216 + int first_linear_in_chunk = hitbuffer->chunk_start[chunk_i]; 1.217 + int first_linear_in_next_chunk = hitbuffer->chunk_start[chunk_i + 1]; 1.218 + 1.219 + chunk_tote->Reinit(); 1.220 + cspan->delta_len = 0; 1.221 + cspan->distinct_len = 0; 1.222 + if (scoringcontext->flags_cld2_verbose) { 1.223 + fprintf(scoringcontext->debug_file, "<br>ScoreOneChunk[%d..%d) ", 1.224 + first_linear_in_chunk, first_linear_in_next_chunk); 1.225 + } 1.226 + 1.227 + // 2013.02.05 linear design: just use base and base_len for the span 1.228 + cspan->chunk_base = first_linear_in_chunk; 1.229 + cspan->base_len = first_linear_in_next_chunk - first_linear_in_chunk; 1.230 + for (int i = first_linear_in_chunk; i < first_linear_in_next_chunk; ++i) { 1.231 + uint32 langprob = hitbuffer->linear[i].langprob; 1.232 + AddLangProb(langprob, chunk_tote); 1.233 + if (hitbuffer->linear[i].type <= QUADHIT) { 1.234 + chunk_tote->AddScoreCount(); // Just count quads, not octas 1.235 + } 1.236 + if (hitbuffer->linear[i].type == DISTINCTHIT) { 1.237 + AddDistinctBoost2(langprob, scoringcontext); 1.238 + } 1.239 + } 1.240 + 1.241 + // Score language prior boosts 1.242 + // Score distinct word boost 1.243 + ScoreBoosts(scoringcontext, chunk_tote); 1.244 + 1.245 + int lo = hitbuffer->linear[first_linear_in_chunk].offset; 1.246 + int hi = hitbuffer->linear[first_linear_in_next_chunk].offset; 1.247 + 1.248 + // Chunk_tote: get top langs, scores, etc. and fill in chunk summary 1.249 + SetChunkSummary(ulscript, first_linear_in_chunk, lo, hi - lo, 1.250 + scoringcontext, chunk_tote, chunksummary); 1.251 + 1.252 + bool more_to_come = false; 1.253 + bool score_cjk = false; 1.254 + if (scoringcontext->flags_cld2_html) { 1.255 + // Show one chunk in readable output 1.256 + CLD2_Debug(text, lo, hi, more_to_come, score_cjk, hitbuffer, 1.257 + scoringcontext, cspan, chunksummary); 1.258 + } 1.259 + 1.260 + scoringcontext->prior_chunk_lang = static_cast<Language>(chunksummary->lang1); 1.261 +} 1.262 + 1.263 + 1.264 +// Score chunks of text described by hitbuffer, allowing each to be in a 1.265 +// different language, and optionally adjusting the boundaries inbetween. 1.266 +// Set last_cspan to the last chunkspan used 1.267 +void ScoreAllHits(const char* text, ULScript ulscript, 1.268 + bool more_to_come, bool score_cjk, 1.269 + const ScoringHitBuffer* hitbuffer, 1.270 + ScoringContext* scoringcontext, 1.271 + SummaryBuffer* summarybuffer, ChunkSpan* last_cspan) { 1.272 + ChunkSpan prior_cspan = {0, 0, 0, 0, 0, 0}; 1.273 + ChunkSpan cspan = {0, 0, 0, 0, 0, 0}; 1.274 + 1.275 + for (int i = 0; i < hitbuffer->next_chunk_start; ++i) { 1.276 + // Score one chunk 1.277 + // Sets delta_len, and distinct_len 1.278 + Tote chunk_tote; 1.279 + ChunkSummary chunksummary; 1.280 + ScoreOneChunk(text, ulscript, 1.281 + hitbuffer, i, 1.282 + scoringcontext, &cspan, &chunk_tote, &chunksummary); 1.283 + 1.284 + // Put result in summarybuffer 1.285 + if (summarybuffer->n < kMaxSummaries) { 1.286 + summarybuffer->chunksummary[summarybuffer->n] = chunksummary; 1.287 + summarybuffer->n += 1; 1.288 + } 1.289 + 1.290 + prior_cspan = cspan; 1.291 + cspan.chunk_base += cspan.base_len; 1.292 + cspan.chunk_delta += cspan.delta_len; 1.293 + cspan.chunk_distinct += cspan.distinct_len; 1.294 + } 1.295 + 1.296 + // Add one dummy off the end to hold first unused linear_in_chunk 1.297 + int linear_off_end = hitbuffer->next_linear; 1.298 + int offset_off_end = hitbuffer->linear[linear_off_end].offset; 1.299 + ChunkSummary* cs = &summarybuffer->chunksummary[summarybuffer->n]; 1.300 + memset(cs, 0, sizeof(ChunkSummary)); 1.301 + cs->offset = offset_off_end; 1.302 + cs->chunk_start = linear_off_end; 1.303 + *last_cspan = prior_cspan; 1.304 +} 1.305 + 1.306 + 1.307 +void SummaryBufferToDocTote(const SummaryBuffer* summarybuffer, 1.308 + bool more_to_come, DocTote* doc_tote) { 1.309 + int cs_bytes_sum = 0; 1.310 + for (int i = 0; i < summarybuffer->n; ++i) { 1.311 + const ChunkSummary* cs = &summarybuffer->chunksummary[i]; 1.312 + int reliability = minint(cs->reliability_delta, cs->reliability_score); 1.313 + // doc_tote uses full languages 1.314 + doc_tote->Add(cs->lang1, cs->bytes, cs->score1, reliability); 1.315 + cs_bytes_sum += cs->bytes; 1.316 + } 1.317 +} 1.318 + 1.319 +// Turn on for debugging vectors 1.320 +static const bool kShowLettersOriginal = false; 1.321 + 1.322 + 1.323 +// If next chunk language matches last vector language, extend last element 1.324 +// Otherwise add new element to vector 1.325 +void ItemToVector(ScriptScanner* scanner, 1.326 + ResultChunkVector* vec, Language new_lang, 1.327 + int mapped_offset, int mapped_len) { 1.328 + uint16 last_vec_lang = static_cast<uint16>(UNKNOWN_LANGUAGE); 1.329 + int last_vec_subscr = vec->size() - 1; 1.330 + if (last_vec_subscr >= 0) { 1.331 + ResultChunk* priorrc = &(*vec)[last_vec_subscr]; 1.332 + last_vec_lang = priorrc->lang1; 1.333 + if (new_lang == last_vec_lang) { 1.334 + // Extend prior. Current mapped_offset may be beyond prior end, so do 1.335 + // the arithmetic to include any such gap 1.336 + priorrc->bytes = minint((mapped_offset + mapped_len) - priorrc->offset, 1.337 + kMaxResultChunkBytes); 1.338 + if (kShowLettersOriginal) { 1.339 + // Optionally print the new chunk original text 1.340 + string temp2(&scanner->GetBufferStart()[priorrc->offset], 1.341 + priorrc->bytes); 1.342 + fprintf(stderr, "Item[%d..%d) '%s'<br>\n", 1.343 + priorrc->offset, priorrc->offset + priorrc->bytes, 1.344 + GetHtmlEscapedText(temp2).c_str()); 1.345 + } 1.346 + return; 1.347 + } 1.348 + } 1.349 + // Add new vector element 1.350 + ResultChunk rc; 1.351 + rc.offset = mapped_offset; 1.352 + rc.bytes = minint(mapped_len, kMaxResultChunkBytes); 1.353 + rc.lang1 = static_cast<uint16>(new_lang); 1.354 + vec->push_back(rc); 1.355 + if (kShowLettersOriginal) { 1.356 + // Optionally print the new chunk original text 1.357 + string temp2(&scanner->GetBufferStart()[rc.offset], rc.bytes); 1.358 + fprintf(stderr, "Item[%d..%d) '%s'<br>\n", 1.359 + rc.offset, rc.offset + rc.bytes, 1.360 + GetHtmlEscapedText(temp2).c_str()); 1.361 + } 1.362 +} 1.363 + 1.364 +uint16 PriorVecLang(const ResultChunkVector* vec) { 1.365 + if (vec->empty()) {return static_cast<uint16>(UNKNOWN_LANGUAGE);} 1.366 + return (*vec)[vec->size() - 1].lang1; 1.367 +} 1.368 + 1.369 +uint16 NextChunkLang(const SummaryBuffer* summarybuffer, int i) { 1.370 + if ((i + 1) >= summarybuffer->n) { 1.371 + return static_cast<uint16>(UNKNOWN_LANGUAGE); 1.372 + } 1.373 + return summarybuffer->chunksummary[i + 1].lang1; 1.374 +} 1.375 + 1.376 + 1.377 + 1.378 +// Add n elements of summarybuffer to resultchunk vector: 1.379 +// Each element is letters-only text [offset..offset+bytes) 1.380 +// This maps back to original[Back(offset)..Back(offset+bytes)) 1.381 +// 1.382 +// We go out of our way to minimize the variation in the ResultChunkVector, 1.383 +// so that the caller has fewer but more meaningful spans in different 1.384 +// lanaguges, for the likely purpose of translation or spell-check. 1.385 +// 1.386 +// The language of each chunk is lang1, but it might be unreliable for 1.387 +// either of two reasons: its score is relatively too close to the score of 1.388 +// lang2, or its score is too far away from the expected score of real text in 1.389 +// the given language. Unreliable languages are mapped to Unknown. 1.390 +// 1.391 +void SummaryBufferToVector(ScriptScanner* scanner, const char* text, 1.392 + const SummaryBuffer* summarybuffer, 1.393 + bool more_to_come, ResultChunkVector* vec) { 1.394 + if (vec == NULL) {return;} 1.395 + 1.396 + if (kShowLettersOriginal) { 1.397 + fprintf(stderr, "map2original_ "); 1.398 + scanner->map2original_.DumpWindow(); 1.399 + fprintf(stderr, "<br>\n"); 1.400 + fprintf(stderr, "map2uplow_ "); 1.401 + scanner->map2uplow_.DumpWindow(); 1.402 + fprintf(stderr, "<br>\n"); 1.403 + } 1.404 + 1.405 + for (int i = 0; i < summarybuffer->n; ++i) { 1.406 + const ChunkSummary* cs = &summarybuffer->chunksummary[i]; 1.407 + int unmapped_offset = cs->offset; 1.408 + int unmapped_len = cs->bytes; 1.409 + 1.410 + if (kShowLettersOriginal) { 1.411 + // Optionally print the chunk lowercase letters/marks text 1.412 + string temp(&text[unmapped_offset], unmapped_len); 1.413 + fprintf(stderr, "Letters [%d..%d) '%s'<br>\n", 1.414 + unmapped_offset, unmapped_offset + unmapped_len, 1.415 + GetHtmlEscapedText(temp).c_str()); 1.416 + } 1.417 + 1.418 + int mapped_offset = scanner->MapBack(unmapped_offset); 1.419 + 1.420 + // Trim back a little to prefer splicing original at word boundaries 1.421 + if (mapped_offset > 0) { 1.422 + // Size of prior vector entry, if any 1.423 + int prior_size = 0; 1.424 + if (!vec->empty()) { 1.425 + ResultChunk* rc = &(*vec)[vec->size() - 1]; 1.426 + prior_size = rc->bytes; 1.427 + } 1.428 + // Maximum back up size to leave at least 3 bytes in prior, 1.429 + // and not entire buffer, and no more than 12 bytes total backup 1.430 + int n_limit = minint(prior_size - 3, mapped_offset); 1.431 + n_limit = minint(n_limit, 12); 1.432 + 1.433 + // Backscan over letters, stopping if prior byte is < 0x41 1.434 + // There is some possibility that we will backscan over a different script 1.435 + const char* s = &scanner->GetBufferStart()[mapped_offset]; 1.436 + const unsigned char* us = reinterpret_cast<const unsigned char*>(s); 1.437 + int n = 0; 1.438 + while ((n < n_limit) && (us[-n - 1] >= 0x41)) {++n;} 1.439 + if (n >= n_limit) {n = 0;} // New boundary not found within range 1.440 + 1.441 + // Also back up exactly one leading punctuation character if '"#@ 1.442 + if (n < n_limit) { 1.443 + unsigned char c = us[-n - 1]; 1.444 + if ((c == '\'') || (c == '"') || (c == '#') || (c == '@')) {++n;} 1.445 + } 1.446 + // Shrink the previous chunk slightly 1.447 + if (n > 0) { 1.448 + ResultChunk* rc = &(*vec)[vec->size() - 1]; 1.449 + rc->bytes -= n; 1.450 + mapped_offset -= n; 1.451 + if (kShowLettersOriginal) { 1.452 + fprintf(stderr, "Back up %d bytes<br>\n", n); 1.453 + // Optionally print the prior chunk original text 1.454 + string temp2(&scanner->GetBufferStart()[rc->offset], rc->bytes); 1.455 + fprintf(stderr, "Prior [%d..%d) '%s'<br>\n", 1.456 + rc->offset, rc->offset + rc->bytes, 1.457 + GetHtmlEscapedText(temp2).c_str()); 1.458 + } 1.459 + } 1.460 + } 1.461 + 1.462 + int mapped_len = 1.463 + scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset; 1.464 + 1.465 + if (kShowLettersOriginal) { 1.466 + // Optionally print the chunk original text 1.467 + string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len); 1.468 + fprintf(stderr, "Original[%d..%d) '%s'<br>\n", 1.469 + mapped_offset, mapped_offset + mapped_len, 1.470 + GetHtmlEscapedText(temp2).c_str()); 1.471 + } 1.472 + 1.473 + Language new_lang = static_cast<Language>(cs->lang1); 1.474 + bool reliability_delta_bad = 1.475 + (cs->reliability_delta < kUnreliablePercentThreshold); 1.476 + bool reliability_score_bad = 1.477 + (cs->reliability_score < kUnreliablePercentThreshold); 1.478 + 1.479 + // If the top language matches last vector, ignore reliability_delta 1.480 + uint16 prior_lang = PriorVecLang(vec); 1.481 + if (prior_lang == cs->lang1) { 1.482 + reliability_delta_bad = false; 1.483 + } 1.484 + // If the top language is in same close set as last vector, set up to merge 1.485 + if (SameCloseSet(cs->lang1, prior_lang)) { 1.486 + new_lang = static_cast<Language>(prior_lang); 1.487 + reliability_delta_bad = false; 1.488 + } 1.489 + // If the top two languages are in the same close set and the last vector 1.490 + // language is the second language, set up to merge 1.491 + if (SameCloseSet(cs->lang1, cs->lang2) && 1.492 + (prior_lang == cs->lang2)) { 1.493 + new_lang = static_cast<Language>(prior_lang); 1.494 + reliability_delta_bad = false; 1.495 + } 1.496 + // If unreliable and the last and next vector languages are both 1.497 + // the second language, set up to merge 1.498 + uint16 next_lang = NextChunkLang(summarybuffer, i); 1.499 + if (reliability_delta_bad && 1.500 + (prior_lang == cs->lang2) && (next_lang == cs->lang2)) { 1.501 + new_lang = static_cast<Language>(prior_lang); 1.502 + reliability_delta_bad = false; 1.503 + } 1.504 + 1.505 + if (reliability_delta_bad || reliability_score_bad) { 1.506 + new_lang = UNKNOWN_LANGUAGE; 1.507 + } 1.508 + ItemToVector(scanner, vec, new_lang, mapped_offset, mapped_len); 1.509 + } 1.510 +} 1.511 + 1.512 +// Add just one element to resultchunk vector: 1.513 +// For RTypeNone or RTypeOne 1.514 +void JustOneItemToVector(ScriptScanner* scanner, const char* text, 1.515 + Language lang1, int unmapped_offset, int unmapped_len, 1.516 + ResultChunkVector* vec) { 1.517 + if (vec == NULL) {return;} 1.518 + 1.519 + if (kShowLettersOriginal) { 1.520 + fprintf(stderr, "map2original_ "); 1.521 + scanner->map2original_.DumpWindow(); 1.522 + fprintf(stderr, "<br>\n"); 1.523 + fprintf(stderr, "map2uplow_ "); 1.524 + scanner->map2uplow_.DumpWindow(); 1.525 + fprintf(stderr, "<br>\n"); 1.526 + } 1.527 + 1.528 + if (kShowLettersOriginal) { 1.529 + // Optionally print the chunk lowercase letters/marks text 1.530 + string temp(&text[unmapped_offset], unmapped_len); 1.531 + fprintf(stderr, "Letters1 [%d..%d) '%s'<br>\n", 1.532 + unmapped_offset, unmapped_offset + unmapped_len, 1.533 + GetHtmlEscapedText(temp).c_str()); 1.534 + } 1.535 + 1.536 + int mapped_offset = scanner->MapBack(unmapped_offset); 1.537 + int mapped_len = 1.538 + scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset; 1.539 + 1.540 + if (kShowLettersOriginal) { 1.541 + // Optionally print the chunk original text 1.542 + string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len); 1.543 + fprintf(stderr, "Original1[%d..%d) '%s'<br>\n", 1.544 + mapped_offset, mapped_offset + mapped_len, 1.545 + GetHtmlEscapedText(temp2).c_str()); 1.546 + } 1.547 + 1.548 + ItemToVector(scanner, vec, lang1, mapped_offset, mapped_len); 1.549 +} 1.550 + 1.551 + 1.552 +// Debugging. Not thread safe. Defined in getonescriptspan 1.553 +char* DisplayPiece(const char* next_byte_, int byte_length_); 1.554 + 1.555 +// If high bit is on, take out high bit and add 2B to make table2 entries easy 1.556 +inline int PrintableIndirect(int x) { 1.557 + if ((x & 0x80000000u) != 0) { 1.558 + return (x & ~0x80000000u) + 2000000000; 1.559 + } 1.560 + return x; 1.561 +} 1.562 +void DumpHitBuffer(FILE* df, const char* text, 1.563 + const ScoringHitBuffer* hitbuffer) { 1.564 + fprintf(df, 1.565 + "<br>DumpHitBuffer[%s, next_base/delta/distinct %d, %d, %d)<br>\n", 1.566 + ULScriptCode(hitbuffer->ulscript), 1.567 + hitbuffer->next_base, hitbuffer->next_delta, 1.568 + hitbuffer->next_distinct); 1.569 + for (int i = 0; i < hitbuffer->maxscoringhits; ++i) { 1.570 + if (i < hitbuffer->next_base) { 1.571 + fprintf(df, "Q[%d]%d,%d,%s ", 1.572 + i, hitbuffer->base[i].offset, 1.573 + PrintableIndirect(hitbuffer->base[i].indirect), 1.574 + DisplayPiece(&text[hitbuffer->base[i].offset], 6)); 1.575 + } 1.576 + if (i < hitbuffer->next_delta) { 1.577 + fprintf(df, "DL[%d]%d,%d,%s ", 1.578 + i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect, 1.579 + DisplayPiece(&text[hitbuffer->delta[i].offset], 12)); 1.580 + } 1.581 + if (i < hitbuffer->next_distinct) { 1.582 + fprintf(df, "D[%d]%d,%d,%s ", 1.583 + i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect, 1.584 + DisplayPiece(&text[hitbuffer->distinct[i].offset], 12)); 1.585 + } 1.586 + if (i < hitbuffer->next_base) { 1.587 + fprintf(df, "<br>\n"); 1.588 + } 1.589 + if (i > 50) {break;} 1.590 + } 1.591 + if (hitbuffer->next_base > 50) { 1.592 + int i = hitbuffer->next_base; 1.593 + fprintf(df, "Q[%d]%d,%d,%s ", 1.594 + i, hitbuffer->base[i].offset, 1.595 + PrintableIndirect(hitbuffer->base[i].indirect), 1.596 + DisplayPiece(&text[hitbuffer->base[i].offset], 6)); 1.597 + } 1.598 + if (hitbuffer->next_delta > 50) { 1.599 + int i = hitbuffer->next_delta; 1.600 + fprintf(df, "DL[%d]%d,%d,%s ", 1.601 + i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect, 1.602 + DisplayPiece(&text[hitbuffer->delta[i].offset], 12)); 1.603 + } 1.604 + if (hitbuffer->next_distinct > 50) { 1.605 + int i = hitbuffer->next_distinct; 1.606 + fprintf(df, "D[%d]%d,%d,%s ", 1.607 + i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect, 1.608 + DisplayPiece(&text[hitbuffer->distinct[i].offset], 12)); 1.609 + } 1.610 + fprintf(df, "<br>\n"); 1.611 +} 1.612 + 1.613 + 1.614 +void DumpLinearBuffer(FILE* df, const char* text, 1.615 + const ScoringHitBuffer* hitbuffer) { 1.616 + fprintf(df, "<br>DumpLinearBuffer[%d)<br>\n", 1.617 + hitbuffer->next_linear); 1.618 + // Include the dummy entry off the end 1.619 + for (int i = 0; i < hitbuffer->next_linear + 1; ++i) { 1.620 + if ((50 < i) && (i < (hitbuffer->next_linear - 1))) {continue;} 1.621 + fprintf(df, "[%d]%d,%c=%08x,%s<br>\n", 1.622 + i, hitbuffer->linear[i].offset, 1.623 + "UQLD"[hitbuffer->linear[i].type], 1.624 + hitbuffer->linear[i].langprob, 1.625 + DisplayPiece(&text[hitbuffer->linear[i].offset], 6)); 1.626 + } 1.627 + fprintf(df, "<br>\n"); 1.628 + 1.629 + fprintf(df, "DumpChunkStart[%d]<br>\n", hitbuffer->next_chunk_start); 1.630 + for (int i = 0; i < hitbuffer->next_chunk_start + 1; ++i) { 1.631 + fprintf(df, "[%d]%d\n", i, hitbuffer->chunk_start[i]); 1.632 + } 1.633 + fprintf(df, "<br>\n"); 1.634 +} 1.635 + 1.636 +// Move this verbose debugging output to debug.cc eventually 1.637 +void DumpChunkSummary(FILE* df, const ChunkSummary* cs) { 1.638 + // Print chunksummary 1.639 + fprintf(df, "%d lin[%d] %s.%d %s.%d %dB %d# %s %dRd %dRs<br>\n", 1.640 + cs->offset, 1.641 + cs->chunk_start, 1.642 + LanguageCode(static_cast<Language>(cs->lang1)), 1.643 + cs->score1, 1.644 + LanguageCode(static_cast<Language>(cs->lang2)), 1.645 + cs->score2, 1.646 + cs->bytes, 1.647 + cs->grams, 1.648 + ULScriptCode(static_cast<ULScript>(cs->ulscript)), 1.649 + cs->reliability_delta, 1.650 + cs->reliability_score); 1.651 +} 1.652 + 1.653 +void DumpSummaryBuffer(FILE* df, const SummaryBuffer* summarybuffer) { 1.654 + fprintf(df, "<br>DumpSummaryBuffer[%d]<br>\n", summarybuffer->n); 1.655 + fprintf(df, "[i] offset linear[chunk_start] lang.score1 lang.score2 " 1.656 + "bytesB ngrams# script rel_delta rel_score<br>\n"); 1.657 + for (int i = 0; i <= summarybuffer->n; ++i) { 1.658 + fprintf(df, "[%d] ", i); 1.659 + DumpChunkSummary(df, &summarybuffer->chunksummary[i]); 1.660 + } 1.661 + fprintf(df, "<br>\n"); 1.662 +} 1.663 + 1.664 + 1.665 + 1.666 +// Within hitbufer->linear[] 1.667 +// <-- prior chunk --><-- this chunk --> 1.668 +// | | | 1.669 +// linear0 linear1 linear2 1.670 +// lang0 lang1 1.671 +// The goal of sharpening is to move this_linear to better separate langs 1.672 +int BetterBoundary(const char* text, 1.673 + ScoringHitBuffer* hitbuffer, 1.674 + ScoringContext* scoringcontext, 1.675 + uint16 pslang0, uint16 pslang1, 1.676 + int linear0, int linear1, int linear2) { 1.677 + // Degenerate case, no change 1.678 + if ((linear2 - linear0) <= 8) {return linear1;} 1.679 + 1.680 + // Each diff gives pslang0 score - pslang1 score 1.681 + // Running diff has four entries + + + + followed by four entries - - - - 1.682 + // so that this value is maximal at the sharpest boundary between pslang0 1.683 + // (positive diffs) and pslang1 (negative diffs) 1.684 + int running_diff = 0; 1.685 + int diff[8]; // Ring buffer of pslang0-pslang1 differences 1.686 + // Initialize with first 8 diffs 1.687 + for (int i = linear0; i < linear0 + 8; ++i) { 1.688 + int j = i & 7; 1.689 + uint32 langprob = hitbuffer->linear[i].langprob; 1.690 + diff[j] = GetLangScore(langprob, pslang0) - 1.691 + GetLangScore(langprob, pslang1); 1.692 + if (i < linear0 + 4) { 1.693 + // First four diffs pslang0 - pslang1 1.694 + running_diff += diff[j]; 1.695 + } else { 1.696 + // Second four diffs -(pslang0 - pslang1) 1.697 + running_diff -= diff[j]; 1.698 + } 1.699 + } 1.700 + 1.701 + // Now scan for sharpest boundary. j is at left end of 8 entries 1.702 + // To be a boundary, there must be both >0 and <0 entries in the window 1.703 + int better_boundary_value = 0; 1.704 + int better_boundary = linear1; 1.705 + for (int i = linear0; i < linear2 - 8; ++i) { 1.706 + int j = i & 7; 1.707 + if (better_boundary_value < running_diff) { 1.708 + bool has_plus = false; 1.709 + bool has_minus = false; 1.710 + for (int kk = 0; kk < 8; ++kk) { 1.711 + if (diff[kk] > 0) {has_plus = true;} 1.712 + if (diff[kk] < 0) {has_minus = true;} 1.713 + } 1.714 + if (has_plus && has_minus) { 1.715 + better_boundary_value = running_diff; 1.716 + better_boundary = i + 4; 1.717 + } 1.718 + } 1.719 + // Shift right one entry 1.720 + uint32 langprob = hitbuffer->linear[i + 8].langprob; 1.721 + int newdiff = GetLangScore(langprob, pslang0) - 1.722 + GetLangScore(langprob, pslang1); 1.723 + int middiff = diff[(i + 4) & 7]; 1.724 + int olddiff = diff[j]; 1.725 + diff[j] = newdiff; 1.726 + running_diff -= olddiff; // Remove left 1.727 + running_diff += 2 * middiff; // Convert middle from - to + 1.728 + running_diff -= newdiff; // Insert right 1.729 + } 1.730 + 1.731 + if (scoringcontext->flags_cld2_verbose && (linear1 != better_boundary)) { 1.732 + Language lang0 = FromPerScriptNumber(scoringcontext->ulscript, pslang0); 1.733 + Language lang1 = FromPerScriptNumber(scoringcontext->ulscript, pslang1); 1.734 + fprintf(scoringcontext->debug_file, " Better lin[%d=>%d] %s^^%s <br>\n", 1.735 + linear1, better_boundary, 1.736 + LanguageCode(lang0), LanguageCode(lang1)); 1.737 + int lin0_off = hitbuffer->linear[linear0].offset; 1.738 + int lin1_off = hitbuffer->linear[linear1].offset; 1.739 + int lin2_off = hitbuffer->linear[linear2].offset; 1.740 + int better_offm1 = hitbuffer->linear[better_boundary - 1].offset; 1.741 + int better_off = hitbuffer->linear[better_boundary].offset; 1.742 + int better_offp1 = hitbuffer->linear[better_boundary + 1].offset; 1.743 + string old0(&text[lin0_off], lin1_off - lin0_off); 1.744 + string old1(&text[lin1_off], lin2_off - lin1_off); 1.745 + string new0(&text[lin0_off], better_offm1 - lin0_off); 1.746 + string new0m1(&text[better_offm1], better_off - better_offm1); 1.747 + string new1(&text[better_off], better_offp1 - better_off); 1.748 + string new1p1(&text[better_offp1], lin2_off - better_offp1); 1.749 + fprintf(scoringcontext->debug_file, "%s^^%s => <br>\n%s^%s^^%s^%s<br>\n", 1.750 + GetHtmlEscapedText(old0).c_str(), 1.751 + GetHtmlEscapedText(old1).c_str(), 1.752 + GetHtmlEscapedText(new0).c_str(), 1.753 + GetHtmlEscapedText(new0m1).c_str(), 1.754 + GetHtmlEscapedText(new1).c_str(), 1.755 + GetHtmlEscapedText(new1p1).c_str()); 1.756 + // Slow picture of differences per linear entry 1.757 + int d; 1.758 + for (int i = linear0; i < linear2; ++i) { 1.759 + if (i == better_boundary) { 1.760 + fprintf(scoringcontext->debug_file, "^^ "); 1.761 + } 1.762 + uint32 langprob = hitbuffer->linear[i].langprob; 1.763 + d = GetLangScore(langprob, pslang0) - GetLangScore(langprob, pslang1); 1.764 + const char* s = "="; 1.765 + //if (d > 2) {s = "\xc2\xaf";} // Macron 1.766 + if (d > 2) {s = "#";} 1.767 + else if (d > 0) {s = "+";} 1.768 + else if (d < -2) {s = "_";} 1.769 + else if (d < 0) {s = "-";} 1.770 + fprintf(scoringcontext->debug_file, "%s ", s); 1.771 + } 1.772 + fprintf(scoringcontext->debug_file, " (scale: #+=-_)<br>\n"); 1.773 + } 1.774 + return better_boundary; 1.775 +} 1.776 + 1.777 + 1.778 +// For all but the first summary, if its top language differs from 1.779 +// the previous chunk, refine the boundary 1.780 +// Linearized version 1.781 +void SharpenBoundaries(const char* text, 1.782 + bool more_to_come, 1.783 + ScoringHitBuffer* hitbuffer, 1.784 + ScoringContext* scoringcontext, 1.785 + SummaryBuffer* summarybuffer) { 1.786 + 1.787 + int prior_linear = summarybuffer->chunksummary[0].chunk_start; 1.788 + uint16 prior_lang = summarybuffer->chunksummary[0].lang1; 1.789 + 1.790 + if (scoringcontext->flags_cld2_verbose) { 1.791 + fprintf(scoringcontext->debug_file, "<br>SharpenBoundaries<br>\n"); 1.792 + } 1.793 + for (int i = 1; i < summarybuffer->n; ++i) { 1.794 + ChunkSummary* cs = &summarybuffer->chunksummary[i]; 1.795 + uint16 this_lang = cs->lang1; 1.796 + if (this_lang == prior_lang) { 1.797 + prior_linear = cs->chunk_start; 1.798 + continue; 1.799 + } 1.800 + 1.801 + int this_linear = cs->chunk_start; 1.802 + int next_linear = summarybuffer->chunksummary[i + 1].chunk_start; 1.803 + 1.804 + // If this/prior in same close set, don't move boundary 1.805 + if (SameCloseSet(prior_lang, this_lang)) { 1.806 + prior_linear = this_linear; 1.807 + prior_lang = this_lang; 1.808 + continue; 1.809 + } 1.810 + 1.811 + 1.812 + // Within hitbuffer->linear[] 1.813 + // <-- prior chunk --><-- this chunk --> 1.814 + // | | | 1.815 + // prior_linear this_linear next_linear 1.816 + // prior_lang this_lang 1.817 + // The goal of sharpening is to move this_linear to better separate langs 1.818 + 1.819 + uint8 pslang0 = PerScriptNumber(scoringcontext->ulscript, 1.820 + static_cast<Language>(prior_lang)); 1.821 + uint8 pslang1 = PerScriptNumber(scoringcontext->ulscript, 1.822 + static_cast<Language>(this_lang)); 1.823 + int better_linear = BetterBoundary(text, 1.824 + hitbuffer, 1.825 + scoringcontext, 1.826 + pslang0, pslang1, 1.827 + prior_linear, this_linear, next_linear); 1.828 + 1.829 + int old_offset = hitbuffer->linear[this_linear].offset; 1.830 + int new_offset = hitbuffer->linear[better_linear].offset; 1.831 + cs->chunk_start = better_linear; 1.832 + cs->offset = new_offset; 1.833 + // If this_linear moved right, make bytes smaller for this, larger for prior 1.834 + // If this_linear moved left, make bytes larger for this, smaller for prior 1.835 + cs->bytes -= (new_offset - old_offset); 1.836 + summarybuffer->chunksummary[i - 1].bytes += (new_offset - old_offset); 1.837 + 1.838 + this_linear = better_linear; // Update so that next chunk doesn't intrude 1.839 + 1.840 + // Consider rescoring the two chunks 1.841 + 1.842 + // Update for next round (note: using pre-updated boundary) 1.843 + prior_linear = this_linear; 1.844 + prior_lang = this_lang; 1.845 + } 1.846 +} 1.847 + 1.848 +// Make a langprob that gives small weight to the default language for ulscript 1.849 +uint32 DefaultLangProb(ULScript ulscript) { 1.850 + Language default_lang = DefaultLanguage(ulscript); 1.851 + return MakeLangProb(default_lang, 1); 1.852 +} 1.853 + 1.854 +// Effectively, do a merge-sort based on text offsets 1.855 +// Look up each indirect value in appropriate scoring table and keep 1.856 +// just the resulting langprobs 1.857 +void LinearizeAll(ScoringContext* scoringcontext, bool score_cjk, 1.858 + ScoringHitBuffer* hitbuffer) { 1.859 + const CLD2TableSummary* base_obj; // unigram or quadgram 1.860 + const CLD2TableSummary* base_obj2; // quadgram dual table 1.861 + const CLD2TableSummary* delta_obj; // bigram or octagram 1.862 + const CLD2TableSummary* distinct_obj; // bigram or octagram 1.863 + uint16 base_hit; 1.864 + if (score_cjk) { 1.865 + base_obj = scoringcontext->scoringtables->unigram_compat_obj; 1.866 + base_obj2 = scoringcontext->scoringtables->unigram_compat_obj; 1.867 + delta_obj = scoringcontext->scoringtables->deltabi_obj; 1.868 + distinct_obj = scoringcontext->scoringtables->distinctbi_obj; 1.869 + base_hit = UNIHIT; 1.870 + } else { 1.871 + base_obj = scoringcontext->scoringtables->quadgram_obj; 1.872 + base_obj2 = scoringcontext->scoringtables->quadgram_obj2; 1.873 + delta_obj = scoringcontext->scoringtables->deltaocta_obj; 1.874 + distinct_obj = scoringcontext->scoringtables->distinctocta_obj; 1.875 + base_hit = QUADHIT; 1.876 + } 1.877 + 1.878 + int base_limit = hitbuffer->next_base; 1.879 + int delta_limit = hitbuffer->next_delta; 1.880 + int distinct_limit = hitbuffer->next_distinct; 1.881 + int base_i = 0; 1.882 + int delta_i = 0; 1.883 + int distinct_i = 0; 1.884 + int linear_i = 0; 1.885 + 1.886 + // Start with an initial base hit for the default language for this script 1.887 + // Inserting this avoids edge effects with no hits at all 1.888 + hitbuffer->linear[linear_i].offset = hitbuffer->lowest_offset; 1.889 + hitbuffer->linear[linear_i].type = base_hit; 1.890 + hitbuffer->linear[linear_i].langprob = 1.891 + DefaultLangProb(scoringcontext->ulscript); 1.892 + ++linear_i; 1.893 + 1.894 + while ((base_i < base_limit) || (delta_i < delta_limit) || 1.895 + (distinct_i < distinct_limit)) { 1.896 + int base_off = hitbuffer->base[base_i].offset; 1.897 + int delta_off = hitbuffer->delta[delta_i].offset; 1.898 + int distinct_off = hitbuffer->distinct[distinct_i].offset; 1.899 + 1.900 + // Do delta and distinct first, so that they are not lost at base_limit 1.901 + if ((delta_i < delta_limit) && 1.902 + (delta_off <= base_off) && (delta_off <= distinct_off)) { 1.903 + // Add delta entry 1.904 + int indirect = hitbuffer->delta[delta_i].indirect; 1.905 + ++delta_i; 1.906 + uint32 langprob = delta_obj->kCLDTableInd[indirect]; 1.907 + if (langprob > 0) { 1.908 + hitbuffer->linear[linear_i].offset = delta_off; 1.909 + hitbuffer->linear[linear_i].type = DELTAHIT; 1.910 + hitbuffer->linear[linear_i].langprob = langprob; 1.911 + ++linear_i; 1.912 + } 1.913 + } 1.914 + else if ((distinct_i < distinct_limit) && 1.915 + (distinct_off <= base_off) && (distinct_off <= delta_off)) { 1.916 + // Add distinct entry 1.917 + int indirect = hitbuffer->distinct[distinct_i].indirect; 1.918 + ++distinct_i; 1.919 + uint32 langprob = distinct_obj->kCLDTableInd[indirect]; 1.920 + if (langprob > 0) { 1.921 + hitbuffer->linear[linear_i].offset = distinct_off; 1.922 + hitbuffer->linear[linear_i].type = DISTINCTHIT; 1.923 + hitbuffer->linear[linear_i].langprob = langprob; 1.924 + ++linear_i; 1.925 + } 1.926 + } 1.927 + else { 1.928 + // Add one or two base entries 1.929 + int indirect = hitbuffer->base[base_i].indirect; 1.930 + // First, get right scoring table 1.931 + const CLD2TableSummary* local_base_obj = base_obj; 1.932 + if ((indirect & 0x80000000u) != 0) { 1.933 + local_base_obj = base_obj2; 1.934 + indirect &= ~0x80000000u; 1.935 + } 1.936 + ++base_i; 1.937 + // One langprob in kQuadInd[0..SingleSize), 1.938 + // two in kQuadInd[SingleSize..Size) 1.939 + if (indirect < static_cast<int>(local_base_obj->kCLDTableSizeOne)) { 1.940 + // Up to three languages at indirect 1.941 + uint32 langprob = local_base_obj->kCLDTableInd[indirect]; 1.942 + if (langprob > 0) { 1.943 + hitbuffer->linear[linear_i].offset = base_off; 1.944 + hitbuffer->linear[linear_i].type = base_hit; 1.945 + hitbuffer->linear[linear_i].langprob = langprob; 1.946 + ++linear_i; 1.947 + } 1.948 + } else { 1.949 + // Up to six languages at start + 2 * (indirect - start) 1.950 + indirect += (indirect - local_base_obj->kCLDTableSizeOne); 1.951 + uint32 langprob = local_base_obj->kCLDTableInd[indirect]; 1.952 + uint32 langprob2 = local_base_obj->kCLDTableInd[indirect + 1]; 1.953 + if (langprob > 0) { 1.954 + hitbuffer->linear[linear_i].offset = base_off; 1.955 + hitbuffer->linear[linear_i].type = base_hit; 1.956 + hitbuffer->linear[linear_i].langprob = langprob; 1.957 + ++linear_i; 1.958 + } 1.959 + if (langprob2 > 0) { 1.960 + hitbuffer->linear[linear_i].offset = base_off; 1.961 + hitbuffer->linear[linear_i].type = base_hit; 1.962 + hitbuffer->linear[linear_i].langprob = langprob2; 1.963 + ++linear_i; 1.964 + } 1.965 + } 1.966 + } 1.967 + } 1.968 + 1.969 + // Update 1.970 + hitbuffer->next_linear = linear_i; 1.971 + 1.972 + // Add a dummy entry off the end, just to capture final offset 1.973 + hitbuffer->linear[linear_i].offset = 1.974 + hitbuffer->base[hitbuffer->next_base].offset; 1.975 + hitbuffer->linear[linear_i].langprob = 0; 1.976 +} 1.977 + 1.978 +// Break linear array into chunks of ~20 quadgram hits or ~50 CJK unigram hits 1.979 +void ChunkAll(int letter_offset, bool score_cjk, ScoringHitBuffer* hitbuffer) { 1.980 + int chunksize; 1.981 + uint16 base_hit; 1.982 + if (score_cjk) { 1.983 + chunksize = kChunksizeUnis; 1.984 + base_hit = UNIHIT; 1.985 + } else { 1.986 + chunksize = kChunksizeQuads; 1.987 + base_hit = QUADHIT; 1.988 + } 1.989 + 1.990 + int linear_i = 0; 1.991 + int linear_off_end = hitbuffer->next_linear; 1.992 + int text_i = letter_offset; // Next unseen text offset 1.993 + int next_chunk_start = 0; 1.994 + int bases_left = hitbuffer->next_base; 1.995 + while (bases_left > 0) { 1.996 + // Linearize one chunk 1.997 + int base_len = chunksize; // Default; may be changed below 1.998 + if (bases_left < (chunksize + (chunksize >> 1))) { 1.999 + // If within 1.5 chunks of the end, avoid runts by using it all 1.1000 + base_len = bases_left; 1.1001 + } else if (bases_left < (2 * chunksize)) { 1.1002 + // Avoid runts by splitting 1.5 to 2 chunks in half (about 3/4 each) 1.1003 + base_len = (bases_left + 1) >> 1; 1.1004 + } 1.1005 + 1.1006 + hitbuffer->chunk_start[next_chunk_start] = linear_i; 1.1007 + hitbuffer->chunk_offset[next_chunk_start] = text_i; 1.1008 + ++next_chunk_start; 1.1009 + 1.1010 + int base_count = 0; 1.1011 + while ((base_count < base_len) && (linear_i < linear_off_end)) { 1.1012 + if (hitbuffer->linear[linear_i].type == base_hit) {++base_count;} 1.1013 + ++linear_i; 1.1014 + } 1.1015 + text_i = hitbuffer->linear[linear_i].offset; // Next unseen text offset 1.1016 + bases_left -= base_len; 1.1017 + } 1.1018 + 1.1019 + // If no base hits at all, make a single dummy chunk 1.1020 + if (next_chunk_start == 0) { 1.1021 + hitbuffer->chunk_start[next_chunk_start] = 0; 1.1022 + hitbuffer->chunk_offset[next_chunk_start] = hitbuffer->linear[0].offset; 1.1023 + ++next_chunk_start; 1.1024 + } 1.1025 + 1.1026 + // Remember the linear array start of dummy entry 1.1027 + hitbuffer->next_chunk_start = next_chunk_start; 1.1028 + 1.1029 + // Add a dummy entry off the end, just to capture final linear subscr 1.1030 + hitbuffer->chunk_start[next_chunk_start] = hitbuffer->next_linear; 1.1031 + hitbuffer->chunk_offset[next_chunk_start] = text_i; 1.1032 +} 1.1033 + 1.1034 + 1.1035 +// Merge-sort the individual hit arrays, go indirect on the scoring subscripts, 1.1036 +// break linear array into chunks. 1.1037 +// 1.1038 +// Input: 1.1039 +// hitbuffer base, delta, distinct arrays 1.1040 +// Output: 1.1041 +// linear array 1.1042 +// chunk_start array 1.1043 +// 1.1044 +void LinearizeHitBuffer(int letter_offset, 1.1045 + ScoringContext* scoringcontext, 1.1046 + bool more_to_come, bool score_cjk, 1.1047 + ScoringHitBuffer* hitbuffer) { 1.1048 + LinearizeAll(scoringcontext, score_cjk, hitbuffer); 1.1049 + ChunkAll(letter_offset, score_cjk, hitbuffer); 1.1050 +} 1.1051 + 1.1052 + 1.1053 + 1.1054 +// The hitbuffer is in an awkward form -- three sets of base/delta/distinct 1.1055 +// scores, each with an indirect subscript to one of six scoring tables, some 1.1056 +// of which can yield two langprobs for six languages, others one langprob for 1.1057 +// three languages. The only correlation between base/delta/distinct is their 1.1058 +// offsets into the letters-only text buffer. 1.1059 +// 1.1060 +// SummaryBuffer needs to be built to linear, giving linear offset of start of 1.1061 +// each chunk 1.1062 +// 1.1063 +// So we first do all the langprob lookups and merge-sort by offset to make 1.1064 +// a single linear vector, building a side vector of chunk beginnings as we go. 1.1065 +// The sharpening is simply moving the beginnings, scoring is a simple linear 1.1066 +// sweep, etc. 1.1067 + 1.1068 +void ProcessHitBuffer(const LangSpan& scriptspan, 1.1069 + int letter_offset, 1.1070 + ScoringContext* scoringcontext, 1.1071 + DocTote* doc_tote, 1.1072 + ResultChunkVector* vec, 1.1073 + bool more_to_come, bool score_cjk, 1.1074 + ScoringHitBuffer* hitbuffer) { 1.1075 + if (scoringcontext->flags_cld2_verbose) { 1.1076 + fprintf(scoringcontext->debug_file, "Hitbuffer[) "); 1.1077 + DumpHitBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer); 1.1078 + } 1.1079 + 1.1080 + LinearizeHitBuffer(letter_offset, scoringcontext, more_to_come, score_cjk, 1.1081 + hitbuffer); 1.1082 + 1.1083 + if (scoringcontext->flags_cld2_verbose) { 1.1084 + fprintf(scoringcontext->debug_file, "Linear[) "); 1.1085 + DumpLinearBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer); 1.1086 + } 1.1087 + 1.1088 + SummaryBuffer summarybuffer; 1.1089 + summarybuffer.n = 0; 1.1090 + ChunkSpan last_cspan; 1.1091 + ScoreAllHits(scriptspan.text, scriptspan.ulscript, 1.1092 + more_to_come, score_cjk, hitbuffer, 1.1093 + scoringcontext, 1.1094 + &summarybuffer, &last_cspan); 1.1095 + 1.1096 + if (scoringcontext->flags_cld2_verbose) { 1.1097 + DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer); 1.1098 + } 1.1099 + 1.1100 + if (vec != NULL) { 1.1101 + // Sharpen boundaries of summarybuffer 1.1102 + // This is not a high-performance path 1.1103 + SharpenBoundaries(scriptspan.text, more_to_come, hitbuffer, scoringcontext, 1.1104 + &summarybuffer); 1.1105 + // Show after the sharpening 1.1106 + // CLD2_Debug2(scriptspan.text, more_to_come, score_cjk, 1.1107 + // hitbuffer, scoringcontext, &summarybuffer); 1.1108 + 1.1109 + if (scoringcontext->flags_cld2_verbose) { 1.1110 + DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer); 1.1111 + } 1.1112 + } 1.1113 + 1.1114 + SummaryBufferToDocTote(&summarybuffer, more_to_come, doc_tote); 1.1115 + SummaryBufferToVector(scoringcontext->scanner, scriptspan.text, 1.1116 + &summarybuffer, more_to_come, vec); 1.1117 +} 1.1118 + 1.1119 +void SpliceHitBuffer(ScoringHitBuffer* hitbuffer, int next_offset) { 1.1120 + // Splice hitbuffer and summarybuffer for next round. With big chunks and 1.1121 + // distinctive-word state carried across chunks, we might not need to do this. 1.1122 + hitbuffer->next_base = 0; 1.1123 + hitbuffer->next_delta = 0; 1.1124 + hitbuffer->next_distinct = 0; 1.1125 + hitbuffer->next_linear = 0; 1.1126 + hitbuffer->next_chunk_start = 0; 1.1127 + hitbuffer->lowest_offset = next_offset; 1.1128 +} 1.1129 + 1.1130 + 1.1131 +// Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating 1.1132 +// scoringcontext 1.1133 +void ScoreEntireScriptSpan(const LangSpan& scriptspan, 1.1134 + ScoringContext* scoringcontext, 1.1135 + DocTote* doc_tote, 1.1136 + ResultChunkVector* vec) { 1.1137 + int bytes = scriptspan.text_bytes; 1.1138 + // Artificially set score to 1024 per 1KB, or 1 per byte 1.1139 + int score = bytes; 1.1140 + int reliability = 100; 1.1141 + // doc_tote uses full languages 1.1142 + Language one_one_lang = DefaultLanguage(scriptspan.ulscript); 1.1143 + doc_tote->Add(one_one_lang, bytes, score, reliability); 1.1144 + 1.1145 + if (scoringcontext->flags_cld2_html) { 1.1146 + ChunkSummary chunksummary = { 1.1147 + 1, 0, 1.1148 + one_one_lang, UNKNOWN_LANGUAGE, score, 1, 1.1149 + bytes, 0, scriptspan.ulscript, reliability, reliability 1.1150 + }; 1.1151 + CLD2_Debug(scriptspan.text, 1, scriptspan.text_bytes, 1.1152 + false, false, NULL, 1.1153 + scoringcontext, NULL, &chunksummary); 1.1154 + } 1.1155 + 1.1156 + // First byte is always a space 1.1157 + JustOneItemToVector(scoringcontext->scanner, scriptspan.text, 1.1158 + one_one_lang, 1, bytes - 1, vec); 1.1159 + 1.1160 + scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; 1.1161 +} 1.1162 + 1.1163 +// Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext 1.1164 +void ScoreCJKScriptSpan(const LangSpan& scriptspan, 1.1165 + ScoringContext* scoringcontext, 1.1166 + DocTote* doc_tote, 1.1167 + ResultChunkVector* vec) { 1.1168 + // Allocate three parallel arrays of scoring hits 1.1169 + ScoringHitBuffer* hitbuffer = new ScoringHitBuffer; 1.1170 + hitbuffer->init(); 1.1171 + hitbuffer->ulscript = scriptspan.ulscript; 1.1172 + 1.1173 + scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; 1.1174 + scoringcontext->oldest_distinct_boost = 0; 1.1175 + 1.1176 + // Incoming scriptspan has a single leading space at scriptspan.text[0] 1.1177 + // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3] 1.1178 + 1.1179 + int letter_offset = 1; // Skip initial space 1.1180 + hitbuffer->lowest_offset = letter_offset; 1.1181 + int letter_limit = scriptspan.text_bytes; 1.1182 + while (letter_offset < letter_limit) { 1.1183 + if (scoringcontext->flags_cld2_verbose) { 1.1184 + fprintf(scoringcontext->debug_file, " ScoreCJKScriptSpan[%d,%d)<br>\n", 1.1185 + letter_offset, letter_limit); 1.1186 + } 1.1187 + // 1.1188 + // Fill up one hitbuffer, possibly splicing onto previous fragment 1.1189 + // 1.1190 + // NOTE: GetUniHits deals with close repeats 1.1191 + // NOTE: After last chunk there is always a hitbuffer entry with an offset 1.1192 + // just off the end of the text = next_offset. 1.1193 + int next_offset = GetUniHits(scriptspan.text, letter_offset, letter_limit, 1.1194 + scoringcontext, hitbuffer); 1.1195 + // NOTE: GetBiHitVectors deals with close repeats, 1.1196 + // does one hash and two lookups (delta and distinct) per word 1.1197 + GetBiHits(scriptspan.text, letter_offset, next_offset, 1.1198 + scoringcontext, hitbuffer); 1.1199 + 1.1200 + // 1.1201 + // Score one hitbuffer in chunks to summarybuffer 1.1202 + // 1.1203 + bool more_to_come = next_offset < letter_limit; 1.1204 + bool score_cjk = true; 1.1205 + ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec, 1.1206 + more_to_come, score_cjk, hitbuffer); 1.1207 + SpliceHitBuffer(hitbuffer, next_offset); 1.1208 + 1.1209 + letter_offset = next_offset; 1.1210 + } 1.1211 + 1.1212 + delete hitbuffer; 1.1213 + // Context across buffers is not connected yet 1.1214 + scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; 1.1215 +} 1.1216 + 1.1217 + 1.1218 + 1.1219 +// Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext 1.1220 +// We have a scriptspan with all lowercase text in one script. Look up 1.1221 +// quadgrams and octagrams, saving the hits in three parallel vectors. 1.1222 +// Score from those vectors in chunks, toting each chunk to get a single 1.1223 +// language, and combining into the overall document score. The hit vectors 1.1224 +// in general are not big enough to handle and entire scriptspan, so 1.1225 +// repeat until the entire scriptspan is scored. 1.1226 +// Caller deals with minimizing numbr of runt scriptspans 1.1227 +// This routine deals with minimizing number of runt chunks. 1.1228 +// 1.1229 +// Returns updated scoringcontext 1.1230 +// Returns updated doc_tote 1.1231 +// If vec != NULL, appends to that vector of ResultChunk's 1.1232 +void ScoreQuadScriptSpan(const LangSpan& scriptspan, 1.1233 + ScoringContext* scoringcontext, 1.1234 + DocTote* doc_tote, 1.1235 + ResultChunkVector* vec) { 1.1236 + // Allocate three parallel arrays of scoring hits 1.1237 + ScoringHitBuffer* hitbuffer = new ScoringHitBuffer; 1.1238 + hitbuffer->init(); 1.1239 + hitbuffer->ulscript = scriptspan.ulscript; 1.1240 + 1.1241 + scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; 1.1242 + scoringcontext->oldest_distinct_boost = 0; 1.1243 + 1.1244 + // Incoming scriptspan has a single leading space at scriptspan.text[0] 1.1245 + // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3] 1.1246 + 1.1247 + int letter_offset = 1; // Skip initial space 1.1248 + hitbuffer->lowest_offset = letter_offset; 1.1249 + int letter_limit = scriptspan.text_bytes; 1.1250 + while (letter_offset < letter_limit) { 1.1251 + // 1.1252 + // Fill up one hitbuffer, possibly splicing onto previous fragment 1.1253 + // 1.1254 + // NOTE: GetQuadHits deals with close repeats 1.1255 + // NOTE: After last chunk there is always a hitbuffer entry with an offset 1.1256 + // just off the end of the text = next_offset. 1.1257 + int next_offset = GetQuadHits(scriptspan.text, letter_offset, letter_limit, 1.1258 + scoringcontext, hitbuffer); 1.1259 + // If true, there is more text to process in this scriptspan 1.1260 + // NOTE: GetOctaHitVectors deals with close repeats, 1.1261 + // does one hash and two lookups (delta and distinct) per word 1.1262 + GetOctaHits(scriptspan.text, letter_offset, next_offset, 1.1263 + scoringcontext, hitbuffer); 1.1264 + 1.1265 + // 1.1266 + // Score one hitbuffer in chunks to summarybuffer 1.1267 + // 1.1268 + bool more_to_come = next_offset < letter_limit; 1.1269 + bool score_cjk = false; 1.1270 + ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec, 1.1271 + more_to_come, score_cjk, hitbuffer); 1.1272 + SpliceHitBuffer(hitbuffer, next_offset); 1.1273 + 1.1274 + letter_offset = next_offset; 1.1275 + } 1.1276 + 1.1277 + delete hitbuffer; 1.1278 +} 1.1279 + 1.1280 + 1.1281 +// Score one scriptspan into doc_tote and vec, updating scoringcontext 1.1282 +// Inputs: 1.1283 +// One scriptspan of perhaps 40-60KB, all same script lower-case letters 1.1284 +// and single ASCII spaces. First character is a space to allow simple 1.1285 +// begining-of-word detect. End of buffer has three spaces and NUL to 1.1286 +// allow easy scan-to-end-of-word. 1.1287 +// Scoring context of 1.1288 +// scoring tables 1.1289 +// flags 1.1290 +// running boosts 1.1291 +// Outputs: 1.1292 +// Updated doc_tote giving overall languages and byte counts 1.1293 +// Optional updated chunk vector giving offset, length, language 1.1294 +// 1.1295 +// Caller initializes flags, boosts, doc_tote and vec. 1.1296 +// Caller aggregates across multiple scriptspans 1.1297 +// Caller calculates final document result 1.1298 +// Caller deals with detecting and triggering suppression of repeated text. 1.1299 +// 1.1300 +// This top-level routine just chooses the recognition type and calls one of 1.1301 +// the next-level-down routines. 1.1302 +// 1.1303 +void ScoreOneScriptSpan(const LangSpan& scriptspan, 1.1304 + ScoringContext* scoringcontext, 1.1305 + DocTote* doc_tote, 1.1306 + ResultChunkVector* vec) { 1.1307 + if (scoringcontext->flags_cld2_verbose) { 1.1308 + fprintf(scoringcontext->debug_file, "<br>ScoreOneScriptSpan(%s,%d) ", 1.1309 + ULScriptCode(scriptspan.ulscript), scriptspan.text_bytes); 1.1310 + // Optionally print the chunk lowercase letters/marks text 1.1311 + string temp(&scriptspan.text[0], scriptspan.text_bytes); 1.1312 + fprintf(scoringcontext->debug_file, "'%s'", 1.1313 + GetHtmlEscapedText(temp).c_str()); 1.1314 + fprintf(scoringcontext->debug_file, "<br>\n"); 1.1315 + } 1.1316 + scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; 1.1317 + scoringcontext->oldest_distinct_boost = 0; 1.1318 + ULScriptRType rtype = ULScriptRecognitionType(scriptspan.ulscript); 1.1319 + if (scoringcontext->flags_cld2_score_as_quads && (rtype != RTypeCJK)) { 1.1320 + rtype = RTypeMany; 1.1321 + } 1.1322 + switch (rtype) { 1.1323 + case RTypeNone: 1.1324 + case RTypeOne: 1.1325 + ScoreEntireScriptSpan(scriptspan, scoringcontext, doc_tote, vec); 1.1326 + break; 1.1327 + case RTypeCJK: 1.1328 + ScoreCJKScriptSpan(scriptspan, scoringcontext, doc_tote, vec); 1.1329 + break; 1.1330 + case RTypeMany: 1.1331 + ScoreQuadScriptSpan(scriptspan, scoringcontext, doc_tote, vec); 1.1332 + break; 1.1333 + } 1.1334 +} 1.1335 + 1.1336 +} // End namespace CLD2 1.1337 +