Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | // Copyright 2013 Google Inc. All Rights Reserved. |
michael@0 | 2 | // |
michael@0 | 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
michael@0 | 4 | // you may not use this file except in compliance with the License. |
michael@0 | 5 | // You may obtain a copy of the License at |
michael@0 | 6 | // |
michael@0 | 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
michael@0 | 8 | // |
michael@0 | 9 | // Unless required by applicable law or agreed to in writing, software |
michael@0 | 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
michael@0 | 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
michael@0 | 12 | // See the License for the specific language governing permissions and |
michael@0 | 13 | // limitations under the License. |
michael@0 | 14 | |
michael@0 | 15 | // |
michael@0 | 16 | // Author: dsites@google.com (Dick Sites) |
michael@0 | 17 | // Updated 2014.01 for dual table lookup |
michael@0 | 18 | // |
michael@0 | 19 | |
michael@0 | 20 | #include "scoreonescriptspan.h" |
michael@0 | 21 | |
michael@0 | 22 | #include "cldutil.h" |
michael@0 | 23 | #include "debug.h" |
michael@0 | 24 | #include "lang_script.h" |
michael@0 | 25 | |
michael@0 | 26 | #include <stdio.h> |
michael@0 | 27 | |
michael@0 | 28 | using namespace std; |
michael@0 | 29 | |
michael@0 | 30 | namespace CLD2 { |
michael@0 | 31 | |
michael@0 | 32 | static const int kUnreliablePercentThreshold = 75; |
michael@0 | 33 | |
michael@0 | 34 | void AddLangProb(uint32 langprob, Tote* chunk_tote) { |
michael@0 | 35 | ProcessProbV2Tote(langprob, chunk_tote); |
michael@0 | 36 | } |
michael@0 | 37 | |
michael@0 | 38 | void ZeroPSLang(uint32 langprob, Tote* chunk_tote) { |
michael@0 | 39 | uint8 top1 = (langprob >> 8) & 0xff; |
michael@0 | 40 | chunk_tote->SetScore(top1, 0); |
michael@0 | 41 | } |
michael@0 | 42 | |
michael@0 | 43 | bool SameCloseSet(uint16 lang1, uint16 lang2) { |
michael@0 | 44 | int lang1_close_set = LanguageCloseSet(static_cast<Language>(lang1)); |
michael@0 | 45 | if (lang1_close_set == 0) {return false;} |
michael@0 | 46 | int lang2_close_set = LanguageCloseSet(static_cast<Language>(lang2)); |
michael@0 | 47 | return (lang1_close_set == lang2_close_set); |
michael@0 | 48 | } |
michael@0 | 49 | |
michael@0 | 50 | bool SameCloseSet(Language lang1, Language lang2) { |
michael@0 | 51 | int lang1_close_set = LanguageCloseSet(lang1); |
michael@0 | 52 | if (lang1_close_set == 0) {return false;} |
michael@0 | 53 | int lang2_close_set = LanguageCloseSet(lang2); |
michael@0 | 54 | return (lang1_close_set == lang2_close_set); |
michael@0 | 55 | } |
michael@0 | 56 | |
michael@0 | 57 | |
michael@0 | 58 | // Needs expected score per 1KB in scoring context |
michael@0 | 59 | void SetChunkSummary(ULScript ulscript, int first_linear_in_chunk, |
michael@0 | 60 | int offset, int len, |
michael@0 | 61 | const ScoringContext* scoringcontext, |
michael@0 | 62 | const Tote* chunk_tote, |
michael@0 | 63 | ChunkSummary* chunksummary) { |
michael@0 | 64 | int key3[3]; |
michael@0 | 65 | chunk_tote->CurrentTopThreeKeys(key3); |
michael@0 | 66 | Language lang1 = FromPerScriptNumber(ulscript, key3[0]); |
michael@0 | 67 | Language lang2 = FromPerScriptNumber(ulscript, key3[1]); |
michael@0 | 68 | |
michael@0 | 69 | int actual_score_per_kb = 0; |
michael@0 | 70 | if (len > 0) { |
michael@0 | 71 | actual_score_per_kb = (chunk_tote->GetScore(key3[0]) << 10) / len; |
michael@0 | 72 | } |
michael@0 | 73 | int expected_subscr = lang1 * 4 + LScript4(ulscript); |
michael@0 | 74 | int expected_score_per_kb = |
michael@0 | 75 | scoringcontext->scoringtables->kExpectedScore[expected_subscr]; |
michael@0 | 76 | |
michael@0 | 77 | chunksummary->offset = offset; |
michael@0 | 78 | chunksummary->chunk_start = first_linear_in_chunk; |
michael@0 | 79 | chunksummary->lang1 = lang1; |
michael@0 | 80 | chunksummary->lang2 = lang2; |
michael@0 | 81 | chunksummary->score1 = chunk_tote->GetScore(key3[0]); |
michael@0 | 82 | chunksummary->score2 = chunk_tote->GetScore(key3[1]); |
michael@0 | 83 | chunksummary->bytes = len; |
michael@0 | 84 | chunksummary->grams = chunk_tote->GetScoreCount(); |
michael@0 | 85 | chunksummary->ulscript = ulscript; |
michael@0 | 86 | chunksummary->reliability_delta = ReliabilityDelta(chunksummary->score1, |
michael@0 | 87 | chunksummary->score2, |
michael@0 | 88 | chunksummary->grams); |
michael@0 | 89 | // If lang1/lang2 in same close set, set delta reliability to 100% |
michael@0 | 90 | if (SameCloseSet(lang1, lang2)) { |
michael@0 | 91 | chunksummary->reliability_delta = 100; |
michael@0 | 92 | } |
michael@0 | 93 | chunksummary->reliability_score = |
michael@0 | 94 | ReliabilityExpected(actual_score_per_kb, expected_score_per_kb); |
michael@0 | 95 | } |
michael@0 | 96 | |
michael@0 | 97 | // Return true if just lang1 is there: lang2=0 and lang3=0 |
michael@0 | 98 | bool IsSingleLang(uint32 langprob) { |
michael@0 | 99 | // Probably a bug -- which end is lang1? But only used to call empty Boost1 |
michael@0 | 100 | return ((langprob & 0x00ffff00) == 0); |
michael@0 | 101 | } |
michael@0 | 102 | |
michael@0 | 103 | // Update scoring context distinct_boost for single language quad |
michael@0 | 104 | void AddDistinctBoost1(uint32 langprob, ScoringContext* scoringcontext) { |
michael@0 | 105 | // Probably keep this empty -- not a good enough signal |
michael@0 | 106 | } |
michael@0 | 107 | |
michael@0 | 108 | // Update scoring context distinct_boost for distinct octagram |
michael@0 | 109 | // Keep last 4 used. Since these are mostly (except at splices) in |
michael@0 | 110 | // hitbuffer, we might be able to just use a subscript and splice |
michael@0 | 111 | void AddDistinctBoost2(uint32 langprob, ScoringContext* scoringcontext) { |
michael@0 | 112 | // this is called 0..n times per chunk with decoded hitbuffer->distinct... |
michael@0 | 113 | LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn; |
michael@0 | 114 | if (scoringcontext->ulscript != ULScript_Latin) { |
michael@0 | 115 | distinct_boost = &scoringcontext->distinct_boost.othr; |
michael@0 | 116 | } |
michael@0 | 117 | int n = distinct_boost->n; |
michael@0 | 118 | distinct_boost->langprob[n] = langprob; |
michael@0 | 119 | distinct_boost->n = distinct_boost->wrap(n + 1); |
michael@0 | 120 | } |
michael@0 | 121 | |
michael@0 | 122 | // For each chunk, add extra weight for language priors (from content-lang and |
michael@0 | 123 | // meta lang=xx) and distinctive tokens |
michael@0 | 124 | void ScoreBoosts(const ScoringContext* scoringcontext, Tote* chunk_tote) { |
michael@0 | 125 | // Get boosts for current script |
michael@0 | 126 | const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn; |
michael@0 | 127 | const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn; |
michael@0 | 128 | const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn; |
michael@0 | 129 | if (scoringcontext->ulscript != ULScript_Latin) { |
michael@0 | 130 | langprior_boost = &scoringcontext->langprior_boost.othr; |
michael@0 | 131 | langprior_whack = &scoringcontext->langprior_whack.othr; |
michael@0 | 132 | distinct_boost = &scoringcontext->distinct_boost.othr; |
michael@0 | 133 | } |
michael@0 | 134 | |
michael@0 | 135 | for (int k = 0; k < kMaxBoosts; ++k) { |
michael@0 | 136 | uint32 langprob = langprior_boost->langprob[k]; |
michael@0 | 137 | if (langprob > 0) {AddLangProb(langprob, chunk_tote);} |
michael@0 | 138 | } |
michael@0 | 139 | for (int k = 0; k < kMaxBoosts; ++k) { |
michael@0 | 140 | uint32 langprob = distinct_boost->langprob[k]; |
michael@0 | 141 | if (langprob > 0) {AddLangProb(langprob, chunk_tote);} |
michael@0 | 142 | } |
michael@0 | 143 | // boost has a packed set of per-script langs and probabilites |
michael@0 | 144 | // whack has a packed set of per-script lang to be suppressed (zeroed) |
michael@0 | 145 | // When a language in a close set is given as an explicit hint, others in |
michael@0 | 146 | // that set will be whacked here. |
michael@0 | 147 | for (int k = 0; k < kMaxBoosts; ++k) { |
michael@0 | 148 | uint32 langprob = langprior_whack->langprob[k]; |
michael@0 | 149 | if (langprob > 0) {ZeroPSLang(langprob, chunk_tote);} |
michael@0 | 150 | } |
michael@0 | 151 | } |
michael@0 | 152 | |
michael@0 | 153 | |
michael@0 | 154 | |
michael@0 | 155 | // At this point, The chunk is described by |
michael@0 | 156 | // hitbuffer->base[cspan->chunk_base .. cspan->chunk_base + cspan->base_len) |
michael@0 | 157 | // hitbuffer->delta[cspan->chunk_delta ... ) |
michael@0 | 158 | // hitbuffer->distinct[cspan->chunk_distinct ... ) |
michael@0 | 159 | // Scored text is in text[lo..hi) where |
michael@0 | 160 | // lo is 0 or the min of first base/delta/distinct hitbuffer offset and |
michael@0 | 161 | // hi is the min of next base/delta/distinct hitbuffer offset after |
michael@0 | 162 | // base_len, etc. |
michael@0 | 163 | void GetTextSpanOffsets(const ScoringHitBuffer* hitbuffer, |
michael@0 | 164 | const ChunkSpan* cspan, int* lo, int* hi) { |
michael@0 | 165 | // Front of this span |
michael@0 | 166 | int lo_base = hitbuffer->base[cspan->chunk_base].offset; |
michael@0 | 167 | int lo_delta = hitbuffer->delta[cspan->chunk_delta].offset; |
michael@0 | 168 | int lo_distinct = hitbuffer->distinct[cspan->chunk_distinct].offset; |
michael@0 | 169 | // Front of next span |
michael@0 | 170 | int hi_base = hitbuffer->base[cspan->chunk_base + |
michael@0 | 171 | cspan->base_len].offset; |
michael@0 | 172 | int hi_delta = hitbuffer->delta[cspan->chunk_delta + |
michael@0 | 173 | cspan->delta_len].offset; |
michael@0 | 174 | int hi_distinct = hitbuffer->distinct[cspan->chunk_distinct + |
michael@0 | 175 | cspan->distinct_len].offset; |
michael@0 | 176 | |
michael@0 | 177 | *lo = 0; |
michael@0 | 178 | // if (cspan->chunk_base > 0) { |
michael@0 | 179 | // *lo = minint(minint(lo_base, lo_delta), lo_distinct); |
michael@0 | 180 | // } |
michael@0 | 181 | *lo = minint(minint(lo_base, lo_delta), lo_distinct); |
michael@0 | 182 | *hi = minint(minint(hi_base, hi_delta), hi_distinct); |
michael@0 | 183 | } |
michael@0 | 184 | |
michael@0 | 185 | |
michael@0 | 186 | int DiffScore(const CLD2TableSummary* obj, int indirect, |
michael@0 | 187 | uint16 lang1, uint16 lang2) { |
michael@0 | 188 | if (indirect < static_cast<int>(obj->kCLDTableSizeOne)) { |
michael@0 | 189 | // Up to three languages at indirect |
michael@0 | 190 | uint32 langprob = obj->kCLDTableInd[indirect]; |
michael@0 | 191 | return GetLangScore(langprob, lang1) - GetLangScore(langprob, lang2); |
michael@0 | 192 | } else { |
michael@0 | 193 | // Up to six languages at start + 2 * (indirect - start) |
michael@0 | 194 | indirect += (indirect - obj->kCLDTableSizeOne); |
michael@0 | 195 | uint32 langprob = obj->kCLDTableInd[indirect]; |
michael@0 | 196 | uint32 langprob2 = obj->kCLDTableInd[indirect + 1]; |
michael@0 | 197 | return (GetLangScore(langprob, lang1) + GetLangScore(langprob2, lang1)) - |
michael@0 | 198 | (GetLangScore(langprob, lang2) + GetLangScore(langprob2, lang2)); |
michael@0 | 199 | } |
michael@0 | 200 | |
michael@0 | 201 | } |
michael@0 | 202 | |
michael@0 | 203 | // Score all the bases, deltas, distincts, boosts for one chunk into chunk_tote |
michael@0 | 204 | // After last chunk there is always a hitbuffer entry with an offset just off |
michael@0 | 205 | // the end of the text. |
michael@0 | 206 | // Sets delta_len, and distinct_len |
michael@0 | 207 | void ScoreOneChunk(const char* text, ULScript ulscript, |
michael@0 | 208 | const ScoringHitBuffer* hitbuffer, |
michael@0 | 209 | int chunk_i, |
michael@0 | 210 | ScoringContext* scoringcontext, |
michael@0 | 211 | ChunkSpan* cspan, Tote* chunk_tote, |
michael@0 | 212 | ChunkSummary* chunksummary) { |
michael@0 | 213 | int first_linear_in_chunk = hitbuffer->chunk_start[chunk_i]; |
michael@0 | 214 | int first_linear_in_next_chunk = hitbuffer->chunk_start[chunk_i + 1]; |
michael@0 | 215 | |
michael@0 | 216 | chunk_tote->Reinit(); |
michael@0 | 217 | cspan->delta_len = 0; |
michael@0 | 218 | cspan->distinct_len = 0; |
michael@0 | 219 | if (scoringcontext->flags_cld2_verbose) { |
michael@0 | 220 | fprintf(scoringcontext->debug_file, "<br>ScoreOneChunk[%d..%d) ", |
michael@0 | 221 | first_linear_in_chunk, first_linear_in_next_chunk); |
michael@0 | 222 | } |
michael@0 | 223 | |
michael@0 | 224 | // 2013.02.05 linear design: just use base and base_len for the span |
michael@0 | 225 | cspan->chunk_base = first_linear_in_chunk; |
michael@0 | 226 | cspan->base_len = first_linear_in_next_chunk - first_linear_in_chunk; |
michael@0 | 227 | for (int i = first_linear_in_chunk; i < first_linear_in_next_chunk; ++i) { |
michael@0 | 228 | uint32 langprob = hitbuffer->linear[i].langprob; |
michael@0 | 229 | AddLangProb(langprob, chunk_tote); |
michael@0 | 230 | if (hitbuffer->linear[i].type <= QUADHIT) { |
michael@0 | 231 | chunk_tote->AddScoreCount(); // Just count quads, not octas |
michael@0 | 232 | } |
michael@0 | 233 | if (hitbuffer->linear[i].type == DISTINCTHIT) { |
michael@0 | 234 | AddDistinctBoost2(langprob, scoringcontext); |
michael@0 | 235 | } |
michael@0 | 236 | } |
michael@0 | 237 | |
michael@0 | 238 | // Score language prior boosts |
michael@0 | 239 | // Score distinct word boost |
michael@0 | 240 | ScoreBoosts(scoringcontext, chunk_tote); |
michael@0 | 241 | |
michael@0 | 242 | int lo = hitbuffer->linear[first_linear_in_chunk].offset; |
michael@0 | 243 | int hi = hitbuffer->linear[first_linear_in_next_chunk].offset; |
michael@0 | 244 | |
michael@0 | 245 | // Chunk_tote: get top langs, scores, etc. and fill in chunk summary |
michael@0 | 246 | SetChunkSummary(ulscript, first_linear_in_chunk, lo, hi - lo, |
michael@0 | 247 | scoringcontext, chunk_tote, chunksummary); |
michael@0 | 248 | |
michael@0 | 249 | bool more_to_come = false; |
michael@0 | 250 | bool score_cjk = false; |
michael@0 | 251 | if (scoringcontext->flags_cld2_html) { |
michael@0 | 252 | // Show one chunk in readable output |
michael@0 | 253 | CLD2_Debug(text, lo, hi, more_to_come, score_cjk, hitbuffer, |
michael@0 | 254 | scoringcontext, cspan, chunksummary); |
michael@0 | 255 | } |
michael@0 | 256 | |
michael@0 | 257 | scoringcontext->prior_chunk_lang = static_cast<Language>(chunksummary->lang1); |
michael@0 | 258 | } |
michael@0 | 259 | |
michael@0 | 260 | |
michael@0 | 261 | // Score chunks of text described by hitbuffer, allowing each to be in a |
michael@0 | 262 | // different language, and optionally adjusting the boundaries inbetween. |
michael@0 | 263 | // Set last_cspan to the last chunkspan used |
michael@0 | 264 | void ScoreAllHits(const char* text, ULScript ulscript, |
michael@0 | 265 | bool more_to_come, bool score_cjk, |
michael@0 | 266 | const ScoringHitBuffer* hitbuffer, |
michael@0 | 267 | ScoringContext* scoringcontext, |
michael@0 | 268 | SummaryBuffer* summarybuffer, ChunkSpan* last_cspan) { |
michael@0 | 269 | ChunkSpan prior_cspan = {0, 0, 0, 0, 0, 0}; |
michael@0 | 270 | ChunkSpan cspan = {0, 0, 0, 0, 0, 0}; |
michael@0 | 271 | |
michael@0 | 272 | for (int i = 0; i < hitbuffer->next_chunk_start; ++i) { |
michael@0 | 273 | // Score one chunk |
michael@0 | 274 | // Sets delta_len, and distinct_len |
michael@0 | 275 | Tote chunk_tote; |
michael@0 | 276 | ChunkSummary chunksummary; |
michael@0 | 277 | ScoreOneChunk(text, ulscript, |
michael@0 | 278 | hitbuffer, i, |
michael@0 | 279 | scoringcontext, &cspan, &chunk_tote, &chunksummary); |
michael@0 | 280 | |
michael@0 | 281 | // Put result in summarybuffer |
michael@0 | 282 | if (summarybuffer->n < kMaxSummaries) { |
michael@0 | 283 | summarybuffer->chunksummary[summarybuffer->n] = chunksummary; |
michael@0 | 284 | summarybuffer->n += 1; |
michael@0 | 285 | } |
michael@0 | 286 | |
michael@0 | 287 | prior_cspan = cspan; |
michael@0 | 288 | cspan.chunk_base += cspan.base_len; |
michael@0 | 289 | cspan.chunk_delta += cspan.delta_len; |
michael@0 | 290 | cspan.chunk_distinct += cspan.distinct_len; |
michael@0 | 291 | } |
michael@0 | 292 | |
michael@0 | 293 | // Add one dummy off the end to hold first unused linear_in_chunk |
michael@0 | 294 | int linear_off_end = hitbuffer->next_linear; |
michael@0 | 295 | int offset_off_end = hitbuffer->linear[linear_off_end].offset; |
michael@0 | 296 | ChunkSummary* cs = &summarybuffer->chunksummary[summarybuffer->n]; |
michael@0 | 297 | memset(cs, 0, sizeof(ChunkSummary)); |
michael@0 | 298 | cs->offset = offset_off_end; |
michael@0 | 299 | cs->chunk_start = linear_off_end; |
michael@0 | 300 | *last_cspan = prior_cspan; |
michael@0 | 301 | } |
michael@0 | 302 | |
michael@0 | 303 | |
michael@0 | 304 | void SummaryBufferToDocTote(const SummaryBuffer* summarybuffer, |
michael@0 | 305 | bool more_to_come, DocTote* doc_tote) { |
michael@0 | 306 | int cs_bytes_sum = 0; |
michael@0 | 307 | for (int i = 0; i < summarybuffer->n; ++i) { |
michael@0 | 308 | const ChunkSummary* cs = &summarybuffer->chunksummary[i]; |
michael@0 | 309 | int reliability = minint(cs->reliability_delta, cs->reliability_score); |
michael@0 | 310 | // doc_tote uses full languages |
michael@0 | 311 | doc_tote->Add(cs->lang1, cs->bytes, cs->score1, reliability); |
michael@0 | 312 | cs_bytes_sum += cs->bytes; |
michael@0 | 313 | } |
michael@0 | 314 | } |
michael@0 | 315 | |
michael@0 | 316 | // Turn on for debugging vectors |
michael@0 | 317 | static const bool kShowLettersOriginal = false; |
michael@0 | 318 | |
michael@0 | 319 | |
michael@0 | 320 | // If next chunk language matches last vector language, extend last element |
michael@0 | 321 | // Otherwise add new element to vector |
michael@0 | 322 | void ItemToVector(ScriptScanner* scanner, |
michael@0 | 323 | ResultChunkVector* vec, Language new_lang, |
michael@0 | 324 | int mapped_offset, int mapped_len) { |
michael@0 | 325 | uint16 last_vec_lang = static_cast<uint16>(UNKNOWN_LANGUAGE); |
michael@0 | 326 | int last_vec_subscr = vec->size() - 1; |
michael@0 | 327 | if (last_vec_subscr >= 0) { |
michael@0 | 328 | ResultChunk* priorrc = &(*vec)[last_vec_subscr]; |
michael@0 | 329 | last_vec_lang = priorrc->lang1; |
michael@0 | 330 | if (new_lang == last_vec_lang) { |
michael@0 | 331 | // Extend prior. Current mapped_offset may be beyond prior end, so do |
michael@0 | 332 | // the arithmetic to include any such gap |
michael@0 | 333 | priorrc->bytes = minint((mapped_offset + mapped_len) - priorrc->offset, |
michael@0 | 334 | kMaxResultChunkBytes); |
michael@0 | 335 | if (kShowLettersOriginal) { |
michael@0 | 336 | // Optionally print the new chunk original text |
michael@0 | 337 | string temp2(&scanner->GetBufferStart()[priorrc->offset], |
michael@0 | 338 | priorrc->bytes); |
michael@0 | 339 | fprintf(stderr, "Item[%d..%d) '%s'<br>\n", |
michael@0 | 340 | priorrc->offset, priorrc->offset + priorrc->bytes, |
michael@0 | 341 | GetHtmlEscapedText(temp2).c_str()); |
michael@0 | 342 | } |
michael@0 | 343 | return; |
michael@0 | 344 | } |
michael@0 | 345 | } |
michael@0 | 346 | // Add new vector element |
michael@0 | 347 | ResultChunk rc; |
michael@0 | 348 | rc.offset = mapped_offset; |
michael@0 | 349 | rc.bytes = minint(mapped_len, kMaxResultChunkBytes); |
michael@0 | 350 | rc.lang1 = static_cast<uint16>(new_lang); |
michael@0 | 351 | vec->push_back(rc); |
michael@0 | 352 | if (kShowLettersOriginal) { |
michael@0 | 353 | // Optionally print the new chunk original text |
michael@0 | 354 | string temp2(&scanner->GetBufferStart()[rc.offset], rc.bytes); |
michael@0 | 355 | fprintf(stderr, "Item[%d..%d) '%s'<br>\n", |
michael@0 | 356 | rc.offset, rc.offset + rc.bytes, |
michael@0 | 357 | GetHtmlEscapedText(temp2).c_str()); |
michael@0 | 358 | } |
michael@0 | 359 | } |
michael@0 | 360 | |
michael@0 | 361 | uint16 PriorVecLang(const ResultChunkVector* vec) { |
michael@0 | 362 | if (vec->empty()) {return static_cast<uint16>(UNKNOWN_LANGUAGE);} |
michael@0 | 363 | return (*vec)[vec->size() - 1].lang1; |
michael@0 | 364 | } |
michael@0 | 365 | |
michael@0 | 366 | uint16 NextChunkLang(const SummaryBuffer* summarybuffer, int i) { |
michael@0 | 367 | if ((i + 1) >= summarybuffer->n) { |
michael@0 | 368 | return static_cast<uint16>(UNKNOWN_LANGUAGE); |
michael@0 | 369 | } |
michael@0 | 370 | return summarybuffer->chunksummary[i + 1].lang1; |
michael@0 | 371 | } |
michael@0 | 372 | |
michael@0 | 373 | |
michael@0 | 374 | |
michael@0 | 375 | // Add n elements of summarybuffer to resultchunk vector: |
michael@0 | 376 | // Each element is letters-only text [offset..offset+bytes) |
michael@0 | 377 | // This maps back to original[Back(offset)..Back(offset+bytes)) |
michael@0 | 378 | // |
michael@0 | 379 | // We go out of our way to minimize the variation in the ResultChunkVector, |
michael@0 | 380 | // so that the caller has fewer but more meaningful spans in different |
michael@0 | 381 | // lanaguges, for the likely purpose of translation or spell-check. |
michael@0 | 382 | // |
michael@0 | 383 | // The language of each chunk is lang1, but it might be unreliable for |
michael@0 | 384 | // either of two reasons: its score is relatively too close to the score of |
michael@0 | 385 | // lang2, or its score is too far away from the expected score of real text in |
michael@0 | 386 | // the given language. Unreliable languages are mapped to Unknown. |
michael@0 | 387 | // |
michael@0 | 388 | void SummaryBufferToVector(ScriptScanner* scanner, const char* text, |
michael@0 | 389 | const SummaryBuffer* summarybuffer, |
michael@0 | 390 | bool more_to_come, ResultChunkVector* vec) { |
michael@0 | 391 | if (vec == NULL) {return;} |
michael@0 | 392 | |
michael@0 | 393 | if (kShowLettersOriginal) { |
michael@0 | 394 | fprintf(stderr, "map2original_ "); |
michael@0 | 395 | scanner->map2original_.DumpWindow(); |
michael@0 | 396 | fprintf(stderr, "<br>\n"); |
michael@0 | 397 | fprintf(stderr, "map2uplow_ "); |
michael@0 | 398 | scanner->map2uplow_.DumpWindow(); |
michael@0 | 399 | fprintf(stderr, "<br>\n"); |
michael@0 | 400 | } |
michael@0 | 401 | |
michael@0 | 402 | for (int i = 0; i < summarybuffer->n; ++i) { |
michael@0 | 403 | const ChunkSummary* cs = &summarybuffer->chunksummary[i]; |
michael@0 | 404 | int unmapped_offset = cs->offset; |
michael@0 | 405 | int unmapped_len = cs->bytes; |
michael@0 | 406 | |
michael@0 | 407 | if (kShowLettersOriginal) { |
michael@0 | 408 | // Optionally print the chunk lowercase letters/marks text |
michael@0 | 409 | string temp(&text[unmapped_offset], unmapped_len); |
michael@0 | 410 | fprintf(stderr, "Letters [%d..%d) '%s'<br>\n", |
michael@0 | 411 | unmapped_offset, unmapped_offset + unmapped_len, |
michael@0 | 412 | GetHtmlEscapedText(temp).c_str()); |
michael@0 | 413 | } |
michael@0 | 414 | |
michael@0 | 415 | int mapped_offset = scanner->MapBack(unmapped_offset); |
michael@0 | 416 | |
michael@0 | 417 | // Trim back a little to prefer splicing original at word boundaries |
michael@0 | 418 | if (mapped_offset > 0) { |
michael@0 | 419 | // Size of prior vector entry, if any |
michael@0 | 420 | int prior_size = 0; |
michael@0 | 421 | if (!vec->empty()) { |
michael@0 | 422 | ResultChunk* rc = &(*vec)[vec->size() - 1]; |
michael@0 | 423 | prior_size = rc->bytes; |
michael@0 | 424 | } |
michael@0 | 425 | // Maximum back up size to leave at least 3 bytes in prior, |
michael@0 | 426 | // and not entire buffer, and no more than 12 bytes total backup |
michael@0 | 427 | int n_limit = minint(prior_size - 3, mapped_offset); |
michael@0 | 428 | n_limit = minint(n_limit, 12); |
michael@0 | 429 | |
michael@0 | 430 | // Backscan over letters, stopping if prior byte is < 0x41 |
michael@0 | 431 | // There is some possibility that we will backscan over a different script |
michael@0 | 432 | const char* s = &scanner->GetBufferStart()[mapped_offset]; |
michael@0 | 433 | const unsigned char* us = reinterpret_cast<const unsigned char*>(s); |
michael@0 | 434 | int n = 0; |
michael@0 | 435 | while ((n < n_limit) && (us[-n - 1] >= 0x41)) {++n;} |
michael@0 | 436 | if (n >= n_limit) {n = 0;} // New boundary not found within range |
michael@0 | 437 | |
michael@0 | 438 | // Also back up exactly one leading punctuation character if '"#@ |
michael@0 | 439 | if (n < n_limit) { |
michael@0 | 440 | unsigned char c = us[-n - 1]; |
michael@0 | 441 | if ((c == '\'') || (c == '"') || (c == '#') || (c == '@')) {++n;} |
michael@0 | 442 | } |
michael@0 | 443 | // Shrink the previous chunk slightly |
michael@0 | 444 | if (n > 0) { |
michael@0 | 445 | ResultChunk* rc = &(*vec)[vec->size() - 1]; |
michael@0 | 446 | rc->bytes -= n; |
michael@0 | 447 | mapped_offset -= n; |
michael@0 | 448 | if (kShowLettersOriginal) { |
michael@0 | 449 | fprintf(stderr, "Back up %d bytes<br>\n", n); |
michael@0 | 450 | // Optionally print the prior chunk original text |
michael@0 | 451 | string temp2(&scanner->GetBufferStart()[rc->offset], rc->bytes); |
michael@0 | 452 | fprintf(stderr, "Prior [%d..%d) '%s'<br>\n", |
michael@0 | 453 | rc->offset, rc->offset + rc->bytes, |
michael@0 | 454 | GetHtmlEscapedText(temp2).c_str()); |
michael@0 | 455 | } |
michael@0 | 456 | } |
michael@0 | 457 | } |
michael@0 | 458 | |
michael@0 | 459 | int mapped_len = |
michael@0 | 460 | scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset; |
michael@0 | 461 | |
michael@0 | 462 | if (kShowLettersOriginal) { |
michael@0 | 463 | // Optionally print the chunk original text |
michael@0 | 464 | string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len); |
michael@0 | 465 | fprintf(stderr, "Original[%d..%d) '%s'<br>\n", |
michael@0 | 466 | mapped_offset, mapped_offset + mapped_len, |
michael@0 | 467 | GetHtmlEscapedText(temp2).c_str()); |
michael@0 | 468 | } |
michael@0 | 469 | |
michael@0 | 470 | Language new_lang = static_cast<Language>(cs->lang1); |
michael@0 | 471 | bool reliability_delta_bad = |
michael@0 | 472 | (cs->reliability_delta < kUnreliablePercentThreshold); |
michael@0 | 473 | bool reliability_score_bad = |
michael@0 | 474 | (cs->reliability_score < kUnreliablePercentThreshold); |
michael@0 | 475 | |
michael@0 | 476 | // If the top language matches last vector, ignore reliability_delta |
michael@0 | 477 | uint16 prior_lang = PriorVecLang(vec); |
michael@0 | 478 | if (prior_lang == cs->lang1) { |
michael@0 | 479 | reliability_delta_bad = false; |
michael@0 | 480 | } |
michael@0 | 481 | // If the top language is in same close set as last vector, set up to merge |
michael@0 | 482 | if (SameCloseSet(cs->lang1, prior_lang)) { |
michael@0 | 483 | new_lang = static_cast<Language>(prior_lang); |
michael@0 | 484 | reliability_delta_bad = false; |
michael@0 | 485 | } |
michael@0 | 486 | // If the top two languages are in the same close set and the last vector |
michael@0 | 487 | // language is the second language, set up to merge |
michael@0 | 488 | if (SameCloseSet(cs->lang1, cs->lang2) && |
michael@0 | 489 | (prior_lang == cs->lang2)) { |
michael@0 | 490 | new_lang = static_cast<Language>(prior_lang); |
michael@0 | 491 | reliability_delta_bad = false; |
michael@0 | 492 | } |
michael@0 | 493 | // If unreliable and the last and next vector languages are both |
michael@0 | 494 | // the second language, set up to merge |
michael@0 | 495 | uint16 next_lang = NextChunkLang(summarybuffer, i); |
michael@0 | 496 | if (reliability_delta_bad && |
michael@0 | 497 | (prior_lang == cs->lang2) && (next_lang == cs->lang2)) { |
michael@0 | 498 | new_lang = static_cast<Language>(prior_lang); |
michael@0 | 499 | reliability_delta_bad = false; |
michael@0 | 500 | } |
michael@0 | 501 | |
michael@0 | 502 | if (reliability_delta_bad || reliability_score_bad) { |
michael@0 | 503 | new_lang = UNKNOWN_LANGUAGE; |
michael@0 | 504 | } |
michael@0 | 505 | ItemToVector(scanner, vec, new_lang, mapped_offset, mapped_len); |
michael@0 | 506 | } |
michael@0 | 507 | } |
michael@0 | 508 | |
michael@0 | 509 | // Add just one element to resultchunk vector: |
michael@0 | 510 | // For RTypeNone or RTypeOne |
michael@0 | 511 | void JustOneItemToVector(ScriptScanner* scanner, const char* text, |
michael@0 | 512 | Language lang1, int unmapped_offset, int unmapped_len, |
michael@0 | 513 | ResultChunkVector* vec) { |
michael@0 | 514 | if (vec == NULL) {return;} |
michael@0 | 515 | |
michael@0 | 516 | if (kShowLettersOriginal) { |
michael@0 | 517 | fprintf(stderr, "map2original_ "); |
michael@0 | 518 | scanner->map2original_.DumpWindow(); |
michael@0 | 519 | fprintf(stderr, "<br>\n"); |
michael@0 | 520 | fprintf(stderr, "map2uplow_ "); |
michael@0 | 521 | scanner->map2uplow_.DumpWindow(); |
michael@0 | 522 | fprintf(stderr, "<br>\n"); |
michael@0 | 523 | } |
michael@0 | 524 | |
michael@0 | 525 | if (kShowLettersOriginal) { |
michael@0 | 526 | // Optionally print the chunk lowercase letters/marks text |
michael@0 | 527 | string temp(&text[unmapped_offset], unmapped_len); |
michael@0 | 528 | fprintf(stderr, "Letters1 [%d..%d) '%s'<br>\n", |
michael@0 | 529 | unmapped_offset, unmapped_offset + unmapped_len, |
michael@0 | 530 | GetHtmlEscapedText(temp).c_str()); |
michael@0 | 531 | } |
michael@0 | 532 | |
michael@0 | 533 | int mapped_offset = scanner->MapBack(unmapped_offset); |
michael@0 | 534 | int mapped_len = |
michael@0 | 535 | scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset; |
michael@0 | 536 | |
michael@0 | 537 | if (kShowLettersOriginal) { |
michael@0 | 538 | // Optionally print the chunk original text |
michael@0 | 539 | string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len); |
michael@0 | 540 | fprintf(stderr, "Original1[%d..%d) '%s'<br>\n", |
michael@0 | 541 | mapped_offset, mapped_offset + mapped_len, |
michael@0 | 542 | GetHtmlEscapedText(temp2).c_str()); |
michael@0 | 543 | } |
michael@0 | 544 | |
michael@0 | 545 | ItemToVector(scanner, vec, lang1, mapped_offset, mapped_len); |
michael@0 | 546 | } |
michael@0 | 547 | |
michael@0 | 548 | |
michael@0 | 549 | // Debugging. Not thread safe. Defined in getonescriptspan |
michael@0 | 550 | char* DisplayPiece(const char* next_byte_, int byte_length_); |
michael@0 | 551 | |
michael@0 | 552 | // If high bit is on, take out high bit and add 2B to make table2 entries easy |
michael@0 | 553 | inline int PrintableIndirect(int x) { |
michael@0 | 554 | if ((x & 0x80000000u) != 0) { |
michael@0 | 555 | return (x & ~0x80000000u) + 2000000000; |
michael@0 | 556 | } |
michael@0 | 557 | return x; |
michael@0 | 558 | } |
michael@0 | 559 | void DumpHitBuffer(FILE* df, const char* text, |
michael@0 | 560 | const ScoringHitBuffer* hitbuffer) { |
michael@0 | 561 | fprintf(df, |
michael@0 | 562 | "<br>DumpHitBuffer[%s, next_base/delta/distinct %d, %d, %d)<br>\n", |
michael@0 | 563 | ULScriptCode(hitbuffer->ulscript), |
michael@0 | 564 | hitbuffer->next_base, hitbuffer->next_delta, |
michael@0 | 565 | hitbuffer->next_distinct); |
michael@0 | 566 | for (int i = 0; i < hitbuffer->maxscoringhits; ++i) { |
michael@0 | 567 | if (i < hitbuffer->next_base) { |
michael@0 | 568 | fprintf(df, "Q[%d]%d,%d,%s ", |
michael@0 | 569 | i, hitbuffer->base[i].offset, |
michael@0 | 570 | PrintableIndirect(hitbuffer->base[i].indirect), |
michael@0 | 571 | DisplayPiece(&text[hitbuffer->base[i].offset], 6)); |
michael@0 | 572 | } |
michael@0 | 573 | if (i < hitbuffer->next_delta) { |
michael@0 | 574 | fprintf(df, "DL[%d]%d,%d,%s ", |
michael@0 | 575 | i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect, |
michael@0 | 576 | DisplayPiece(&text[hitbuffer->delta[i].offset], 12)); |
michael@0 | 577 | } |
michael@0 | 578 | if (i < hitbuffer->next_distinct) { |
michael@0 | 579 | fprintf(df, "D[%d]%d,%d,%s ", |
michael@0 | 580 | i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect, |
michael@0 | 581 | DisplayPiece(&text[hitbuffer->distinct[i].offset], 12)); |
michael@0 | 582 | } |
michael@0 | 583 | if (i < hitbuffer->next_base) { |
michael@0 | 584 | fprintf(df, "<br>\n"); |
michael@0 | 585 | } |
michael@0 | 586 | if (i > 50) {break;} |
michael@0 | 587 | } |
michael@0 | 588 | if (hitbuffer->next_base > 50) { |
michael@0 | 589 | int i = hitbuffer->next_base; |
michael@0 | 590 | fprintf(df, "Q[%d]%d,%d,%s ", |
michael@0 | 591 | i, hitbuffer->base[i].offset, |
michael@0 | 592 | PrintableIndirect(hitbuffer->base[i].indirect), |
michael@0 | 593 | DisplayPiece(&text[hitbuffer->base[i].offset], 6)); |
michael@0 | 594 | } |
michael@0 | 595 | if (hitbuffer->next_delta > 50) { |
michael@0 | 596 | int i = hitbuffer->next_delta; |
michael@0 | 597 | fprintf(df, "DL[%d]%d,%d,%s ", |
michael@0 | 598 | i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect, |
michael@0 | 599 | DisplayPiece(&text[hitbuffer->delta[i].offset], 12)); |
michael@0 | 600 | } |
michael@0 | 601 | if (hitbuffer->next_distinct > 50) { |
michael@0 | 602 | int i = hitbuffer->next_distinct; |
michael@0 | 603 | fprintf(df, "D[%d]%d,%d,%s ", |
michael@0 | 604 | i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect, |
michael@0 | 605 | DisplayPiece(&text[hitbuffer->distinct[i].offset], 12)); |
michael@0 | 606 | } |
michael@0 | 607 | fprintf(df, "<br>\n"); |
michael@0 | 608 | } |
michael@0 | 609 | |
michael@0 | 610 | |
michael@0 | 611 | void DumpLinearBuffer(FILE* df, const char* text, |
michael@0 | 612 | const ScoringHitBuffer* hitbuffer) { |
michael@0 | 613 | fprintf(df, "<br>DumpLinearBuffer[%d)<br>\n", |
michael@0 | 614 | hitbuffer->next_linear); |
michael@0 | 615 | // Include the dummy entry off the end |
michael@0 | 616 | for (int i = 0; i < hitbuffer->next_linear + 1; ++i) { |
michael@0 | 617 | if ((50 < i) && (i < (hitbuffer->next_linear - 1))) {continue;} |
michael@0 | 618 | fprintf(df, "[%d]%d,%c=%08x,%s<br>\n", |
michael@0 | 619 | i, hitbuffer->linear[i].offset, |
michael@0 | 620 | "UQLD"[hitbuffer->linear[i].type], |
michael@0 | 621 | hitbuffer->linear[i].langprob, |
michael@0 | 622 | DisplayPiece(&text[hitbuffer->linear[i].offset], 6)); |
michael@0 | 623 | } |
michael@0 | 624 | fprintf(df, "<br>\n"); |
michael@0 | 625 | |
michael@0 | 626 | fprintf(df, "DumpChunkStart[%d]<br>\n", hitbuffer->next_chunk_start); |
michael@0 | 627 | for (int i = 0; i < hitbuffer->next_chunk_start + 1; ++i) { |
michael@0 | 628 | fprintf(df, "[%d]%d\n", i, hitbuffer->chunk_start[i]); |
michael@0 | 629 | } |
michael@0 | 630 | fprintf(df, "<br>\n"); |
michael@0 | 631 | } |
michael@0 | 632 | |
michael@0 | 633 | // Move this verbose debugging output to debug.cc eventually |
michael@0 | 634 | void DumpChunkSummary(FILE* df, const ChunkSummary* cs) { |
michael@0 | 635 | // Print chunksummary |
michael@0 | 636 | fprintf(df, "%d lin[%d] %s.%d %s.%d %dB %d# %s %dRd %dRs<br>\n", |
michael@0 | 637 | cs->offset, |
michael@0 | 638 | cs->chunk_start, |
michael@0 | 639 | LanguageCode(static_cast<Language>(cs->lang1)), |
michael@0 | 640 | cs->score1, |
michael@0 | 641 | LanguageCode(static_cast<Language>(cs->lang2)), |
michael@0 | 642 | cs->score2, |
michael@0 | 643 | cs->bytes, |
michael@0 | 644 | cs->grams, |
michael@0 | 645 | ULScriptCode(static_cast<ULScript>(cs->ulscript)), |
michael@0 | 646 | cs->reliability_delta, |
michael@0 | 647 | cs->reliability_score); |
michael@0 | 648 | } |
michael@0 | 649 | |
michael@0 | 650 | void DumpSummaryBuffer(FILE* df, const SummaryBuffer* summarybuffer) { |
michael@0 | 651 | fprintf(df, "<br>DumpSummaryBuffer[%d]<br>\n", summarybuffer->n); |
michael@0 | 652 | fprintf(df, "[i] offset linear[chunk_start] lang.score1 lang.score2 " |
michael@0 | 653 | "bytesB ngrams# script rel_delta rel_score<br>\n"); |
michael@0 | 654 | for (int i = 0; i <= summarybuffer->n; ++i) { |
michael@0 | 655 | fprintf(df, "[%d] ", i); |
michael@0 | 656 | DumpChunkSummary(df, &summarybuffer->chunksummary[i]); |
michael@0 | 657 | } |
michael@0 | 658 | fprintf(df, "<br>\n"); |
michael@0 | 659 | } |
michael@0 | 660 | |
michael@0 | 661 | |
michael@0 | 662 | |
michael@0 | 663 | // Within hitbufer->linear[] |
michael@0 | 664 | // <-- prior chunk --><-- this chunk --> |
michael@0 | 665 | // | | | |
michael@0 | 666 | // linear0 linear1 linear2 |
michael@0 | 667 | // lang0 lang1 |
michael@0 | 668 | // The goal of sharpening is to move this_linear to better separate langs |
michael@0 | 669 | int BetterBoundary(const char* text, |
michael@0 | 670 | ScoringHitBuffer* hitbuffer, |
michael@0 | 671 | ScoringContext* scoringcontext, |
michael@0 | 672 | uint16 pslang0, uint16 pslang1, |
michael@0 | 673 | int linear0, int linear1, int linear2) { |
michael@0 | 674 | // Degenerate case, no change |
michael@0 | 675 | if ((linear2 - linear0) <= 8) {return linear1;} |
michael@0 | 676 | |
michael@0 | 677 | // Each diff gives pslang0 score - pslang1 score |
michael@0 | 678 | // Running diff has four entries + + + + followed by four entries - - - - |
michael@0 | 679 | // so that this value is maximal at the sharpest boundary between pslang0 |
michael@0 | 680 | // (positive diffs) and pslang1 (negative diffs) |
michael@0 | 681 | int running_diff = 0; |
michael@0 | 682 | int diff[8]; // Ring buffer of pslang0-pslang1 differences |
michael@0 | 683 | // Initialize with first 8 diffs |
michael@0 | 684 | for (int i = linear0; i < linear0 + 8; ++i) { |
michael@0 | 685 | int j = i & 7; |
michael@0 | 686 | uint32 langprob = hitbuffer->linear[i].langprob; |
michael@0 | 687 | diff[j] = GetLangScore(langprob, pslang0) - |
michael@0 | 688 | GetLangScore(langprob, pslang1); |
michael@0 | 689 | if (i < linear0 + 4) { |
michael@0 | 690 | // First four diffs pslang0 - pslang1 |
michael@0 | 691 | running_diff += diff[j]; |
michael@0 | 692 | } else { |
michael@0 | 693 | // Second four diffs -(pslang0 - pslang1) |
michael@0 | 694 | running_diff -= diff[j]; |
michael@0 | 695 | } |
michael@0 | 696 | } |
michael@0 | 697 | |
michael@0 | 698 | // Now scan for sharpest boundary. j is at left end of 8 entries |
michael@0 | 699 | // To be a boundary, there must be both >0 and <0 entries in the window |
michael@0 | 700 | int better_boundary_value = 0; |
michael@0 | 701 | int better_boundary = linear1; |
michael@0 | 702 | for (int i = linear0; i < linear2 - 8; ++i) { |
michael@0 | 703 | int j = i & 7; |
michael@0 | 704 | if (better_boundary_value < running_diff) { |
michael@0 | 705 | bool has_plus = false; |
michael@0 | 706 | bool has_minus = false; |
michael@0 | 707 | for (int kk = 0; kk < 8; ++kk) { |
michael@0 | 708 | if (diff[kk] > 0) {has_plus = true;} |
michael@0 | 709 | if (diff[kk] < 0) {has_minus = true;} |
michael@0 | 710 | } |
michael@0 | 711 | if (has_plus && has_minus) { |
michael@0 | 712 | better_boundary_value = running_diff; |
michael@0 | 713 | better_boundary = i + 4; |
michael@0 | 714 | } |
michael@0 | 715 | } |
michael@0 | 716 | // Shift right one entry |
michael@0 | 717 | uint32 langprob = hitbuffer->linear[i + 8].langprob; |
michael@0 | 718 | int newdiff = GetLangScore(langprob, pslang0) - |
michael@0 | 719 | GetLangScore(langprob, pslang1); |
michael@0 | 720 | int middiff = diff[(i + 4) & 7]; |
michael@0 | 721 | int olddiff = diff[j]; |
michael@0 | 722 | diff[j] = newdiff; |
michael@0 | 723 | running_diff -= olddiff; // Remove left |
michael@0 | 724 | running_diff += 2 * middiff; // Convert middle from - to + |
michael@0 | 725 | running_diff -= newdiff; // Insert right |
michael@0 | 726 | } |
michael@0 | 727 | |
michael@0 | 728 | if (scoringcontext->flags_cld2_verbose && (linear1 != better_boundary)) { |
michael@0 | 729 | Language lang0 = FromPerScriptNumber(scoringcontext->ulscript, pslang0); |
michael@0 | 730 | Language lang1 = FromPerScriptNumber(scoringcontext->ulscript, pslang1); |
michael@0 | 731 | fprintf(scoringcontext->debug_file, " Better lin[%d=>%d] %s^^%s <br>\n", |
michael@0 | 732 | linear1, better_boundary, |
michael@0 | 733 | LanguageCode(lang0), LanguageCode(lang1)); |
michael@0 | 734 | int lin0_off = hitbuffer->linear[linear0].offset; |
michael@0 | 735 | int lin1_off = hitbuffer->linear[linear1].offset; |
michael@0 | 736 | int lin2_off = hitbuffer->linear[linear2].offset; |
michael@0 | 737 | int better_offm1 = hitbuffer->linear[better_boundary - 1].offset; |
michael@0 | 738 | int better_off = hitbuffer->linear[better_boundary].offset; |
michael@0 | 739 | int better_offp1 = hitbuffer->linear[better_boundary + 1].offset; |
michael@0 | 740 | string old0(&text[lin0_off], lin1_off - lin0_off); |
michael@0 | 741 | string old1(&text[lin1_off], lin2_off - lin1_off); |
michael@0 | 742 | string new0(&text[lin0_off], better_offm1 - lin0_off); |
michael@0 | 743 | string new0m1(&text[better_offm1], better_off - better_offm1); |
michael@0 | 744 | string new1(&text[better_off], better_offp1 - better_off); |
michael@0 | 745 | string new1p1(&text[better_offp1], lin2_off - better_offp1); |
michael@0 | 746 | fprintf(scoringcontext->debug_file, "%s^^%s => <br>\n%s^%s^^%s^%s<br>\n", |
michael@0 | 747 | GetHtmlEscapedText(old0).c_str(), |
michael@0 | 748 | GetHtmlEscapedText(old1).c_str(), |
michael@0 | 749 | GetHtmlEscapedText(new0).c_str(), |
michael@0 | 750 | GetHtmlEscapedText(new0m1).c_str(), |
michael@0 | 751 | GetHtmlEscapedText(new1).c_str(), |
michael@0 | 752 | GetHtmlEscapedText(new1p1).c_str()); |
michael@0 | 753 | // Slow picture of differences per linear entry |
michael@0 | 754 | int d; |
michael@0 | 755 | for (int i = linear0; i < linear2; ++i) { |
michael@0 | 756 | if (i == better_boundary) { |
michael@0 | 757 | fprintf(scoringcontext->debug_file, "^^ "); |
michael@0 | 758 | } |
michael@0 | 759 | uint32 langprob = hitbuffer->linear[i].langprob; |
michael@0 | 760 | d = GetLangScore(langprob, pslang0) - GetLangScore(langprob, pslang1); |
michael@0 | 761 | const char* s = "="; |
michael@0 | 762 | //if (d > 2) {s = "\xc2\xaf";} // Macron |
michael@0 | 763 | if (d > 2) {s = "#";} |
michael@0 | 764 | else if (d > 0) {s = "+";} |
michael@0 | 765 | else if (d < -2) {s = "_";} |
michael@0 | 766 | else if (d < 0) {s = "-";} |
michael@0 | 767 | fprintf(scoringcontext->debug_file, "%s ", s); |
michael@0 | 768 | } |
michael@0 | 769 | fprintf(scoringcontext->debug_file, " (scale: #+=-_)<br>\n"); |
michael@0 | 770 | } |
michael@0 | 771 | return better_boundary; |
michael@0 | 772 | } |
michael@0 | 773 | |
michael@0 | 774 | |
michael@0 | 775 | // For all but the first summary, if its top language differs from |
michael@0 | 776 | // the previous chunk, refine the boundary |
michael@0 | 777 | // Linearized version |
michael@0 | 778 | void SharpenBoundaries(const char* text, |
michael@0 | 779 | bool more_to_come, |
michael@0 | 780 | ScoringHitBuffer* hitbuffer, |
michael@0 | 781 | ScoringContext* scoringcontext, |
michael@0 | 782 | SummaryBuffer* summarybuffer) { |
michael@0 | 783 | |
michael@0 | 784 | int prior_linear = summarybuffer->chunksummary[0].chunk_start; |
michael@0 | 785 | uint16 prior_lang = summarybuffer->chunksummary[0].lang1; |
michael@0 | 786 | |
michael@0 | 787 | if (scoringcontext->flags_cld2_verbose) { |
michael@0 | 788 | fprintf(scoringcontext->debug_file, "<br>SharpenBoundaries<br>\n"); |
michael@0 | 789 | } |
michael@0 | 790 | for (int i = 1; i < summarybuffer->n; ++i) { |
michael@0 | 791 | ChunkSummary* cs = &summarybuffer->chunksummary[i]; |
michael@0 | 792 | uint16 this_lang = cs->lang1; |
michael@0 | 793 | if (this_lang == prior_lang) { |
michael@0 | 794 | prior_linear = cs->chunk_start; |
michael@0 | 795 | continue; |
michael@0 | 796 | } |
michael@0 | 797 | |
michael@0 | 798 | int this_linear = cs->chunk_start; |
michael@0 | 799 | int next_linear = summarybuffer->chunksummary[i + 1].chunk_start; |
michael@0 | 800 | |
michael@0 | 801 | // If this/prior in same close set, don't move boundary |
michael@0 | 802 | if (SameCloseSet(prior_lang, this_lang)) { |
michael@0 | 803 | prior_linear = this_linear; |
michael@0 | 804 | prior_lang = this_lang; |
michael@0 | 805 | continue; |
michael@0 | 806 | } |
michael@0 | 807 | |
michael@0 | 808 | |
michael@0 | 809 | // Within hitbuffer->linear[] |
michael@0 | 810 | // <-- prior chunk --><-- this chunk --> |
michael@0 | 811 | // | | | |
michael@0 | 812 | // prior_linear this_linear next_linear |
michael@0 | 813 | // prior_lang this_lang |
michael@0 | 814 | // The goal of sharpening is to move this_linear to better separate langs |
michael@0 | 815 | |
michael@0 | 816 | uint8 pslang0 = PerScriptNumber(scoringcontext->ulscript, |
michael@0 | 817 | static_cast<Language>(prior_lang)); |
michael@0 | 818 | uint8 pslang1 = PerScriptNumber(scoringcontext->ulscript, |
michael@0 | 819 | static_cast<Language>(this_lang)); |
michael@0 | 820 | int better_linear = BetterBoundary(text, |
michael@0 | 821 | hitbuffer, |
michael@0 | 822 | scoringcontext, |
michael@0 | 823 | pslang0, pslang1, |
michael@0 | 824 | prior_linear, this_linear, next_linear); |
michael@0 | 825 | |
michael@0 | 826 | int old_offset = hitbuffer->linear[this_linear].offset; |
michael@0 | 827 | int new_offset = hitbuffer->linear[better_linear].offset; |
michael@0 | 828 | cs->chunk_start = better_linear; |
michael@0 | 829 | cs->offset = new_offset; |
michael@0 | 830 | // If this_linear moved right, make bytes smaller for this, larger for prior |
michael@0 | 831 | // If this_linear moved left, make bytes larger for this, smaller for prior |
michael@0 | 832 | cs->bytes -= (new_offset - old_offset); |
michael@0 | 833 | summarybuffer->chunksummary[i - 1].bytes += (new_offset - old_offset); |
michael@0 | 834 | |
michael@0 | 835 | this_linear = better_linear; // Update so that next chunk doesn't intrude |
michael@0 | 836 | |
michael@0 | 837 | // Consider rescoring the two chunks |
michael@0 | 838 | |
michael@0 | 839 | // Update for next round (note: using pre-updated boundary) |
michael@0 | 840 | prior_linear = this_linear; |
michael@0 | 841 | prior_lang = this_lang; |
michael@0 | 842 | } |
michael@0 | 843 | } |
michael@0 | 844 | |
michael@0 | 845 | // Make a langprob that gives small weight to the default language for ulscript |
michael@0 | 846 | uint32 DefaultLangProb(ULScript ulscript) { |
michael@0 | 847 | Language default_lang = DefaultLanguage(ulscript); |
michael@0 | 848 | return MakeLangProb(default_lang, 1); |
michael@0 | 849 | } |
michael@0 | 850 | |
michael@0 | 851 | // Effectively, do a merge-sort based on text offsets |
michael@0 | 852 | // Look up each indirect value in appropriate scoring table and keep |
michael@0 | 853 | // just the resulting langprobs |
michael@0 | 854 | void LinearizeAll(ScoringContext* scoringcontext, bool score_cjk, |
michael@0 | 855 | ScoringHitBuffer* hitbuffer) { |
michael@0 | 856 | const CLD2TableSummary* base_obj; // unigram or quadgram |
michael@0 | 857 | const CLD2TableSummary* base_obj2; // quadgram dual table |
michael@0 | 858 | const CLD2TableSummary* delta_obj; // bigram or octagram |
michael@0 | 859 | const CLD2TableSummary* distinct_obj; // bigram or octagram |
michael@0 | 860 | uint16 base_hit; |
michael@0 | 861 | if (score_cjk) { |
michael@0 | 862 | base_obj = scoringcontext->scoringtables->unigram_compat_obj; |
michael@0 | 863 | base_obj2 = scoringcontext->scoringtables->unigram_compat_obj; |
michael@0 | 864 | delta_obj = scoringcontext->scoringtables->deltabi_obj; |
michael@0 | 865 | distinct_obj = scoringcontext->scoringtables->distinctbi_obj; |
michael@0 | 866 | base_hit = UNIHIT; |
michael@0 | 867 | } else { |
michael@0 | 868 | base_obj = scoringcontext->scoringtables->quadgram_obj; |
michael@0 | 869 | base_obj2 = scoringcontext->scoringtables->quadgram_obj2; |
michael@0 | 870 | delta_obj = scoringcontext->scoringtables->deltaocta_obj; |
michael@0 | 871 | distinct_obj = scoringcontext->scoringtables->distinctocta_obj; |
michael@0 | 872 | base_hit = QUADHIT; |
michael@0 | 873 | } |
michael@0 | 874 | |
michael@0 | 875 | int base_limit = hitbuffer->next_base; |
michael@0 | 876 | int delta_limit = hitbuffer->next_delta; |
michael@0 | 877 | int distinct_limit = hitbuffer->next_distinct; |
michael@0 | 878 | int base_i = 0; |
michael@0 | 879 | int delta_i = 0; |
michael@0 | 880 | int distinct_i = 0; |
michael@0 | 881 | int linear_i = 0; |
michael@0 | 882 | |
michael@0 | 883 | // Start with an initial base hit for the default language for this script |
michael@0 | 884 | // Inserting this avoids edge effects with no hits at all |
michael@0 | 885 | hitbuffer->linear[linear_i].offset = hitbuffer->lowest_offset; |
michael@0 | 886 | hitbuffer->linear[linear_i].type = base_hit; |
michael@0 | 887 | hitbuffer->linear[linear_i].langprob = |
michael@0 | 888 | DefaultLangProb(scoringcontext->ulscript); |
michael@0 | 889 | ++linear_i; |
michael@0 | 890 | |
michael@0 | 891 | while ((base_i < base_limit) || (delta_i < delta_limit) || |
michael@0 | 892 | (distinct_i < distinct_limit)) { |
michael@0 | 893 | int base_off = hitbuffer->base[base_i].offset; |
michael@0 | 894 | int delta_off = hitbuffer->delta[delta_i].offset; |
michael@0 | 895 | int distinct_off = hitbuffer->distinct[distinct_i].offset; |
michael@0 | 896 | |
michael@0 | 897 | // Do delta and distinct first, so that they are not lost at base_limit |
michael@0 | 898 | if ((delta_i < delta_limit) && |
michael@0 | 899 | (delta_off <= base_off) && (delta_off <= distinct_off)) { |
michael@0 | 900 | // Add delta entry |
michael@0 | 901 | int indirect = hitbuffer->delta[delta_i].indirect; |
michael@0 | 902 | ++delta_i; |
michael@0 | 903 | uint32 langprob = delta_obj->kCLDTableInd[indirect]; |
michael@0 | 904 | if (langprob > 0) { |
michael@0 | 905 | hitbuffer->linear[linear_i].offset = delta_off; |
michael@0 | 906 | hitbuffer->linear[linear_i].type = DELTAHIT; |
michael@0 | 907 | hitbuffer->linear[linear_i].langprob = langprob; |
michael@0 | 908 | ++linear_i; |
michael@0 | 909 | } |
michael@0 | 910 | } |
michael@0 | 911 | else if ((distinct_i < distinct_limit) && |
michael@0 | 912 | (distinct_off <= base_off) && (distinct_off <= delta_off)) { |
michael@0 | 913 | // Add distinct entry |
michael@0 | 914 | int indirect = hitbuffer->distinct[distinct_i].indirect; |
michael@0 | 915 | ++distinct_i; |
michael@0 | 916 | uint32 langprob = distinct_obj->kCLDTableInd[indirect]; |
michael@0 | 917 | if (langprob > 0) { |
michael@0 | 918 | hitbuffer->linear[linear_i].offset = distinct_off; |
michael@0 | 919 | hitbuffer->linear[linear_i].type = DISTINCTHIT; |
michael@0 | 920 | hitbuffer->linear[linear_i].langprob = langprob; |
michael@0 | 921 | ++linear_i; |
michael@0 | 922 | } |
michael@0 | 923 | } |
michael@0 | 924 | else { |
michael@0 | 925 | // Add one or two base entries |
michael@0 | 926 | int indirect = hitbuffer->base[base_i].indirect; |
michael@0 | 927 | // First, get right scoring table |
michael@0 | 928 | const CLD2TableSummary* local_base_obj = base_obj; |
michael@0 | 929 | if ((indirect & 0x80000000u) != 0) { |
michael@0 | 930 | local_base_obj = base_obj2; |
michael@0 | 931 | indirect &= ~0x80000000u; |
michael@0 | 932 | } |
michael@0 | 933 | ++base_i; |
michael@0 | 934 | // One langprob in kQuadInd[0..SingleSize), |
michael@0 | 935 | // two in kQuadInd[SingleSize..Size) |
michael@0 | 936 | if (indirect < static_cast<int>(local_base_obj->kCLDTableSizeOne)) { |
michael@0 | 937 | // Up to three languages at indirect |
michael@0 | 938 | uint32 langprob = local_base_obj->kCLDTableInd[indirect]; |
michael@0 | 939 | if (langprob > 0) { |
michael@0 | 940 | hitbuffer->linear[linear_i].offset = base_off; |
michael@0 | 941 | hitbuffer->linear[linear_i].type = base_hit; |
michael@0 | 942 | hitbuffer->linear[linear_i].langprob = langprob; |
michael@0 | 943 | ++linear_i; |
michael@0 | 944 | } |
michael@0 | 945 | } else { |
michael@0 | 946 | // Up to six languages at start + 2 * (indirect - start) |
michael@0 | 947 | indirect += (indirect - local_base_obj->kCLDTableSizeOne); |
michael@0 | 948 | uint32 langprob = local_base_obj->kCLDTableInd[indirect]; |
michael@0 | 949 | uint32 langprob2 = local_base_obj->kCLDTableInd[indirect + 1]; |
michael@0 | 950 | if (langprob > 0) { |
michael@0 | 951 | hitbuffer->linear[linear_i].offset = base_off; |
michael@0 | 952 | hitbuffer->linear[linear_i].type = base_hit; |
michael@0 | 953 | hitbuffer->linear[linear_i].langprob = langprob; |
michael@0 | 954 | ++linear_i; |
michael@0 | 955 | } |
michael@0 | 956 | if (langprob2 > 0) { |
michael@0 | 957 | hitbuffer->linear[linear_i].offset = base_off; |
michael@0 | 958 | hitbuffer->linear[linear_i].type = base_hit; |
michael@0 | 959 | hitbuffer->linear[linear_i].langprob = langprob2; |
michael@0 | 960 | ++linear_i; |
michael@0 | 961 | } |
michael@0 | 962 | } |
michael@0 | 963 | } |
michael@0 | 964 | } |
michael@0 | 965 | |
michael@0 | 966 | // Update |
michael@0 | 967 | hitbuffer->next_linear = linear_i; |
michael@0 | 968 | |
michael@0 | 969 | // Add a dummy entry off the end, just to capture final offset |
michael@0 | 970 | hitbuffer->linear[linear_i].offset = |
michael@0 | 971 | hitbuffer->base[hitbuffer->next_base].offset; |
michael@0 | 972 | hitbuffer->linear[linear_i].langprob = 0; |
michael@0 | 973 | } |
michael@0 | 974 | |
michael@0 | 975 | // Break linear array into chunks of ~20 quadgram hits or ~50 CJK unigram hits |
michael@0 | 976 | void ChunkAll(int letter_offset, bool score_cjk, ScoringHitBuffer* hitbuffer) { |
michael@0 | 977 | int chunksize; |
michael@0 | 978 | uint16 base_hit; |
michael@0 | 979 | if (score_cjk) { |
michael@0 | 980 | chunksize = kChunksizeUnis; |
michael@0 | 981 | base_hit = UNIHIT; |
michael@0 | 982 | } else { |
michael@0 | 983 | chunksize = kChunksizeQuads; |
michael@0 | 984 | base_hit = QUADHIT; |
michael@0 | 985 | } |
michael@0 | 986 | |
michael@0 | 987 | int linear_i = 0; |
michael@0 | 988 | int linear_off_end = hitbuffer->next_linear; |
michael@0 | 989 | int text_i = letter_offset; // Next unseen text offset |
michael@0 | 990 | int next_chunk_start = 0; |
michael@0 | 991 | int bases_left = hitbuffer->next_base; |
michael@0 | 992 | while (bases_left > 0) { |
michael@0 | 993 | // Linearize one chunk |
michael@0 | 994 | int base_len = chunksize; // Default; may be changed below |
michael@0 | 995 | if (bases_left < (chunksize + (chunksize >> 1))) { |
michael@0 | 996 | // If within 1.5 chunks of the end, avoid runts by using it all |
michael@0 | 997 | base_len = bases_left; |
michael@0 | 998 | } else if (bases_left < (2 * chunksize)) { |
michael@0 | 999 | // Avoid runts by splitting 1.5 to 2 chunks in half (about 3/4 each) |
michael@0 | 1000 | base_len = (bases_left + 1) >> 1; |
michael@0 | 1001 | } |
michael@0 | 1002 | |
michael@0 | 1003 | hitbuffer->chunk_start[next_chunk_start] = linear_i; |
michael@0 | 1004 | hitbuffer->chunk_offset[next_chunk_start] = text_i; |
michael@0 | 1005 | ++next_chunk_start; |
michael@0 | 1006 | |
michael@0 | 1007 | int base_count = 0; |
michael@0 | 1008 | while ((base_count < base_len) && (linear_i < linear_off_end)) { |
michael@0 | 1009 | if (hitbuffer->linear[linear_i].type == base_hit) {++base_count;} |
michael@0 | 1010 | ++linear_i; |
michael@0 | 1011 | } |
michael@0 | 1012 | text_i = hitbuffer->linear[linear_i].offset; // Next unseen text offset |
michael@0 | 1013 | bases_left -= base_len; |
michael@0 | 1014 | } |
michael@0 | 1015 | |
michael@0 | 1016 | // If no base hits at all, make a single dummy chunk |
michael@0 | 1017 | if (next_chunk_start == 0) { |
michael@0 | 1018 | hitbuffer->chunk_start[next_chunk_start] = 0; |
michael@0 | 1019 | hitbuffer->chunk_offset[next_chunk_start] = hitbuffer->linear[0].offset; |
michael@0 | 1020 | ++next_chunk_start; |
michael@0 | 1021 | } |
michael@0 | 1022 | |
michael@0 | 1023 | // Remember the linear array start of dummy entry |
michael@0 | 1024 | hitbuffer->next_chunk_start = next_chunk_start; |
michael@0 | 1025 | |
michael@0 | 1026 | // Add a dummy entry off the end, just to capture final linear subscr |
michael@0 | 1027 | hitbuffer->chunk_start[next_chunk_start] = hitbuffer->next_linear; |
michael@0 | 1028 | hitbuffer->chunk_offset[next_chunk_start] = text_i; |
michael@0 | 1029 | } |
michael@0 | 1030 | |
michael@0 | 1031 | |
michael@0 | 1032 | // Merge-sort the individual hit arrays, go indirect on the scoring subscripts, |
michael@0 | 1033 | // break linear array into chunks. |
michael@0 | 1034 | // |
michael@0 | 1035 | // Input: |
michael@0 | 1036 | // hitbuffer base, delta, distinct arrays |
michael@0 | 1037 | // Output: |
michael@0 | 1038 | // linear array |
michael@0 | 1039 | // chunk_start array |
michael@0 | 1040 | // |
michael@0 | 1041 | void LinearizeHitBuffer(int letter_offset, |
michael@0 | 1042 | ScoringContext* scoringcontext, |
michael@0 | 1043 | bool more_to_come, bool score_cjk, |
michael@0 | 1044 | ScoringHitBuffer* hitbuffer) { |
michael@0 | 1045 | LinearizeAll(scoringcontext, score_cjk, hitbuffer); |
michael@0 | 1046 | ChunkAll(letter_offset, score_cjk, hitbuffer); |
michael@0 | 1047 | } |
michael@0 | 1048 | |
michael@0 | 1049 | |
michael@0 | 1050 | |
michael@0 | 1051 | // The hitbuffer is in an awkward form -- three sets of base/delta/distinct |
michael@0 | 1052 | // scores, each with an indirect subscript to one of six scoring tables, some |
michael@0 | 1053 | // of which can yield two langprobs for six languages, others one langprob for |
michael@0 | 1054 | // three languages. The only correlation between base/delta/distinct is their |
michael@0 | 1055 | // offsets into the letters-only text buffer. |
michael@0 | 1056 | // |
michael@0 | 1057 | // SummaryBuffer needs to be built to linear, giving linear offset of start of |
michael@0 | 1058 | // each chunk |
michael@0 | 1059 | // |
michael@0 | 1060 | // So we first do all the langprob lookups and merge-sort by offset to make |
michael@0 | 1061 | // a single linear vector, building a side vector of chunk beginnings as we go. |
michael@0 | 1062 | // The sharpening is simply moving the beginnings, scoring is a simple linear |
michael@0 | 1063 | // sweep, etc. |
michael@0 | 1064 | |
michael@0 | 1065 | void ProcessHitBuffer(const LangSpan& scriptspan, |
michael@0 | 1066 | int letter_offset, |
michael@0 | 1067 | ScoringContext* scoringcontext, |
michael@0 | 1068 | DocTote* doc_tote, |
michael@0 | 1069 | ResultChunkVector* vec, |
michael@0 | 1070 | bool more_to_come, bool score_cjk, |
michael@0 | 1071 | ScoringHitBuffer* hitbuffer) { |
michael@0 | 1072 | if (scoringcontext->flags_cld2_verbose) { |
michael@0 | 1073 | fprintf(scoringcontext->debug_file, "Hitbuffer[) "); |
michael@0 | 1074 | DumpHitBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer); |
michael@0 | 1075 | } |
michael@0 | 1076 | |
michael@0 | 1077 | LinearizeHitBuffer(letter_offset, scoringcontext, more_to_come, score_cjk, |
michael@0 | 1078 | hitbuffer); |
michael@0 | 1079 | |
michael@0 | 1080 | if (scoringcontext->flags_cld2_verbose) { |
michael@0 | 1081 | fprintf(scoringcontext->debug_file, "Linear[) "); |
michael@0 | 1082 | DumpLinearBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer); |
michael@0 | 1083 | } |
michael@0 | 1084 | |
michael@0 | 1085 | SummaryBuffer summarybuffer; |
michael@0 | 1086 | summarybuffer.n = 0; |
michael@0 | 1087 | ChunkSpan last_cspan; |
michael@0 | 1088 | ScoreAllHits(scriptspan.text, scriptspan.ulscript, |
michael@0 | 1089 | more_to_come, score_cjk, hitbuffer, |
michael@0 | 1090 | scoringcontext, |
michael@0 | 1091 | &summarybuffer, &last_cspan); |
michael@0 | 1092 | |
michael@0 | 1093 | if (scoringcontext->flags_cld2_verbose) { |
michael@0 | 1094 | DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer); |
michael@0 | 1095 | } |
michael@0 | 1096 | |
michael@0 | 1097 | if (vec != NULL) { |
michael@0 | 1098 | // Sharpen boundaries of summarybuffer |
michael@0 | 1099 | // This is not a high-performance path |
michael@0 | 1100 | SharpenBoundaries(scriptspan.text, more_to_come, hitbuffer, scoringcontext, |
michael@0 | 1101 | &summarybuffer); |
michael@0 | 1102 | // Show after the sharpening |
michael@0 | 1103 | // CLD2_Debug2(scriptspan.text, more_to_come, score_cjk, |
michael@0 | 1104 | // hitbuffer, scoringcontext, &summarybuffer); |
michael@0 | 1105 | |
michael@0 | 1106 | if (scoringcontext->flags_cld2_verbose) { |
michael@0 | 1107 | DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer); |
michael@0 | 1108 | } |
michael@0 | 1109 | } |
michael@0 | 1110 | |
michael@0 | 1111 | SummaryBufferToDocTote(&summarybuffer, more_to_come, doc_tote); |
michael@0 | 1112 | SummaryBufferToVector(scoringcontext->scanner, scriptspan.text, |
michael@0 | 1113 | &summarybuffer, more_to_come, vec); |
michael@0 | 1114 | } |
michael@0 | 1115 | |
michael@0 | 1116 | void SpliceHitBuffer(ScoringHitBuffer* hitbuffer, int next_offset) { |
michael@0 | 1117 | // Splice hitbuffer and summarybuffer for next round. With big chunks and |
michael@0 | 1118 | // distinctive-word state carried across chunks, we might not need to do this. |
michael@0 | 1119 | hitbuffer->next_base = 0; |
michael@0 | 1120 | hitbuffer->next_delta = 0; |
michael@0 | 1121 | hitbuffer->next_distinct = 0; |
michael@0 | 1122 | hitbuffer->next_linear = 0; |
michael@0 | 1123 | hitbuffer->next_chunk_start = 0; |
michael@0 | 1124 | hitbuffer->lowest_offset = next_offset; |
michael@0 | 1125 | } |
michael@0 | 1126 | |
michael@0 | 1127 | |
michael@0 | 1128 | // Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating |
michael@0 | 1129 | // scoringcontext |
michael@0 | 1130 | void ScoreEntireScriptSpan(const LangSpan& scriptspan, |
michael@0 | 1131 | ScoringContext* scoringcontext, |
michael@0 | 1132 | DocTote* doc_tote, |
michael@0 | 1133 | ResultChunkVector* vec) { |
michael@0 | 1134 | int bytes = scriptspan.text_bytes; |
michael@0 | 1135 | // Artificially set score to 1024 per 1KB, or 1 per byte |
michael@0 | 1136 | int score = bytes; |
michael@0 | 1137 | int reliability = 100; |
michael@0 | 1138 | // doc_tote uses full languages |
michael@0 | 1139 | Language one_one_lang = DefaultLanguage(scriptspan.ulscript); |
michael@0 | 1140 | doc_tote->Add(one_one_lang, bytes, score, reliability); |
michael@0 | 1141 | |
michael@0 | 1142 | if (scoringcontext->flags_cld2_html) { |
michael@0 | 1143 | ChunkSummary chunksummary = { |
michael@0 | 1144 | 1, 0, |
michael@0 | 1145 | one_one_lang, UNKNOWN_LANGUAGE, score, 1, |
michael@0 | 1146 | bytes, 0, scriptspan.ulscript, reliability, reliability |
michael@0 | 1147 | }; |
michael@0 | 1148 | CLD2_Debug(scriptspan.text, 1, scriptspan.text_bytes, |
michael@0 | 1149 | false, false, NULL, |
michael@0 | 1150 | scoringcontext, NULL, &chunksummary); |
michael@0 | 1151 | } |
michael@0 | 1152 | |
michael@0 | 1153 | // First byte is always a space |
michael@0 | 1154 | JustOneItemToVector(scoringcontext->scanner, scriptspan.text, |
michael@0 | 1155 | one_one_lang, 1, bytes - 1, vec); |
michael@0 | 1156 | |
michael@0 | 1157 | scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; |
michael@0 | 1158 | } |
michael@0 | 1159 | |
michael@0 | 1160 | // Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext |
michael@0 | 1161 | void ScoreCJKScriptSpan(const LangSpan& scriptspan, |
michael@0 | 1162 | ScoringContext* scoringcontext, |
michael@0 | 1163 | DocTote* doc_tote, |
michael@0 | 1164 | ResultChunkVector* vec) { |
michael@0 | 1165 | // Allocate three parallel arrays of scoring hits |
michael@0 | 1166 | ScoringHitBuffer* hitbuffer = new ScoringHitBuffer; |
michael@0 | 1167 | hitbuffer->init(); |
michael@0 | 1168 | hitbuffer->ulscript = scriptspan.ulscript; |
michael@0 | 1169 | |
michael@0 | 1170 | scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; |
michael@0 | 1171 | scoringcontext->oldest_distinct_boost = 0; |
michael@0 | 1172 | |
michael@0 | 1173 | // Incoming scriptspan has a single leading space at scriptspan.text[0] |
michael@0 | 1174 | // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3] |
michael@0 | 1175 | |
michael@0 | 1176 | int letter_offset = 1; // Skip initial space |
michael@0 | 1177 | hitbuffer->lowest_offset = letter_offset; |
michael@0 | 1178 | int letter_limit = scriptspan.text_bytes; |
michael@0 | 1179 | while (letter_offset < letter_limit) { |
michael@0 | 1180 | if (scoringcontext->flags_cld2_verbose) { |
michael@0 | 1181 | fprintf(scoringcontext->debug_file, " ScoreCJKScriptSpan[%d,%d)<br>\n", |
michael@0 | 1182 | letter_offset, letter_limit); |
michael@0 | 1183 | } |
michael@0 | 1184 | // |
michael@0 | 1185 | // Fill up one hitbuffer, possibly splicing onto previous fragment |
michael@0 | 1186 | // |
michael@0 | 1187 | // NOTE: GetUniHits deals with close repeats |
michael@0 | 1188 | // NOTE: After last chunk there is always a hitbuffer entry with an offset |
michael@0 | 1189 | // just off the end of the text = next_offset. |
michael@0 | 1190 | int next_offset = GetUniHits(scriptspan.text, letter_offset, letter_limit, |
michael@0 | 1191 | scoringcontext, hitbuffer); |
michael@0 | 1192 | // NOTE: GetBiHitVectors deals with close repeats, |
michael@0 | 1193 | // does one hash and two lookups (delta and distinct) per word |
michael@0 | 1194 | GetBiHits(scriptspan.text, letter_offset, next_offset, |
michael@0 | 1195 | scoringcontext, hitbuffer); |
michael@0 | 1196 | |
michael@0 | 1197 | // |
michael@0 | 1198 | // Score one hitbuffer in chunks to summarybuffer |
michael@0 | 1199 | // |
michael@0 | 1200 | bool more_to_come = next_offset < letter_limit; |
michael@0 | 1201 | bool score_cjk = true; |
michael@0 | 1202 | ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec, |
michael@0 | 1203 | more_to_come, score_cjk, hitbuffer); |
michael@0 | 1204 | SpliceHitBuffer(hitbuffer, next_offset); |
michael@0 | 1205 | |
michael@0 | 1206 | letter_offset = next_offset; |
michael@0 | 1207 | } |
michael@0 | 1208 | |
michael@0 | 1209 | delete hitbuffer; |
michael@0 | 1210 | // Context across buffers is not connected yet |
michael@0 | 1211 | scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; |
michael@0 | 1212 | } |
michael@0 | 1213 | |
michael@0 | 1214 | |
michael@0 | 1215 | |
michael@0 | 1216 | // Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext |
michael@0 | 1217 | // We have a scriptspan with all lowercase text in one script. Look up |
michael@0 | 1218 | // quadgrams and octagrams, saving the hits in three parallel vectors. |
michael@0 | 1219 | // Score from those vectors in chunks, toting each chunk to get a single |
michael@0 | 1220 | // language, and combining into the overall document score. The hit vectors |
michael@0 | 1221 | // in general are not big enough to handle and entire scriptspan, so |
michael@0 | 1222 | // repeat until the entire scriptspan is scored. |
michael@0 | 1223 | // Caller deals with minimizing numbr of runt scriptspans |
michael@0 | 1224 | // This routine deals with minimizing number of runt chunks. |
michael@0 | 1225 | // |
michael@0 | 1226 | // Returns updated scoringcontext |
michael@0 | 1227 | // Returns updated doc_tote |
michael@0 | 1228 | // If vec != NULL, appends to that vector of ResultChunk's |
michael@0 | 1229 | void ScoreQuadScriptSpan(const LangSpan& scriptspan, |
michael@0 | 1230 | ScoringContext* scoringcontext, |
michael@0 | 1231 | DocTote* doc_tote, |
michael@0 | 1232 | ResultChunkVector* vec) { |
michael@0 | 1233 | // Allocate three parallel arrays of scoring hits |
michael@0 | 1234 | ScoringHitBuffer* hitbuffer = new ScoringHitBuffer; |
michael@0 | 1235 | hitbuffer->init(); |
michael@0 | 1236 | hitbuffer->ulscript = scriptspan.ulscript; |
michael@0 | 1237 | |
michael@0 | 1238 | scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; |
michael@0 | 1239 | scoringcontext->oldest_distinct_boost = 0; |
michael@0 | 1240 | |
michael@0 | 1241 | // Incoming scriptspan has a single leading space at scriptspan.text[0] |
michael@0 | 1242 | // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3] |
michael@0 | 1243 | |
michael@0 | 1244 | int letter_offset = 1; // Skip initial space |
michael@0 | 1245 | hitbuffer->lowest_offset = letter_offset; |
michael@0 | 1246 | int letter_limit = scriptspan.text_bytes; |
michael@0 | 1247 | while (letter_offset < letter_limit) { |
michael@0 | 1248 | // |
michael@0 | 1249 | // Fill up one hitbuffer, possibly splicing onto previous fragment |
michael@0 | 1250 | // |
michael@0 | 1251 | // NOTE: GetQuadHits deals with close repeats |
michael@0 | 1252 | // NOTE: After last chunk there is always a hitbuffer entry with an offset |
michael@0 | 1253 | // just off the end of the text = next_offset. |
michael@0 | 1254 | int next_offset = GetQuadHits(scriptspan.text, letter_offset, letter_limit, |
michael@0 | 1255 | scoringcontext, hitbuffer); |
michael@0 | 1256 | // If true, there is more text to process in this scriptspan |
michael@0 | 1257 | // NOTE: GetOctaHitVectors deals with close repeats, |
michael@0 | 1258 | // does one hash and two lookups (delta and distinct) per word |
michael@0 | 1259 | GetOctaHits(scriptspan.text, letter_offset, next_offset, |
michael@0 | 1260 | scoringcontext, hitbuffer); |
michael@0 | 1261 | |
michael@0 | 1262 | // |
michael@0 | 1263 | // Score one hitbuffer in chunks to summarybuffer |
michael@0 | 1264 | // |
michael@0 | 1265 | bool more_to_come = next_offset < letter_limit; |
michael@0 | 1266 | bool score_cjk = false; |
michael@0 | 1267 | ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec, |
michael@0 | 1268 | more_to_come, score_cjk, hitbuffer); |
michael@0 | 1269 | SpliceHitBuffer(hitbuffer, next_offset); |
michael@0 | 1270 | |
michael@0 | 1271 | letter_offset = next_offset; |
michael@0 | 1272 | } |
michael@0 | 1273 | |
michael@0 | 1274 | delete hitbuffer; |
michael@0 | 1275 | } |
michael@0 | 1276 | |
michael@0 | 1277 | |
michael@0 | 1278 | // Score one scriptspan into doc_tote and vec, updating scoringcontext |
michael@0 | 1279 | // Inputs: |
michael@0 | 1280 | // One scriptspan of perhaps 40-60KB, all same script lower-case letters |
michael@0 | 1281 | // and single ASCII spaces. First character is a space to allow simple |
michael@0 | 1282 | // begining-of-word detect. End of buffer has three spaces and NUL to |
michael@0 | 1283 | // allow easy scan-to-end-of-word. |
michael@0 | 1284 | // Scoring context of |
michael@0 | 1285 | // scoring tables |
michael@0 | 1286 | // flags |
michael@0 | 1287 | // running boosts |
michael@0 | 1288 | // Outputs: |
michael@0 | 1289 | // Updated doc_tote giving overall languages and byte counts |
michael@0 | 1290 | // Optional updated chunk vector giving offset, length, language |
michael@0 | 1291 | // |
michael@0 | 1292 | // Caller initializes flags, boosts, doc_tote and vec. |
michael@0 | 1293 | // Caller aggregates across multiple scriptspans |
michael@0 | 1294 | // Caller calculates final document result |
michael@0 | 1295 | // Caller deals with detecting and triggering suppression of repeated text. |
michael@0 | 1296 | // |
michael@0 | 1297 | // This top-level routine just chooses the recognition type and calls one of |
michael@0 | 1298 | // the next-level-down routines. |
michael@0 | 1299 | // |
michael@0 | 1300 | void ScoreOneScriptSpan(const LangSpan& scriptspan, |
michael@0 | 1301 | ScoringContext* scoringcontext, |
michael@0 | 1302 | DocTote* doc_tote, |
michael@0 | 1303 | ResultChunkVector* vec) { |
michael@0 | 1304 | if (scoringcontext->flags_cld2_verbose) { |
michael@0 | 1305 | fprintf(scoringcontext->debug_file, "<br>ScoreOneScriptSpan(%s,%d) ", |
michael@0 | 1306 | ULScriptCode(scriptspan.ulscript), scriptspan.text_bytes); |
michael@0 | 1307 | // Optionally print the chunk lowercase letters/marks text |
michael@0 | 1308 | string temp(&scriptspan.text[0], scriptspan.text_bytes); |
michael@0 | 1309 | fprintf(scoringcontext->debug_file, "'%s'", |
michael@0 | 1310 | GetHtmlEscapedText(temp).c_str()); |
michael@0 | 1311 | fprintf(scoringcontext->debug_file, "<br>\n"); |
michael@0 | 1312 | } |
michael@0 | 1313 | scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE; |
michael@0 | 1314 | scoringcontext->oldest_distinct_boost = 0; |
michael@0 | 1315 | ULScriptRType rtype = ULScriptRecognitionType(scriptspan.ulscript); |
michael@0 | 1316 | if (scoringcontext->flags_cld2_score_as_quads && (rtype != RTypeCJK)) { |
michael@0 | 1317 | rtype = RTypeMany; |
michael@0 | 1318 | } |
michael@0 | 1319 | switch (rtype) { |
michael@0 | 1320 | case RTypeNone: |
michael@0 | 1321 | case RTypeOne: |
michael@0 | 1322 | ScoreEntireScriptSpan(scriptspan, scoringcontext, doc_tote, vec); |
michael@0 | 1323 | break; |
michael@0 | 1324 | case RTypeCJK: |
michael@0 | 1325 | ScoreCJKScriptSpan(scriptspan, scoringcontext, doc_tote, vec); |
michael@0 | 1326 | break; |
michael@0 | 1327 | case RTypeMany: |
michael@0 | 1328 | ScoreQuadScriptSpan(scriptspan, scoringcontext, doc_tote, vec); |
michael@0 | 1329 | break; |
michael@0 | 1330 | } |
michael@0 | 1331 | } |
michael@0 | 1332 | |
michael@0 | 1333 | } // End namespace CLD2 |
michael@0 | 1334 |