browser/components/translation/cld2/internal/scoreonescriptspan.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/browser/components/translation/cld2/internal/scoreonescriptspan.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1334 @@
     1.4 +// Copyright 2013 Google Inc. All Rights Reserved.
     1.5 +//
     1.6 +// Licensed under the Apache License, Version 2.0 (the "License");
     1.7 +// you may not use this file except in compliance with the License.
     1.8 +// You may obtain a copy of the License at
     1.9 +//
    1.10 +//     http://www.apache.org/licenses/LICENSE-2.0
    1.11 +//
    1.12 +// Unless required by applicable law or agreed to in writing, software
    1.13 +// distributed under the License is distributed on an "AS IS" BASIS,
    1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    1.15 +// See the License for the specific language governing permissions and
    1.16 +// limitations under the License.
    1.17 +
    1.18 +//
    1.19 +// Author: dsites@google.com (Dick Sites)
    1.20 +// Updated 2014.01 for dual table lookup
    1.21 +//
    1.22 +
    1.23 +#include "scoreonescriptspan.h"
    1.24 +
    1.25 +#include "cldutil.h"
    1.26 +#include "debug.h"
    1.27 +#include "lang_script.h"
    1.28 +
    1.29 +#include <stdio.h>
    1.30 +
    1.31 +using namespace std;
    1.32 +
    1.33 +namespace CLD2 {
    1.34 +
    1.35 +static const int kUnreliablePercentThreshold = 75;
    1.36 +
    1.37 +void AddLangProb(uint32 langprob, Tote* chunk_tote) {
    1.38 +  ProcessProbV2Tote(langprob, chunk_tote);
    1.39 +}
    1.40 +
    1.41 +void ZeroPSLang(uint32 langprob, Tote* chunk_tote) {
    1.42 +  uint8 top1 = (langprob >> 8) & 0xff;
    1.43 +  chunk_tote->SetScore(top1, 0);
    1.44 +}
    1.45 +
    1.46 +bool SameCloseSet(uint16 lang1, uint16 lang2) {
    1.47 +  int lang1_close_set = LanguageCloseSet(static_cast<Language>(lang1));
    1.48 +  if (lang1_close_set == 0) {return false;}
    1.49 +  int lang2_close_set = LanguageCloseSet(static_cast<Language>(lang2));
    1.50 +  return (lang1_close_set == lang2_close_set);
    1.51 +}
    1.52 +
    1.53 +bool SameCloseSet(Language lang1, Language lang2) {
    1.54 +  int lang1_close_set = LanguageCloseSet(lang1);
    1.55 +  if (lang1_close_set == 0) {return false;}
    1.56 +  int lang2_close_set = LanguageCloseSet(lang2);
    1.57 +  return (lang1_close_set == lang2_close_set);
    1.58 +}
    1.59 +
    1.60 +
    1.61 +// Needs expected score per 1KB in scoring context
    1.62 +void SetChunkSummary(ULScript ulscript, int first_linear_in_chunk,
    1.63 +                     int offset, int len,
    1.64 +                     const ScoringContext* scoringcontext,
    1.65 +                     const Tote* chunk_tote,
    1.66 +                     ChunkSummary* chunksummary) {
    1.67 +  int key3[3];
    1.68 +  chunk_tote->CurrentTopThreeKeys(key3);
    1.69 +  Language lang1 = FromPerScriptNumber(ulscript, key3[0]);
    1.70 +  Language lang2 = FromPerScriptNumber(ulscript, key3[1]);
    1.71 +
    1.72 +  int actual_score_per_kb = 0;
    1.73 +  if (len > 0) {
    1.74 +    actual_score_per_kb = (chunk_tote->GetScore(key3[0]) << 10) / len;
    1.75 +  }
    1.76 +  int expected_subscr = lang1 * 4 + LScript4(ulscript);
    1.77 +  int expected_score_per_kb =
    1.78 +     scoringcontext->scoringtables->kExpectedScore[expected_subscr];
    1.79 +
    1.80 +  chunksummary->offset = offset;
    1.81 +  chunksummary->chunk_start = first_linear_in_chunk;
    1.82 +  chunksummary->lang1 = lang1;
    1.83 +  chunksummary->lang2 = lang2;
    1.84 +  chunksummary->score1 = chunk_tote->GetScore(key3[0]);
    1.85 +  chunksummary->score2 = chunk_tote->GetScore(key3[1]);
    1.86 +  chunksummary->bytes = len;
    1.87 +  chunksummary->grams = chunk_tote->GetScoreCount();
    1.88 +  chunksummary->ulscript = ulscript;
    1.89 +  chunksummary->reliability_delta = ReliabilityDelta(chunksummary->score1,
    1.90 +                                                     chunksummary->score2,
    1.91 +                                                     chunksummary->grams);
    1.92 +  // If lang1/lang2 in same close set, set delta reliability to 100%
    1.93 +  if (SameCloseSet(lang1, lang2)) {
    1.94 +    chunksummary->reliability_delta = 100;
    1.95 +  }
    1.96 +  chunksummary->reliability_score =
    1.97 +     ReliabilityExpected(actual_score_per_kb, expected_score_per_kb);
    1.98 +}
    1.99 +
   1.100 +// Return true if just lang1 is there: lang2=0 and lang3=0
   1.101 +bool IsSingleLang(uint32 langprob) {
   1.102 +  // Probably a bug -- which end is lang1? But only used to call empty Boost1
   1.103 +  return ((langprob & 0x00ffff00) == 0);
   1.104 +}
   1.105 +
   1.106 +// Update scoring context distinct_boost for single language quad
   1.107 +void AddDistinctBoost1(uint32 langprob, ScoringContext* scoringcontext) {
   1.108 +  // Probably keep this empty -- not a good enough signal
   1.109 +}
   1.110 +
   1.111 +// Update scoring context distinct_boost for distinct octagram
   1.112 +// Keep last 4 used. Since these are mostly (except at splices) in
   1.113 +// hitbuffer, we might be able to just use a subscript and splice
   1.114 +void AddDistinctBoost2(uint32 langprob, ScoringContext* scoringcontext) {
   1.115 +// this is called 0..n times per chunk with decoded hitbuffer->distinct...
   1.116 +  LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
   1.117 +  if (scoringcontext->ulscript != ULScript_Latin) {
   1.118 +    distinct_boost = &scoringcontext->distinct_boost.othr;
   1.119 +  }
   1.120 +  int n = distinct_boost->n;
   1.121 +  distinct_boost->langprob[n] = langprob;
   1.122 +  distinct_boost->n = distinct_boost->wrap(n + 1);
   1.123 +}
   1.124 +
   1.125 +// For each chunk, add extra weight for language priors (from content-lang and
   1.126 +// meta lang=xx) and distinctive tokens
   1.127 +void ScoreBoosts(const ScoringContext* scoringcontext, Tote* chunk_tote) {
   1.128 +  // Get boosts for current script
   1.129 +  const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
   1.130 +  const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
   1.131 +  const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
   1.132 +  if (scoringcontext->ulscript != ULScript_Latin) {
   1.133 +    langprior_boost = &scoringcontext->langprior_boost.othr;
   1.134 +    langprior_whack = &scoringcontext->langprior_whack.othr;
   1.135 +    distinct_boost = &scoringcontext->distinct_boost.othr;
   1.136 +  }
   1.137 +
   1.138 +  for (int k = 0; k < kMaxBoosts; ++k) {
   1.139 +    uint32 langprob = langprior_boost->langprob[k];
   1.140 +    if (langprob > 0) {AddLangProb(langprob, chunk_tote);}
   1.141 +  }
   1.142 +  for (int k = 0; k < kMaxBoosts; ++k) {
   1.143 +    uint32 langprob = distinct_boost->langprob[k];
   1.144 +    if (langprob > 0) {AddLangProb(langprob, chunk_tote);}
   1.145 +  }
   1.146 +  // boost has a packed set of per-script langs and probabilites
   1.147 +  // whack has a packed set of per-script lang to be suppressed (zeroed)
   1.148 +  // When a language in a close set is given as an explicit hint, others in
   1.149 +  //  that set will be whacked here.
   1.150 +  for (int k = 0; k < kMaxBoosts; ++k) {
   1.151 +    uint32 langprob = langprior_whack->langprob[k];
   1.152 +    if (langprob > 0) {ZeroPSLang(langprob, chunk_tote);}
   1.153 +  }
   1.154 +}
   1.155 +
   1.156 +
   1.157 +
   1.158 +// At this point, The chunk is described by
   1.159 +//  hitbuffer->base[cspan->chunk_base .. cspan->chunk_base + cspan->base_len)
   1.160 +//  hitbuffer->delta[cspan->chunk_delta ... )
   1.161 +//  hitbuffer->distinct[cspan->chunk_distinct ... )
   1.162 +// Scored text is in text[lo..hi) where
   1.163 +//  lo is 0 or the min of first base/delta/distinct hitbuffer offset and
   1.164 +//  hi is the min of next base/delta/distinct hitbuffer offset after
   1.165 +//  base_len, etc.
   1.166 +void GetTextSpanOffsets(const ScoringHitBuffer* hitbuffer,
   1.167 +                        const ChunkSpan* cspan, int* lo, int* hi) {
   1.168 +  // Front of this span
   1.169 +  int lo_base = hitbuffer->base[cspan->chunk_base].offset;
   1.170 +  int lo_delta = hitbuffer->delta[cspan->chunk_delta].offset;
   1.171 +  int lo_distinct = hitbuffer->distinct[cspan->chunk_distinct].offset;
   1.172 +  // Front of next span
   1.173 +  int hi_base = hitbuffer->base[cspan->chunk_base +
   1.174 +    cspan->base_len].offset;
   1.175 +  int hi_delta = hitbuffer->delta[cspan->chunk_delta +
   1.176 +    cspan->delta_len].offset;
   1.177 +  int hi_distinct = hitbuffer->distinct[cspan->chunk_distinct +
   1.178 +    cspan->distinct_len].offset;
   1.179 +
   1.180 +  *lo = 0;
   1.181 +//  if (cspan->chunk_base > 0) {
   1.182 +//    *lo = minint(minint(lo_base, lo_delta), lo_distinct);
   1.183 +//  }
   1.184 +  *lo = minint(minint(lo_base, lo_delta), lo_distinct);
   1.185 +  *hi = minint(minint(hi_base, hi_delta), hi_distinct);
   1.186 +}
   1.187 +
   1.188 +
   1.189 +int DiffScore(const CLD2TableSummary* obj, int indirect,
   1.190 +              uint16 lang1, uint16 lang2) {
   1.191 +  if (indirect < static_cast<int>(obj->kCLDTableSizeOne)) {
   1.192 +    // Up to three languages at indirect
   1.193 +    uint32 langprob = obj->kCLDTableInd[indirect];
   1.194 +    return GetLangScore(langprob, lang1) - GetLangScore(langprob, lang2);
   1.195 +  } else {
   1.196 +    // Up to six languages at start + 2 * (indirect - start)
   1.197 +    indirect += (indirect - obj->kCLDTableSizeOne);
   1.198 +    uint32 langprob = obj->kCLDTableInd[indirect];
   1.199 +    uint32 langprob2 = obj->kCLDTableInd[indirect + 1];
   1.200 +    return (GetLangScore(langprob, lang1) + GetLangScore(langprob2, lang1)) -
   1.201 +      (GetLangScore(langprob, lang2) + GetLangScore(langprob2, lang2));
   1.202 +  }
   1.203 +
   1.204 +}
   1.205 +
   1.206 +// Score all the bases, deltas, distincts, boosts for one chunk into chunk_tote
   1.207 +// After last chunk there is always a hitbuffer entry with an offset just off
   1.208 +// the end of the text.
   1.209 +// Sets delta_len, and distinct_len
   1.210 +void ScoreOneChunk(const char* text, ULScript ulscript,
   1.211 +                   const ScoringHitBuffer* hitbuffer,
   1.212 +                   int chunk_i,
   1.213 +                   ScoringContext* scoringcontext,
   1.214 +                   ChunkSpan* cspan, Tote* chunk_tote,
   1.215 +                   ChunkSummary* chunksummary) {
   1.216 +  int first_linear_in_chunk = hitbuffer->chunk_start[chunk_i];
   1.217 +  int first_linear_in_next_chunk = hitbuffer->chunk_start[chunk_i + 1];
   1.218 +
   1.219 +  chunk_tote->Reinit();
   1.220 +  cspan->delta_len = 0;
   1.221 +  cspan->distinct_len = 0;
   1.222 +  if (scoringcontext->flags_cld2_verbose) {
   1.223 +    fprintf(scoringcontext->debug_file, "<br>ScoreOneChunk[%d..%d) ",
   1.224 +            first_linear_in_chunk, first_linear_in_next_chunk);
   1.225 +  }
   1.226 +
   1.227 +  // 2013.02.05 linear design: just use base and base_len for the span
   1.228 +  cspan->chunk_base = first_linear_in_chunk;
   1.229 +  cspan->base_len = first_linear_in_next_chunk - first_linear_in_chunk;
   1.230 +  for (int i = first_linear_in_chunk; i < first_linear_in_next_chunk; ++i) {
   1.231 +    uint32 langprob = hitbuffer->linear[i].langprob;
   1.232 +    AddLangProb(langprob, chunk_tote);
   1.233 +    if (hitbuffer->linear[i].type <= QUADHIT) {
   1.234 +      chunk_tote->AddScoreCount();      // Just count quads, not octas
   1.235 +    }
   1.236 +    if (hitbuffer->linear[i].type == DISTINCTHIT) {
   1.237 +      AddDistinctBoost2(langprob, scoringcontext);
   1.238 +    }
   1.239 +  }
   1.240 +
   1.241 +  // Score language prior boosts
   1.242 +  // Score distinct word boost
   1.243 +  ScoreBoosts(scoringcontext, chunk_tote);
   1.244 +
   1.245 +  int lo = hitbuffer->linear[first_linear_in_chunk].offset;
   1.246 +  int hi = hitbuffer->linear[first_linear_in_next_chunk].offset;
   1.247 +
   1.248 +  // Chunk_tote: get top langs, scores, etc. and fill in chunk summary
   1.249 +  SetChunkSummary(ulscript, first_linear_in_chunk, lo, hi - lo,
   1.250 +                  scoringcontext, chunk_tote, chunksummary);
   1.251 +
   1.252 +  bool more_to_come = false;
   1.253 +  bool score_cjk = false;
   1.254 +  if (scoringcontext->flags_cld2_html) {
   1.255 +    // Show one chunk in readable output
   1.256 +    CLD2_Debug(text, lo, hi, more_to_come, score_cjk, hitbuffer,
   1.257 +               scoringcontext, cspan, chunksummary);
   1.258 +  }
   1.259 +
   1.260 +  scoringcontext->prior_chunk_lang = static_cast<Language>(chunksummary->lang1);
   1.261 +}
   1.262 +
   1.263 +
   1.264 +// Score chunks of text described by hitbuffer, allowing each to be in a
   1.265 +// different language, and optionally adjusting the boundaries inbetween.
   1.266 +// Set last_cspan to the last chunkspan used
   1.267 +void ScoreAllHits(const char* text,  ULScript ulscript,
   1.268 +                  bool more_to_come, bool score_cjk,
   1.269 +                  const ScoringHitBuffer* hitbuffer,
   1.270 +                  ScoringContext* scoringcontext,
   1.271 +                  SummaryBuffer* summarybuffer, ChunkSpan* last_cspan) {
   1.272 +  ChunkSpan prior_cspan = {0, 0, 0, 0, 0, 0};
   1.273 +  ChunkSpan cspan = {0, 0, 0, 0, 0, 0};
   1.274 +
   1.275 +  for (int i = 0; i < hitbuffer->next_chunk_start; ++i) {
   1.276 +    // Score one chunk
   1.277 +    // Sets delta_len, and distinct_len
   1.278 +    Tote chunk_tote;
   1.279 +    ChunkSummary chunksummary;
   1.280 +    ScoreOneChunk(text, ulscript,
   1.281 +                  hitbuffer, i,
   1.282 +                  scoringcontext, &cspan, &chunk_tote, &chunksummary);
   1.283 +
   1.284 +    // Put result in summarybuffer
   1.285 +    if (summarybuffer->n < kMaxSummaries) {
   1.286 +      summarybuffer->chunksummary[summarybuffer->n] = chunksummary;
   1.287 +      summarybuffer->n += 1;
   1.288 +    }
   1.289 +
   1.290 +    prior_cspan = cspan;
   1.291 +    cspan.chunk_base += cspan.base_len;
   1.292 +    cspan.chunk_delta += cspan.delta_len;
   1.293 +    cspan.chunk_distinct += cspan.distinct_len;
   1.294 +  }
   1.295 +
   1.296 +  // Add one dummy off the end to hold first unused linear_in_chunk
   1.297 +  int linear_off_end = hitbuffer->next_linear;
   1.298 +  int offset_off_end = hitbuffer->linear[linear_off_end].offset;
   1.299 +  ChunkSummary* cs = &summarybuffer->chunksummary[summarybuffer->n];
   1.300 +  memset(cs, 0, sizeof(ChunkSummary));
   1.301 +  cs->offset = offset_off_end;
   1.302 +  cs->chunk_start = linear_off_end;
   1.303 +  *last_cspan = prior_cspan;
   1.304 +}
   1.305 +
   1.306 +
   1.307 +void SummaryBufferToDocTote(const SummaryBuffer* summarybuffer,
   1.308 +                            bool more_to_come, DocTote* doc_tote) {
   1.309 +  int cs_bytes_sum = 0;
   1.310 +  for (int i = 0; i < summarybuffer->n; ++i) {
   1.311 +    const ChunkSummary* cs = &summarybuffer->chunksummary[i];
   1.312 +    int reliability = minint(cs->reliability_delta, cs->reliability_score);
   1.313 +    // doc_tote uses full languages
   1.314 +    doc_tote->Add(cs->lang1, cs->bytes, cs->score1, reliability);
   1.315 +    cs_bytes_sum += cs->bytes;
   1.316 +  }
   1.317 +}
   1.318 +
   1.319 +// Turn on for debugging vectors
   1.320 +static const bool kShowLettersOriginal = false;
   1.321 +
   1.322 +
   1.323 +// If next chunk language matches last vector language, extend last element
   1.324 +// Otherwise add new element to vector
   1.325 +void ItemToVector(ScriptScanner* scanner,
   1.326 +                  ResultChunkVector* vec, Language new_lang,
   1.327 +                  int mapped_offset, int mapped_len) {
   1.328 +  uint16 last_vec_lang = static_cast<uint16>(UNKNOWN_LANGUAGE);
   1.329 +  int last_vec_subscr = vec->size() - 1;
   1.330 +  if (last_vec_subscr >= 0) {
   1.331 +    ResultChunk* priorrc = &(*vec)[last_vec_subscr];
   1.332 +    last_vec_lang = priorrc->lang1;
   1.333 +    if (new_lang == last_vec_lang) {
   1.334 +      // Extend prior. Current mapped_offset may be beyond prior end, so do
   1.335 +      // the arithmetic to include any such gap
   1.336 +      priorrc->bytes = minint((mapped_offset + mapped_len) - priorrc->offset,
   1.337 +                              kMaxResultChunkBytes);
   1.338 +      if (kShowLettersOriginal) {
   1.339 +        // Optionally print the new chunk original text
   1.340 +        string temp2(&scanner->GetBufferStart()[priorrc->offset],
   1.341 +                     priorrc->bytes);
   1.342 +        fprintf(stderr, "Item[%d..%d) '%s'<br>\n",
   1.343 +                priorrc->offset, priorrc->offset + priorrc->bytes,
   1.344 +                GetHtmlEscapedText(temp2).c_str());
   1.345 +      }
   1.346 +      return;
   1.347 +    }
   1.348 +  }
   1.349 +  // Add new vector element
   1.350 +  ResultChunk rc;
   1.351 +  rc.offset = mapped_offset;
   1.352 +  rc.bytes = minint(mapped_len, kMaxResultChunkBytes);
   1.353 +  rc.lang1 = static_cast<uint16>(new_lang);
   1.354 +  vec->push_back(rc);
   1.355 +  if (kShowLettersOriginal) {
   1.356 +    // Optionally print the new chunk original text
   1.357 +    string temp2(&scanner->GetBufferStart()[rc.offset], rc.bytes);
   1.358 +    fprintf(stderr, "Item[%d..%d) '%s'<br>\n",
   1.359 +            rc.offset, rc.offset + rc.bytes,
   1.360 +            GetHtmlEscapedText(temp2).c_str());
   1.361 +  }
   1.362 +}
   1.363 +
   1.364 +uint16 PriorVecLang(const ResultChunkVector* vec) {
   1.365 +  if (vec->empty()) {return static_cast<uint16>(UNKNOWN_LANGUAGE);}
   1.366 +  return (*vec)[vec->size() - 1].lang1;
   1.367 +}
   1.368 +
   1.369 +uint16 NextChunkLang(const SummaryBuffer* summarybuffer, int i) {
   1.370 +  if ((i + 1) >= summarybuffer->n) {
   1.371 +    return static_cast<uint16>(UNKNOWN_LANGUAGE);
   1.372 +  }
   1.373 +  return summarybuffer->chunksummary[i + 1].lang1;
   1.374 +}
   1.375 +
   1.376 +
   1.377 +
   1.378 +// Add n elements of summarybuffer to resultchunk vector:
   1.379 +// Each element is letters-only text [offset..offset+bytes)
   1.380 +// This maps back to original[Back(offset)..Back(offset+bytes))
   1.381 +//
   1.382 +// We go out of our way to minimize the variation in the ResultChunkVector,
   1.383 +// so that the caller has fewer but more meaningful spans in different
   1.384 +// lanaguges, for the likely purpose of translation or spell-check.
   1.385 +//
   1.386 +// The language of each chunk is lang1, but it might be unreliable for
   1.387 +// either of two reasons: its score is relatively too close to the score of
   1.388 +// lang2, or its score is too far away from the expected score of real text in
   1.389 +// the given language. Unreliable languages are mapped to Unknown.
   1.390 +//
   1.391 +void SummaryBufferToVector(ScriptScanner* scanner, const char* text,
   1.392 +                           const SummaryBuffer* summarybuffer,
   1.393 +                           bool more_to_come, ResultChunkVector* vec) {
   1.394 +  if (vec == NULL) {return;}
   1.395 +
   1.396 +  if (kShowLettersOriginal) {
   1.397 +    fprintf(stderr, "map2original_ ");
   1.398 +    scanner->map2original_.DumpWindow();
   1.399 +    fprintf(stderr, "<br>\n");
   1.400 +    fprintf(stderr, "map2uplow_ ");
   1.401 +    scanner->map2uplow_.DumpWindow();
   1.402 +    fprintf(stderr, "<br>\n");
   1.403 +  }
   1.404 +
   1.405 +  for (int i = 0; i < summarybuffer->n; ++i) {
   1.406 +    const ChunkSummary* cs = &summarybuffer->chunksummary[i];
   1.407 +    int unmapped_offset = cs->offset;
   1.408 +    int unmapped_len = cs->bytes;
   1.409 +
   1.410 +    if (kShowLettersOriginal) {
   1.411 +      // Optionally print the chunk lowercase letters/marks text
   1.412 +      string temp(&text[unmapped_offset], unmapped_len);
   1.413 +      fprintf(stderr, "Letters [%d..%d) '%s'<br>\n",
   1.414 +              unmapped_offset, unmapped_offset + unmapped_len,
   1.415 +              GetHtmlEscapedText(temp).c_str());
   1.416 +    }
   1.417 +
   1.418 +    int mapped_offset = scanner->MapBack(unmapped_offset);
   1.419 +
   1.420 +    // Trim back a little to prefer splicing original at word boundaries
   1.421 +    if (mapped_offset > 0) {
   1.422 +      // Size of prior vector entry, if any
   1.423 +      int prior_size = 0;
   1.424 +      if (!vec->empty()) {
   1.425 +        ResultChunk* rc = &(*vec)[vec->size() - 1];
   1.426 +        prior_size = rc->bytes;
   1.427 +      }
   1.428 +      // Maximum back up size to leave at least 3 bytes in prior,
   1.429 +      // and not entire buffer, and no more than 12 bytes total backup
   1.430 +      int n_limit = minint(prior_size - 3, mapped_offset);
   1.431 +      n_limit = minint(n_limit, 12);
   1.432 +
   1.433 +      // Backscan over letters, stopping if prior byte is < 0x41
   1.434 +      // There is some possibility that we will backscan over a different script
   1.435 +      const char* s = &scanner->GetBufferStart()[mapped_offset];
   1.436 +      const unsigned char* us = reinterpret_cast<const unsigned char*>(s);
   1.437 +      int n = 0;
   1.438 +      while ((n < n_limit) && (us[-n - 1] >= 0x41)) {++n;}
   1.439 +      if (n >= n_limit) {n = 0;} // New boundary not found within range
   1.440 +
   1.441 +      // Also back up exactly one leading punctuation character if '"#@
   1.442 +      if (n < n_limit) {
   1.443 +        unsigned char c = us[-n - 1];
   1.444 +        if ((c == '\'') || (c == '"') || (c == '#') || (c == '@')) {++n;}
   1.445 +      }
   1.446 +      // Shrink the previous chunk slightly
   1.447 +      if (n > 0) {
   1.448 +        ResultChunk* rc = &(*vec)[vec->size() - 1];
   1.449 +        rc->bytes -= n;
   1.450 +        mapped_offset -= n;
   1.451 +        if (kShowLettersOriginal) {
   1.452 +          fprintf(stderr, "Back up %d bytes<br>\n", n);
   1.453 +          // Optionally print the prior chunk original text
   1.454 +          string temp2(&scanner->GetBufferStart()[rc->offset], rc->bytes);
   1.455 +          fprintf(stderr, "Prior   [%d..%d) '%s'<br>\n",
   1.456 +                  rc->offset, rc->offset + rc->bytes,
   1.457 +                  GetHtmlEscapedText(temp2).c_str());
   1.458 +        }
   1.459 +      }
   1.460 +    }
   1.461 +
   1.462 +    int mapped_len =
   1.463 +      scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset;
   1.464 +
   1.465 +    if (kShowLettersOriginal) {
   1.466 +      // Optionally print the chunk original text
   1.467 +      string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len);
   1.468 +      fprintf(stderr, "Original[%d..%d) '%s'<br>\n",
   1.469 +              mapped_offset, mapped_offset + mapped_len,
   1.470 +              GetHtmlEscapedText(temp2).c_str());
   1.471 +    }
   1.472 +
   1.473 +    Language new_lang = static_cast<Language>(cs->lang1);
   1.474 +    bool reliability_delta_bad =
   1.475 +      (cs->reliability_delta < kUnreliablePercentThreshold);
   1.476 +    bool reliability_score_bad =
   1.477 +      (cs->reliability_score < kUnreliablePercentThreshold);
   1.478 +
   1.479 +    // If the top language matches last vector, ignore reliability_delta
   1.480 +    uint16 prior_lang = PriorVecLang(vec);
   1.481 +    if (prior_lang == cs->lang1) {
   1.482 +      reliability_delta_bad = false;
   1.483 +    }
   1.484 +    // If the top language is in same close set as last vector, set up to merge
   1.485 +    if (SameCloseSet(cs->lang1, prior_lang)) {
   1.486 +      new_lang = static_cast<Language>(prior_lang);
   1.487 +      reliability_delta_bad = false;
   1.488 +    }
   1.489 +    // If the top two languages are in the same close set and the last vector
   1.490 +    // language is the second language, set up to merge
   1.491 +    if (SameCloseSet(cs->lang1, cs->lang2) &&
   1.492 +        (prior_lang == cs->lang2)) {
   1.493 +      new_lang = static_cast<Language>(prior_lang);
   1.494 +      reliability_delta_bad = false;
   1.495 +    }
   1.496 +    // If unreliable and the last and next vector languages are both
   1.497 +    // the second language, set up to merge
   1.498 +    uint16 next_lang = NextChunkLang(summarybuffer, i);
   1.499 +    if (reliability_delta_bad &&
   1.500 +        (prior_lang == cs->lang2) && (next_lang == cs->lang2)) {
   1.501 +      new_lang = static_cast<Language>(prior_lang);
   1.502 +      reliability_delta_bad = false;
   1.503 +    }
   1.504 +
   1.505 +    if (reliability_delta_bad || reliability_score_bad) {
   1.506 +      new_lang = UNKNOWN_LANGUAGE;
   1.507 +    }
   1.508 +    ItemToVector(scanner, vec, new_lang, mapped_offset, mapped_len);
   1.509 +  }
   1.510 +}
   1.511 +
   1.512 +// Add just one element to resultchunk vector:
   1.513 +// For RTypeNone or RTypeOne
   1.514 +void JustOneItemToVector(ScriptScanner* scanner, const char* text,
   1.515 +                         Language lang1, int unmapped_offset, int unmapped_len,
   1.516 +                         ResultChunkVector* vec) {
   1.517 +  if (vec == NULL) {return;}
   1.518 +
   1.519 +  if (kShowLettersOriginal) {
   1.520 +    fprintf(stderr, "map2original_ ");
   1.521 +    scanner->map2original_.DumpWindow();
   1.522 +    fprintf(stderr, "<br>\n");
   1.523 +    fprintf(stderr, "map2uplow_ ");
   1.524 +    scanner->map2uplow_.DumpWindow();
   1.525 +    fprintf(stderr, "<br>\n");
   1.526 +  }
   1.527 +
   1.528 +  if (kShowLettersOriginal) {
   1.529 +   // Optionally print the chunk lowercase letters/marks text
   1.530 +   string temp(&text[unmapped_offset], unmapped_len);
   1.531 +   fprintf(stderr, "Letters1 [%d..%d) '%s'<br>\n",
   1.532 +           unmapped_offset, unmapped_offset + unmapped_len,
   1.533 +           GetHtmlEscapedText(temp).c_str());
   1.534 +  }
   1.535 +
   1.536 +  int mapped_offset = scanner->MapBack(unmapped_offset);
   1.537 +  int mapped_len =
   1.538 +    scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset;
   1.539 +
   1.540 +  if (kShowLettersOriginal) {
   1.541 +    // Optionally print the chunk original text
   1.542 +    string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len);
   1.543 +    fprintf(stderr, "Original1[%d..%d) '%s'<br>\n",
   1.544 +            mapped_offset, mapped_offset + mapped_len,
   1.545 +            GetHtmlEscapedText(temp2).c_str());
   1.546 +  }
   1.547 +
   1.548 +  ItemToVector(scanner, vec, lang1, mapped_offset, mapped_len);
   1.549 +}
   1.550 +
   1.551 +
   1.552 +// Debugging. Not thread safe. Defined in getonescriptspan
   1.553 +char* DisplayPiece(const char* next_byte_, int byte_length_);
   1.554 +
   1.555 +// If high bit is on, take out high bit and add 2B to make table2 entries easy
   1.556 +inline int PrintableIndirect(int x) {
   1.557 +  if ((x & 0x80000000u) != 0) {
   1.558 +    return (x & ~0x80000000u) + 2000000000;
   1.559 +  }
   1.560 +  return x;
   1.561 +}
   1.562 +void DumpHitBuffer(FILE* df, const char* text,
   1.563 +                   const ScoringHitBuffer* hitbuffer) {
   1.564 +  fprintf(df,
   1.565 +          "<br>DumpHitBuffer[%s, next_base/delta/distinct %d, %d, %d)<br>\n",
   1.566 +          ULScriptCode(hitbuffer->ulscript),
   1.567 +          hitbuffer->next_base, hitbuffer->next_delta,
   1.568 +          hitbuffer->next_distinct);
   1.569 +  for (int i = 0; i < hitbuffer->maxscoringhits; ++i) {
   1.570 +    if (i < hitbuffer->next_base) {
   1.571 +      fprintf(df, "Q[%d]%d,%d,%s ",
   1.572 +              i, hitbuffer->base[i].offset,
   1.573 +              PrintableIndirect(hitbuffer->base[i].indirect),
   1.574 +              DisplayPiece(&text[hitbuffer->base[i].offset], 6));
   1.575 +    }
   1.576 +    if (i < hitbuffer->next_delta) {
   1.577 +      fprintf(df, "DL[%d]%d,%d,%s ",
   1.578 +              i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,
   1.579 +              DisplayPiece(&text[hitbuffer->delta[i].offset], 12));
   1.580 +    }
   1.581 +    if (i < hitbuffer->next_distinct) {
   1.582 +      fprintf(df, "D[%d]%d,%d,%s ",
   1.583 +              i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect,
   1.584 +              DisplayPiece(&text[hitbuffer->distinct[i].offset], 12));
   1.585 +    }
   1.586 +    if (i < hitbuffer->next_base) {
   1.587 +      fprintf(df, "<br>\n");
   1.588 +    }
   1.589 +    if (i > 50) {break;}
   1.590 +  }
   1.591 +  if (hitbuffer->next_base > 50) {
   1.592 +    int i = hitbuffer->next_base;
   1.593 +    fprintf(df, "Q[%d]%d,%d,%s ",
   1.594 +            i, hitbuffer->base[i].offset,
   1.595 +            PrintableIndirect(hitbuffer->base[i].indirect),
   1.596 +            DisplayPiece(&text[hitbuffer->base[i].offset], 6));
   1.597 +  }
   1.598 +  if (hitbuffer->next_delta > 50) {
   1.599 +    int i = hitbuffer->next_delta;
   1.600 +    fprintf(df, "DL[%d]%d,%d,%s ",
   1.601 +            i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,
   1.602 +            DisplayPiece(&text[hitbuffer->delta[i].offset], 12));
   1.603 +  }
   1.604 +  if (hitbuffer->next_distinct > 50) {
   1.605 +    int i = hitbuffer->next_distinct;
   1.606 +    fprintf(df, "D[%d]%d,%d,%s ",
   1.607 +            i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect,
   1.608 +            DisplayPiece(&text[hitbuffer->distinct[i].offset], 12));
   1.609 +  }
   1.610 +  fprintf(df, "<br>\n");
   1.611 +}
   1.612 +
   1.613 +
   1.614 +void DumpLinearBuffer(FILE* df, const char* text,
   1.615 +                      const ScoringHitBuffer* hitbuffer) {
   1.616 +  fprintf(df, "<br>DumpLinearBuffer[%d)<br>\n",
   1.617 +          hitbuffer->next_linear);
   1.618 +  // Include the dummy entry off the end
   1.619 +  for (int i = 0; i < hitbuffer->next_linear + 1; ++i) {
   1.620 +    if ((50 < i) && (i < (hitbuffer->next_linear - 1))) {continue;}
   1.621 +    fprintf(df, "[%d]%d,%c=%08x,%s<br>\n",
   1.622 +            i, hitbuffer->linear[i].offset,
   1.623 +            "UQLD"[hitbuffer->linear[i].type],
   1.624 +            hitbuffer->linear[i].langprob,
   1.625 +            DisplayPiece(&text[hitbuffer->linear[i].offset], 6));
   1.626 +  }
   1.627 +  fprintf(df, "<br>\n");
   1.628 +
   1.629 +  fprintf(df, "DumpChunkStart[%d]<br>\n", hitbuffer->next_chunk_start);
   1.630 +  for (int i = 0; i < hitbuffer->next_chunk_start + 1; ++i) {
   1.631 +    fprintf(df, "[%d]%d\n", i, hitbuffer->chunk_start[i]);
   1.632 +  }
   1.633 +  fprintf(df, "<br>\n");
   1.634 +}
   1.635 +
   1.636 +// Move this verbose debugging output to debug.cc eventually
   1.637 +void DumpChunkSummary(FILE* df, const ChunkSummary* cs) {
   1.638 +  // Print chunksummary
   1.639 +  fprintf(df, "%d lin[%d] %s.%d %s.%d %dB %d# %s %dRd %dRs<br>\n",
   1.640 +          cs->offset,
   1.641 +          cs->chunk_start,
   1.642 +          LanguageCode(static_cast<Language>(cs->lang1)),
   1.643 +          cs->score1,
   1.644 +          LanguageCode(static_cast<Language>(cs->lang2)),
   1.645 +          cs->score2,
   1.646 +          cs->bytes,
   1.647 +          cs->grams,
   1.648 +          ULScriptCode(static_cast<ULScript>(cs->ulscript)),
   1.649 +          cs->reliability_delta,
   1.650 +          cs->reliability_score);
   1.651 +}
   1.652 +
   1.653 +void DumpSummaryBuffer(FILE* df, const SummaryBuffer* summarybuffer) {
   1.654 +  fprintf(df, "<br>DumpSummaryBuffer[%d]<br>\n", summarybuffer->n);
   1.655 +  fprintf(df, "[i] offset linear[chunk_start] lang.score1 lang.score2 "
   1.656 +              "bytesB ngrams# script rel_delta rel_score<br>\n");
   1.657 +  for (int i = 0; i <= summarybuffer->n; ++i) {
   1.658 +    fprintf(df, "[%d] ", i);
   1.659 +    DumpChunkSummary(df, &summarybuffer->chunksummary[i]);
   1.660 +  }
   1.661 +  fprintf(df, "<br>\n");
   1.662 +}
   1.663 +
   1.664 +
   1.665 +
   1.666 +// Within hitbufer->linear[]
   1.667 +// <-- prior chunk --><-- this chunk -->
   1.668 +// |                  |                 |
   1.669 +// linear0            linear1           linear2
   1.670 +//     lang0              lang1
   1.671 +// The goal of sharpening is to move this_linear to better separate langs
   1.672 +int BetterBoundary(const char* text,
   1.673 +                   ScoringHitBuffer* hitbuffer,
   1.674 +                   ScoringContext* scoringcontext,
   1.675 +                   uint16 pslang0, uint16 pslang1,
   1.676 +                   int linear0, int linear1, int linear2) {
   1.677 +  // Degenerate case, no change
   1.678 +  if ((linear2 - linear0) <= 8) {return linear1;}
   1.679 +
   1.680 +  // Each diff gives pslang0 score - pslang1 score
   1.681 +  // Running diff has four entries + + + + followed by four entries - - - -
   1.682 +  // so that this value is maximal at the sharpest boundary between pslang0
   1.683 +  // (positive diffs) and pslang1 (negative diffs)
   1.684 +  int running_diff = 0;
   1.685 +  int diff[8];    // Ring buffer of pslang0-pslang1 differences
   1.686 +  // Initialize with first 8 diffs
   1.687 +  for (int i = linear0; i < linear0 + 8; ++i) {
   1.688 +    int j = i & 7;
   1.689 +    uint32 langprob = hitbuffer->linear[i].langprob;
   1.690 +    diff[j] = GetLangScore(langprob, pslang0) -
   1.691 +       GetLangScore(langprob, pslang1);
   1.692 +    if (i < linear0 + 4) {
   1.693 +      // First four diffs pslang0 - pslang1
   1.694 +      running_diff += diff[j];
   1.695 +    } else {
   1.696 +      // Second four diffs -(pslang0 - pslang1)
   1.697 +      running_diff -= diff[j];
   1.698 +    }
   1.699 +  }
   1.700 +
   1.701 +  // Now scan for sharpest boundary. j is at left end of 8 entries
   1.702 +  // To be a boundary, there must be both >0 and <0 entries in the window
   1.703 +  int better_boundary_value = 0;
   1.704 +  int better_boundary = linear1;
   1.705 +  for (int i = linear0; i < linear2 - 8; ++i) {
   1.706 +    int j = i & 7;
   1.707 +    if (better_boundary_value < running_diff) {
   1.708 +      bool has_plus = false;
   1.709 +      bool has_minus = false;
   1.710 +      for (int kk = 0; kk < 8; ++kk) {
   1.711 +        if (diff[kk] > 0) {has_plus = true;}
   1.712 +        if (diff[kk] < 0) {has_minus = true;}
   1.713 +      }
   1.714 +      if (has_plus && has_minus) {
   1.715 +        better_boundary_value = running_diff;
   1.716 +        better_boundary = i + 4;
   1.717 +      }
   1.718 +    }
   1.719 +    // Shift right one entry
   1.720 +    uint32 langprob = hitbuffer->linear[i + 8].langprob;
   1.721 +    int newdiff = GetLangScore(langprob, pslang0) -
   1.722 +       GetLangScore(langprob, pslang1);
   1.723 +    int middiff = diff[(i + 4) & 7];
   1.724 +    int olddiff = diff[j];
   1.725 +    diff[j] = newdiff;
   1.726 +    running_diff -= olddiff;                  // Remove left
   1.727 +    running_diff += 2 * middiff;              // Convert middle from - to +
   1.728 +    running_diff -= newdiff;                  // Insert right
   1.729 +  }
   1.730 +
   1.731 +  if (scoringcontext->flags_cld2_verbose && (linear1 != better_boundary)) {
   1.732 +    Language lang0 = FromPerScriptNumber(scoringcontext->ulscript, pslang0);
   1.733 +    Language lang1 = FromPerScriptNumber(scoringcontext->ulscript, pslang1);
   1.734 +    fprintf(scoringcontext->debug_file, " Better lin[%d=>%d] %s^^%s <br>\n",
   1.735 +            linear1, better_boundary,
   1.736 +            LanguageCode(lang0), LanguageCode(lang1));
   1.737 +    int lin0_off = hitbuffer->linear[linear0].offset;
   1.738 +    int lin1_off = hitbuffer->linear[linear1].offset;
   1.739 +    int lin2_off = hitbuffer->linear[linear2].offset;
   1.740 +    int better_offm1 = hitbuffer->linear[better_boundary - 1].offset;
   1.741 +    int better_off = hitbuffer->linear[better_boundary].offset;
   1.742 +    int better_offp1 = hitbuffer->linear[better_boundary + 1].offset;
   1.743 +    string old0(&text[lin0_off], lin1_off - lin0_off);
   1.744 +    string old1(&text[lin1_off], lin2_off - lin1_off);
   1.745 +    string new0(&text[lin0_off], better_offm1 - lin0_off);
   1.746 +    string new0m1(&text[better_offm1], better_off - better_offm1);
   1.747 +    string new1(&text[better_off], better_offp1 - better_off);
   1.748 +    string new1p1(&text[better_offp1], lin2_off - better_offp1);
   1.749 +    fprintf(scoringcontext->debug_file, "%s^^%s => <br>\n%s^%s^^%s^%s<br>\n",
   1.750 +            GetHtmlEscapedText(old0).c_str(),
   1.751 +            GetHtmlEscapedText(old1).c_str(),
   1.752 +            GetHtmlEscapedText(new0).c_str(),
   1.753 +            GetHtmlEscapedText(new0m1).c_str(),
   1.754 +            GetHtmlEscapedText(new1).c_str(),
   1.755 +            GetHtmlEscapedText(new1p1).c_str());
   1.756 +    // Slow picture of differences per linear entry
   1.757 +    int d;
   1.758 +    for (int i = linear0; i < linear2; ++i) {
   1.759 +      if (i == better_boundary) {
   1.760 +        fprintf(scoringcontext->debug_file, "^^ ");
   1.761 +      }
   1.762 +      uint32 langprob = hitbuffer->linear[i].langprob;
   1.763 +      d = GetLangScore(langprob, pslang0) - GetLangScore(langprob, pslang1);
   1.764 +      const char* s = "=";
   1.765 +      //if (d > 2) {s = "\xc2\xaf";}    // Macron
   1.766 +      if (d > 2) {s = "#";}
   1.767 +      else if (d > 0) {s = "+";}
   1.768 +      else if (d < -2) {s = "_";}
   1.769 +      else if (d < 0) {s = "-";}
   1.770 +      fprintf(scoringcontext->debug_file, "%s ", s);
   1.771 +    }
   1.772 +    fprintf(scoringcontext->debug_file, " &nbsp;&nbsp;(scale: #+=-_)<br>\n");
   1.773 +  }
   1.774 +  return better_boundary;
   1.775 +}
   1.776 +
   1.777 +
   1.778 +// For all but the first summary, if its top language differs from
   1.779 +// the previous chunk, refine the boundary
   1.780 +// Linearized version
   1.781 +void SharpenBoundaries(const char* text,
   1.782 +                       bool more_to_come,
   1.783 +                       ScoringHitBuffer* hitbuffer,
   1.784 +                       ScoringContext* scoringcontext,
   1.785 +                       SummaryBuffer* summarybuffer) {
   1.786 +
   1.787 +  int prior_linear = summarybuffer->chunksummary[0].chunk_start;
   1.788 +  uint16 prior_lang = summarybuffer->chunksummary[0].lang1;
   1.789 +
   1.790 +  if (scoringcontext->flags_cld2_verbose) {
   1.791 +    fprintf(scoringcontext->debug_file, "<br>SharpenBoundaries<br>\n");
   1.792 +  }
   1.793 +  for (int i = 1; i < summarybuffer->n; ++i) {
   1.794 +    ChunkSummary* cs = &summarybuffer->chunksummary[i];
   1.795 +    uint16 this_lang = cs->lang1;
   1.796 +    if (this_lang == prior_lang) {
   1.797 +      prior_linear = cs->chunk_start;
   1.798 +      continue;
   1.799 +    }
   1.800 +
   1.801 +    int this_linear = cs->chunk_start;
   1.802 +    int next_linear = summarybuffer->chunksummary[i + 1].chunk_start;
   1.803 +
   1.804 +    // If this/prior in same close set, don't move boundary
   1.805 +    if (SameCloseSet(prior_lang, this_lang)) {
   1.806 +      prior_linear = this_linear;
   1.807 +      prior_lang = this_lang;
   1.808 +      continue;
   1.809 +    }
   1.810 +
   1.811 +
   1.812 +    // Within hitbuffer->linear[]
   1.813 +    // <-- prior chunk --><-- this chunk -->
   1.814 +    // |                  |                 |
   1.815 +    // prior_linear       this_linear       next_linear
   1.816 +    //     prior_lang         this_lang
   1.817 +    // The goal of sharpening is to move this_linear to better separate langs
   1.818 +
   1.819 +    uint8 pslang0 = PerScriptNumber(scoringcontext->ulscript,
   1.820 +                                    static_cast<Language>(prior_lang));
   1.821 +    uint8 pslang1 = PerScriptNumber(scoringcontext->ulscript,
   1.822 +                                    static_cast<Language>(this_lang));
   1.823 +    int better_linear = BetterBoundary(text,
   1.824 +                                       hitbuffer,
   1.825 +                                       scoringcontext,
   1.826 +                                       pslang0, pslang1,
   1.827 +                                       prior_linear, this_linear, next_linear);
   1.828 +
   1.829 +    int old_offset = hitbuffer->linear[this_linear].offset;
   1.830 +    int new_offset = hitbuffer->linear[better_linear].offset;
   1.831 +    cs->chunk_start = better_linear;
   1.832 +    cs->offset = new_offset;
   1.833 +    // If this_linear moved right, make bytes smaller for this, larger for prior
   1.834 +    // If this_linear moved left, make bytes larger for this, smaller for prior
   1.835 +    cs->bytes -= (new_offset - old_offset);
   1.836 +    summarybuffer->chunksummary[i - 1].bytes += (new_offset - old_offset);
   1.837 +
   1.838 +    this_linear = better_linear;    // Update so that next chunk doesn't intrude
   1.839 +
   1.840 +    // Consider rescoring the two chunks
   1.841 +
   1.842 +    // Update for next round (note: using pre-updated boundary)
   1.843 +    prior_linear = this_linear;
   1.844 +    prior_lang = this_lang;
   1.845 +  }
   1.846 +}
   1.847 +
   1.848 +// Make a langprob that gives small weight to the default language for ulscript
   1.849 +uint32 DefaultLangProb(ULScript ulscript) {
   1.850 +  Language default_lang = DefaultLanguage(ulscript);
   1.851 +  return MakeLangProb(default_lang, 1);
   1.852 +}
   1.853 +
   1.854 +// Effectively, do a merge-sort based on text offsets
   1.855 +// Look up each indirect value in appropriate scoring table and keep
   1.856 +// just the resulting langprobs
   1.857 +void LinearizeAll(ScoringContext* scoringcontext, bool score_cjk,
   1.858 +                  ScoringHitBuffer* hitbuffer) {
   1.859 +  const CLD2TableSummary* base_obj;       // unigram or quadgram
   1.860 +  const CLD2TableSummary* base_obj2;      // quadgram dual table
   1.861 +  const CLD2TableSummary* delta_obj;      // bigram or octagram
   1.862 +  const CLD2TableSummary* distinct_obj;   // bigram or octagram
   1.863 +  uint16 base_hit;
   1.864 +  if (score_cjk) {
   1.865 +    base_obj = scoringcontext->scoringtables->unigram_compat_obj;
   1.866 +    base_obj2 = scoringcontext->scoringtables->unigram_compat_obj;
   1.867 +    delta_obj = scoringcontext->scoringtables->deltabi_obj;
   1.868 +    distinct_obj = scoringcontext->scoringtables->distinctbi_obj;
   1.869 +    base_hit = UNIHIT;
   1.870 +  } else {
   1.871 +    base_obj = scoringcontext->scoringtables->quadgram_obj;
   1.872 +    base_obj2 = scoringcontext->scoringtables->quadgram_obj2;
   1.873 +    delta_obj = scoringcontext->scoringtables->deltaocta_obj;
   1.874 +    distinct_obj = scoringcontext->scoringtables->distinctocta_obj;
   1.875 +    base_hit = QUADHIT;
   1.876 +  }
   1.877 +
   1.878 +  int base_limit = hitbuffer->next_base;
   1.879 +  int delta_limit = hitbuffer->next_delta;
   1.880 +  int distinct_limit = hitbuffer->next_distinct;
   1.881 +  int base_i = 0;
   1.882 +  int delta_i = 0;
   1.883 +  int distinct_i = 0;
   1.884 +  int linear_i = 0;
   1.885 +
   1.886 +  // Start with an initial base hit for the default language for this script
   1.887 +  // Inserting this avoids edge effects with no hits at all
   1.888 +  hitbuffer->linear[linear_i].offset = hitbuffer->lowest_offset;
   1.889 +  hitbuffer->linear[linear_i].type = base_hit;
   1.890 +  hitbuffer->linear[linear_i].langprob =
   1.891 +    DefaultLangProb(scoringcontext->ulscript);
   1.892 +  ++linear_i;
   1.893 +
   1.894 +  while ((base_i < base_limit) || (delta_i < delta_limit) ||
   1.895 +         (distinct_i < distinct_limit)) {
   1.896 +    int base_off = hitbuffer->base[base_i].offset;
   1.897 +    int delta_off = hitbuffer->delta[delta_i].offset;
   1.898 +    int distinct_off = hitbuffer->distinct[distinct_i].offset;
   1.899 +
   1.900 +    // Do delta and distinct first, so that they are not lost at base_limit
   1.901 +    if ((delta_i < delta_limit) &&
   1.902 +        (delta_off <= base_off) && (delta_off <= distinct_off)) {
   1.903 +      // Add delta entry
   1.904 +      int indirect = hitbuffer->delta[delta_i].indirect;
   1.905 +      ++delta_i;
   1.906 +      uint32 langprob = delta_obj->kCLDTableInd[indirect];
   1.907 +      if (langprob > 0) {
   1.908 +        hitbuffer->linear[linear_i].offset = delta_off;
   1.909 +        hitbuffer->linear[linear_i].type = DELTAHIT;
   1.910 +        hitbuffer->linear[linear_i].langprob = langprob;
   1.911 +        ++linear_i;
   1.912 +      }
   1.913 +    }
   1.914 +    else if ((distinct_i < distinct_limit) &&
   1.915 +             (distinct_off <= base_off) && (distinct_off <= delta_off)) {
   1.916 +      // Add distinct entry
   1.917 +      int indirect = hitbuffer->distinct[distinct_i].indirect;
   1.918 +      ++distinct_i;
   1.919 +      uint32 langprob = distinct_obj->kCLDTableInd[indirect];
   1.920 +      if (langprob > 0) {
   1.921 +        hitbuffer->linear[linear_i].offset = distinct_off;
   1.922 +        hitbuffer->linear[linear_i].type = DISTINCTHIT;
   1.923 +        hitbuffer->linear[linear_i].langprob = langprob;
   1.924 +        ++linear_i;
   1.925 +      }
   1.926 +    }
   1.927 +    else {
   1.928 +      // Add one or two base entries
   1.929 +      int indirect = hitbuffer->base[base_i].indirect;
   1.930 +      // First, get right scoring table
   1.931 +      const CLD2TableSummary* local_base_obj = base_obj;
   1.932 +      if ((indirect & 0x80000000u) != 0) {
   1.933 +        local_base_obj = base_obj2;
   1.934 +        indirect &= ~0x80000000u;
   1.935 +      }
   1.936 +      ++base_i;
   1.937 +      // One langprob in kQuadInd[0..SingleSize),
   1.938 +      // two in kQuadInd[SingleSize..Size)
   1.939 +      if (indirect < static_cast<int>(local_base_obj->kCLDTableSizeOne)) {
   1.940 +        // Up to three languages at indirect
   1.941 +        uint32 langprob = local_base_obj->kCLDTableInd[indirect];
   1.942 +        if (langprob > 0) {
   1.943 +          hitbuffer->linear[linear_i].offset = base_off;
   1.944 +          hitbuffer->linear[linear_i].type = base_hit;
   1.945 +          hitbuffer->linear[linear_i].langprob = langprob;
   1.946 +          ++linear_i;
   1.947 +        }
   1.948 +      } else {
   1.949 +        // Up to six languages at start + 2 * (indirect - start)
   1.950 +        indirect += (indirect - local_base_obj->kCLDTableSizeOne);
   1.951 +        uint32 langprob = local_base_obj->kCLDTableInd[indirect];
   1.952 +        uint32 langprob2 = local_base_obj->kCLDTableInd[indirect + 1];
   1.953 +        if (langprob > 0) {
   1.954 +          hitbuffer->linear[linear_i].offset = base_off;
   1.955 +          hitbuffer->linear[linear_i].type = base_hit;
   1.956 +          hitbuffer->linear[linear_i].langprob = langprob;
   1.957 +          ++linear_i;
   1.958 +        }
   1.959 +        if (langprob2 > 0) {
   1.960 +          hitbuffer->linear[linear_i].offset = base_off;
   1.961 +          hitbuffer->linear[linear_i].type = base_hit;
   1.962 +          hitbuffer->linear[linear_i].langprob = langprob2;
   1.963 +          ++linear_i;
   1.964 +        }
   1.965 +      }
   1.966 +    }
   1.967 +  }
   1.968 +
   1.969 +  // Update
   1.970 +  hitbuffer->next_linear = linear_i;
   1.971 +
   1.972 +  // Add a dummy entry off the end, just to capture final offset
   1.973 +  hitbuffer->linear[linear_i].offset =
   1.974 +  hitbuffer->base[hitbuffer->next_base].offset;
   1.975 +  hitbuffer->linear[linear_i].langprob = 0;
   1.976 +}
   1.977 +
   1.978 +// Break linear array into chunks of ~20 quadgram hits or ~50 CJK unigram hits
   1.979 +void ChunkAll(int letter_offset, bool score_cjk, ScoringHitBuffer* hitbuffer) {
   1.980 +  int chunksize;
   1.981 +  uint16 base_hit;
   1.982 +  if (score_cjk) {
   1.983 +    chunksize = kChunksizeUnis;
   1.984 +    base_hit = UNIHIT;
   1.985 +  } else {
   1.986 +    chunksize = kChunksizeQuads;
   1.987 +    base_hit = QUADHIT;
   1.988 +  }
   1.989 +
   1.990 +  int linear_i = 0;
   1.991 +  int linear_off_end = hitbuffer->next_linear;
   1.992 +  int text_i = letter_offset;               // Next unseen text offset
   1.993 +  int next_chunk_start = 0;
   1.994 +  int bases_left = hitbuffer->next_base;
   1.995 +  while (bases_left > 0) {
   1.996 +    // Linearize one chunk
   1.997 +    int base_len = chunksize;     // Default; may be changed below
   1.998 +    if (bases_left < (chunksize + (chunksize >> 1))) {
   1.999 +      // If within 1.5 chunks of the end, avoid runts by using it all
  1.1000 +      base_len = bases_left;
  1.1001 +    } else if (bases_left < (2 * chunksize)) {
  1.1002 +      // Avoid runts by splitting 1.5 to 2 chunks in half (about 3/4 each)
  1.1003 +      base_len = (bases_left + 1) >> 1;
  1.1004 +    }
  1.1005 +
  1.1006 +    hitbuffer->chunk_start[next_chunk_start] = linear_i;
  1.1007 +    hitbuffer->chunk_offset[next_chunk_start] = text_i;
  1.1008 +    ++next_chunk_start;
  1.1009 +
  1.1010 +    int base_count = 0;
  1.1011 +    while ((base_count < base_len) && (linear_i < linear_off_end)) {
  1.1012 +      if (hitbuffer->linear[linear_i].type == base_hit) {++base_count;}
  1.1013 +      ++linear_i;
  1.1014 +    }
  1.1015 +    text_i = hitbuffer->linear[linear_i].offset;    // Next unseen text offset
  1.1016 +    bases_left -= base_len;
  1.1017 +  }
  1.1018 +
  1.1019 +  // If no base hits at all, make a single dummy chunk
  1.1020 +  if (next_chunk_start == 0) {
  1.1021 +     hitbuffer->chunk_start[next_chunk_start] = 0;
  1.1022 +     hitbuffer->chunk_offset[next_chunk_start] = hitbuffer->linear[0].offset;
  1.1023 +     ++next_chunk_start;
  1.1024 +  }
  1.1025 +
  1.1026 +  // Remember the linear array start of dummy entry
  1.1027 +  hitbuffer->next_chunk_start = next_chunk_start;
  1.1028 +
  1.1029 +  // Add a dummy entry off the end, just to capture final linear subscr
  1.1030 +  hitbuffer->chunk_start[next_chunk_start] = hitbuffer->next_linear;
  1.1031 +  hitbuffer->chunk_offset[next_chunk_start] = text_i;
  1.1032 +}
  1.1033 +
  1.1034 +
  1.1035 +// Merge-sort the individual hit arrays, go indirect on the scoring subscripts,
  1.1036 +// break linear array into chunks.
  1.1037 +//
  1.1038 +// Input:
  1.1039 +//  hitbuffer base, delta, distinct arrays
  1.1040 +// Output:
  1.1041 +//  linear array
  1.1042 +//  chunk_start array
  1.1043 +//
  1.1044 +void LinearizeHitBuffer(int letter_offset,
  1.1045 +                        ScoringContext* scoringcontext,
  1.1046 +                        bool more_to_come, bool score_cjk,
  1.1047 +                        ScoringHitBuffer* hitbuffer) {
  1.1048 +  LinearizeAll(scoringcontext, score_cjk, hitbuffer);
  1.1049 +  ChunkAll(letter_offset, score_cjk, hitbuffer);
  1.1050 +}
  1.1051 +
  1.1052 +
  1.1053 +
  1.1054 +// The hitbuffer is in an awkward form -- three sets of base/delta/distinct
  1.1055 +// scores, each with an indirect subscript to one of six scoring tables, some
  1.1056 +// of which can yield two langprobs for six languages, others one langprob for
  1.1057 +// three languages. The only correlation between base/delta/distinct is their
  1.1058 +// offsets into the letters-only text buffer.
  1.1059 +//
  1.1060 +// SummaryBuffer needs to be built to linear, giving linear offset of start of
  1.1061 +// each chunk
  1.1062 +//
  1.1063 +// So we first do all the langprob lookups and merge-sort by offset to make
  1.1064 +// a single linear vector, building a side vector of chunk beginnings as we go.
  1.1065 +// The sharpening is simply moving the beginnings, scoring is a simple linear
  1.1066 +// sweep, etc.
  1.1067 +
  1.1068 +void ProcessHitBuffer(const LangSpan& scriptspan,
  1.1069 +                      int letter_offset,
  1.1070 +                      ScoringContext* scoringcontext,
  1.1071 +                      DocTote* doc_tote,
  1.1072 +                      ResultChunkVector* vec,
  1.1073 +                      bool more_to_come, bool score_cjk,
  1.1074 +                      ScoringHitBuffer* hitbuffer) {
  1.1075 +  if (scoringcontext->flags_cld2_verbose) {
  1.1076 +    fprintf(scoringcontext->debug_file, "Hitbuffer[) ");
  1.1077 +    DumpHitBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer);
  1.1078 +  }
  1.1079 +
  1.1080 +  LinearizeHitBuffer(letter_offset, scoringcontext, more_to_come, score_cjk,
  1.1081 +                     hitbuffer);
  1.1082 +
  1.1083 +  if (scoringcontext->flags_cld2_verbose) {
  1.1084 +    fprintf(scoringcontext->debug_file, "Linear[) ");
  1.1085 +    DumpLinearBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer);
  1.1086 +  }
  1.1087 +
  1.1088 +  SummaryBuffer summarybuffer;
  1.1089 +  summarybuffer.n = 0;
  1.1090 +  ChunkSpan last_cspan;
  1.1091 +  ScoreAllHits(scriptspan.text, scriptspan.ulscript,
  1.1092 +                    more_to_come, score_cjk, hitbuffer,
  1.1093 +                    scoringcontext,
  1.1094 +                    &summarybuffer, &last_cspan);
  1.1095 +
  1.1096 +  if (scoringcontext->flags_cld2_verbose) {
  1.1097 +    DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer);
  1.1098 +  }
  1.1099 +
  1.1100 +  if (vec != NULL) {
  1.1101 +    // Sharpen boundaries of summarybuffer
  1.1102 +    // This is not a high-performance path
  1.1103 +    SharpenBoundaries(scriptspan.text, more_to_come, hitbuffer, scoringcontext,
  1.1104 +                      &summarybuffer);
  1.1105 +    // Show after the sharpening
  1.1106 +    // CLD2_Debug2(scriptspan.text, more_to_come, score_cjk,
  1.1107 +    //             hitbuffer, scoringcontext, &summarybuffer);
  1.1108 +
  1.1109 +    if (scoringcontext->flags_cld2_verbose) {
  1.1110 +      DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer);
  1.1111 +    }
  1.1112 +  }
  1.1113 +
  1.1114 +  SummaryBufferToDocTote(&summarybuffer, more_to_come, doc_tote);
  1.1115 +  SummaryBufferToVector(scoringcontext->scanner, scriptspan.text,
  1.1116 +                        &summarybuffer, more_to_come, vec);
  1.1117 +}
  1.1118 +
  1.1119 +void SpliceHitBuffer(ScoringHitBuffer* hitbuffer, int next_offset) {
  1.1120 +  // Splice hitbuffer and summarybuffer for next round. With big chunks and
  1.1121 +  // distinctive-word state carried across chunks, we might not need to do this.
  1.1122 +  hitbuffer->next_base = 0;
  1.1123 +  hitbuffer->next_delta = 0;
  1.1124 +  hitbuffer->next_distinct = 0;
  1.1125 +  hitbuffer->next_linear = 0;
  1.1126 +  hitbuffer->next_chunk_start = 0;
  1.1127 +  hitbuffer->lowest_offset = next_offset;
  1.1128 +}
  1.1129 +
  1.1130 +
  1.1131 +// Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating
  1.1132 +// scoringcontext
  1.1133 +void ScoreEntireScriptSpan(const LangSpan& scriptspan,
  1.1134 +                           ScoringContext* scoringcontext,
  1.1135 +                           DocTote* doc_tote,
  1.1136 +                           ResultChunkVector* vec) {
  1.1137 +  int bytes = scriptspan.text_bytes;
  1.1138 +  // Artificially set score to 1024 per 1KB, or 1 per byte
  1.1139 +  int score = bytes;
  1.1140 +  int reliability = 100;
  1.1141 +  // doc_tote uses full languages
  1.1142 +  Language one_one_lang = DefaultLanguage(scriptspan.ulscript);
  1.1143 +  doc_tote->Add(one_one_lang, bytes, score, reliability);
  1.1144 +
  1.1145 +  if (scoringcontext->flags_cld2_html) {
  1.1146 +    ChunkSummary chunksummary = {
  1.1147 +      1, 0,
  1.1148 +      one_one_lang, UNKNOWN_LANGUAGE, score, 1,
  1.1149 +      bytes, 0, scriptspan.ulscript, reliability, reliability
  1.1150 +    };
  1.1151 +    CLD2_Debug(scriptspan.text, 1, scriptspan.text_bytes,
  1.1152 +               false, false, NULL,
  1.1153 +               scoringcontext, NULL, &chunksummary);
  1.1154 +  }
  1.1155 +
  1.1156 +  // First byte is always a space
  1.1157 +  JustOneItemToVector(scoringcontext->scanner, scriptspan.text,
  1.1158 +                      one_one_lang, 1, bytes - 1, vec);
  1.1159 +
  1.1160 +  scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
  1.1161 +}
  1.1162 +
  1.1163 +// Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext
  1.1164 +void ScoreCJKScriptSpan(const LangSpan& scriptspan,
  1.1165 +                        ScoringContext* scoringcontext,
  1.1166 +                        DocTote* doc_tote,
  1.1167 +                        ResultChunkVector* vec) {
  1.1168 +  // Allocate three parallel arrays of scoring hits
  1.1169 +  ScoringHitBuffer* hitbuffer = new ScoringHitBuffer;
  1.1170 +  hitbuffer->init();
  1.1171 +  hitbuffer->ulscript = scriptspan.ulscript;
  1.1172 +
  1.1173 +  scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
  1.1174 +  scoringcontext->oldest_distinct_boost = 0;
  1.1175 +
  1.1176 +  // Incoming scriptspan has a single leading space at scriptspan.text[0]
  1.1177 +  // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3]
  1.1178 +
  1.1179 +  int letter_offset = 1;        // Skip initial space
  1.1180 +  hitbuffer->lowest_offset = letter_offset;
  1.1181 +  int letter_limit = scriptspan.text_bytes;
  1.1182 +  while (letter_offset < letter_limit) {
  1.1183 +    if (scoringcontext->flags_cld2_verbose) {
  1.1184 +      fprintf(scoringcontext->debug_file, " ScoreCJKScriptSpan[%d,%d)<br>\n",
  1.1185 +              letter_offset, letter_limit);
  1.1186 +    }
  1.1187 +    //
  1.1188 +    // Fill up one hitbuffer, possibly splicing onto previous fragment
  1.1189 +    //
  1.1190 +    // NOTE: GetUniHits deals with close repeats
  1.1191 +    // NOTE: After last chunk there is always a hitbuffer entry with an offset
  1.1192 +    // just off the end of the text = next_offset.
  1.1193 +    int next_offset = GetUniHits(scriptspan.text, letter_offset, letter_limit,
  1.1194 +                                  scoringcontext, hitbuffer);
  1.1195 +    // NOTE: GetBiHitVectors deals with close repeats,
  1.1196 +    // does one hash and two lookups (delta and distinct) per word
  1.1197 +    GetBiHits(scriptspan.text, letter_offset, next_offset,
  1.1198 +                scoringcontext, hitbuffer);
  1.1199 +
  1.1200 +    //
  1.1201 +    // Score one hitbuffer in chunks to summarybuffer
  1.1202 +    //
  1.1203 +    bool more_to_come = next_offset < letter_limit;
  1.1204 +    bool score_cjk = true;
  1.1205 +    ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec,
  1.1206 +                     more_to_come, score_cjk, hitbuffer);
  1.1207 +    SpliceHitBuffer(hitbuffer, next_offset);
  1.1208 +
  1.1209 +    letter_offset = next_offset;
  1.1210 +  }
  1.1211 +
  1.1212 +  delete hitbuffer;
  1.1213 +  // Context across buffers is not connected yet
  1.1214 +  scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
  1.1215 +}
  1.1216 +
  1.1217 +
  1.1218 +
  1.1219 +// Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext
  1.1220 +// We have a scriptspan with all lowercase text in one script. Look up
  1.1221 +// quadgrams and octagrams, saving the hits in three parallel vectors.
  1.1222 +// Score from those vectors in chunks, toting each chunk to get a single
  1.1223 +// language, and combining into the overall document score. The hit vectors
  1.1224 +// in general are not big enough to handle and entire scriptspan, so
  1.1225 +// repeat until the entire scriptspan is scored.
  1.1226 +// Caller deals with minimizing numbr of runt scriptspans
  1.1227 +// This routine deals with minimizing number of runt chunks.
  1.1228 +//
  1.1229 +// Returns updated scoringcontext
  1.1230 +// Returns updated doc_tote
  1.1231 +// If vec != NULL, appends to that vector of ResultChunk's
  1.1232 +void ScoreQuadScriptSpan(const LangSpan& scriptspan,
  1.1233 +                         ScoringContext* scoringcontext,
  1.1234 +                         DocTote* doc_tote,
  1.1235 +                         ResultChunkVector* vec) {
  1.1236 +  // Allocate three parallel arrays of scoring hits
  1.1237 +  ScoringHitBuffer* hitbuffer = new ScoringHitBuffer;
  1.1238 +  hitbuffer->init();
  1.1239 +  hitbuffer->ulscript = scriptspan.ulscript;
  1.1240 +
  1.1241 +  scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
  1.1242 +  scoringcontext->oldest_distinct_boost = 0;
  1.1243 +
  1.1244 +  // Incoming scriptspan has a single leading space at scriptspan.text[0]
  1.1245 +  // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3]
  1.1246 +
  1.1247 +  int letter_offset = 1;        // Skip initial space
  1.1248 +  hitbuffer->lowest_offset = letter_offset;
  1.1249 +  int letter_limit = scriptspan.text_bytes;
  1.1250 +  while (letter_offset < letter_limit) {
  1.1251 +    //
  1.1252 +    // Fill up one hitbuffer, possibly splicing onto previous fragment
  1.1253 +    //
  1.1254 +    // NOTE: GetQuadHits deals with close repeats
  1.1255 +    // NOTE: After last chunk there is always a hitbuffer entry with an offset
  1.1256 +    // just off the end of the text = next_offset.
  1.1257 +    int next_offset = GetQuadHits(scriptspan.text, letter_offset, letter_limit,
  1.1258 +                                  scoringcontext, hitbuffer);
  1.1259 +    // If true, there is more text to process in this scriptspan
  1.1260 +    // NOTE: GetOctaHitVectors deals with close repeats,
  1.1261 +    // does one hash and two lookups (delta and distinct) per word
  1.1262 +    GetOctaHits(scriptspan.text, letter_offset, next_offset,
  1.1263 +                scoringcontext, hitbuffer);
  1.1264 +
  1.1265 +    //
  1.1266 +    // Score one hitbuffer in chunks to summarybuffer
  1.1267 +    //
  1.1268 +    bool more_to_come = next_offset < letter_limit;
  1.1269 +    bool score_cjk = false;
  1.1270 +    ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec,
  1.1271 +                     more_to_come, score_cjk, hitbuffer);
  1.1272 +    SpliceHitBuffer(hitbuffer, next_offset);
  1.1273 +
  1.1274 +    letter_offset = next_offset;
  1.1275 +  }
  1.1276 +
  1.1277 +  delete hitbuffer;
  1.1278 +}
  1.1279 +
  1.1280 +
  1.1281 +// Score one scriptspan into doc_tote and vec, updating scoringcontext
  1.1282 +// Inputs:
  1.1283 +//  One scriptspan of perhaps 40-60KB, all same script lower-case letters
  1.1284 +//    and single ASCII spaces. First character is a space to allow simple
  1.1285 +//    begining-of-word detect. End of buffer has three spaces and NUL to
  1.1286 +//    allow easy scan-to-end-of-word.
  1.1287 +//  Scoring context of
  1.1288 +//    scoring tables
  1.1289 +//    flags
  1.1290 +//    running boosts
  1.1291 +// Outputs:
  1.1292 +//  Updated doc_tote giving overall languages and byte counts
  1.1293 +//  Optional updated chunk vector giving offset, length, language
  1.1294 +//
  1.1295 +// Caller initializes flags, boosts, doc_tote and vec.
  1.1296 +// Caller aggregates across multiple scriptspans
  1.1297 +// Caller calculates final document result
  1.1298 +// Caller deals with detecting and triggering suppression of repeated text.
  1.1299 +//
  1.1300 +// This top-level routine just chooses the recognition type and calls one of
  1.1301 +// the next-level-down routines.
  1.1302 +//
  1.1303 +void ScoreOneScriptSpan(const LangSpan& scriptspan,
  1.1304 +                        ScoringContext* scoringcontext,
  1.1305 +                        DocTote* doc_tote,
  1.1306 +                        ResultChunkVector* vec) {
  1.1307 +  if (scoringcontext->flags_cld2_verbose) {
  1.1308 +    fprintf(scoringcontext->debug_file, "<br>ScoreOneScriptSpan(%s,%d) ",
  1.1309 +            ULScriptCode(scriptspan.ulscript), scriptspan.text_bytes);
  1.1310 +    // Optionally print the chunk lowercase letters/marks text
  1.1311 +    string temp(&scriptspan.text[0], scriptspan.text_bytes);
  1.1312 +    fprintf(scoringcontext->debug_file, "'%s'",
  1.1313 +            GetHtmlEscapedText(temp).c_str());
  1.1314 +    fprintf(scoringcontext->debug_file, "<br>\n");
  1.1315 +  }
  1.1316 +  scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
  1.1317 +  scoringcontext->oldest_distinct_boost = 0;
  1.1318 +  ULScriptRType rtype = ULScriptRecognitionType(scriptspan.ulscript);
  1.1319 +  if (scoringcontext->flags_cld2_score_as_quads && (rtype != RTypeCJK)) {
  1.1320 +    rtype = RTypeMany;
  1.1321 +  }
  1.1322 +  switch (rtype) {
  1.1323 +  case RTypeNone:
  1.1324 +  case RTypeOne:
  1.1325 +    ScoreEntireScriptSpan(scriptspan, scoringcontext, doc_tote, vec);
  1.1326 +    break;
  1.1327 +  case RTypeCJK:
  1.1328 +    ScoreCJKScriptSpan(scriptspan, scoringcontext, doc_tote, vec);
  1.1329 +    break;
  1.1330 +  case RTypeMany:
  1.1331 +    ScoreQuadScriptSpan(scriptspan, scoringcontext, doc_tote, vec);
  1.1332 +    break;
  1.1333 +  }
  1.1334 +}
  1.1335 +
  1.1336 +}       // End namespace CLD2
  1.1337 +

mercurial