browser/components/translation/cld2/internal/scoreonescriptspan.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.
     2 //
     3 // Licensed under the Apache License, Version 2.0 (the "License");
     4 // you may not use this file except in compliance with the License.
     5 // You may obtain a copy of the License at
     6 //
     7 //     http://www.apache.org/licenses/LICENSE-2.0
     8 //
     9 // Unless required by applicable law or agreed to in writing, software
    10 // distributed under the License is distributed on an "AS IS" BASIS,
    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12 // See the License for the specific language governing permissions and
    13 // limitations under the License.
    15 //
    16 // Author: dsites@google.com (Dick Sites)
    17 // Updated 2014.01 for dual table lookup
    18 //
    20 #include "scoreonescriptspan.h"
    22 #include "cldutil.h"
    23 #include "debug.h"
    24 #include "lang_script.h"
    26 #include <stdio.h>
    28 using namespace std;
    30 namespace CLD2 {
    32 static const int kUnreliablePercentThreshold = 75;
    34 void AddLangProb(uint32 langprob, Tote* chunk_tote) {
    35   ProcessProbV2Tote(langprob, chunk_tote);
    36 }
    38 void ZeroPSLang(uint32 langprob, Tote* chunk_tote) {
    39   uint8 top1 = (langprob >> 8) & 0xff;
    40   chunk_tote->SetScore(top1, 0);
    41 }
    43 bool SameCloseSet(uint16 lang1, uint16 lang2) {
    44   int lang1_close_set = LanguageCloseSet(static_cast<Language>(lang1));
    45   if (lang1_close_set == 0) {return false;}
    46   int lang2_close_set = LanguageCloseSet(static_cast<Language>(lang2));
    47   return (lang1_close_set == lang2_close_set);
    48 }
    50 bool SameCloseSet(Language lang1, Language lang2) {
    51   int lang1_close_set = LanguageCloseSet(lang1);
    52   if (lang1_close_set == 0) {return false;}
    53   int lang2_close_set = LanguageCloseSet(lang2);
    54   return (lang1_close_set == lang2_close_set);
    55 }
    58 // Needs expected score per 1KB in scoring context
    59 void SetChunkSummary(ULScript ulscript, int first_linear_in_chunk,
    60                      int offset, int len,
    61                      const ScoringContext* scoringcontext,
    62                      const Tote* chunk_tote,
    63                      ChunkSummary* chunksummary) {
    64   int key3[3];
    65   chunk_tote->CurrentTopThreeKeys(key3);
    66   Language lang1 = FromPerScriptNumber(ulscript, key3[0]);
    67   Language lang2 = FromPerScriptNumber(ulscript, key3[1]);
    69   int actual_score_per_kb = 0;
    70   if (len > 0) {
    71     actual_score_per_kb = (chunk_tote->GetScore(key3[0]) << 10) / len;
    72   }
    73   int expected_subscr = lang1 * 4 + LScript4(ulscript);
    74   int expected_score_per_kb =
    75      scoringcontext->scoringtables->kExpectedScore[expected_subscr];
    77   chunksummary->offset = offset;
    78   chunksummary->chunk_start = first_linear_in_chunk;
    79   chunksummary->lang1 = lang1;
    80   chunksummary->lang2 = lang2;
    81   chunksummary->score1 = chunk_tote->GetScore(key3[0]);
    82   chunksummary->score2 = chunk_tote->GetScore(key3[1]);
    83   chunksummary->bytes = len;
    84   chunksummary->grams = chunk_tote->GetScoreCount();
    85   chunksummary->ulscript = ulscript;
    86   chunksummary->reliability_delta = ReliabilityDelta(chunksummary->score1,
    87                                                      chunksummary->score2,
    88                                                      chunksummary->grams);
    89   // If lang1/lang2 in same close set, set delta reliability to 100%
    90   if (SameCloseSet(lang1, lang2)) {
    91     chunksummary->reliability_delta = 100;
    92   }
    93   chunksummary->reliability_score =
    94      ReliabilityExpected(actual_score_per_kb, expected_score_per_kb);
    95 }
    97 // Return true if just lang1 is there: lang2=0 and lang3=0
    98 bool IsSingleLang(uint32 langprob) {
    99   // Probably a bug -- which end is lang1? But only used to call empty Boost1
   100   return ((langprob & 0x00ffff00) == 0);
   101 }
   103 // Update scoring context distinct_boost for single language quad
   104 void AddDistinctBoost1(uint32 langprob, ScoringContext* scoringcontext) {
   105   // Probably keep this empty -- not a good enough signal
   106 }
   108 // Update scoring context distinct_boost for distinct octagram
   109 // Keep last 4 used. Since these are mostly (except at splices) in
   110 // hitbuffer, we might be able to just use a subscript and splice
   111 void AddDistinctBoost2(uint32 langprob, ScoringContext* scoringcontext) {
   112 // this is called 0..n times per chunk with decoded hitbuffer->distinct...
   113   LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
   114   if (scoringcontext->ulscript != ULScript_Latin) {
   115     distinct_boost = &scoringcontext->distinct_boost.othr;
   116   }
   117   int n = distinct_boost->n;
   118   distinct_boost->langprob[n] = langprob;
   119   distinct_boost->n = distinct_boost->wrap(n + 1);
   120 }
   122 // For each chunk, add extra weight for language priors (from content-lang and
   123 // meta lang=xx) and distinctive tokens
   124 void ScoreBoosts(const ScoringContext* scoringcontext, Tote* chunk_tote) {
   125   // Get boosts for current script
   126   const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
   127   const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
   128   const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
   129   if (scoringcontext->ulscript != ULScript_Latin) {
   130     langprior_boost = &scoringcontext->langprior_boost.othr;
   131     langprior_whack = &scoringcontext->langprior_whack.othr;
   132     distinct_boost = &scoringcontext->distinct_boost.othr;
   133   }
   135   for (int k = 0; k < kMaxBoosts; ++k) {
   136     uint32 langprob = langprior_boost->langprob[k];
   137     if (langprob > 0) {AddLangProb(langprob, chunk_tote);}
   138   }
   139   for (int k = 0; k < kMaxBoosts; ++k) {
   140     uint32 langprob = distinct_boost->langprob[k];
   141     if (langprob > 0) {AddLangProb(langprob, chunk_tote);}
   142   }
   143   // boost has a packed set of per-script langs and probabilites
   144   // whack has a packed set of per-script lang to be suppressed (zeroed)
   145   // When a language in a close set is given as an explicit hint, others in
   146   //  that set will be whacked here.
   147   for (int k = 0; k < kMaxBoosts; ++k) {
   148     uint32 langprob = langprior_whack->langprob[k];
   149     if (langprob > 0) {ZeroPSLang(langprob, chunk_tote);}
   150   }
   151 }
   155 // At this point, The chunk is described by
   156 //  hitbuffer->base[cspan->chunk_base .. cspan->chunk_base + cspan->base_len)
   157 //  hitbuffer->delta[cspan->chunk_delta ... )
   158 //  hitbuffer->distinct[cspan->chunk_distinct ... )
   159 // Scored text is in text[lo..hi) where
   160 //  lo is 0 or the min of first base/delta/distinct hitbuffer offset and
   161 //  hi is the min of next base/delta/distinct hitbuffer offset after
   162 //  base_len, etc.
   163 void GetTextSpanOffsets(const ScoringHitBuffer* hitbuffer,
   164                         const ChunkSpan* cspan, int* lo, int* hi) {
   165   // Front of this span
   166   int lo_base = hitbuffer->base[cspan->chunk_base].offset;
   167   int lo_delta = hitbuffer->delta[cspan->chunk_delta].offset;
   168   int lo_distinct = hitbuffer->distinct[cspan->chunk_distinct].offset;
   169   // Front of next span
   170   int hi_base = hitbuffer->base[cspan->chunk_base +
   171     cspan->base_len].offset;
   172   int hi_delta = hitbuffer->delta[cspan->chunk_delta +
   173     cspan->delta_len].offset;
   174   int hi_distinct = hitbuffer->distinct[cspan->chunk_distinct +
   175     cspan->distinct_len].offset;
   177   *lo = 0;
   178 //  if (cspan->chunk_base > 0) {
   179 //    *lo = minint(minint(lo_base, lo_delta), lo_distinct);
   180 //  }
   181   *lo = minint(minint(lo_base, lo_delta), lo_distinct);
   182   *hi = minint(minint(hi_base, hi_delta), hi_distinct);
   183 }
   186 int DiffScore(const CLD2TableSummary* obj, int indirect,
   187               uint16 lang1, uint16 lang2) {
   188   if (indirect < static_cast<int>(obj->kCLDTableSizeOne)) {
   189     // Up to three languages at indirect
   190     uint32 langprob = obj->kCLDTableInd[indirect];
   191     return GetLangScore(langprob, lang1) - GetLangScore(langprob, lang2);
   192   } else {
   193     // Up to six languages at start + 2 * (indirect - start)
   194     indirect += (indirect - obj->kCLDTableSizeOne);
   195     uint32 langprob = obj->kCLDTableInd[indirect];
   196     uint32 langprob2 = obj->kCLDTableInd[indirect + 1];
   197     return (GetLangScore(langprob, lang1) + GetLangScore(langprob2, lang1)) -
   198       (GetLangScore(langprob, lang2) + GetLangScore(langprob2, lang2));
   199   }
   201 }
   203 // Score all the bases, deltas, distincts, boosts for one chunk into chunk_tote
   204 // After last chunk there is always a hitbuffer entry with an offset just off
   205 // the end of the text.
   206 // Sets delta_len, and distinct_len
   207 void ScoreOneChunk(const char* text, ULScript ulscript,
   208                    const ScoringHitBuffer* hitbuffer,
   209                    int chunk_i,
   210                    ScoringContext* scoringcontext,
   211                    ChunkSpan* cspan, Tote* chunk_tote,
   212                    ChunkSummary* chunksummary) {
   213   int first_linear_in_chunk = hitbuffer->chunk_start[chunk_i];
   214   int first_linear_in_next_chunk = hitbuffer->chunk_start[chunk_i + 1];
   216   chunk_tote->Reinit();
   217   cspan->delta_len = 0;
   218   cspan->distinct_len = 0;
   219   if (scoringcontext->flags_cld2_verbose) {
   220     fprintf(scoringcontext->debug_file, "<br>ScoreOneChunk[%d..%d) ",
   221             first_linear_in_chunk, first_linear_in_next_chunk);
   222   }
   224   // 2013.02.05 linear design: just use base and base_len for the span
   225   cspan->chunk_base = first_linear_in_chunk;
   226   cspan->base_len = first_linear_in_next_chunk - first_linear_in_chunk;
   227   for (int i = first_linear_in_chunk; i < first_linear_in_next_chunk; ++i) {
   228     uint32 langprob = hitbuffer->linear[i].langprob;
   229     AddLangProb(langprob, chunk_tote);
   230     if (hitbuffer->linear[i].type <= QUADHIT) {
   231       chunk_tote->AddScoreCount();      // Just count quads, not octas
   232     }
   233     if (hitbuffer->linear[i].type == DISTINCTHIT) {
   234       AddDistinctBoost2(langprob, scoringcontext);
   235     }
   236   }
   238   // Score language prior boosts
   239   // Score distinct word boost
   240   ScoreBoosts(scoringcontext, chunk_tote);
   242   int lo = hitbuffer->linear[first_linear_in_chunk].offset;
   243   int hi = hitbuffer->linear[first_linear_in_next_chunk].offset;
   245   // Chunk_tote: get top langs, scores, etc. and fill in chunk summary
   246   SetChunkSummary(ulscript, first_linear_in_chunk, lo, hi - lo,
   247                   scoringcontext, chunk_tote, chunksummary);
   249   bool more_to_come = false;
   250   bool score_cjk = false;
   251   if (scoringcontext->flags_cld2_html) {
   252     // Show one chunk in readable output
   253     CLD2_Debug(text, lo, hi, more_to_come, score_cjk, hitbuffer,
   254                scoringcontext, cspan, chunksummary);
   255   }
   257   scoringcontext->prior_chunk_lang = static_cast<Language>(chunksummary->lang1);
   258 }
   261 // Score chunks of text described by hitbuffer, allowing each to be in a
   262 // different language, and optionally adjusting the boundaries inbetween.
   263 // Set last_cspan to the last chunkspan used
   264 void ScoreAllHits(const char* text,  ULScript ulscript,
   265                   bool more_to_come, bool score_cjk,
   266                   const ScoringHitBuffer* hitbuffer,
   267                   ScoringContext* scoringcontext,
   268                   SummaryBuffer* summarybuffer, ChunkSpan* last_cspan) {
   269   ChunkSpan prior_cspan = {0, 0, 0, 0, 0, 0};
   270   ChunkSpan cspan = {0, 0, 0, 0, 0, 0};
   272   for (int i = 0; i < hitbuffer->next_chunk_start; ++i) {
   273     // Score one chunk
   274     // Sets delta_len, and distinct_len
   275     Tote chunk_tote;
   276     ChunkSummary chunksummary;
   277     ScoreOneChunk(text, ulscript,
   278                   hitbuffer, i,
   279                   scoringcontext, &cspan, &chunk_tote, &chunksummary);
   281     // Put result in summarybuffer
   282     if (summarybuffer->n < kMaxSummaries) {
   283       summarybuffer->chunksummary[summarybuffer->n] = chunksummary;
   284       summarybuffer->n += 1;
   285     }
   287     prior_cspan = cspan;
   288     cspan.chunk_base += cspan.base_len;
   289     cspan.chunk_delta += cspan.delta_len;
   290     cspan.chunk_distinct += cspan.distinct_len;
   291   }
   293   // Add one dummy off the end to hold first unused linear_in_chunk
   294   int linear_off_end = hitbuffer->next_linear;
   295   int offset_off_end = hitbuffer->linear[linear_off_end].offset;
   296   ChunkSummary* cs = &summarybuffer->chunksummary[summarybuffer->n];
   297   memset(cs, 0, sizeof(ChunkSummary));
   298   cs->offset = offset_off_end;
   299   cs->chunk_start = linear_off_end;
   300   *last_cspan = prior_cspan;
   301 }
   304 void SummaryBufferToDocTote(const SummaryBuffer* summarybuffer,
   305                             bool more_to_come, DocTote* doc_tote) {
   306   int cs_bytes_sum = 0;
   307   for (int i = 0; i < summarybuffer->n; ++i) {
   308     const ChunkSummary* cs = &summarybuffer->chunksummary[i];
   309     int reliability = minint(cs->reliability_delta, cs->reliability_score);
   310     // doc_tote uses full languages
   311     doc_tote->Add(cs->lang1, cs->bytes, cs->score1, reliability);
   312     cs_bytes_sum += cs->bytes;
   313   }
   314 }
   316 // Turn on for debugging vectors
   317 static const bool kShowLettersOriginal = false;
   320 // If next chunk language matches last vector language, extend last element
   321 // Otherwise add new element to vector
   322 void ItemToVector(ScriptScanner* scanner,
   323                   ResultChunkVector* vec, Language new_lang,
   324                   int mapped_offset, int mapped_len) {
   325   uint16 last_vec_lang = static_cast<uint16>(UNKNOWN_LANGUAGE);
   326   int last_vec_subscr = vec->size() - 1;
   327   if (last_vec_subscr >= 0) {
   328     ResultChunk* priorrc = &(*vec)[last_vec_subscr];
   329     last_vec_lang = priorrc->lang1;
   330     if (new_lang == last_vec_lang) {
   331       // Extend prior. Current mapped_offset may be beyond prior end, so do
   332       // the arithmetic to include any such gap
   333       priorrc->bytes = minint((mapped_offset + mapped_len) - priorrc->offset,
   334                               kMaxResultChunkBytes);
   335       if (kShowLettersOriginal) {
   336         // Optionally print the new chunk original text
   337         string temp2(&scanner->GetBufferStart()[priorrc->offset],
   338                      priorrc->bytes);
   339         fprintf(stderr, "Item[%d..%d) '%s'<br>\n",
   340                 priorrc->offset, priorrc->offset + priorrc->bytes,
   341                 GetHtmlEscapedText(temp2).c_str());
   342       }
   343       return;
   344     }
   345   }
   346   // Add new vector element
   347   ResultChunk rc;
   348   rc.offset = mapped_offset;
   349   rc.bytes = minint(mapped_len, kMaxResultChunkBytes);
   350   rc.lang1 = static_cast<uint16>(new_lang);
   351   vec->push_back(rc);
   352   if (kShowLettersOriginal) {
   353     // Optionally print the new chunk original text
   354     string temp2(&scanner->GetBufferStart()[rc.offset], rc.bytes);
   355     fprintf(stderr, "Item[%d..%d) '%s'<br>\n",
   356             rc.offset, rc.offset + rc.bytes,
   357             GetHtmlEscapedText(temp2).c_str());
   358   }
   359 }
   361 uint16 PriorVecLang(const ResultChunkVector* vec) {
   362   if (vec->empty()) {return static_cast<uint16>(UNKNOWN_LANGUAGE);}
   363   return (*vec)[vec->size() - 1].lang1;
   364 }
   366 uint16 NextChunkLang(const SummaryBuffer* summarybuffer, int i) {
   367   if ((i + 1) >= summarybuffer->n) {
   368     return static_cast<uint16>(UNKNOWN_LANGUAGE);
   369   }
   370   return summarybuffer->chunksummary[i + 1].lang1;
   371 }
   375 // Add n elements of summarybuffer to resultchunk vector:
   376 // Each element is letters-only text [offset..offset+bytes)
   377 // This maps back to original[Back(offset)..Back(offset+bytes))
   378 //
   379 // We go out of our way to minimize the variation in the ResultChunkVector,
   380 // so that the caller has fewer but more meaningful spans in different
   381 // lanaguges, for the likely purpose of translation or spell-check.
   382 //
   383 // The language of each chunk is lang1, but it might be unreliable for
   384 // either of two reasons: its score is relatively too close to the score of
   385 // lang2, or its score is too far away from the expected score of real text in
   386 // the given language. Unreliable languages are mapped to Unknown.
   387 //
   388 void SummaryBufferToVector(ScriptScanner* scanner, const char* text,
   389                            const SummaryBuffer* summarybuffer,
   390                            bool more_to_come, ResultChunkVector* vec) {
   391   if (vec == NULL) {return;}
   393   if (kShowLettersOriginal) {
   394     fprintf(stderr, "map2original_ ");
   395     scanner->map2original_.DumpWindow();
   396     fprintf(stderr, "<br>\n");
   397     fprintf(stderr, "map2uplow_ ");
   398     scanner->map2uplow_.DumpWindow();
   399     fprintf(stderr, "<br>\n");
   400   }
   402   for (int i = 0; i < summarybuffer->n; ++i) {
   403     const ChunkSummary* cs = &summarybuffer->chunksummary[i];
   404     int unmapped_offset = cs->offset;
   405     int unmapped_len = cs->bytes;
   407     if (kShowLettersOriginal) {
   408       // Optionally print the chunk lowercase letters/marks text
   409       string temp(&text[unmapped_offset], unmapped_len);
   410       fprintf(stderr, "Letters [%d..%d) '%s'<br>\n",
   411               unmapped_offset, unmapped_offset + unmapped_len,
   412               GetHtmlEscapedText(temp).c_str());
   413     }
   415     int mapped_offset = scanner->MapBack(unmapped_offset);
   417     // Trim back a little to prefer splicing original at word boundaries
   418     if (mapped_offset > 0) {
   419       // Size of prior vector entry, if any
   420       int prior_size = 0;
   421       if (!vec->empty()) {
   422         ResultChunk* rc = &(*vec)[vec->size() - 1];
   423         prior_size = rc->bytes;
   424       }
   425       // Maximum back up size to leave at least 3 bytes in prior,
   426       // and not entire buffer, and no more than 12 bytes total backup
   427       int n_limit = minint(prior_size - 3, mapped_offset);
   428       n_limit = minint(n_limit, 12);
   430       // Backscan over letters, stopping if prior byte is < 0x41
   431       // There is some possibility that we will backscan over a different script
   432       const char* s = &scanner->GetBufferStart()[mapped_offset];
   433       const unsigned char* us = reinterpret_cast<const unsigned char*>(s);
   434       int n = 0;
   435       while ((n < n_limit) && (us[-n - 1] >= 0x41)) {++n;}
   436       if (n >= n_limit) {n = 0;} // New boundary not found within range
   438       // Also back up exactly one leading punctuation character if '"#@
   439       if (n < n_limit) {
   440         unsigned char c = us[-n - 1];
   441         if ((c == '\'') || (c == '"') || (c == '#') || (c == '@')) {++n;}
   442       }
   443       // Shrink the previous chunk slightly
   444       if (n > 0) {
   445         ResultChunk* rc = &(*vec)[vec->size() - 1];
   446         rc->bytes -= n;
   447         mapped_offset -= n;
   448         if (kShowLettersOriginal) {
   449           fprintf(stderr, "Back up %d bytes<br>\n", n);
   450           // Optionally print the prior chunk original text
   451           string temp2(&scanner->GetBufferStart()[rc->offset], rc->bytes);
   452           fprintf(stderr, "Prior   [%d..%d) '%s'<br>\n",
   453                   rc->offset, rc->offset + rc->bytes,
   454                   GetHtmlEscapedText(temp2).c_str());
   455         }
   456       }
   457     }
   459     int mapped_len =
   460       scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset;
   462     if (kShowLettersOriginal) {
   463       // Optionally print the chunk original text
   464       string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len);
   465       fprintf(stderr, "Original[%d..%d) '%s'<br>\n",
   466               mapped_offset, mapped_offset + mapped_len,
   467               GetHtmlEscapedText(temp2).c_str());
   468     }
   470     Language new_lang = static_cast<Language>(cs->lang1);
   471     bool reliability_delta_bad =
   472       (cs->reliability_delta < kUnreliablePercentThreshold);
   473     bool reliability_score_bad =
   474       (cs->reliability_score < kUnreliablePercentThreshold);
   476     // If the top language matches last vector, ignore reliability_delta
   477     uint16 prior_lang = PriorVecLang(vec);
   478     if (prior_lang == cs->lang1) {
   479       reliability_delta_bad = false;
   480     }
   481     // If the top language is in same close set as last vector, set up to merge
   482     if (SameCloseSet(cs->lang1, prior_lang)) {
   483       new_lang = static_cast<Language>(prior_lang);
   484       reliability_delta_bad = false;
   485     }
   486     // If the top two languages are in the same close set and the last vector
   487     // language is the second language, set up to merge
   488     if (SameCloseSet(cs->lang1, cs->lang2) &&
   489         (prior_lang == cs->lang2)) {
   490       new_lang = static_cast<Language>(prior_lang);
   491       reliability_delta_bad = false;
   492     }
   493     // If unreliable and the last and next vector languages are both
   494     // the second language, set up to merge
   495     uint16 next_lang = NextChunkLang(summarybuffer, i);
   496     if (reliability_delta_bad &&
   497         (prior_lang == cs->lang2) && (next_lang == cs->lang2)) {
   498       new_lang = static_cast<Language>(prior_lang);
   499       reliability_delta_bad = false;
   500     }
   502     if (reliability_delta_bad || reliability_score_bad) {
   503       new_lang = UNKNOWN_LANGUAGE;
   504     }
   505     ItemToVector(scanner, vec, new_lang, mapped_offset, mapped_len);
   506   }
   507 }
   509 // Add just one element to resultchunk vector:
   510 // For RTypeNone or RTypeOne
   511 void JustOneItemToVector(ScriptScanner* scanner, const char* text,
   512                          Language lang1, int unmapped_offset, int unmapped_len,
   513                          ResultChunkVector* vec) {
   514   if (vec == NULL) {return;}
   516   if (kShowLettersOriginal) {
   517     fprintf(stderr, "map2original_ ");
   518     scanner->map2original_.DumpWindow();
   519     fprintf(stderr, "<br>\n");
   520     fprintf(stderr, "map2uplow_ ");
   521     scanner->map2uplow_.DumpWindow();
   522     fprintf(stderr, "<br>\n");
   523   }
   525   if (kShowLettersOriginal) {
   526    // Optionally print the chunk lowercase letters/marks text
   527    string temp(&text[unmapped_offset], unmapped_len);
   528    fprintf(stderr, "Letters1 [%d..%d) '%s'<br>\n",
   529            unmapped_offset, unmapped_offset + unmapped_len,
   530            GetHtmlEscapedText(temp).c_str());
   531   }
   533   int mapped_offset = scanner->MapBack(unmapped_offset);
   534   int mapped_len =
   535     scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset;
   537   if (kShowLettersOriginal) {
   538     // Optionally print the chunk original text
   539     string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len);
   540     fprintf(stderr, "Original1[%d..%d) '%s'<br>\n",
   541             mapped_offset, mapped_offset + mapped_len,
   542             GetHtmlEscapedText(temp2).c_str());
   543   }
   545   ItemToVector(scanner, vec, lang1, mapped_offset, mapped_len);
   546 }
   549 // Debugging. Not thread safe. Defined in getonescriptspan
   550 char* DisplayPiece(const char* next_byte_, int byte_length_);
   552 // If high bit is on, take out high bit and add 2B to make table2 entries easy
   553 inline int PrintableIndirect(int x) {
   554   if ((x & 0x80000000u) != 0) {
   555     return (x & ~0x80000000u) + 2000000000;
   556   }
   557   return x;
   558 }
   559 void DumpHitBuffer(FILE* df, const char* text,
   560                    const ScoringHitBuffer* hitbuffer) {
   561   fprintf(df,
   562           "<br>DumpHitBuffer[%s, next_base/delta/distinct %d, %d, %d)<br>\n",
   563           ULScriptCode(hitbuffer->ulscript),
   564           hitbuffer->next_base, hitbuffer->next_delta,
   565           hitbuffer->next_distinct);
   566   for (int i = 0; i < hitbuffer->maxscoringhits; ++i) {
   567     if (i < hitbuffer->next_base) {
   568       fprintf(df, "Q[%d]%d,%d,%s ",
   569               i, hitbuffer->base[i].offset,
   570               PrintableIndirect(hitbuffer->base[i].indirect),
   571               DisplayPiece(&text[hitbuffer->base[i].offset], 6));
   572     }
   573     if (i < hitbuffer->next_delta) {
   574       fprintf(df, "DL[%d]%d,%d,%s ",
   575               i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,
   576               DisplayPiece(&text[hitbuffer->delta[i].offset], 12));
   577     }
   578     if (i < hitbuffer->next_distinct) {
   579       fprintf(df, "D[%d]%d,%d,%s ",
   580               i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect,
   581               DisplayPiece(&text[hitbuffer->distinct[i].offset], 12));
   582     }
   583     if (i < hitbuffer->next_base) {
   584       fprintf(df, "<br>\n");
   585     }
   586     if (i > 50) {break;}
   587   }
   588   if (hitbuffer->next_base > 50) {
   589     int i = hitbuffer->next_base;
   590     fprintf(df, "Q[%d]%d,%d,%s ",
   591             i, hitbuffer->base[i].offset,
   592             PrintableIndirect(hitbuffer->base[i].indirect),
   593             DisplayPiece(&text[hitbuffer->base[i].offset], 6));
   594   }
   595   if (hitbuffer->next_delta > 50) {
   596     int i = hitbuffer->next_delta;
   597     fprintf(df, "DL[%d]%d,%d,%s ",
   598             i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,
   599             DisplayPiece(&text[hitbuffer->delta[i].offset], 12));
   600   }
   601   if (hitbuffer->next_distinct > 50) {
   602     int i = hitbuffer->next_distinct;
   603     fprintf(df, "D[%d]%d,%d,%s ",
   604             i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect,
   605             DisplayPiece(&text[hitbuffer->distinct[i].offset], 12));
   606   }
   607   fprintf(df, "<br>\n");
   608 }
   611 void DumpLinearBuffer(FILE* df, const char* text,
   612                       const ScoringHitBuffer* hitbuffer) {
   613   fprintf(df, "<br>DumpLinearBuffer[%d)<br>\n",
   614           hitbuffer->next_linear);
   615   // Include the dummy entry off the end
   616   for (int i = 0; i < hitbuffer->next_linear + 1; ++i) {
   617     if ((50 < i) && (i < (hitbuffer->next_linear - 1))) {continue;}
   618     fprintf(df, "[%d]%d,%c=%08x,%s<br>\n",
   619             i, hitbuffer->linear[i].offset,
   620             "UQLD"[hitbuffer->linear[i].type],
   621             hitbuffer->linear[i].langprob,
   622             DisplayPiece(&text[hitbuffer->linear[i].offset], 6));
   623   }
   624   fprintf(df, "<br>\n");
   626   fprintf(df, "DumpChunkStart[%d]<br>\n", hitbuffer->next_chunk_start);
   627   for (int i = 0; i < hitbuffer->next_chunk_start + 1; ++i) {
   628     fprintf(df, "[%d]%d\n", i, hitbuffer->chunk_start[i]);
   629   }
   630   fprintf(df, "<br>\n");
   631 }
   633 // Move this verbose debugging output to debug.cc eventually
   634 void DumpChunkSummary(FILE* df, const ChunkSummary* cs) {
   635   // Print chunksummary
   636   fprintf(df, "%d lin[%d] %s.%d %s.%d %dB %d# %s %dRd %dRs<br>\n",
   637           cs->offset,
   638           cs->chunk_start,
   639           LanguageCode(static_cast<Language>(cs->lang1)),
   640           cs->score1,
   641           LanguageCode(static_cast<Language>(cs->lang2)),
   642           cs->score2,
   643           cs->bytes,
   644           cs->grams,
   645           ULScriptCode(static_cast<ULScript>(cs->ulscript)),
   646           cs->reliability_delta,
   647           cs->reliability_score);
   648 }
   650 void DumpSummaryBuffer(FILE* df, const SummaryBuffer* summarybuffer) {
   651   fprintf(df, "<br>DumpSummaryBuffer[%d]<br>\n", summarybuffer->n);
   652   fprintf(df, "[i] offset linear[chunk_start] lang.score1 lang.score2 "
   653               "bytesB ngrams# script rel_delta rel_score<br>\n");
   654   for (int i = 0; i <= summarybuffer->n; ++i) {
   655     fprintf(df, "[%d] ", i);
   656     DumpChunkSummary(df, &summarybuffer->chunksummary[i]);
   657   }
   658   fprintf(df, "<br>\n");
   659 }
   663 // Within hitbufer->linear[]
   664 // <-- prior chunk --><-- this chunk -->
   665 // |                  |                 |
   666 // linear0            linear1           linear2
   667 //     lang0              lang1
   668 // The goal of sharpening is to move this_linear to better separate langs
   669 int BetterBoundary(const char* text,
   670                    ScoringHitBuffer* hitbuffer,
   671                    ScoringContext* scoringcontext,
   672                    uint16 pslang0, uint16 pslang1,
   673                    int linear0, int linear1, int linear2) {
   674   // Degenerate case, no change
   675   if ((linear2 - linear0) <= 8) {return linear1;}
   677   // Each diff gives pslang0 score - pslang1 score
   678   // Running diff has four entries + + + + followed by four entries - - - -
   679   // so that this value is maximal at the sharpest boundary between pslang0
   680   // (positive diffs) and pslang1 (negative diffs)
   681   int running_diff = 0;
   682   int diff[8];    // Ring buffer of pslang0-pslang1 differences
   683   // Initialize with first 8 diffs
   684   for (int i = linear0; i < linear0 + 8; ++i) {
   685     int j = i & 7;
   686     uint32 langprob = hitbuffer->linear[i].langprob;
   687     diff[j] = GetLangScore(langprob, pslang0) -
   688        GetLangScore(langprob, pslang1);
   689     if (i < linear0 + 4) {
   690       // First four diffs pslang0 - pslang1
   691       running_diff += diff[j];
   692     } else {
   693       // Second four diffs -(pslang0 - pslang1)
   694       running_diff -= diff[j];
   695     }
   696   }
   698   // Now scan for sharpest boundary. j is at left end of 8 entries
   699   // To be a boundary, there must be both >0 and <0 entries in the window
   700   int better_boundary_value = 0;
   701   int better_boundary = linear1;
   702   for (int i = linear0; i < linear2 - 8; ++i) {
   703     int j = i & 7;
   704     if (better_boundary_value < running_diff) {
   705       bool has_plus = false;
   706       bool has_minus = false;
   707       for (int kk = 0; kk < 8; ++kk) {
   708         if (diff[kk] > 0) {has_plus = true;}
   709         if (diff[kk] < 0) {has_minus = true;}
   710       }
   711       if (has_plus && has_minus) {
   712         better_boundary_value = running_diff;
   713         better_boundary = i + 4;
   714       }
   715     }
   716     // Shift right one entry
   717     uint32 langprob = hitbuffer->linear[i + 8].langprob;
   718     int newdiff = GetLangScore(langprob, pslang0) -
   719        GetLangScore(langprob, pslang1);
   720     int middiff = diff[(i + 4) & 7];
   721     int olddiff = diff[j];
   722     diff[j] = newdiff;
   723     running_diff -= olddiff;                  // Remove left
   724     running_diff += 2 * middiff;              // Convert middle from - to +
   725     running_diff -= newdiff;                  // Insert right
   726   }
   728   if (scoringcontext->flags_cld2_verbose && (linear1 != better_boundary)) {
   729     Language lang0 = FromPerScriptNumber(scoringcontext->ulscript, pslang0);
   730     Language lang1 = FromPerScriptNumber(scoringcontext->ulscript, pslang1);
   731     fprintf(scoringcontext->debug_file, " Better lin[%d=>%d] %s^^%s <br>\n",
   732             linear1, better_boundary,
   733             LanguageCode(lang0), LanguageCode(lang1));
   734     int lin0_off = hitbuffer->linear[linear0].offset;
   735     int lin1_off = hitbuffer->linear[linear1].offset;
   736     int lin2_off = hitbuffer->linear[linear2].offset;
   737     int better_offm1 = hitbuffer->linear[better_boundary - 1].offset;
   738     int better_off = hitbuffer->linear[better_boundary].offset;
   739     int better_offp1 = hitbuffer->linear[better_boundary + 1].offset;
   740     string old0(&text[lin0_off], lin1_off - lin0_off);
   741     string old1(&text[lin1_off], lin2_off - lin1_off);
   742     string new0(&text[lin0_off], better_offm1 - lin0_off);
   743     string new0m1(&text[better_offm1], better_off - better_offm1);
   744     string new1(&text[better_off], better_offp1 - better_off);
   745     string new1p1(&text[better_offp1], lin2_off - better_offp1);
   746     fprintf(scoringcontext->debug_file, "%s^^%s => <br>\n%s^%s^^%s^%s<br>\n",
   747             GetHtmlEscapedText(old0).c_str(),
   748             GetHtmlEscapedText(old1).c_str(),
   749             GetHtmlEscapedText(new0).c_str(),
   750             GetHtmlEscapedText(new0m1).c_str(),
   751             GetHtmlEscapedText(new1).c_str(),
   752             GetHtmlEscapedText(new1p1).c_str());
   753     // Slow picture of differences per linear entry
   754     int d;
   755     for (int i = linear0; i < linear2; ++i) {
   756       if (i == better_boundary) {
   757         fprintf(scoringcontext->debug_file, "^^ ");
   758       }
   759       uint32 langprob = hitbuffer->linear[i].langprob;
   760       d = GetLangScore(langprob, pslang0) - GetLangScore(langprob, pslang1);
   761       const char* s = "=";
   762       //if (d > 2) {s = "\xc2\xaf";}    // Macron
   763       if (d > 2) {s = "#";}
   764       else if (d > 0) {s = "+";}
   765       else if (d < -2) {s = "_";}
   766       else if (d < 0) {s = "-";}
   767       fprintf(scoringcontext->debug_file, "%s ", s);
   768     }
   769     fprintf(scoringcontext->debug_file, " &nbsp;&nbsp;(scale: #+=-_)<br>\n");
   770   }
   771   return better_boundary;
   772 }
   775 // For all but the first summary, if its top language differs from
   776 // the previous chunk, refine the boundary
   777 // Linearized version
   778 void SharpenBoundaries(const char* text,
   779                        bool more_to_come,
   780                        ScoringHitBuffer* hitbuffer,
   781                        ScoringContext* scoringcontext,
   782                        SummaryBuffer* summarybuffer) {
   784   int prior_linear = summarybuffer->chunksummary[0].chunk_start;
   785   uint16 prior_lang = summarybuffer->chunksummary[0].lang1;
   787   if (scoringcontext->flags_cld2_verbose) {
   788     fprintf(scoringcontext->debug_file, "<br>SharpenBoundaries<br>\n");
   789   }
   790   for (int i = 1; i < summarybuffer->n; ++i) {
   791     ChunkSummary* cs = &summarybuffer->chunksummary[i];
   792     uint16 this_lang = cs->lang1;
   793     if (this_lang == prior_lang) {
   794       prior_linear = cs->chunk_start;
   795       continue;
   796     }
   798     int this_linear = cs->chunk_start;
   799     int next_linear = summarybuffer->chunksummary[i + 1].chunk_start;
   801     // If this/prior in same close set, don't move boundary
   802     if (SameCloseSet(prior_lang, this_lang)) {
   803       prior_linear = this_linear;
   804       prior_lang = this_lang;
   805       continue;
   806     }
   809     // Within hitbuffer->linear[]
   810     // <-- prior chunk --><-- this chunk -->
   811     // |                  |                 |
   812     // prior_linear       this_linear       next_linear
   813     //     prior_lang         this_lang
   814     // The goal of sharpening is to move this_linear to better separate langs
   816     uint8 pslang0 = PerScriptNumber(scoringcontext->ulscript,
   817                                     static_cast<Language>(prior_lang));
   818     uint8 pslang1 = PerScriptNumber(scoringcontext->ulscript,
   819                                     static_cast<Language>(this_lang));
   820     int better_linear = BetterBoundary(text,
   821                                        hitbuffer,
   822                                        scoringcontext,
   823                                        pslang0, pslang1,
   824                                        prior_linear, this_linear, next_linear);
   826     int old_offset = hitbuffer->linear[this_linear].offset;
   827     int new_offset = hitbuffer->linear[better_linear].offset;
   828     cs->chunk_start = better_linear;
   829     cs->offset = new_offset;
   830     // If this_linear moved right, make bytes smaller for this, larger for prior
   831     // If this_linear moved left, make bytes larger for this, smaller for prior
   832     cs->bytes -= (new_offset - old_offset);
   833     summarybuffer->chunksummary[i - 1].bytes += (new_offset - old_offset);
   835     this_linear = better_linear;    // Update so that next chunk doesn't intrude
   837     // Consider rescoring the two chunks
   839     // Update for next round (note: using pre-updated boundary)
   840     prior_linear = this_linear;
   841     prior_lang = this_lang;
   842   }
   843 }
   845 // Make a langprob that gives small weight to the default language for ulscript
   846 uint32 DefaultLangProb(ULScript ulscript) {
   847   Language default_lang = DefaultLanguage(ulscript);
   848   return MakeLangProb(default_lang, 1);
   849 }
   851 // Effectively, do a merge-sort based on text offsets
   852 // Look up each indirect value in appropriate scoring table and keep
   853 // just the resulting langprobs
   854 void LinearizeAll(ScoringContext* scoringcontext, bool score_cjk,
   855                   ScoringHitBuffer* hitbuffer) {
   856   const CLD2TableSummary* base_obj;       // unigram or quadgram
   857   const CLD2TableSummary* base_obj2;      // quadgram dual table
   858   const CLD2TableSummary* delta_obj;      // bigram or octagram
   859   const CLD2TableSummary* distinct_obj;   // bigram or octagram
   860   uint16 base_hit;
   861   if (score_cjk) {
   862     base_obj = scoringcontext->scoringtables->unigram_compat_obj;
   863     base_obj2 = scoringcontext->scoringtables->unigram_compat_obj;
   864     delta_obj = scoringcontext->scoringtables->deltabi_obj;
   865     distinct_obj = scoringcontext->scoringtables->distinctbi_obj;
   866     base_hit = UNIHIT;
   867   } else {
   868     base_obj = scoringcontext->scoringtables->quadgram_obj;
   869     base_obj2 = scoringcontext->scoringtables->quadgram_obj2;
   870     delta_obj = scoringcontext->scoringtables->deltaocta_obj;
   871     distinct_obj = scoringcontext->scoringtables->distinctocta_obj;
   872     base_hit = QUADHIT;
   873   }
   875   int base_limit = hitbuffer->next_base;
   876   int delta_limit = hitbuffer->next_delta;
   877   int distinct_limit = hitbuffer->next_distinct;
   878   int base_i = 0;
   879   int delta_i = 0;
   880   int distinct_i = 0;
   881   int linear_i = 0;
   883   // Start with an initial base hit for the default language for this script
   884   // Inserting this avoids edge effects with no hits at all
   885   hitbuffer->linear[linear_i].offset = hitbuffer->lowest_offset;
   886   hitbuffer->linear[linear_i].type = base_hit;
   887   hitbuffer->linear[linear_i].langprob =
   888     DefaultLangProb(scoringcontext->ulscript);
   889   ++linear_i;
   891   while ((base_i < base_limit) || (delta_i < delta_limit) ||
   892          (distinct_i < distinct_limit)) {
   893     int base_off = hitbuffer->base[base_i].offset;
   894     int delta_off = hitbuffer->delta[delta_i].offset;
   895     int distinct_off = hitbuffer->distinct[distinct_i].offset;
   897     // Do delta and distinct first, so that they are not lost at base_limit
   898     if ((delta_i < delta_limit) &&
   899         (delta_off <= base_off) && (delta_off <= distinct_off)) {
   900       // Add delta entry
   901       int indirect = hitbuffer->delta[delta_i].indirect;
   902       ++delta_i;
   903       uint32 langprob = delta_obj->kCLDTableInd[indirect];
   904       if (langprob > 0) {
   905         hitbuffer->linear[linear_i].offset = delta_off;
   906         hitbuffer->linear[linear_i].type = DELTAHIT;
   907         hitbuffer->linear[linear_i].langprob = langprob;
   908         ++linear_i;
   909       }
   910     }
   911     else if ((distinct_i < distinct_limit) &&
   912              (distinct_off <= base_off) && (distinct_off <= delta_off)) {
   913       // Add distinct entry
   914       int indirect = hitbuffer->distinct[distinct_i].indirect;
   915       ++distinct_i;
   916       uint32 langprob = distinct_obj->kCLDTableInd[indirect];
   917       if (langprob > 0) {
   918         hitbuffer->linear[linear_i].offset = distinct_off;
   919         hitbuffer->linear[linear_i].type = DISTINCTHIT;
   920         hitbuffer->linear[linear_i].langprob = langprob;
   921         ++linear_i;
   922       }
   923     }
   924     else {
   925       // Add one or two base entries
   926       int indirect = hitbuffer->base[base_i].indirect;
   927       // First, get right scoring table
   928       const CLD2TableSummary* local_base_obj = base_obj;
   929       if ((indirect & 0x80000000u) != 0) {
   930         local_base_obj = base_obj2;
   931         indirect &= ~0x80000000u;
   932       }
   933       ++base_i;
   934       // One langprob in kQuadInd[0..SingleSize),
   935       // two in kQuadInd[SingleSize..Size)
   936       if (indirect < static_cast<int>(local_base_obj->kCLDTableSizeOne)) {
   937         // Up to three languages at indirect
   938         uint32 langprob = local_base_obj->kCLDTableInd[indirect];
   939         if (langprob > 0) {
   940           hitbuffer->linear[linear_i].offset = base_off;
   941           hitbuffer->linear[linear_i].type = base_hit;
   942           hitbuffer->linear[linear_i].langprob = langprob;
   943           ++linear_i;
   944         }
   945       } else {
   946         // Up to six languages at start + 2 * (indirect - start)
   947         indirect += (indirect - local_base_obj->kCLDTableSizeOne);
   948         uint32 langprob = local_base_obj->kCLDTableInd[indirect];
   949         uint32 langprob2 = local_base_obj->kCLDTableInd[indirect + 1];
   950         if (langprob > 0) {
   951           hitbuffer->linear[linear_i].offset = base_off;
   952           hitbuffer->linear[linear_i].type = base_hit;
   953           hitbuffer->linear[linear_i].langprob = langprob;
   954           ++linear_i;
   955         }
   956         if (langprob2 > 0) {
   957           hitbuffer->linear[linear_i].offset = base_off;
   958           hitbuffer->linear[linear_i].type = base_hit;
   959           hitbuffer->linear[linear_i].langprob = langprob2;
   960           ++linear_i;
   961         }
   962       }
   963     }
   964   }
   966   // Update
   967   hitbuffer->next_linear = linear_i;
   969   // Add a dummy entry off the end, just to capture final offset
   970   hitbuffer->linear[linear_i].offset =
   971   hitbuffer->base[hitbuffer->next_base].offset;
   972   hitbuffer->linear[linear_i].langprob = 0;
   973 }
   975 // Break linear array into chunks of ~20 quadgram hits or ~50 CJK unigram hits
   976 void ChunkAll(int letter_offset, bool score_cjk, ScoringHitBuffer* hitbuffer) {
   977   int chunksize;
   978   uint16 base_hit;
   979   if (score_cjk) {
   980     chunksize = kChunksizeUnis;
   981     base_hit = UNIHIT;
   982   } else {
   983     chunksize = kChunksizeQuads;
   984     base_hit = QUADHIT;
   985   }
   987   int linear_i = 0;
   988   int linear_off_end = hitbuffer->next_linear;
   989   int text_i = letter_offset;               // Next unseen text offset
   990   int next_chunk_start = 0;
   991   int bases_left = hitbuffer->next_base;
   992   while (bases_left > 0) {
   993     // Linearize one chunk
   994     int base_len = chunksize;     // Default; may be changed below
   995     if (bases_left < (chunksize + (chunksize >> 1))) {
   996       // If within 1.5 chunks of the end, avoid runts by using it all
   997       base_len = bases_left;
   998     } else if (bases_left < (2 * chunksize)) {
   999       // Avoid runts by splitting 1.5 to 2 chunks in half (about 3/4 each)
  1000       base_len = (bases_left + 1) >> 1;
  1003     hitbuffer->chunk_start[next_chunk_start] = linear_i;
  1004     hitbuffer->chunk_offset[next_chunk_start] = text_i;
  1005     ++next_chunk_start;
  1007     int base_count = 0;
  1008     while ((base_count < base_len) && (linear_i < linear_off_end)) {
  1009       if (hitbuffer->linear[linear_i].type == base_hit) {++base_count;}
  1010       ++linear_i;
  1012     text_i = hitbuffer->linear[linear_i].offset;    // Next unseen text offset
  1013     bases_left -= base_len;
  1016   // If no base hits at all, make a single dummy chunk
  1017   if (next_chunk_start == 0) {
  1018      hitbuffer->chunk_start[next_chunk_start] = 0;
  1019      hitbuffer->chunk_offset[next_chunk_start] = hitbuffer->linear[0].offset;
  1020      ++next_chunk_start;
  1023   // Remember the linear array start of dummy entry
  1024   hitbuffer->next_chunk_start = next_chunk_start;
  1026   // Add a dummy entry off the end, just to capture final linear subscr
  1027   hitbuffer->chunk_start[next_chunk_start] = hitbuffer->next_linear;
  1028   hitbuffer->chunk_offset[next_chunk_start] = text_i;
  1032 // Merge-sort the individual hit arrays, go indirect on the scoring subscripts,
  1033 // break linear array into chunks.
  1034 //
  1035 // Input:
  1036 //  hitbuffer base, delta, distinct arrays
  1037 // Output:
  1038 //  linear array
  1039 //  chunk_start array
  1040 //
  1041 void LinearizeHitBuffer(int letter_offset,
  1042                         ScoringContext* scoringcontext,
  1043                         bool more_to_come, bool score_cjk,
  1044                         ScoringHitBuffer* hitbuffer) {
  1045   LinearizeAll(scoringcontext, score_cjk, hitbuffer);
  1046   ChunkAll(letter_offset, score_cjk, hitbuffer);
  1051 // The hitbuffer is in an awkward form -- three sets of base/delta/distinct
  1052 // scores, each with an indirect subscript to one of six scoring tables, some
  1053 // of which can yield two langprobs for six languages, others one langprob for
  1054 // three languages. The only correlation between base/delta/distinct is their
  1055 // offsets into the letters-only text buffer.
  1056 //
  1057 // SummaryBuffer needs to be built to linear, giving linear offset of start of
  1058 // each chunk
  1059 //
  1060 // So we first do all the langprob lookups and merge-sort by offset to make
  1061 // a single linear vector, building a side vector of chunk beginnings as we go.
  1062 // The sharpening is simply moving the beginnings, scoring is a simple linear
  1063 // sweep, etc.
  1065 void ProcessHitBuffer(const LangSpan& scriptspan,
  1066                       int letter_offset,
  1067                       ScoringContext* scoringcontext,
  1068                       DocTote* doc_tote,
  1069                       ResultChunkVector* vec,
  1070                       bool more_to_come, bool score_cjk,
  1071                       ScoringHitBuffer* hitbuffer) {
  1072   if (scoringcontext->flags_cld2_verbose) {
  1073     fprintf(scoringcontext->debug_file, "Hitbuffer[) ");
  1074     DumpHitBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer);
  1077   LinearizeHitBuffer(letter_offset, scoringcontext, more_to_come, score_cjk,
  1078                      hitbuffer);
  1080   if (scoringcontext->flags_cld2_verbose) {
  1081     fprintf(scoringcontext->debug_file, "Linear[) ");
  1082     DumpLinearBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer);
  1085   SummaryBuffer summarybuffer;
  1086   summarybuffer.n = 0;
  1087   ChunkSpan last_cspan;
  1088   ScoreAllHits(scriptspan.text, scriptspan.ulscript,
  1089                     more_to_come, score_cjk, hitbuffer,
  1090                     scoringcontext,
  1091                     &summarybuffer, &last_cspan);
  1093   if (scoringcontext->flags_cld2_verbose) {
  1094     DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer);
  1097   if (vec != NULL) {
  1098     // Sharpen boundaries of summarybuffer
  1099     // This is not a high-performance path
  1100     SharpenBoundaries(scriptspan.text, more_to_come, hitbuffer, scoringcontext,
  1101                       &summarybuffer);
  1102     // Show after the sharpening
  1103     // CLD2_Debug2(scriptspan.text, more_to_come, score_cjk,
  1104     //             hitbuffer, scoringcontext, &summarybuffer);
  1106     if (scoringcontext->flags_cld2_verbose) {
  1107       DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer);
  1111   SummaryBufferToDocTote(&summarybuffer, more_to_come, doc_tote);
  1112   SummaryBufferToVector(scoringcontext->scanner, scriptspan.text,
  1113                         &summarybuffer, more_to_come, vec);
  1116 void SpliceHitBuffer(ScoringHitBuffer* hitbuffer, int next_offset) {
  1117   // Splice hitbuffer and summarybuffer for next round. With big chunks and
  1118   // distinctive-word state carried across chunks, we might not need to do this.
  1119   hitbuffer->next_base = 0;
  1120   hitbuffer->next_delta = 0;
  1121   hitbuffer->next_distinct = 0;
  1122   hitbuffer->next_linear = 0;
  1123   hitbuffer->next_chunk_start = 0;
  1124   hitbuffer->lowest_offset = next_offset;
  1128 // Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating
  1129 // scoringcontext
  1130 void ScoreEntireScriptSpan(const LangSpan& scriptspan,
  1131                            ScoringContext* scoringcontext,
  1132                            DocTote* doc_tote,
  1133                            ResultChunkVector* vec) {
  1134   int bytes = scriptspan.text_bytes;
  1135   // Artificially set score to 1024 per 1KB, or 1 per byte
  1136   int score = bytes;
  1137   int reliability = 100;
  1138   // doc_tote uses full languages
  1139   Language one_one_lang = DefaultLanguage(scriptspan.ulscript);
  1140   doc_tote->Add(one_one_lang, bytes, score, reliability);
  1142   if (scoringcontext->flags_cld2_html) {
  1143     ChunkSummary chunksummary = {
  1144       1, 0,
  1145       one_one_lang, UNKNOWN_LANGUAGE, score, 1,
  1146       bytes, 0, scriptspan.ulscript, reliability, reliability
  1147     };
  1148     CLD2_Debug(scriptspan.text, 1, scriptspan.text_bytes,
  1149                false, false, NULL,
  1150                scoringcontext, NULL, &chunksummary);
  1153   // First byte is always a space
  1154   JustOneItemToVector(scoringcontext->scanner, scriptspan.text,
  1155                       one_one_lang, 1, bytes - 1, vec);
  1157   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
  1160 // Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext
  1161 void ScoreCJKScriptSpan(const LangSpan& scriptspan,
  1162                         ScoringContext* scoringcontext,
  1163                         DocTote* doc_tote,
  1164                         ResultChunkVector* vec) {
  1165   // Allocate three parallel arrays of scoring hits
  1166   ScoringHitBuffer* hitbuffer = new ScoringHitBuffer;
  1167   hitbuffer->init();
  1168   hitbuffer->ulscript = scriptspan.ulscript;
  1170   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
  1171   scoringcontext->oldest_distinct_boost = 0;
  1173   // Incoming scriptspan has a single leading space at scriptspan.text[0]
  1174   // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3]
  1176   int letter_offset = 1;        // Skip initial space
  1177   hitbuffer->lowest_offset = letter_offset;
  1178   int letter_limit = scriptspan.text_bytes;
  1179   while (letter_offset < letter_limit) {
  1180     if (scoringcontext->flags_cld2_verbose) {
  1181       fprintf(scoringcontext->debug_file, " ScoreCJKScriptSpan[%d,%d)<br>\n",
  1182               letter_offset, letter_limit);
  1184     //
  1185     // Fill up one hitbuffer, possibly splicing onto previous fragment
  1186     //
  1187     // NOTE: GetUniHits deals with close repeats
  1188     // NOTE: After last chunk there is always a hitbuffer entry with an offset
  1189     // just off the end of the text = next_offset.
  1190     int next_offset = GetUniHits(scriptspan.text, letter_offset, letter_limit,
  1191                                   scoringcontext, hitbuffer);
  1192     // NOTE: GetBiHitVectors deals with close repeats,
  1193     // does one hash and two lookups (delta and distinct) per word
  1194     GetBiHits(scriptspan.text, letter_offset, next_offset,
  1195                 scoringcontext, hitbuffer);
  1197     //
  1198     // Score one hitbuffer in chunks to summarybuffer
  1199     //
  1200     bool more_to_come = next_offset < letter_limit;
  1201     bool score_cjk = true;
  1202     ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec,
  1203                      more_to_come, score_cjk, hitbuffer);
  1204     SpliceHitBuffer(hitbuffer, next_offset);
  1206     letter_offset = next_offset;
  1209   delete hitbuffer;
  1210   // Context across buffers is not connected yet
  1211   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
  1216 // Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext
  1217 // We have a scriptspan with all lowercase text in one script. Look up
  1218 // quadgrams and octagrams, saving the hits in three parallel vectors.
  1219 // Score from those vectors in chunks, toting each chunk to get a single
  1220 // language, and combining into the overall document score. The hit vectors
  1221 // in general are not big enough to handle and entire scriptspan, so
  1222 // repeat until the entire scriptspan is scored.
  1223 // Caller deals with minimizing numbr of runt scriptspans
  1224 // This routine deals with minimizing number of runt chunks.
  1225 //
  1226 // Returns updated scoringcontext
  1227 // Returns updated doc_tote
  1228 // If vec != NULL, appends to that vector of ResultChunk's
  1229 void ScoreQuadScriptSpan(const LangSpan& scriptspan,
  1230                          ScoringContext* scoringcontext,
  1231                          DocTote* doc_tote,
  1232                          ResultChunkVector* vec) {
  1233   // Allocate three parallel arrays of scoring hits
  1234   ScoringHitBuffer* hitbuffer = new ScoringHitBuffer;
  1235   hitbuffer->init();
  1236   hitbuffer->ulscript = scriptspan.ulscript;
  1238   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
  1239   scoringcontext->oldest_distinct_boost = 0;
  1241   // Incoming scriptspan has a single leading space at scriptspan.text[0]
  1242   // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3]
  1244   int letter_offset = 1;        // Skip initial space
  1245   hitbuffer->lowest_offset = letter_offset;
  1246   int letter_limit = scriptspan.text_bytes;
  1247   while (letter_offset < letter_limit) {
  1248     //
  1249     // Fill up one hitbuffer, possibly splicing onto previous fragment
  1250     //
  1251     // NOTE: GetQuadHits deals with close repeats
  1252     // NOTE: After last chunk there is always a hitbuffer entry with an offset
  1253     // just off the end of the text = next_offset.
  1254     int next_offset = GetQuadHits(scriptspan.text, letter_offset, letter_limit,
  1255                                   scoringcontext, hitbuffer);
  1256     // If true, there is more text to process in this scriptspan
  1257     // NOTE: GetOctaHitVectors deals with close repeats,
  1258     // does one hash and two lookups (delta and distinct) per word
  1259     GetOctaHits(scriptspan.text, letter_offset, next_offset,
  1260                 scoringcontext, hitbuffer);
  1262     //
  1263     // Score one hitbuffer in chunks to summarybuffer
  1264     //
  1265     bool more_to_come = next_offset < letter_limit;
  1266     bool score_cjk = false;
  1267     ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec,
  1268                      more_to_come, score_cjk, hitbuffer);
  1269     SpliceHitBuffer(hitbuffer, next_offset);
  1271     letter_offset = next_offset;
  1274   delete hitbuffer;
  1278 // Score one scriptspan into doc_tote and vec, updating scoringcontext
  1279 // Inputs:
  1280 //  One scriptspan of perhaps 40-60KB, all same script lower-case letters
  1281 //    and single ASCII spaces. First character is a space to allow simple
  1282 //    begining-of-word detect. End of buffer has three spaces and NUL to
  1283 //    allow easy scan-to-end-of-word.
  1284 //  Scoring context of
  1285 //    scoring tables
  1286 //    flags
  1287 //    running boosts
  1288 // Outputs:
  1289 //  Updated doc_tote giving overall languages and byte counts
  1290 //  Optional updated chunk vector giving offset, length, language
  1291 //
  1292 // Caller initializes flags, boosts, doc_tote and vec.
  1293 // Caller aggregates across multiple scriptspans
  1294 // Caller calculates final document result
  1295 // Caller deals with detecting and triggering suppression of repeated text.
  1296 //
  1297 // This top-level routine just chooses the recognition type and calls one of
  1298 // the next-level-down routines.
  1299 //
  1300 void ScoreOneScriptSpan(const LangSpan& scriptspan,
  1301                         ScoringContext* scoringcontext,
  1302                         DocTote* doc_tote,
  1303                         ResultChunkVector* vec) {
  1304   if (scoringcontext->flags_cld2_verbose) {
  1305     fprintf(scoringcontext->debug_file, "<br>ScoreOneScriptSpan(%s,%d) ",
  1306             ULScriptCode(scriptspan.ulscript), scriptspan.text_bytes);
  1307     // Optionally print the chunk lowercase letters/marks text
  1308     string temp(&scriptspan.text[0], scriptspan.text_bytes);
  1309     fprintf(scoringcontext->debug_file, "'%s'",
  1310             GetHtmlEscapedText(temp).c_str());
  1311     fprintf(scoringcontext->debug_file, "<br>\n");
  1313   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
  1314   scoringcontext->oldest_distinct_boost = 0;
  1315   ULScriptRType rtype = ULScriptRecognitionType(scriptspan.ulscript);
  1316   if (scoringcontext->flags_cld2_score_as_quads && (rtype != RTypeCJK)) {
  1317     rtype = RTypeMany;
  1319   switch (rtype) {
  1320   case RTypeNone:
  1321   case RTypeOne:
  1322     ScoreEntireScriptSpan(scriptspan, scoringcontext, doc_tote, vec);
  1323     break;
  1324   case RTypeCJK:
  1325     ScoreCJKScriptSpan(scriptspan, scoringcontext, doc_tote, vec);
  1326     break;
  1327   case RTypeMany:
  1328     ScoreQuadScriptSpan(scriptspan, scoringcontext, doc_tote, vec);
  1329     break;
  1333 }       // End namespace CLD2

mercurial