The Tor Browser: browser/components/translation/cld2/internal/scoreonescriptspan.cc@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.

     2 //

     3 // Licensed under the Apache License, Version 2.0 (the "License");

     4 // you may not use this file except in compliance with the License.

     5 // You may obtain a copy of the License at

     6 //

     7 //     http://www.apache.org/licenses/LICENSE-2.0

     8 //

     9 // Unless required by applicable law or agreed to in writing, software

    10 // distributed under the License is distributed on an "AS IS" BASIS,

    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

    12 // See the License for the specific language governing permissions and

    13 // limitations under the License.

    15 //

    16 // Author: dsites@google.com (Dick Sites)

    17 // Updated 2014.01 for dual table lookup

    18 //

    20 #include "scoreonescriptspan.h"

    22 #include "cldutil.h"

    23 #include "debug.h"

    24 #include "lang_script.h"

    26 #include <stdio.h>

    28 using namespace std;

    30 namespace CLD2 {

    32 static const int kUnreliablePercentThreshold = 75;

    34 void AddLangProb(uint32 langprob, Tote* chunk_tote) {

    35   ProcessProbV2Tote(langprob, chunk_tote);

    36 }

    38 void ZeroPSLang(uint32 langprob, Tote* chunk_tote) {

    39   uint8 top1 = (langprob >> 8) & 0xff;

    40   chunk_tote->SetScore(top1, 0);

    41 }

    43 bool SameCloseSet(uint16 lang1, uint16 lang2) {

    44   int lang1_close_set = LanguageCloseSet(static_cast<Language>(lang1));

    45   if (lang1_close_set == 0) {return false;}

    46   int lang2_close_set = LanguageCloseSet(static_cast<Language>(lang2));

    47   return (lang1_close_set == lang2_close_set);

    48 }

    50 bool SameCloseSet(Language lang1, Language lang2) {

    51   int lang1_close_set = LanguageCloseSet(lang1);

    52   if (lang1_close_set == 0) {return false;}

    53   int lang2_close_set = LanguageCloseSet(lang2);

    54   return (lang1_close_set == lang2_close_set);

    55 }

    58 // Needs expected score per 1KB in scoring context

    59 void SetChunkSummary(ULScript ulscript, int first_linear_in_chunk,

    60                      int offset, int len,

    61                      const ScoringContext* scoringcontext,

    62                      const Tote* chunk_tote,

    63                      ChunkSummary* chunksummary) {

    64   int key3[3];

    65   chunk_tote->CurrentTopThreeKeys(key3);

    66   Language lang1 = FromPerScriptNumber(ulscript, key3[0]);

    67   Language lang2 = FromPerScriptNumber(ulscript, key3[1]);

    69   int actual_score_per_kb = 0;

    70   if (len > 0) {

    71     actual_score_per_kb = (chunk_tote->GetScore(key3[0]) << 10) / len;

    72   }

    73   int expected_subscr = lang1 * 4 + LScript4(ulscript);

    74   int expected_score_per_kb =

    75      scoringcontext->scoringtables->kExpectedScore[expected_subscr];

    77   chunksummary->offset = offset;

    78   chunksummary->chunk_start = first_linear_in_chunk;

    79   chunksummary->lang1 = lang1;

    80   chunksummary->lang2 = lang2;

    81   chunksummary->score1 = chunk_tote->GetScore(key3[0]);

    82   chunksummary->score2 = chunk_tote->GetScore(key3[1]);

    83   chunksummary->bytes = len;

    84   chunksummary->grams = chunk_tote->GetScoreCount();

    85   chunksummary->ulscript = ulscript;

    86   chunksummary->reliability_delta = ReliabilityDelta(chunksummary->score1,

    87                                                      chunksummary->score2,

    88                                                      chunksummary->grams);

    89   // If lang1/lang2 in same close set, set delta reliability to 100%

    90   if (SameCloseSet(lang1, lang2)) {

    91     chunksummary->reliability_delta = 100;

    92   }

    93   chunksummary->reliability_score =

    94      ReliabilityExpected(actual_score_per_kb, expected_score_per_kb);

    95 }

    97 // Return true if just lang1 is there: lang2=0 and lang3=0

    98 bool IsSingleLang(uint32 langprob) {

    99   // Probably a bug -- which end is lang1? But only used to call empty Boost1

   100   return ((langprob & 0x00ffff00) == 0);

   101 }

   103 // Update scoring context distinct_boost for single language quad

   104 void AddDistinctBoost1(uint32 langprob, ScoringContext* scoringcontext) {

   105   // Probably keep this empty -- not a good enough signal

   106 }

   108 // Update scoring context distinct_boost for distinct octagram

   109 // Keep last 4 used. Since these are mostly (except at splices) in

   110 // hitbuffer, we might be able to just use a subscript and splice

   111 void AddDistinctBoost2(uint32 langprob, ScoringContext* scoringcontext) {

   112 // this is called 0..n times per chunk with decoded hitbuffer->distinct...

   113   LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;

   114   if (scoringcontext->ulscript != ULScript_Latin) {

   115     distinct_boost = &scoringcontext->distinct_boost.othr;

   116   }

   117   int n = distinct_boost->n;

   118   distinct_boost->langprob[n] = langprob;

   119   distinct_boost->n = distinct_boost->wrap(n + 1);

   120 }

   122 // For each chunk, add extra weight for language priors (from content-lang and

   123 // meta lang=xx) and distinctive tokens

   124 void ScoreBoosts(const ScoringContext* scoringcontext, Tote* chunk_tote) {

   125   // Get boosts for current script

   126   const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;

   127   const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;

   128   const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;

   129   if (scoringcontext->ulscript != ULScript_Latin) {

   130     langprior_boost = &scoringcontext->langprior_boost.othr;

   131     langprior_whack = &scoringcontext->langprior_whack.othr;

   132     distinct_boost = &scoringcontext->distinct_boost.othr;

   133   }

   135   for (int k = 0; k < kMaxBoosts; ++k) {

   136     uint32 langprob = langprior_boost->langprob[k];

   137     if (langprob > 0) {AddLangProb(langprob, chunk_tote);}

   138   }

   139   for (int k = 0; k < kMaxBoosts; ++k) {

   140     uint32 langprob = distinct_boost->langprob[k];

   141     if (langprob > 0) {AddLangProb(langprob, chunk_tote);}

   142   }

   143   // boost has a packed set of per-script langs and probabilites

   144   // whack has a packed set of per-script lang to be suppressed (zeroed)

   145   // When a language in a close set is given as an explicit hint, others in

   146   //  that set will be whacked here.

   147   for (int k = 0; k < kMaxBoosts; ++k) {

   148     uint32 langprob = langprior_whack->langprob[k];

   149     if (langprob > 0) {ZeroPSLang(langprob, chunk_tote);}

   150   }

   151 }

   155 // At this point, The chunk is described by

   156 //  hitbuffer->base[cspan->chunk_base .. cspan->chunk_base + cspan->base_len)

   157 //  hitbuffer->delta[cspan->chunk_delta ... )

   158 //  hitbuffer->distinct[cspan->chunk_distinct ... )

   159 // Scored text is in text[lo..hi) where

   160 //  lo is 0 or the min of first base/delta/distinct hitbuffer offset and

   161 //  hi is the min of next base/delta/distinct hitbuffer offset after

   162 //  base_len, etc.

   163 void GetTextSpanOffsets(const ScoringHitBuffer* hitbuffer,

   164                         const ChunkSpan* cspan, int* lo, int* hi) {

   165   // Front of this span

   166   int lo_base = hitbuffer->base[cspan->chunk_base].offset;

   167   int lo_delta = hitbuffer->delta[cspan->chunk_delta].offset;

   168   int lo_distinct = hitbuffer->distinct[cspan->chunk_distinct].offset;

   169   // Front of next span

   170   int hi_base = hitbuffer->base[cspan->chunk_base +

   171     cspan->base_len].offset;

   172   int hi_delta = hitbuffer->delta[cspan->chunk_delta +

   173     cspan->delta_len].offset;

   174   int hi_distinct = hitbuffer->distinct[cspan->chunk_distinct +

   175     cspan->distinct_len].offset;

   177   *lo = 0;

   178 //  if (cspan->chunk_base > 0) {

   179 //    *lo = minint(minint(lo_base, lo_delta), lo_distinct);

   180 //  }

   181   *lo = minint(minint(lo_base, lo_delta), lo_distinct);

   182   *hi = minint(minint(hi_base, hi_delta), hi_distinct);

   183 }

   186 int DiffScore(const CLD2TableSummary* obj, int indirect,

   187               uint16 lang1, uint16 lang2) {

   188   if (indirect < static_cast<int>(obj->kCLDTableSizeOne)) {

   189     // Up to three languages at indirect

   190     uint32 langprob = obj->kCLDTableInd[indirect];

   191     return GetLangScore(langprob, lang1) - GetLangScore(langprob, lang2);

   192   } else {

   193     // Up to six languages at start + 2 * (indirect - start)

   194     indirect += (indirect - obj->kCLDTableSizeOne);

   195     uint32 langprob = obj->kCLDTableInd[indirect];

   196     uint32 langprob2 = obj->kCLDTableInd[indirect + 1];

   197     return (GetLangScore(langprob, lang1) + GetLangScore(langprob2, lang1)) -

   198       (GetLangScore(langprob, lang2) + GetLangScore(langprob2, lang2));

   199   }

   201 }

   203 // Score all the bases, deltas, distincts, boosts for one chunk into chunk_tote

   204 // After last chunk there is always a hitbuffer entry with an offset just off

   205 // the end of the text.

   206 // Sets delta_len, and distinct_len

   207 void ScoreOneChunk(const char* text, ULScript ulscript,

   208                    const ScoringHitBuffer* hitbuffer,

   209                    int chunk_i,

   210                    ScoringContext* scoringcontext,

   211                    ChunkSpan* cspan, Tote* chunk_tote,

   212                    ChunkSummary* chunksummary) {

   213   int first_linear_in_chunk = hitbuffer->chunk_start[chunk_i];

   214   int first_linear_in_next_chunk = hitbuffer->chunk_start[chunk_i + 1];

   216   chunk_tote->Reinit();

   217   cspan->delta_len = 0;

   218   cspan->distinct_len = 0;

   219   if (scoringcontext->flags_cld2_verbose) {

   220     fprintf(scoringcontext->debug_file, "<br>ScoreOneChunk[%d..%d) ",

   221             first_linear_in_chunk, first_linear_in_next_chunk);

   222   }

   224   // 2013.02.05 linear design: just use base and base_len for the span

   225   cspan->chunk_base = first_linear_in_chunk;

   226   cspan->base_len = first_linear_in_next_chunk - first_linear_in_chunk;

   227   for (int i = first_linear_in_chunk; i < first_linear_in_next_chunk; ++i) {

   228     uint32 langprob = hitbuffer->linear[i].langprob;

   229     AddLangProb(langprob, chunk_tote);

   230     if (hitbuffer->linear[i].type <= QUADHIT) {

   231       chunk_tote->AddScoreCount();      // Just count quads, not octas

   232     }

   233     if (hitbuffer->linear[i].type == DISTINCTHIT) {

   234       AddDistinctBoost2(langprob, scoringcontext);

   235     }

   236   }

   238   // Score language prior boosts

   239   // Score distinct word boost

   240   ScoreBoosts(scoringcontext, chunk_tote);

   242   int lo = hitbuffer->linear[first_linear_in_chunk].offset;

   243   int hi = hitbuffer->linear[first_linear_in_next_chunk].offset;

   245   // Chunk_tote: get top langs, scores, etc. and fill in chunk summary

   246   SetChunkSummary(ulscript, first_linear_in_chunk, lo, hi - lo,

   247                   scoringcontext, chunk_tote, chunksummary);

   249   bool more_to_come = false;

   250   bool score_cjk = false;

   251   if (scoringcontext->flags_cld2_html) {

   252     // Show one chunk in readable output

   253     CLD2_Debug(text, lo, hi, more_to_come, score_cjk, hitbuffer,

   254                scoringcontext, cspan, chunksummary);

   255   }

   257   scoringcontext->prior_chunk_lang = static_cast<Language>(chunksummary->lang1);

   258 }

   261 // Score chunks of text described by hitbuffer, allowing each to be in a

   262 // different language, and optionally adjusting the boundaries inbetween.

   263 // Set last_cspan to the last chunkspan used

   264 void ScoreAllHits(const char* text,  ULScript ulscript,

   265                   bool more_to_come, bool score_cjk,

   266                   const ScoringHitBuffer* hitbuffer,

   267                   ScoringContext* scoringcontext,

   268                   SummaryBuffer* summarybuffer, ChunkSpan* last_cspan) {

   269   ChunkSpan prior_cspan = {0, 0, 0, 0, 0, 0};

   270   ChunkSpan cspan = {0, 0, 0, 0, 0, 0};

   272   for (int i = 0; i < hitbuffer->next_chunk_start; ++i) {

   273     // Score one chunk

   274     // Sets delta_len, and distinct_len

   275     Tote chunk_tote;

   276     ChunkSummary chunksummary;

   277     ScoreOneChunk(text, ulscript,

   278                   hitbuffer, i,

   279                   scoringcontext, &cspan, &chunk_tote, &chunksummary);

   281     // Put result in summarybuffer

   282     if (summarybuffer->n < kMaxSummaries) {

   283       summarybuffer->chunksummary[summarybuffer->n] = chunksummary;

   284       summarybuffer->n += 1;

   285     }

   287     prior_cspan = cspan;

   288     cspan.chunk_base += cspan.base_len;

   289     cspan.chunk_delta += cspan.delta_len;

   290     cspan.chunk_distinct += cspan.distinct_len;

   291   }

   293   // Add one dummy off the end to hold first unused linear_in_chunk

   294   int linear_off_end = hitbuffer->next_linear;

   295   int offset_off_end = hitbuffer->linear[linear_off_end].offset;

   296   ChunkSummary* cs = &summarybuffer->chunksummary[summarybuffer->n];

   297   memset(cs, 0, sizeof(ChunkSummary));

   298   cs->offset = offset_off_end;

   299   cs->chunk_start = linear_off_end;

   300   *last_cspan = prior_cspan;

   301 }

   304 void SummaryBufferToDocTote(const SummaryBuffer* summarybuffer,

   305                             bool more_to_come, DocTote* doc_tote) {

   306   int cs_bytes_sum = 0;

   307   for (int i = 0; i < summarybuffer->n; ++i) {

   308     const ChunkSummary* cs = &summarybuffer->chunksummary[i];

   309     int reliability = minint(cs->reliability_delta, cs->reliability_score);

   310     // doc_tote uses full languages

   311     doc_tote->Add(cs->lang1, cs->bytes, cs->score1, reliability);

   312     cs_bytes_sum += cs->bytes;

   313   }

   314 }

   316 // Turn on for debugging vectors

   317 static const bool kShowLettersOriginal = false;

   320 // If next chunk language matches last vector language, extend last element

   321 // Otherwise add new element to vector

   322 void ItemToVector(ScriptScanner* scanner,

   323                   ResultChunkVector* vec, Language new_lang,

   324                   int mapped_offset, int mapped_len) {

   325   uint16 last_vec_lang = static_cast<uint16>(UNKNOWN_LANGUAGE);

   326   int last_vec_subscr = vec->size() - 1;

   327   if (last_vec_subscr >= 0) {

   328     ResultChunk* priorrc = &(*vec)[last_vec_subscr];

   329     last_vec_lang = priorrc->lang1;

   330     if (new_lang == last_vec_lang) {

   331       // Extend prior. Current mapped_offset may be beyond prior end, so do

   332       // the arithmetic to include any such gap

   333       priorrc->bytes = minint((mapped_offset + mapped_len) - priorrc->offset,

   334                               kMaxResultChunkBytes);

   335       if (kShowLettersOriginal) {

   336         // Optionally print the new chunk original text

   337         string temp2(&scanner->GetBufferStart()[priorrc->offset],

   338                      priorrc->bytes);

   339         fprintf(stderr, "Item[%d..%d) '%s'<br>\n",

   340                 priorrc->offset, priorrc->offset + priorrc->bytes,

   341                 GetHtmlEscapedText(temp2).c_str());

   342       }

   343       return;

   344     }

   345   }

   346   // Add new vector element

   347   ResultChunk rc;

   348   rc.offset = mapped_offset;

   349   rc.bytes = minint(mapped_len, kMaxResultChunkBytes);

   350   rc.lang1 = static_cast<uint16>(new_lang);

   351   vec->push_back(rc);

   352   if (kShowLettersOriginal) {

   353     // Optionally print the new chunk original text

   354     string temp2(&scanner->GetBufferStart()[rc.offset], rc.bytes);

   355     fprintf(stderr, "Item[%d..%d) '%s'<br>\n",

   356             rc.offset, rc.offset + rc.bytes,

   357             GetHtmlEscapedText(temp2).c_str());

   358   }

   359 }

   361 uint16 PriorVecLang(const ResultChunkVector* vec) {

   362   if (vec->empty()) {return static_cast<uint16>(UNKNOWN_LANGUAGE);}

   363   return (*vec)[vec->size() - 1].lang1;

   364 }

   366 uint16 NextChunkLang(const SummaryBuffer* summarybuffer, int i) {

   367   if ((i + 1) >= summarybuffer->n) {

   368     return static_cast<uint16>(UNKNOWN_LANGUAGE);

   369   }

   370   return summarybuffer->chunksummary[i + 1].lang1;

   371 }

   375 // Add n elements of summarybuffer to resultchunk vector:

   376 // Each element is letters-only text [offset..offset+bytes)

   377 // This maps back to original[Back(offset)..Back(offset+bytes))

   378 //

   379 // We go out of our way to minimize the variation in the ResultChunkVector,

   380 // so that the caller has fewer but more meaningful spans in different

   381 // lanaguges, for the likely purpose of translation or spell-check.

   382 //

   383 // The language of each chunk is lang1, but it might be unreliable for

   384 // either of two reasons: its score is relatively too close to the score of

   385 // lang2, or its score is too far away from the expected score of real text in

   386 // the given language. Unreliable languages are mapped to Unknown.

   387 //

   388 void SummaryBufferToVector(ScriptScanner* scanner, const char* text,

   389                            const SummaryBuffer* summarybuffer,

   390                            bool more_to_come, ResultChunkVector* vec) {

   391   if (vec == NULL) {return;}

   393   if (kShowLettersOriginal) {

   394     fprintf(stderr, "map2original_ ");

   395     scanner->map2original_.DumpWindow();

   396     fprintf(stderr, "<br>\n");

   397     fprintf(stderr, "map2uplow_ ");

   398     scanner->map2uplow_.DumpWindow();

   399     fprintf(stderr, "<br>\n");

   400   }

   402   for (int i = 0; i < summarybuffer->n; ++i) {

   403     const ChunkSummary* cs = &summarybuffer->chunksummary[i];

   404     int unmapped_offset = cs->offset;

   405     int unmapped_len = cs->bytes;

   407     if (kShowLettersOriginal) {

   408       // Optionally print the chunk lowercase letters/marks text

   409       string temp(&text[unmapped_offset], unmapped_len);

   410       fprintf(stderr, "Letters [%d..%d) '%s'<br>\n",

   411               unmapped_offset, unmapped_offset + unmapped_len,

   412               GetHtmlEscapedText(temp).c_str());

   413     }

   415     int mapped_offset = scanner->MapBack(unmapped_offset);

   417     // Trim back a little to prefer splicing original at word boundaries

   418     if (mapped_offset > 0) {

   419       // Size of prior vector entry, if any

   420       int prior_size = 0;

   421       if (!vec->empty()) {

   422         ResultChunk* rc = &(*vec)[vec->size() - 1];

   423         prior_size = rc->bytes;

   424       }

   425       // Maximum back up size to leave at least 3 bytes in prior,

   426       // and not entire buffer, and no more than 12 bytes total backup

   427       int n_limit = minint(prior_size - 3, mapped_offset);

   428       n_limit = minint(n_limit, 12);

   430       // Backscan over letters, stopping if prior byte is < 0x41

   431       // There is some possibility that we will backscan over a different script

   432       const char* s = &scanner->GetBufferStart()[mapped_offset];

   433       const unsigned char* us = reinterpret_cast<const unsigned char*>(s);

   434       int n = 0;

   435       while ((n < n_limit) && (us[-n - 1] >= 0x41)) {++n;}

   436       if (n >= n_limit) {n = 0;} // New boundary not found within range

   438       // Also back up exactly one leading punctuation character if '"#@

   439       if (n < n_limit) {

   440         unsigned char c = us[-n - 1];

   441         if ((c == '\'') || (c == '"') || (c == '#') || (c == '@')) {++n;}

   442       }

   443       // Shrink the previous chunk slightly

   444       if (n > 0) {

   445         ResultChunk* rc = &(*vec)[vec->size() - 1];

   446         rc->bytes -= n;

   447         mapped_offset -= n;

   448         if (kShowLettersOriginal) {

   449           fprintf(stderr, "Back up %d bytes<br>\n", n);

   450           // Optionally print the prior chunk original text

   451           string temp2(&scanner->GetBufferStart()[rc->offset], rc->bytes);

   452           fprintf(stderr, "Prior   [%d..%d) '%s'<br>\n",

   453                   rc->offset, rc->offset + rc->bytes,

   454                   GetHtmlEscapedText(temp2).c_str());

   455         }

   456       }

   457     }

   459     int mapped_len =

   460       scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset;

   462     if (kShowLettersOriginal) {

   463       // Optionally print the chunk original text

   464       string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len);

   465       fprintf(stderr, "Original[%d..%d) '%s'<br>\n",

   466               mapped_offset, mapped_offset + mapped_len,

   467               GetHtmlEscapedText(temp2).c_str());

   468     }

   470     Language new_lang = static_cast<Language>(cs->lang1);

   471     bool reliability_delta_bad =

   472       (cs->reliability_delta < kUnreliablePercentThreshold);

   473     bool reliability_score_bad =

   474       (cs->reliability_score < kUnreliablePercentThreshold);

   476     // If the top language matches last vector, ignore reliability_delta

   477     uint16 prior_lang = PriorVecLang(vec);

   478     if (prior_lang == cs->lang1) {

   479       reliability_delta_bad = false;

   480     }

   481     // If the top language is in same close set as last vector, set up to merge

   482     if (SameCloseSet(cs->lang1, prior_lang)) {

   483       new_lang = static_cast<Language>(prior_lang);

   484       reliability_delta_bad = false;

   485     }

   486     // If the top two languages are in the same close set and the last vector

   487     // language is the second language, set up to merge

   488     if (SameCloseSet(cs->lang1, cs->lang2) &&

   489         (prior_lang == cs->lang2)) {

   490       new_lang = static_cast<Language>(prior_lang);

   491       reliability_delta_bad = false;

   492     }

   493     // If unreliable and the last and next vector languages are both

   494     // the second language, set up to merge

   495     uint16 next_lang = NextChunkLang(summarybuffer, i);

   496     if (reliability_delta_bad &&

   497         (prior_lang == cs->lang2) && (next_lang == cs->lang2)) {

   498       new_lang = static_cast<Language>(prior_lang);

   499       reliability_delta_bad = false;

   500     }

   502     if (reliability_delta_bad || reliability_score_bad) {

   503       new_lang = UNKNOWN_LANGUAGE;

   504     }

   505     ItemToVector(scanner, vec, new_lang, mapped_offset, mapped_len);

   506   }

   507 }

   509 // Add just one element to resultchunk vector:

   510 // For RTypeNone or RTypeOne

   511 void JustOneItemToVector(ScriptScanner* scanner, const char* text,

   512                          Language lang1, int unmapped_offset, int unmapped_len,

   513                          ResultChunkVector* vec) {

   514   if (vec == NULL) {return;}

   516   if (kShowLettersOriginal) {

   517     fprintf(stderr, "map2original_ ");

   518     scanner->map2original_.DumpWindow();

   519     fprintf(stderr, "<br>\n");

   520     fprintf(stderr, "map2uplow_ ");

   521     scanner->map2uplow_.DumpWindow();

   522     fprintf(stderr, "<br>\n");

   523   }

   525   if (kShowLettersOriginal) {

   526    // Optionally print the chunk lowercase letters/marks text

   527    string temp(&text[unmapped_offset], unmapped_len);

   528    fprintf(stderr, "Letters1 [%d..%d) '%s'<br>\n",

   529            unmapped_offset, unmapped_offset + unmapped_len,

   530            GetHtmlEscapedText(temp).c_str());

   531   }

   533   int mapped_offset = scanner->MapBack(unmapped_offset);

   534   int mapped_len =

   535     scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset;

   537   if (kShowLettersOriginal) {

   538     // Optionally print the chunk original text

   539     string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len);

   540     fprintf(stderr, "Original1[%d..%d) '%s'<br>\n",

   541             mapped_offset, mapped_offset + mapped_len,

   542             GetHtmlEscapedText(temp2).c_str());

   543   }

   545   ItemToVector(scanner, vec, lang1, mapped_offset, mapped_len);

   546 }

   549 // Debugging. Not thread safe. Defined in getonescriptspan

   550 char* DisplayPiece(const char* next_byte_, int byte_length_);

   552 // If high bit is on, take out high bit and add 2B to make table2 entries easy

   553 inline int PrintableIndirect(int x) {

   554   if ((x & 0x80000000u) != 0) {

   555     return (x & ~0x80000000u) + 2000000000;

   556   }

   557   return x;

   558 }

   559 void DumpHitBuffer(FILE* df, const char* text,

   560                    const ScoringHitBuffer* hitbuffer) {

   561   fprintf(df,

   562           "<br>DumpHitBuffer[%s, next_base/delta/distinct %d, %d, %d)<br>\n",

   563           ULScriptCode(hitbuffer->ulscript),

   564           hitbuffer->next_base, hitbuffer->next_delta,

   565           hitbuffer->next_distinct);

   566   for (int i = 0; i < hitbuffer->maxscoringhits; ++i) {

   567     if (i < hitbuffer->next_base) {

   568       fprintf(df, "Q[%d]%d,%d,%s ",

   569               i, hitbuffer->base[i].offset,

   570               PrintableIndirect(hitbuffer->base[i].indirect),

   571               DisplayPiece(&text[hitbuffer->base[i].offset], 6));

   572     }

   573     if (i < hitbuffer->next_delta) {

   574       fprintf(df, "DL[%d]%d,%d,%s ",

   575               i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,

   576               DisplayPiece(&text[hitbuffer->delta[i].offset], 12));

   577     }

   578     if (i < hitbuffer->next_distinct) {

   579       fprintf(df, "D[%d]%d,%d,%s ",

   580               i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect,

   581               DisplayPiece(&text[hitbuffer->distinct[i].offset], 12));

   582     }

   583     if (i < hitbuffer->next_base) {

   584       fprintf(df, "<br>\n");

   585     }

   586     if (i > 50) {break;}

   587   }

   588   if (hitbuffer->next_base > 50) {

   589     int i = hitbuffer->next_base;

   590     fprintf(df, "Q[%d]%d,%d,%s ",

   591             i, hitbuffer->base[i].offset,

   592             PrintableIndirect(hitbuffer->base[i].indirect),

   593             DisplayPiece(&text[hitbuffer->base[i].offset], 6));

   594   }

   595   if (hitbuffer->next_delta > 50) {

   596     int i = hitbuffer->next_delta;

   597     fprintf(df, "DL[%d]%d,%d,%s ",

   598             i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,

   599             DisplayPiece(&text[hitbuffer->delta[i].offset], 12));

   600   }

   601   if (hitbuffer->next_distinct > 50) {

   602     int i = hitbuffer->next_distinct;

   603     fprintf(df, "D[%d]%d,%d,%s ",

   604             i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect,

   605             DisplayPiece(&text[hitbuffer->distinct[i].offset], 12));

   606   }

   607   fprintf(df, "<br>\n");

   608 }

   611 void DumpLinearBuffer(FILE* df, const char* text,

   612                       const ScoringHitBuffer* hitbuffer) {

   613   fprintf(df, "<br>DumpLinearBuffer[%d)<br>\n",

   614           hitbuffer->next_linear);

   615   // Include the dummy entry off the end

   616   for (int i = 0; i < hitbuffer->next_linear + 1; ++i) {

   617     if ((50 < i) && (i < (hitbuffer->next_linear - 1))) {continue;}

   618     fprintf(df, "[%d]%d,%c=%08x,%s<br>\n",

   619             i, hitbuffer->linear[i].offset,

   620             "UQLD"[hitbuffer->linear[i].type],

   621             hitbuffer->linear[i].langprob,

   622             DisplayPiece(&text[hitbuffer->linear[i].offset], 6));

   623   }

   624   fprintf(df, "<br>\n");

   626   fprintf(df, "DumpChunkStart[%d]<br>\n", hitbuffer->next_chunk_start);

   627   for (int i = 0; i < hitbuffer->next_chunk_start + 1; ++i) {

   628     fprintf(df, "[%d]%d\n", i, hitbuffer->chunk_start[i]);

   629   }

   630   fprintf(df, "<br>\n");

   631 }

   633 // Move this verbose debugging output to debug.cc eventually

   634 void DumpChunkSummary(FILE* df, const ChunkSummary* cs) {

   635   // Print chunksummary

   636   fprintf(df, "%d lin[%d] %s.%d %s.%d %dB %d# %s %dRd %dRs<br>\n",

   637           cs->offset,

   638           cs->chunk_start,

   639           LanguageCode(static_cast<Language>(cs->lang1)),

   640           cs->score1,

   641           LanguageCode(static_cast<Language>(cs->lang2)),

   642           cs->score2,

   643           cs->bytes,

   644           cs->grams,

   645           ULScriptCode(static_cast<ULScript>(cs->ulscript)),

   646           cs->reliability_delta,

   647           cs->reliability_score);

   648 }

   650 void DumpSummaryBuffer(FILE* df, const SummaryBuffer* summarybuffer) {

   651   fprintf(df, "<br>DumpSummaryBuffer[%d]<br>\n", summarybuffer->n);

   652   fprintf(df, "[i] offset linear[chunk_start] lang.score1 lang.score2 "

   653               "bytesB ngrams# script rel_delta rel_score<br>\n");

   654   for (int i = 0; i <= summarybuffer->n; ++i) {

   655     fprintf(df, "[%d] ", i);

   656     DumpChunkSummary(df, &summarybuffer->chunksummary[i]);

   657   }

   658   fprintf(df, "<br>\n");

   659 }

   663 // Within hitbufer->linear[]

   664 // <-- prior chunk --><-- this chunk -->

   665 // |                  |                 |

   666 // linear0            linear1           linear2

   667 //     lang0              lang1

   668 // The goal of sharpening is to move this_linear to better separate langs

   669 int BetterBoundary(const char* text,

   670                    ScoringHitBuffer* hitbuffer,

   671                    ScoringContext* scoringcontext,

   672                    uint16 pslang0, uint16 pslang1,

   673                    int linear0, int linear1, int linear2) {

   674   // Degenerate case, no change

   675   if ((linear2 - linear0) <= 8) {return linear1;}

   677   // Each diff gives pslang0 score - pslang1 score

   678   // Running diff has four entries + + + + followed by four entries - - - -

   679   // so that this value is maximal at the sharpest boundary between pslang0

   680   // (positive diffs) and pslang1 (negative diffs)

   681   int running_diff = 0;

   682   int diff[8];    // Ring buffer of pslang0-pslang1 differences

   683   // Initialize with first 8 diffs

   684   for (int i = linear0; i < linear0 + 8; ++i) {

   685     int j = i & 7;

   686     uint32 langprob = hitbuffer->linear[i].langprob;

   687     diff[j] = GetLangScore(langprob, pslang0) -

   688        GetLangScore(langprob, pslang1);

   689     if (i < linear0 + 4) {

   690       // First four diffs pslang0 - pslang1

   691       running_diff += diff[j];

   692     } else {

   693       // Second four diffs -(pslang0 - pslang1)

   694       running_diff -= diff[j];

   695     }

   696   }

   698   // Now scan for sharpest boundary. j is at left end of 8 entries

   699   // To be a boundary, there must be both >0 and <0 entries in the window

   700   int better_boundary_value = 0;

   701   int better_boundary = linear1;

   702   for (int i = linear0; i < linear2 - 8; ++i) {

   703     int j = i & 7;

   704     if (better_boundary_value < running_diff) {

   705       bool has_plus = false;

   706       bool has_minus = false;

   707       for (int kk = 0; kk < 8; ++kk) {

   708         if (diff[kk] > 0) {has_plus = true;}

   709         if (diff[kk] < 0) {has_minus = true;}

   710       }

   711       if (has_plus && has_minus) {

   712         better_boundary_value = running_diff;

   713         better_boundary = i + 4;

   714       }

   715     }

   716     // Shift right one entry

   717     uint32 langprob = hitbuffer->linear[i + 8].langprob;

   718     int newdiff = GetLangScore(langprob, pslang0) -

   719        GetLangScore(langprob, pslang1);

   720     int middiff = diff[(i + 4) & 7];

   721     int olddiff = diff[j];

   722     diff[j] = newdiff;

   723     running_diff -= olddiff;                  // Remove left

   724     running_diff += 2 * middiff;              // Convert middle from - to +

   725     running_diff -= newdiff;                  // Insert right

   726   }

   728   if (scoringcontext->flags_cld2_verbose && (linear1 != better_boundary)) {

   729     Language lang0 = FromPerScriptNumber(scoringcontext->ulscript, pslang0);

   730     Language lang1 = FromPerScriptNumber(scoringcontext->ulscript, pslang1);

   731     fprintf(scoringcontext->debug_file, " Better lin[%d=>%d] %s^^%s <br>\n",

   732             linear1, better_boundary,

   733             LanguageCode(lang0), LanguageCode(lang1));

   734     int lin0_off = hitbuffer->linear[linear0].offset;

   735     int lin1_off = hitbuffer->linear[linear1].offset;

   736     int lin2_off = hitbuffer->linear[linear2].offset;

   737     int better_offm1 = hitbuffer->linear[better_boundary - 1].offset;

   738     int better_off = hitbuffer->linear[better_boundary].offset;

   739     int better_offp1 = hitbuffer->linear[better_boundary + 1].offset;

   740     string old0(&text[lin0_off], lin1_off - lin0_off);

   741     string old1(&text[lin1_off], lin2_off - lin1_off);

   742     string new0(&text[lin0_off], better_offm1 - lin0_off);

   743     string new0m1(&text[better_offm1], better_off - better_offm1);

   744     string new1(&text[better_off], better_offp1 - better_off);

   745     string new1p1(&text[better_offp1], lin2_off - better_offp1);

   746     fprintf(scoringcontext->debug_file, "%s^^%s => <br>\n%s^%s^^%s^%s<br>\n",

   747             GetHtmlEscapedText(old0).c_str(),

   748             GetHtmlEscapedText(old1).c_str(),

   749             GetHtmlEscapedText(new0).c_str(),

   750             GetHtmlEscapedText(new0m1).c_str(),

   751             GetHtmlEscapedText(new1).c_str(),

   752             GetHtmlEscapedText(new1p1).c_str());

   753     // Slow picture of differences per linear entry

   754     int d;

   755     for (int i = linear0; i < linear2; ++i) {

   756       if (i == better_boundary) {

   757         fprintf(scoringcontext->debug_file, "^^ ");

   758       }

   759       uint32 langprob = hitbuffer->linear[i].langprob;

   760       d = GetLangScore(langprob, pslang0) - GetLangScore(langprob, pslang1);

   761       const char* s = "=";

   762       //if (d > 2) {s = "\xc2\xaf";}    // Macron

   763       if (d > 2) {s = "#";}

   764       else if (d > 0) {s = "+";}

   765       else if (d < -2) {s = "_";}

   766       else if (d < 0) {s = "-";}

   767       fprintf(scoringcontext->debug_file, "%s ", s);

   768     }

   769     fprintf(scoringcontext->debug_file, " &nbsp;&nbsp;(scale: #+=-_)<br>\n");

   770   }

   771   return better_boundary;

   772 }

   775 // For all but the first summary, if its top language differs from

   776 // the previous chunk, refine the boundary

   777 // Linearized version

   778 void SharpenBoundaries(const char* text,

   779                        bool more_to_come,

   780                        ScoringHitBuffer* hitbuffer,

   781                        ScoringContext* scoringcontext,

   782                        SummaryBuffer* summarybuffer) {

   784   int prior_linear = summarybuffer->chunksummary[0].chunk_start;

   785   uint16 prior_lang = summarybuffer->chunksummary[0].lang1;

   787   if (scoringcontext->flags_cld2_verbose) {

   788     fprintf(scoringcontext->debug_file, "<br>SharpenBoundaries<br>\n");

   789   }

   790   for (int i = 1; i < summarybuffer->n; ++i) {

   791     ChunkSummary* cs = &summarybuffer->chunksummary[i];

   792     uint16 this_lang = cs->lang1;

   793     if (this_lang == prior_lang) {

   794       prior_linear = cs->chunk_start;

   795       continue;

   796     }

   798     int this_linear = cs->chunk_start;

   799     int next_linear = summarybuffer->chunksummary[i + 1].chunk_start;

   801     // If this/prior in same close set, don't move boundary

   802     if (SameCloseSet(prior_lang, this_lang)) {

   803       prior_linear = this_linear;

   804       prior_lang = this_lang;

   805       continue;

   806     }

   809     // Within hitbuffer->linear[]

   810     // <-- prior chunk --><-- this chunk -->

   811     // |                  |                 |

   812     // prior_linear       this_linear       next_linear

   813     //     prior_lang         this_lang

   814     // The goal of sharpening is to move this_linear to better separate langs

   816     uint8 pslang0 = PerScriptNumber(scoringcontext->ulscript,

   817                                     static_cast<Language>(prior_lang));

   818     uint8 pslang1 = PerScriptNumber(scoringcontext->ulscript,

   819                                     static_cast<Language>(this_lang));

   820     int better_linear = BetterBoundary(text,

   821                                        hitbuffer,

   822                                        scoringcontext,

   823                                        pslang0, pslang1,

   824                                        prior_linear, this_linear, next_linear);

   826     int old_offset = hitbuffer->linear[this_linear].offset;

   827     int new_offset = hitbuffer->linear[better_linear].offset;

   828     cs->chunk_start = better_linear;

   829     cs->offset = new_offset;

   830     // If this_linear moved right, make bytes smaller for this, larger for prior

   831     // If this_linear moved left, make bytes larger for this, smaller for prior

   832     cs->bytes -= (new_offset - old_offset);

   833     summarybuffer->chunksummary[i - 1].bytes += (new_offset - old_offset);

   835     this_linear = better_linear;    // Update so that next chunk doesn't intrude

   837     // Consider rescoring the two chunks

   839     // Update for next round (note: using pre-updated boundary)

   840     prior_linear = this_linear;

   841     prior_lang = this_lang;

   842   }

   843 }

   845 // Make a langprob that gives small weight to the default language for ulscript

   846 uint32 DefaultLangProb(ULScript ulscript) {

   847   Language default_lang = DefaultLanguage(ulscript);

   848   return MakeLangProb(default_lang, 1);

   849 }

   851 // Effectively, do a merge-sort based on text offsets

   852 // Look up each indirect value in appropriate scoring table and keep

   853 // just the resulting langprobs

   854 void LinearizeAll(ScoringContext* scoringcontext, bool score_cjk,

   855                   ScoringHitBuffer* hitbuffer) {

   856   const CLD2TableSummary* base_obj;       // unigram or quadgram

   857   const CLD2TableSummary* base_obj2;      // quadgram dual table

   858   const CLD2TableSummary* delta_obj;      // bigram or octagram

   859   const CLD2TableSummary* distinct_obj;   // bigram or octagram

   860   uint16 base_hit;

   861   if (score_cjk) {

   862     base_obj = scoringcontext->scoringtables->unigram_compat_obj;

   863     base_obj2 = scoringcontext->scoringtables->unigram_compat_obj;

   864     delta_obj = scoringcontext->scoringtables->deltabi_obj;

   865     distinct_obj = scoringcontext->scoringtables->distinctbi_obj;

   866     base_hit = UNIHIT;

   867   } else {

   868     base_obj = scoringcontext->scoringtables->quadgram_obj;

   869     base_obj2 = scoringcontext->scoringtables->quadgram_obj2;

   870     delta_obj = scoringcontext->scoringtables->deltaocta_obj;

   871     distinct_obj = scoringcontext->scoringtables->distinctocta_obj;

   872     base_hit = QUADHIT;

   873   }

   875   int base_limit = hitbuffer->next_base;

   876   int delta_limit = hitbuffer->next_delta;

   877   int distinct_limit = hitbuffer->next_distinct;

   878   int base_i = 0;

   879   int delta_i = 0;

   880   int distinct_i = 0;

   881   int linear_i = 0;

   883   // Start with an initial base hit for the default language for this script

   884   // Inserting this avoids edge effects with no hits at all

   885   hitbuffer->linear[linear_i].offset = hitbuffer->lowest_offset;

   886   hitbuffer->linear[linear_i].type = base_hit;

   887   hitbuffer->linear[linear_i].langprob =

   888     DefaultLangProb(scoringcontext->ulscript);

   889   ++linear_i;

   891   while ((base_i < base_limit) || (delta_i < delta_limit) ||

   892          (distinct_i < distinct_limit)) {

   893     int base_off = hitbuffer->base[base_i].offset;

   894     int delta_off = hitbuffer->delta[delta_i].offset;

   895     int distinct_off = hitbuffer->distinct[distinct_i].offset;

   897     // Do delta and distinct first, so that they are not lost at base_limit

   898     if ((delta_i < delta_limit) &&

   899         (delta_off <= base_off) && (delta_off <= distinct_off)) {

   900       // Add delta entry

   901       int indirect = hitbuffer->delta[delta_i].indirect;

   902       ++delta_i;

   903       uint32 langprob = delta_obj->kCLDTableInd[indirect];

   904       if (langprob > 0) {

   905         hitbuffer->linear[linear_i].offset = delta_off;

   906         hitbuffer->linear[linear_i].type = DELTAHIT;

   907         hitbuffer->linear[linear_i].langprob = langprob;

   908         ++linear_i;

   909       }

   910     }

   911     else if ((distinct_i < distinct_limit) &&

   912              (distinct_off <= base_off) && (distinct_off <= delta_off)) {

   913       // Add distinct entry

   914       int indirect = hitbuffer->distinct[distinct_i].indirect;

   915       ++distinct_i;

   916       uint32 langprob = distinct_obj->kCLDTableInd[indirect];

   917       if (langprob > 0) {

   918         hitbuffer->linear[linear_i].offset = distinct_off;

   919         hitbuffer->linear[linear_i].type = DISTINCTHIT;

   920         hitbuffer->linear[linear_i].langprob = langprob;

   921         ++linear_i;

   922       }

   923     }

   924     else {

   925       // Add one or two base entries

   926       int indirect = hitbuffer->base[base_i].indirect;

   927       // First, get right scoring table

   928       const CLD2TableSummary* local_base_obj = base_obj;

   929       if ((indirect & 0x80000000u) != 0) {

   930         local_base_obj = base_obj2;

   931         indirect &= ~0x80000000u;

   932       }

   933       ++base_i;

   934       // One langprob in kQuadInd[0..SingleSize),

   935       // two in kQuadInd[SingleSize..Size)

   936       if (indirect < static_cast<int>(local_base_obj->kCLDTableSizeOne)) {

   937         // Up to three languages at indirect

   938         uint32 langprob = local_base_obj->kCLDTableInd[indirect];

   939         if (langprob > 0) {

   940           hitbuffer->linear[linear_i].offset = base_off;

   941           hitbuffer->linear[linear_i].type = base_hit;

   942           hitbuffer->linear[linear_i].langprob = langprob;

   943           ++linear_i;

   944         }

   945       } else {

   946         // Up to six languages at start + 2 * (indirect - start)

   947         indirect += (indirect - local_base_obj->kCLDTableSizeOne);

   948         uint32 langprob = local_base_obj->kCLDTableInd[indirect];

   949         uint32 langprob2 = local_base_obj->kCLDTableInd[indirect + 1];

   950         if (langprob > 0) {

   951           hitbuffer->linear[linear_i].offset = base_off;

   952           hitbuffer->linear[linear_i].type = base_hit;

   953           hitbuffer->linear[linear_i].langprob = langprob;

   954           ++linear_i;

   955         }

   956         if (langprob2 > 0) {

   957           hitbuffer->linear[linear_i].offset = base_off;

   958           hitbuffer->linear[linear_i].type = base_hit;

   959           hitbuffer->linear[linear_i].langprob = langprob2;

   960           ++linear_i;

   961         }

   962       }

   963     }

   964   }

   966   // Update

   967   hitbuffer->next_linear = linear_i;

   969   // Add a dummy entry off the end, just to capture final offset

   970   hitbuffer->linear[linear_i].offset =

   971   hitbuffer->base[hitbuffer->next_base].offset;

   972   hitbuffer->linear[linear_i].langprob = 0;

   973 }

   975 // Break linear array into chunks of ~20 quadgram hits or ~50 CJK unigram hits

   976 void ChunkAll(int letter_offset, bool score_cjk, ScoringHitBuffer* hitbuffer) {

   977   int chunksize;

   978   uint16 base_hit;

   979   if (score_cjk) {

   980     chunksize = kChunksizeUnis;

   981     base_hit = UNIHIT;

   982   } else {

   983     chunksize = kChunksizeQuads;

   984     base_hit = QUADHIT;

   985   }

   987   int linear_i = 0;

   988   int linear_off_end = hitbuffer->next_linear;

   989   int text_i = letter_offset;               // Next unseen text offset

   990   int next_chunk_start = 0;

   991   int bases_left = hitbuffer->next_base;

   992   while (bases_left > 0) {

   993     // Linearize one chunk

   994     int base_len = chunksize;     // Default; may be changed below

   995     if (bases_left < (chunksize + (chunksize >> 1))) {

   996       // If within 1.5 chunks of the end, avoid runts by using it all

   997       base_len = bases_left;

   998     } else if (bases_left < (2 * chunksize)) {

   999       // Avoid runts by splitting 1.5 to 2 chunks in half (about 3/4 each)

  1000       base_len = (bases_left + 1) >> 1;

  1001     }

  1003     hitbuffer->chunk_start[next_chunk_start] = linear_i;

  1004     hitbuffer->chunk_offset[next_chunk_start] = text_i;

  1005     ++next_chunk_start;

  1007     int base_count = 0;

  1008     while ((base_count < base_len) && (linear_i < linear_off_end)) {

  1009       if (hitbuffer->linear[linear_i].type == base_hit) {++base_count;}

  1010       ++linear_i;

  1011     }

  1012     text_i = hitbuffer->linear[linear_i].offset;    // Next unseen text offset

  1013     bases_left -= base_len;

  1014   }

  1016   // If no base hits at all, make a single dummy chunk

  1017   if (next_chunk_start == 0) {

  1018      hitbuffer->chunk_start[next_chunk_start] = 0;

  1019      hitbuffer->chunk_offset[next_chunk_start] = hitbuffer->linear[0].offset;

  1020      ++next_chunk_start;

  1021   }

  1023   // Remember the linear array start of dummy entry

  1024   hitbuffer->next_chunk_start = next_chunk_start;

  1026   // Add a dummy entry off the end, just to capture final linear subscr

  1027   hitbuffer->chunk_start[next_chunk_start] = hitbuffer->next_linear;

  1028   hitbuffer->chunk_offset[next_chunk_start] = text_i;

  1029 }

  1032 // Merge-sort the individual hit arrays, go indirect on the scoring subscripts,

  1033 // break linear array into chunks.

  1034 //

  1035 // Input:

  1036 //  hitbuffer base, delta, distinct arrays

  1037 // Output:

  1038 //  linear array

  1039 //  chunk_start array

  1040 //

  1041 void LinearizeHitBuffer(int letter_offset,

  1042                         ScoringContext* scoringcontext,

  1043                         bool more_to_come, bool score_cjk,

  1044                         ScoringHitBuffer* hitbuffer) {

  1045   LinearizeAll(scoringcontext, score_cjk, hitbuffer);

  1046   ChunkAll(letter_offset, score_cjk, hitbuffer);

  1047 }

  1051 // The hitbuffer is in an awkward form -- three sets of base/delta/distinct

  1052 // scores, each with an indirect subscript to one of six scoring tables, some

  1053 // of which can yield two langprobs for six languages, others one langprob for

  1054 // three languages. The only correlation between base/delta/distinct is their

  1055 // offsets into the letters-only text buffer.

  1056 //

  1057 // SummaryBuffer needs to be built to linear, giving linear offset of start of

  1058 // each chunk

  1059 //

  1060 // So we first do all the langprob lookups and merge-sort by offset to make

  1061 // a single linear vector, building a side vector of chunk beginnings as we go.

  1062 // The sharpening is simply moving the beginnings, scoring is a simple linear

  1063 // sweep, etc.

  1065 void ProcessHitBuffer(const LangSpan& scriptspan,

  1066                       int letter_offset,

  1067                       ScoringContext* scoringcontext,

  1068                       DocTote* doc_tote,

  1069                       ResultChunkVector* vec,

  1070                       bool more_to_come, bool score_cjk,

  1071                       ScoringHitBuffer* hitbuffer) {

  1072   if (scoringcontext->flags_cld2_verbose) {

  1073     fprintf(scoringcontext->debug_file, "Hitbuffer[) ");

  1074     DumpHitBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer);

  1075   }

  1077   LinearizeHitBuffer(letter_offset, scoringcontext, more_to_come, score_cjk,

  1078                      hitbuffer);

  1080   if (scoringcontext->flags_cld2_verbose) {

  1081     fprintf(scoringcontext->debug_file, "Linear[) ");

  1082     DumpLinearBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer);

  1083   }

  1085   SummaryBuffer summarybuffer;

  1086   summarybuffer.n = 0;

  1087   ChunkSpan last_cspan;

  1088   ScoreAllHits(scriptspan.text, scriptspan.ulscript,

  1089                     more_to_come, score_cjk, hitbuffer,

  1090                     scoringcontext,

  1091                     &summarybuffer, &last_cspan);

  1093   if (scoringcontext->flags_cld2_verbose) {

  1094     DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer);

  1095   }

  1097   if (vec != NULL) {

  1098     // Sharpen boundaries of summarybuffer

  1099     // This is not a high-performance path

  1100     SharpenBoundaries(scriptspan.text, more_to_come, hitbuffer, scoringcontext,

  1101                       &summarybuffer);

  1102     // Show after the sharpening

  1103     // CLD2_Debug2(scriptspan.text, more_to_come, score_cjk,

  1104     //             hitbuffer, scoringcontext, &summarybuffer);

  1106     if (scoringcontext->flags_cld2_verbose) {

  1107       DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer);

  1108     }

  1109   }

  1111   SummaryBufferToDocTote(&summarybuffer, more_to_come, doc_tote);

  1112   SummaryBufferToVector(scoringcontext->scanner, scriptspan.text,

  1113                         &summarybuffer, more_to_come, vec);

  1114 }

  1116 void SpliceHitBuffer(ScoringHitBuffer* hitbuffer, int next_offset) {

  1117   // Splice hitbuffer and summarybuffer for next round. With big chunks and

  1118   // distinctive-word state carried across chunks, we might not need to do this.

  1119   hitbuffer->next_base = 0;

  1120   hitbuffer->next_delta = 0;

  1121   hitbuffer->next_distinct = 0;

  1122   hitbuffer->next_linear = 0;

  1123   hitbuffer->next_chunk_start = 0;

  1124   hitbuffer->lowest_offset = next_offset;

  1125 }

  1128 // Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating

  1129 // scoringcontext

  1130 void ScoreEntireScriptSpan(const LangSpan& scriptspan,

  1131                            ScoringContext* scoringcontext,

  1132                            DocTote* doc_tote,

  1133                            ResultChunkVector* vec) {

  1134   int bytes = scriptspan.text_bytes;

  1135   // Artificially set score to 1024 per 1KB, or 1 per byte

  1136   int score = bytes;

  1137   int reliability = 100;

  1138   // doc_tote uses full languages

  1139   Language one_one_lang = DefaultLanguage(scriptspan.ulscript);

  1140   doc_tote->Add(one_one_lang, bytes, score, reliability);

  1142   if (scoringcontext->flags_cld2_html) {

  1143     ChunkSummary chunksummary = {

  1144       1, 0,

  1145       one_one_lang, UNKNOWN_LANGUAGE, score, 1,

  1146       bytes, 0, scriptspan.ulscript, reliability, reliability

  1147     };

  1148     CLD2_Debug(scriptspan.text, 1, scriptspan.text_bytes,

  1149                false, false, NULL,

  1150                scoringcontext, NULL, &chunksummary);

  1151   }

  1153   // First byte is always a space

  1154   JustOneItemToVector(scoringcontext->scanner, scriptspan.text,

  1155                       one_one_lang, 1, bytes - 1, vec);

  1157   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;

  1158 }

  1160 // Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext

  1161 void ScoreCJKScriptSpan(const LangSpan& scriptspan,

  1162                         ScoringContext* scoringcontext,

  1163                         DocTote* doc_tote,

  1164                         ResultChunkVector* vec) {

  1165   // Allocate three parallel arrays of scoring hits

  1166   ScoringHitBuffer* hitbuffer = new ScoringHitBuffer;

  1167   hitbuffer->init();

  1168   hitbuffer->ulscript = scriptspan.ulscript;

  1170   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;

  1171   scoringcontext->oldest_distinct_boost = 0;

  1173   // Incoming scriptspan has a single leading space at scriptspan.text[0]

  1174   // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3]

  1176   int letter_offset = 1;        // Skip initial space

  1177   hitbuffer->lowest_offset = letter_offset;

  1178   int letter_limit = scriptspan.text_bytes;

  1179   while (letter_offset < letter_limit) {

  1180     if (scoringcontext->flags_cld2_verbose) {

  1181       fprintf(scoringcontext->debug_file, " ScoreCJKScriptSpan[%d,%d)<br>\n",

  1182               letter_offset, letter_limit);

  1183     }

  1184     //

  1185     // Fill up one hitbuffer, possibly splicing onto previous fragment

  1186     //

  1187     // NOTE: GetUniHits deals with close repeats

  1188     // NOTE: After last chunk there is always a hitbuffer entry with an offset

  1189     // just off the end of the text = next_offset.

  1190     int next_offset = GetUniHits(scriptspan.text, letter_offset, letter_limit,

  1191                                   scoringcontext, hitbuffer);

  1192     // NOTE: GetBiHitVectors deals with close repeats,

  1193     // does one hash and two lookups (delta and distinct) per word

  1194     GetBiHits(scriptspan.text, letter_offset, next_offset,

  1195                 scoringcontext, hitbuffer);

  1197     //

  1198     // Score one hitbuffer in chunks to summarybuffer

  1199     //

  1200     bool more_to_come = next_offset < letter_limit;

  1201     bool score_cjk = true;

  1202     ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec,

  1203                      more_to_come, score_cjk, hitbuffer);

  1204     SpliceHitBuffer(hitbuffer, next_offset);

  1206     letter_offset = next_offset;

  1207   }

  1209   delete hitbuffer;

  1210   // Context across buffers is not connected yet

  1211   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;

  1212 }

  1216 // Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext

  1217 // We have a scriptspan with all lowercase text in one script. Look up

  1218 // quadgrams and octagrams, saving the hits in three parallel vectors.

  1219 // Score from those vectors in chunks, toting each chunk to get a single

  1220 // language, and combining into the overall document score. The hit vectors

  1221 // in general are not big enough to handle and entire scriptspan, so

  1222 // repeat until the entire scriptspan is scored.

  1223 // Caller deals with minimizing numbr of runt scriptspans

  1224 // This routine deals with minimizing number of runt chunks.

  1225 //

  1226 // Returns updated scoringcontext

  1227 // Returns updated doc_tote

  1228 // If vec != NULL, appends to that vector of ResultChunk's

  1229 void ScoreQuadScriptSpan(const LangSpan& scriptspan,

  1230                          ScoringContext* scoringcontext,

  1231                          DocTote* doc_tote,

  1232                          ResultChunkVector* vec) {

  1233   // Allocate three parallel arrays of scoring hits

  1234   ScoringHitBuffer* hitbuffer = new ScoringHitBuffer;

  1235   hitbuffer->init();

  1236   hitbuffer->ulscript = scriptspan.ulscript;

  1238   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;

  1239   scoringcontext->oldest_distinct_boost = 0;

  1241   // Incoming scriptspan has a single leading space at scriptspan.text[0]

  1242   // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3]

  1244   int letter_offset = 1;        // Skip initial space

  1245   hitbuffer->lowest_offset = letter_offset;

  1246   int letter_limit = scriptspan.text_bytes;

  1247   while (letter_offset < letter_limit) {

  1248     //

  1249     // Fill up one hitbuffer, possibly splicing onto previous fragment

  1250     //

  1251     // NOTE: GetQuadHits deals with close repeats

  1252     // NOTE: After last chunk there is always a hitbuffer entry with an offset

  1253     // just off the end of the text = next_offset.

  1254     int next_offset = GetQuadHits(scriptspan.text, letter_offset, letter_limit,

  1255                                   scoringcontext, hitbuffer);

  1256     // If true, there is more text to process in this scriptspan

  1257     // NOTE: GetOctaHitVectors deals with close repeats,

  1258     // does one hash and two lookups (delta and distinct) per word

  1259     GetOctaHits(scriptspan.text, letter_offset, next_offset,

  1260                 scoringcontext, hitbuffer);

  1262     //

  1263     // Score one hitbuffer in chunks to summarybuffer

  1264     //

  1265     bool more_to_come = next_offset < letter_limit;

  1266     bool score_cjk = false;

  1267     ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec,

  1268                      more_to_come, score_cjk, hitbuffer);

  1269     SpliceHitBuffer(hitbuffer, next_offset);

  1271     letter_offset = next_offset;

  1272   }

  1274   delete hitbuffer;

  1275 }

  1278 // Score one scriptspan into doc_tote and vec, updating scoringcontext

  1279 // Inputs:

  1280 //  One scriptspan of perhaps 40-60KB, all same script lower-case letters

  1281 //    and single ASCII spaces. First character is a space to allow simple

  1282 //    begining-of-word detect. End of buffer has three spaces and NUL to

  1283 //    allow easy scan-to-end-of-word.

  1284 //  Scoring context of

  1285 //    scoring tables

  1286 //    flags

  1287 //    running boosts

  1288 // Outputs:

  1289 //  Updated doc_tote giving overall languages and byte counts

  1290 //  Optional updated chunk vector giving offset, length, language

  1291 //

  1292 // Caller initializes flags, boosts, doc_tote and vec.

  1293 // Caller aggregates across multiple scriptspans

  1294 // Caller calculates final document result

  1295 // Caller deals with detecting and triggering suppression of repeated text.

  1296 //

  1297 // This top-level routine just chooses the recognition type and calls one of

  1298 // the next-level-down routines.

  1299 //

  1300 void ScoreOneScriptSpan(const LangSpan& scriptspan,

  1301                         ScoringContext* scoringcontext,

  1302                         DocTote* doc_tote,

  1303                         ResultChunkVector* vec) {

  1304   if (scoringcontext->flags_cld2_verbose) {

  1305     fprintf(scoringcontext->debug_file, "<br>ScoreOneScriptSpan(%s,%d) ",

  1306             ULScriptCode(scriptspan.ulscript), scriptspan.text_bytes);

  1307     // Optionally print the chunk lowercase letters/marks text

  1308     string temp(&scriptspan.text[0], scriptspan.text_bytes);

  1309     fprintf(scoringcontext->debug_file, "'%s'",

  1310             GetHtmlEscapedText(temp).c_str());

  1311     fprintf(scoringcontext->debug_file, "<br>\n");

  1312   }

  1313   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;

  1314   scoringcontext->oldest_distinct_boost = 0;

  1315   ULScriptRType rtype = ULScriptRecognitionType(scriptspan.ulscript);

  1316   if (scoringcontext->flags_cld2_score_as_quads && (rtype != RTypeCJK)) {

  1317     rtype = RTypeMany;

  1318   }

  1319   switch (rtype) {

  1320   case RTypeNone:

  1321   case RTypeOne:

  1322     ScoreEntireScriptSpan(scriptspan, scoringcontext, doc_tote, vec);

  1323     break;

  1324   case RTypeCJK:

  1325     ScoreCJKScriptSpan(scriptspan, scoringcontext, doc_tote, vec);

  1326     break;

  1327   case RTypeMany:

  1328     ScoreQuadScriptSpan(scriptspan, scoringcontext, doc_tote, vec);

  1329     break;

  1330   }

  1331 }

  1333 }       // End namespace CLD2

The Tor Browser / file revision

browser/components/translation/cld2/internal/scoreonescriptspan.cc@6474c204b198

browser/components/translation/cld2/internal/scoreonescriptspan.cc