The Tor Browser: browser/components/translation/cld2/internal/cldutil.cc@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.

     2 //

     3 // Licensed under the Apache License, Version 2.0 (the "License");

     4 // you may not use this file except in compliance with the License.

     5 // You may obtain a copy of the License at

     6 //

     7 //     http://www.apache.org/licenses/LICENSE-2.0

     8 //

     9 // Unless required by applicable law or agreed to in writing, software

    10 // distributed under the License is distributed on an "AS IS" BASIS,

    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

    12 // See the License for the specific language governing permissions and

    13 // limitations under the License.

    15 //

    16 // Author: dsites@google.com (Dick Sites)

    17 // Updated 2014.01 for dual table lookup

    18 //

    20 #include "cldutil.h"

    21 #include <string>

    23 #include "cld2tablesummary.h"

    24 #include "integral_types.h"

    25 #include "port.h"

    26 #include "utf8statetable.h"

    28 namespace CLD2 {

    30 // Caller supplies the right tables in scoringcontext

    32 // Runtime routines for hashing, looking up, and scoring

    33 // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.

    34 // Unigrams and bigrams are for CJK languages only, including simplified/

    35 // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and

    36 // Zhuang Han characters. Surrounding spaces are not considered.

    37 // Quadgrams and octagrams for for non-CJK and include two bits indicating

    38 // preceding and trailing spaces (word boundaries).

    41 static const int kMinCJKUTF8CharBytes = 3;

    43 static const int kMinGramCount = 3;

    44 static const int kMaxGramCount = 16;

    46 static const int UTFmax = 4;        // Max number of bytes in a UTF-8 character

    48   // 1 to skip ASCII space, vowels AEIOU aeiou and UTF-8 continuation bytes 80-BF

    49   static const uint8 kSkipSpaceVowelContinue[256] = {

    50     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    51     1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    52     0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,

    53     0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,

    55     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

    56     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

    57     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    58     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    59   };

    61   // 1 to skip ASCII space, and UTF-8 continuation bytes 80-BF

    62   static const uint8 kSkipSpaceContinue[256] = {

    63     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    64     1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    65     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    66     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    68     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

    69     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

    70     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    71     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    72   };

    75   // Always advances one UTF-8 character

    76   static const uint8 kAdvanceOneChar[256] = {

    77     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

    78     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

    79     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

    80     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

    82     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

    83     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

    84     2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,

    85     3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,

    86   };

    88   // Advances *only* on space (or illegal byte)

    89   static const uint8 kAdvanceOneCharSpace[256] = {

    90     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

    91     1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    92     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    93     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    95     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

    96     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

    97     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    98     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

    99   };

   102 // Routines to access a hash table of <key:wordhash, value:probs> pairs

   103 // Buckets have 4-byte wordhash for sizes < 32K buckets, but only

   104 // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as

   105 // bucket subscript.

   106 // Probs is a packed: three languages plus a subscript for probability table

   107 // Buckets have all the keys together, then all the values.Key array never

   108 // crosses a cache-line boundary, so no-match case takes exactly one cache miss.

   109 // Match case may sometimes take an additional cache miss on value access.

   110 //

   111 // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64

   112 // byte buckets with single cache miss.

   113 // Or 2-byte key and 6-byte value, allowing 5 languages instead  of three.

   114 //------------------------------------------------------------------------------

   116 //----------------------------------------------------------------------------//

   117 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores      //

   118 //----------------------------------------------------------------------------//

   120 //----------------------------------------------------------------------------//

   121 // Scoring single groups of letters                                           //

   122 //----------------------------------------------------------------------------//

   124 // BIGRAM, QUADGRAM, OCTAGRAM score one => tote

   125 // Input: 4-byte entry of 3 language numbers and one probability subscript, plus

   126 //  an accumulator tote. (language 0 means unused entry)

   127 // Output: running sums in tote updated

   128 void ProcessProbV2Tote(uint32 probs, Tote* tote) {

   129   uint8 prob123 = (probs >> 0) & 0xff;

   130   const uint8* prob123_entry = LgProb2TblEntry(prob123);

   132   uint8 top1 = (probs >> 8) & 0xff;

   133   if (top1 > 0) {tote->Add(top1, LgProb3(prob123_entry, 0));}

   134   uint8 top2 = (probs >> 16) & 0xff;

   135   if (top2 > 0) {tote->Add(top2, LgProb3(prob123_entry, 1));}

   136   uint8 top3 = (probs >> 24) & 0xff;

   137   if (top3 > 0) {tote->Add(top3, LgProb3(prob123_entry, 2));}

   138 }

   140 // Return score for a particular per-script language, or zero

   141 int GetLangScore(uint32 probs, uint8 pslang) {

   142   uint8 prob123 = (probs >> 0) & 0xff;

   143   const uint8* prob123_entry = LgProb2TblEntry(prob123);

   144   int retval = 0;

   145   uint8 top1 = (probs >> 8) & 0xff;

   146   if (top1 == pslang) {retval += LgProb3(prob123_entry, 0);}

   147   uint8 top2 = (probs >> 16) & 0xff;

   148   if (top2 == pslang) {retval += LgProb3(prob123_entry, 1);}

   149   uint8 top3 = (probs >> 24) & 0xff;

   150   if (top3 == pslang) {retval += LgProb3(prob123_entry, 2);}

   151   return retval;

   152 }

   154 //----------------------------------------------------------------------------//

   155 // Routines to accumulate probabilities                                       //

   156 //----------------------------------------------------------------------------//

   159 // BIGRAM, using hash table, always advancing by 1 char

   160 // Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj

   161 // Score all bigrams in isrc, using languages that have bigrams (CJK)

   162 // Return number of bigrams that hit in the hash table

   163 int DoBigramScoreV3(const CLD2TableSummary* bigram_obj,

   164                          const char* isrc, int srclen, Tote* chunk_tote) {

   165   int hit_count = 0;

   166   const char* src = isrc;

   168   // Hashtable-based CJK bigram lookup

   169   const uint8* usrc = reinterpret_cast<const uint8*>(src);

   170   const uint8* usrclimit1 = usrc + srclen - UTFmax;

   172   while (usrc < usrclimit1) {

   173     int len = kAdvanceOneChar[usrc[0]];

   174     int len2 = kAdvanceOneChar[usrc[len]] + len;

   176     if ((kMinCJKUTF8CharBytes * 2) <= len2) {      // Two CJK chars possible

   177       // Lookup and score this bigram

   178       // Always ignore pre/post spaces

   179       uint32 bihash = BiHashV2(reinterpret_cast<const char*>(usrc), len2);

   180       uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);

   181       // Now go indirect on the subscript

   182       probs = bigram_obj->kCLDTableInd[probs &

   183         ~bigram_obj->kCLDTableKeyMask];

   185       // Process the bigram

   186       if (probs != 0) {

   187         ProcessProbV2Tote(probs, chunk_tote);

   188         ++hit_count;

   189       }

   190     }

   191     usrc += len;  // Advance by one char

   192   }

   194   return hit_count;

   195 }

   198 // Score up to 64KB of a single script span in one pass

   199 // Make a dummy entry off the end to calc length of last span

   200 // Return offset of first unused input byte

   201 int GetUniHits(const char* text,

   202                      int letter_offset, int letter_limit,

   203                      ScoringContext* scoringcontext,

   204                      ScoringHitBuffer* hitbuffer) {

   205   const char* isrc = &text[letter_offset];

   206   const char* src = isrc;

   207   // Limit is end, which has extra 20 20 20 00 past len

   208   const char* srclimit = &text[letter_limit];

   210   // Local copies

   211   const UTF8PropObj* unigram_obj =

   212     scoringcontext->scoringtables->unigram_obj;

   213   int next_base = hitbuffer->next_base;

   214   int next_base_limit = hitbuffer->maxscoringhits;

   216   // Visit all unigrams

   217   if (src[0] == ' ') {++src;}   // skip any initial space

   218   while (src < srclimit) {

   219     const uint8* usrc = reinterpret_cast<const uint8*>(src);

   220     int len = kAdvanceOneChar[usrc[0]];

   221     src += len;

   222     // Look up property of one UTF-8 character and advance over it.

   223     // Updates usrc and len (bad interface design), hence increment above

   224     int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &len);

   225     if (propval > 0) {

   226       // Save indirect subscript for later scoring; 1 or 2 langprobs

   227       int indirect_subscr = propval;

   228       hitbuffer->base[next_base].offset = src - text;     // Offset in text

   229       hitbuffer->base[next_base].indirect = indirect_subscr;

   230       ++next_base;

   231     }

   233     if (next_base >= next_base_limit) {break;}

   234   }

   236   hitbuffer->next_base = next_base;

   238   // Make a dummy entry off the end to calc length of last span

   239   int dummy_offset = src - text;

   240   hitbuffer->base[hitbuffer->next_base].offset = dummy_offset;

   241   hitbuffer->base[hitbuffer->next_base].indirect = 0;

   243   return src - text;

   244 }

   246 // Score up to 64KB of a single script span, doing both delta-bi and

   247 // distinct bis in one pass

   248 void GetBiHits(const char* text,

   249                      int letter_offset, int letter_limit,

   250                      ScoringContext* scoringcontext,

   251                      ScoringHitBuffer* hitbuffer) {

   252   const char* isrc = &text[letter_offset];

   253   const char* src = isrc;

   254   // Limit is end

   255   const char* srclimit1 = &text[letter_limit];

   257   // Local copies

   258   const CLD2TableSummary* deltabi_obj =

   259     scoringcontext->scoringtables->deltabi_obj;

   260   const CLD2TableSummary* distinctbi_obj =

   261     scoringcontext->scoringtables->distinctbi_obj;

   262   int next_delta = hitbuffer->next_delta;

   263   int next_delta_limit = hitbuffer->maxscoringhits;

   264   int next_distinct = hitbuffer->next_distinct;

   265   // We can do 2 inserts per loop, so -1

   266   int next_distinct_limit = hitbuffer->maxscoringhits - 1;

   268   while (src < srclimit1) {

   269     const uint8* usrc = reinterpret_cast<const uint8*>(src);

   270     int len = kAdvanceOneChar[usrc[0]];

   271     int len2 = kAdvanceOneChar[usrc[len]] + len;

   273     if ((kMinCJKUTF8CharBytes * 2) <= len2) {      // Two CJK chars possible

   274       // Lookup and this bigram and save <offset, indirect>

   275       uint32 bihash = BiHashV2(src, len2);

   276       uint32 probs = QuadHashV3Lookup4(deltabi_obj, bihash);

   277       // Now go indirect on the subscript

   278       if (probs != 0) {

   279         // Save indirect subscript for later scoring; 1 langprob

   280         int indirect_subscr = probs & ~deltabi_obj->kCLDTableKeyMask;

   281         hitbuffer->delta[next_delta].offset = src - text;

   282         hitbuffer->delta[next_delta].indirect = indirect_subscr;

   283         ++next_delta;

   284       }

   285       // Lookup this distinct bigram and save <offset, indirect>

   286       probs = QuadHashV3Lookup4(distinctbi_obj, bihash);

   287       if (probs != 0) {

   288         int indirect_subscr = probs & ~distinctbi_obj->kCLDTableKeyMask;

   289         hitbuffer->distinct[next_distinct].offset = src - text;

   290         hitbuffer->distinct[next_distinct].indirect = indirect_subscr;

   291         ++next_distinct;

   292       }

   293     }

   294     src += len;  // Advance by one char (not two)

   296     // Almost always srclimit hit first

   297     if (next_delta >= next_delta_limit) {break;}

   298     if (next_distinct >= next_distinct_limit) {break;}

   299   }

   301   hitbuffer->next_delta = next_delta;

   302   hitbuffer->next_distinct = next_distinct;

   304   // Make a dummy entry off the end to calc length of last span

   305   int dummy_offset = src - text;

   306   hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset;

   307   hitbuffer->delta[hitbuffer->next_delta].indirect = 0;

   308   hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset;

   309   hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0;

   310 }

   312 // Score up to 64KB of a single script span in one pass

   313 // Make a dummy entry off the end to calc length of last span

   314 // Return offset of first unused input byte

   315 int GetQuadHits(const char* text,

   316                      int letter_offset, int letter_limit,

   317                      ScoringContext* scoringcontext,

   318                      ScoringHitBuffer* hitbuffer) {

   319   const char* isrc = &text[letter_offset];

   320   const char* src = isrc;

   321   // Limit is end, which has extra 20 20 20 00 past len

   322   const char* srclimit = &text[letter_limit];

   324   // Local copies

   325   const CLD2TableSummary* quadgram_obj =

   326     scoringcontext->scoringtables->quadgram_obj;

   327   const CLD2TableSummary* quadgram_obj2 =

   328     scoringcontext->scoringtables->quadgram_obj2;

   329   int next_base = hitbuffer->next_base;

   330   int next_base_limit = hitbuffer->maxscoringhits;

   332   // Run a little cache of last quad hits to catch overly-repetitive "text"

   333   // We don't care if we miss a couple repetitions at scriptspan boundaries

   334   int next_prior_quadhash = 0;

   335   uint32 prior_quadhash[2] = {0, 0};

   337   // Visit all quadgrams

   338   if (src[0] == ' ') {++src;}   // skip any initial space

   339   while (src < srclimit) {

   340     // Find one quadgram

   341     const char* src_end = src;

   342     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];

   343     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];

   344     const char* src_mid = src_end;

   345     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];

   346     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];

   347     int len = src_end - src;

   348     // Hash the quadgram

   349     uint32 quadhash = QuadHashV2(src, len);

   351     // Filter out recent repeats

   352     if ((quadhash != prior_quadhash[0]) && (quadhash != prior_quadhash[1])) {

   353       // Look up this quadgram and save <offset, indirect>

   354       uint32 indirect_flag = 0;   // For dual tables

   355       const CLD2TableSummary* hit_obj = quadgram_obj;

   356       uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);

   357       if ((probs == 0) && (quadgram_obj2->kCLDTableSize != 0)) {

   358         // Try lookup in dual table if not found in first one

   359         // Note: we need to know later which of two indirect tables to use.

   360         indirect_flag = 0x80000000u;

   361         hit_obj = quadgram_obj2;

   362         probs = QuadHashV3Lookup4(quadgram_obj2, quadhash);

   363       }

   364       if (probs != 0) {

   365         // Round-robin two entries of actual hits

   366         prior_quadhash[next_prior_quadhash] = quadhash;

   367         next_prior_quadhash = (next_prior_quadhash + 1) & 1;

   369         // Save indirect subscript for later scoring; 1 or 2 langprobs

   370         int indirect_subscr = probs & ~hit_obj->kCLDTableKeyMask;

   371         hitbuffer->base[next_base].offset = src - text;     // Offset in text

   372         // Flip the high bit for table2

   373         hitbuffer->base[next_base].indirect = indirect_subscr | indirect_flag;

   374         ++next_base;

   375       }

   376     }

   378     // Advance: all the way past word if at end-of-word, else 2 chars

   379     if (src_end[0] == ' ') {

   380       src = src_end;

   381     } else {

   382       src = src_mid;

   383     }

   385     // Skip over space at end of word, or ASCII vowel in middle of word

   386     // Use kAdvanceOneCharSpace instead to get rid of vowel hack

   387     if (src < srclimit) {

   388       src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];

   389     } else {

   390       // Advancing by 4/8/16 can overshoot, but we are about to exit anyway

   391       src = srclimit;

   392     }

   394     if (next_base >= next_base_limit) {break;}

   395   }

   397   hitbuffer->next_base = next_base;

   399   // Make a dummy entry off the end to calc length of last span

   400   int dummy_offset = src - text;

   401   hitbuffer->base[hitbuffer->next_base].offset = dummy_offset;

   402   hitbuffer->base[hitbuffer->next_base].indirect = 0;

   404   return src - text;

   405 }

   407 // inputs:

   408 //  const tables

   409 //  const char* isrc, int srclen (in sscriptbuffer)

   410 // intermediates:

   411 //  vector of octa <offset, probs>   (which need indirect table to decode)

   412 //  vector of distinct <offset, probs>   (which need indirect table to decode)

   414 // Score up to 64KB of a single script span, doing both delta-octa and

   415 // distinct words in one pass

   416 void GetOctaHits(const char* text,

   417                      int letter_offset, int letter_limit,

   418                      ScoringContext* scoringcontext,

   419                      ScoringHitBuffer* hitbuffer) {

   420   const char* isrc = &text[letter_offset];

   421   const char* src = isrc;

   422   // Limit is end+1, to include extra space char (0x20) off the end

   423   const char* srclimit = &text[letter_limit + 1];

   425   // Local copies

   426   const CLD2TableSummary* deltaocta_obj =

   427     scoringcontext->scoringtables->deltaocta_obj;

   428   int next_delta = hitbuffer->next_delta;

   429   int next_delta_limit = hitbuffer->maxscoringhits;

   431   const CLD2TableSummary* distinctocta_obj =

   432     scoringcontext->scoringtables->distinctocta_obj;

   433   int next_distinct = hitbuffer->next_distinct;

   434   // We can do 2 inserts per loop, so -1

   435   int next_distinct_limit = hitbuffer->maxscoringhits - 1;

   437   // Run a little cache of last octa hits to catch overly-repetitive "text"

   438   // We don't care if we miss a couple repetitions at scriptspan boundaries

   439   int next_prior_octahash = 0;

   440   uint64 prior_octahash[2] = {0, 0};

   442   // Score all words truncated to 8 characters

   443   int charcount = 0;

   444   // Skip any initial space

   445   if (src[0] == ' ') {++src;}

   447   // Begin the first word

   448   const char* prior_word_start = src;

   449   const char* word_start = src;

   450   const char* word_end = word_start;

   451   while (src < srclimit) {

   452     // Terminate previous word or continue current word

   453     if (src[0] == ' ') {

   454       int len = word_end - word_start;

   455       // Hash the word

   456       uint64 wordhash40 = OctaHash40(word_start, len);

   457       uint32 probs;

   459       // Filter out recent repeats. Unlike quads, we update even if no hit,

   460       // so we can get hits on same word if separated by non-hit words

   461       if ((wordhash40 != prior_octahash[0]) &&

   462           (wordhash40 != prior_octahash[1])) {

   463         // Round-robin two entries of words

   464         prior_octahash[next_prior_octahash] = wordhash40;

   465         next_prior_octahash = 1 - next_prior_octahash;    // Alternates 0,1,0,1

   467         // (1) Lookup distinct word PAIR. For a pair, we want an asymmetrical

   468         // function of the two word hashs. For words A B C, B-A and C-B are good

   469         // enough and fast. We use the same table as distinct single words

   470         // Do not look up a pair of identical words -- all pairs hash to zero

   471         // Both 1- and 2-word distinct lookups are in distinctocta_obj now

   472         // Do this first, because it has the lowest offset

   473         uint64 tmp_prior_hash = prior_octahash[next_prior_octahash];

   474         if ((tmp_prior_hash != 0) && (tmp_prior_hash != wordhash40)) {

   475           uint64 pair_hash = PairHash(tmp_prior_hash, wordhash40);

   476           probs = OctaHashV3Lookup4(distinctocta_obj, pair_hash);

   477           if (probs != 0) {

   478             int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask;

   479             hitbuffer->distinct[next_distinct].offset = prior_word_start - text;

   480             hitbuffer->distinct[next_distinct].indirect = indirect_subscr;

   481             ++next_distinct;

   482           }

   483         }

   485         // (2) Lookup this distinct word and save <offset, indirect>

   486         probs = OctaHashV3Lookup4(distinctocta_obj, wordhash40);

   487         if (probs != 0) {

   488           int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask;

   489           hitbuffer->distinct[next_distinct].offset = word_start - text;

   490           hitbuffer->distinct[next_distinct].indirect = indirect_subscr;

   491           ++next_distinct;

   492         }

   494         // (3) Lookup this word and save <offset, indirect>

   495         probs = OctaHashV3Lookup4(deltaocta_obj, wordhash40);

   496         if (probs != 0) {

   497           // Save indirect subscript for later scoring; 1 langprob

   498           int indirect_subscr = probs & ~deltaocta_obj->kCLDTableKeyMask;

   499           hitbuffer->delta[next_delta].offset = word_start - text;

   500           hitbuffer->delta[next_delta].indirect = indirect_subscr;

   501           ++next_delta;

   502         }

   503       }

   505       // Begin the next word

   506       charcount = 0;

   507       prior_word_start = word_start;

   508       word_start = src + 1;   // Over the space

   509       word_end = word_start;

   510     } else {

   511       ++charcount;

   512     }

   514     // Advance to next char

   515     src += UTF8OneCharLen(src);

   516     if (charcount <= 8) {

   517       word_end = src;

   518     }

   519     // Almost always srclimit hit first

   520     if (next_delta >= next_delta_limit) {break;}

   521     if (next_distinct >= next_distinct_limit) {break;}

   522   }

   524   hitbuffer->next_delta = next_delta;

   525   hitbuffer->next_distinct = next_distinct;

   527   // Make a dummy entry off the end to calc length of last span

   528   int dummy_offset = src - text;

   529   hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset;

   530   hitbuffer->delta[hitbuffer->next_delta].indirect = 0;

   531   hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset;

   532   hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0;

   533 }

   536 //----------------------------------------------------------------------------//

   537 // Reliability calculations, for single language and between languages        //

   538 //----------------------------------------------------------------------------//

   540 // Return reliablity of result 0..100 for top two scores

   541 // delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable

   542 // (on a scale where +1 is a factor of  2 ** 1.6 = 3.02)

   543 // Threshold is uni/quadgram increment count, bounded above and below.

   544 //

   545 // Requiring a factor of 3 improvement (e.g. +1 log base 3)

   546 // for each scored quadgram is too stringent, so I've backed this off to a

   547 // factor of 2 (e.g. +5/8 log base 3).

   548 //

   549 // I also somewhat lowered the Min/MaxGramCount limits above

   550 //

   551 // Added: if fewer than 8 quads/unis, max reliability is 12*n percent

   552 //

   553 int ReliabilityDelta(int value1, int value2, int gramcount) {

   554   int max_reliability_percent = 100;

   555   if (gramcount < 8) {

   556     max_reliability_percent = 12 * gramcount;

   557   }

   558   int fully_reliable_thresh = (gramcount * 5) >> 3;     // see note above

   559   if (fully_reliable_thresh < kMinGramCount) {          // Fully = 3..16

   560     fully_reliable_thresh = kMinGramCount;

   561   } else if (fully_reliable_thresh > kMaxGramCount) {

   562     fully_reliable_thresh = kMaxGramCount;

   563   }

   565   int delta = value1 - value2;

   566   if (delta >= fully_reliable_thresh) {return max_reliability_percent;}

   567   if (delta <= 0) {return 0;}

   568   return minint(max_reliability_percent,

   569                      (100 * delta) / fully_reliable_thresh);

   570 }

   572 // Return reliablity of result 0..100 for top score vs. expected mainsteam score

   573 // Values are score per 1024 bytes of input

   574 // ratio = max(top/mainstream, mainstream/top)

   575 // ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable

   576 // Change: short-text word scoring can give unusually good results.

   577 //  Let top exceed mainstream by 4x at 50% reliable

   578 //

   579 // dsites April 2010: These could be tightened up. It would be

   580 // reasonable with newer data and round-robin table allocation to start ramping

   581 // down at mean * 1.5 and mean/1.5, while letting mean*2 and mean/2 pass,

   582 // but just barely.

   583 //

   584 // dsites March 2013: Tightened up a bit.

   585 static const double kRatio100 = 1.5;

   586 static const double kRatio0 = 4.0;

   587 int ReliabilityExpected(int actual_score_1kb, int expected_score_1kb) {

   588   if (expected_score_1kb == 0) {return 100;}    // No reliability data available yet

   589   if (actual_score_1kb == 0) {return 0;}        // zero score = unreliable

   590   double ratio;

   591   if (expected_score_1kb > actual_score_1kb) {

   592     ratio = (1.0 * expected_score_1kb) / actual_score_1kb;

   593   } else {

   594     ratio = (1.0 * actual_score_1kb) / expected_score_1kb;

   595   }

   596   // Ratio 1.0 .. 1.5 scores 100%

   597   // Ratio 2.0 scores 80%

   598   // Linear decline, to ratio 4.0 scores 0%

   599   if (ratio <= kRatio100) {return 100;}

   600   if (ratio > kRatio0) {return 0;}

   602   int percent_good = 100.0 * (kRatio0 - ratio) / (kRatio0 - kRatio100);

   603   return percent_good;

   604 }

   606 // Create a langprob packed value from its parts.

   607 // qprob is quantized [0..12]

   608 // We use Latn script to represent any RTypeMany language

   609 uint32 MakeLangProb(Language lang, int qprob) {

   610   uint32 pslang = PerScriptNumber(ULScript_Latin, lang);

   611   uint32 retval = (pslang << 8) | kLgProbV2TblBackmap[qprob];

   612   return retval;

   613 }

   615 }       // End namespace CLD2

The Tor Browser / file revision

browser/components/translation/cld2/internal/cldutil.cc@6474c204b198

browser/components/translation/cld2/internal/cldutil.cc