The Tor Browser: browser/components/translation/cld2/internal/cldutil

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.

     2 //

     3 // Licensed under the Apache License, Version 2.0 (the "License");

     4 // you may not use this file except in compliance with the License.

     5 // You may obtain a copy of the License at

     6 //

     7 //     http://www.apache.org/licenses/LICENSE-2.0

     8 //

     9 // Unless required by applicable law or agreed to in writing, software

    10 // distributed under the License is distributed on an "AS IS" BASIS,

    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

    12 // See the License for the specific language governing permissions and

    13 // limitations under the License.

    15 //

    16 // Author: dsites@google.com (Dick Sites)

    17 //

    19 #include "cldutil_shared.h"

    20 #include <string>

    22 #include "cld2tablesummary.h"

    23 #include "integral_types.h"

    24 #include "port.h"

    25 #include "utf8statetable.h"

    27 namespace CLD2 {

    29 // Runtime routines for hashing, looking up, and scoring

    30 // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.

    31 // Unigrams and bigrams are for CJK languages only, including simplified/

    32 // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and

    33 // Zhuang Han characters. Surrounding spaces are not considered.

    34 // Quadgrams and octagrams for for non-CJK and include two bits indicating

    35 // preceding and trailing spaces (word boundaries).

    38 // Indicator bits for leading/trailing space around quad/octagram

    39 // NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of

    40 // 1-, 2-, or 3-bytes each.

    41 static const uint32 kPreSpaceIndicator =  0x00004444;

    42 static const uint32 kPostSpaceIndicator = 0x44440000;

    44 // Little-endian masks for 0..24 bytes picked up as uint32's

    45 static const uint32 kWordMask0[4] = {

    46   0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF

    47 };

    49 static const int kMinCJKUTF8CharBytes = 3;

    51 static const int kMinGramCount = 3;

    52 static const int kMaxGramCount = 16;

    54 static const int UTFmax = 4;        // Max number of bytes in a UTF-8 character

    57 // Routines to access a hash table of <key:wordhash, value:probs> pairs

    58 // Buckets have 4-byte wordhash for sizes < 32K buckets, but only

    59 // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as

    60 // bucket subscript.

    61 // Probs is a packed: three languages plus a subscript for probability table

    62 // Buckets have all the keys together, then all the values.Key array never

    63 // crosses a cache-line boundary, so no-match case takes exactly one cache miss.

    64 // Match case may sometimes take an additional cache miss on value access.

    65 //

    66 // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64

    67 // byte buckets with single cache miss.

    68 // Or 2-byte key and 6-byte value, allowing 5 languages instead  of three.

    71 //----------------------------------------------------------------------------//

    72 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores      //

    73 //----------------------------------------------------------------------------//

    75 // Design principles for these hash functions

    76 // - Few operations

    77 // - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in

    78 //   Latin script expect 1- and 2-byte mixtures.

    79 // - Last byte of each character has about 5 bits of information

    80 // - Spread good bits around so they can interact in at least two ways

    81 //   with other characters

    82 // - Use add for additional mixing thorugh carries

    84 // CJK Three-byte bigram

    85 //   ....dddd..cccccc..bbbbbb....aaaa

    86 //   ..................ffffff..eeeeee

    87 // make

    88 //   ....dddd..cccccc..bbbbbb....aaaa

    89 //   000....dddd..cccccc..bbbbbb....a

    90 //   ..................ffffff..eeeeee

    91 //   ffffff..eeeeee000000000000000000

    92 //

    93 // CJK Four-byte bigram

    94 //   ..dddddd..cccccc....bbbb....aaaa

    95 //   ..hhhhhh..gggggg....ffff....eeee

    96 // make

    97 //   ..dddddd..cccccc....bbbb....aaaa

    98 //   000..dddddd..cccccc....bbbb....a

    99 //   ..hhhhhh..gggggg....ffff....eeee

   100 //   ..ffff....eeee000000000000000000

   102 // BIGRAM

   103 // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post

   104 // OVERSHOOTS up to 3 bytes

   105 // For runtime use of tables

   106 // Does X86 unaligned loads

   107 uint32 BiHashV2(const char* word_ptr, int bytecount) {

   108   if (bytecount == 0) {return 0;}

   109   const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);

   110   uint32 word0, word1;

   111   if (bytecount <= 4) {

   112     word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];

   113     word0 = word0 ^ (word0 >> 3);

   114     return word0;

   115   }

   116   // Else do 8 bytes

   117   word0 = UNALIGNED_LOAD32(word_ptr32);

   118   word0 = word0 ^ (word0 >> 3);

   119   word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];

   120   word1 = word1 ^ (word1 << 18);

   121   return word0 + word1;

   122 }

   124 //

   125 // Ascii-7 One-byte chars

   126 //   ...ddddd...ccccc...bbbbb...aaaaa

   127 // make

   128 //   ...ddddd...ccccc...bbbbb...aaaaa

   129 //   000...ddddd...ccccc...bbbbb...aa

   130 //

   131 // Latin 1- and 2-byte chars

   132 //   ...ddddd...ccccc...bbbbb...aaaaa

   133 //   ...................fffff...eeeee

   134 // make

   135 //   ...ddddd...ccccc...bbbbb...aaaaa

   136 //   000...ddddd...ccccc...bbbbb...aa

   137 //   ...................fffff...eeeee

   138 //   ...............fffff...eeeee0000

   139 //

   140 // Non-CJK Two-byte chars

   141 //   ...ddddd...........bbbbb........

   142 //   ...hhhhh...........fffff........

   143 // make

   144 //   ...ddddd...........bbbbb........

   145 //   000...ddddd...........bbbbb.....

   146 //   ...hhhhh...........fffff........

   147 //   hhhh...........fffff........0000

   148 //

   149 // Non-CJK Three-byte chars

   150 //   ...........ccccc................

   151 //   ...................fffff........

   152 //   ...lllll...................iiiii

   153 // make

   154 //   ...........ccccc................

   155 //   000...........ccccc.............

   156 //   ...................fffff........

   157 //   ...............fffff........0000

   158 //   ...lllll...................iiiii

   159 //   .lllll...................iiiii00

   160 //

   162 // QUADGRAM

   163 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add

   164 // OVERSHOOTS up to 3 bytes

   165 // For runtime use of tables

   166 // Does X86 unaligned loads

   167 uint32 QuadHashV2Mix(const char* word_ptr, int bytecount, uint32 prepost) {

   168   const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);

   169   uint32 word0, word1, word2;

   170   if (bytecount <= 4) {

   171     word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];

   172     word0 = word0 ^ (word0 >> 3);

   173     return word0 ^ prepost;

   174   } else if (bytecount <= 8) {

   175     word0 = UNALIGNED_LOAD32(word_ptr32);

   176     word0 = word0 ^ (word0 >> 3);

   177     word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];

   178     word1 = word1 ^ (word1 << 4);

   179     return (word0 ^ prepost) + word1;

   180   }

   181   // else do 12 bytes

   182   word0 = UNALIGNED_LOAD32(word_ptr32);

   183   word0 = word0 ^ (word0 >> 3);

   184   word1 = UNALIGNED_LOAD32(word_ptr32 + 1);

   185   word1 = word1 ^ (word1 << 4);

   186   word2 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];

   187   word2 = word2 ^ (word2 << 2);

   188   return (word0 ^ prepost) + word1 + word2;

   189 }

   192 // QUADGRAM wrapper with surrounding spaces

   193 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add

   194 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes

   195 // For runtime use of tables

   196 uint32 QuadHashV2(const char* word_ptr, int bytecount) {

   197   if (bytecount == 0) {return 0;}

   198   uint32 prepost = 0;

   199   if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}

   200   if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}

   201   return QuadHashV2Mix(word_ptr, bytecount, prepost);

   202 }

   204 // QUADGRAM wrapper with surrounding underscores (offline use)

   205 // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add

   206 // OVERSHOOTS up to 3 bytes

   207 // For offline construction of tables

   208 uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount) {

   209   if (bytecount == 0) {return 0;}

   210   const char* local_word_ptr = word_ptr;

   211   int local_bytecount = bytecount;

   212   uint32 prepost = 0;

   213   if (local_word_ptr[0] == '_') {

   214     prepost |= kPreSpaceIndicator;

   215     ++local_word_ptr;

   216     --local_bytecount;

   217   }

   218   if (local_word_ptr[local_bytecount - 1] == '_') {

   219     prepost |= kPostSpaceIndicator;

   220     --local_bytecount;

   221   }

   222   return QuadHashV2Mix(local_word_ptr, local_bytecount, prepost);

   223 }

   226 // OCTAGRAM

   227 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add

   228 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes

   229 //

   230 // The low 32 bits follow the pattern from above, tuned to different scripts

   231 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each

   232 // For runtime use of tables V3

   233 // Does X86 unaligned loads

   234 uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {

   235   const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);

   236   uint64 word0;

   237   uint64 word1;

   238   uint64 sum;

   240   if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}

   241   if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}

   242   switch ((bytecount - 1) >> 2) {

   243   case 0:       // 1..4 bytes

   244     word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];

   245     sum = word0;

   246     word0 = word0 ^ (word0 >> 3);

   247     break;

   248   case 1:       // 5..8 bytes

   249     word0 = UNALIGNED_LOAD32(word_ptr32);

   250     sum = word0;

   251     word0 = word0 ^ (word0 >> 3);

   252     word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];

   253     sum += word1;

   254     word1 = word1 ^ (word1 << 4);

   255     word0 += word1;

   256     break;

   257   case 2:       // 9..12 bytes

   258     word0 = UNALIGNED_LOAD32(word_ptr32);

   259     sum = word0;

   260     word0 = word0 ^ (word0 >> 3);

   261     word1 = UNALIGNED_LOAD32(word_ptr32 + 1);

   262     sum += word1;

   263     word1 = word1 ^ (word1 << 4);

   264     word0 += word1;

   265     word1 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];

   266     sum += word1;

   267     word1 = word1 ^ (word1 << 2);

   268     word0 += word1;

   269     break;

   270   case 3:       // 13..16 bytes

   271     word0 =UNALIGNED_LOAD32(word_ptr32);

   272     sum = word0;

   273     word0 = word0 ^ (word0 >> 3);

   274     word1 = UNALIGNED_LOAD32(word_ptr32 + 1);

   275     sum += word1;

   276     word1 = word1 ^ (word1 << 4);

   277     word0 += word1;

   278     word1 = UNALIGNED_LOAD32(word_ptr32 + 2);

   279     sum += word1;

   280     word1 = word1 ^ (word1 << 2);

   281     word0 += word1;

   282     word1 = UNALIGNED_LOAD32(word_ptr32 + 3) & kWordMask0[bytecount & 3];

   283     sum += word1;

   284     word1 = word1 ^ (word1 >> 8);

   285     word0 += word1;

   286     break;

   287   case 4:       // 17..20 bytes

   288     word0 = UNALIGNED_LOAD32(word_ptr32);

   289     sum = word0;

   290     word0 = word0 ^ (word0 >> 3);

   291     word1 = UNALIGNED_LOAD32(word_ptr32 + 1);

   292     sum += word1;

   293     word1 = word1 ^ (word1 << 4);

   294     word0 += word1;

   295     word1 = UNALIGNED_LOAD32(word_ptr32 + 2);

   296     sum += word1;

   297     word1 = word1 ^ (word1 << 2);

   298     word0 += word1;

   299     word1 = UNALIGNED_LOAD32(word_ptr32 + 3);

   300     sum += word1;

   301     word1 = word1 ^ (word1 >> 8);

   302     word0 += word1;

   303     word1 = UNALIGNED_LOAD32(word_ptr32 + 4) & kWordMask0[bytecount & 3];

   304     sum += word1;

   305     word1 = word1 ^ (word1 >> 4);

   306     word0 += word1;

   307     break;

   308   default:      // 21..24 bytes and higher (ignores beyond 24)

   309     word0 = UNALIGNED_LOAD32(word_ptr32);

   310     sum = word0;

   311     word0 = word0 ^ (word0 >> 3);

   312     word1 = UNALIGNED_LOAD32(word_ptr32 + 1);

   313     sum += word1;

   314     word1 = word1 ^ (word1 << 4);

   315     word0 += word1;

   316     word1 = UNALIGNED_LOAD32(word_ptr32 + 2);

   317     sum += word1;

   318     word1 = word1 ^ (word1 << 2);

   319     word0 += word1;

   320     word1 = UNALIGNED_LOAD32(word_ptr32 + 3);

   321     sum += word1;

   322     word1 = word1 ^ (word1 >> 8);

   323     word0 += word1;

   324     word1 = UNALIGNED_LOAD32(word_ptr32 + 4);

   325     sum += word1;

   326     word1 = word1 ^ (word1 >> 4);

   327     word0 += word1;

   328     word1 = UNALIGNED_LOAD32(word_ptr32 + 5) & kWordMask0[bytecount & 3];

   329     sum += word1;

   330     word1 = word1 ^ (word1 >> 6);

   331     word0 += word1;

   332     break;

   333   }

   335   sum += (sum >> 17);             // extra 1-bit shift for bytes 2 & 3

   336   sum += (sum >> 9);              // extra 1-bit shift for bytes 1 & 3

   337   sum = (sum & 0xff) << 32;

   338   return (word0 ^ prepost) + sum;

   339 }

   341 // OCTAGRAM wrapper with surrounding spaces

   342 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add

   343 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes

   344 //

   345 // The low 32 bits follow the pattern from above, tuned to different scripts

   346 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each

   347 // For runtime use of tables V3

   348 uint64 OctaHash40(const char* word_ptr, int bytecount) {

   349   if (bytecount == 0) {return 0;}

   350   uint64 prepost = 0;

   351   if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}

   352   if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}

   353   return OctaHash40Mix(word_ptr, bytecount, prepost);

   354 }

   357 // OCTAGRAM wrapper with surrounding underscores (offline use)

   358 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add

   359 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes

   360 //

   361 // The low 32 bits follow the pattern from above, tuned to different scripts

   362 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each

   363 // For offline construction of tables

   364 uint64 OctaHash40underscore(const char* word_ptr, int bytecount) {

   365   if (bytecount == 0) {return 0;}

   366   const char* local_word_ptr = word_ptr;

   367   int local_bytecount = bytecount;

   368   uint64 prepost = 0;

   369   if (local_word_ptr[0] == '_') {

   370     prepost |= kPreSpaceIndicator;

   371     ++local_word_ptr;

   372     --local_bytecount;

   373   }

   374   if (local_word_ptr[local_bytecount - 1] == '_') {

   375     prepost |= kPostSpaceIndicator;

   376     --local_bytecount;

   377   }

   378   return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);

   379 }

   381 // Hash a consecutive pair of tokens/words A B

   382 // Old: hash is B - A, which gives too many false hits on one-char diffs

   383 // Now: rotate(A,13) + B

   384 uint64 PairHash(uint64 worda_hash, uint64 wordb_hash) {

   385    return ((worda_hash >> 13) | (worda_hash << (64 - 13))) + wordb_hash;

   386 }

   391 //----------------------------------------------------------------------------//

   392 // Finding groups of 1/2/4/8 letters                                          //

   393 //----------------------------------------------------------------------------//

   395 // src points to a letter. Find the byte length of a unigram starting there.

   396 int UniLen(const char* src) {

   397   const char* src_end = src;

   398   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];

   399   return src_end - src;

   400 }

   402 // src points to a letter. Find the byte length of a bigram starting there.

   403 int BiLen(const char* src) {

   404   const char* src_end = src;

   405   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];

   406   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];

   407   return src_end - src;

   408 }

   410 // src points to a letter. Find the byte length of a quadgram starting there.

   411 int QuadLen(const char* src) {

   412   const char* src_end = src;

   413   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];

   414   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];

   415   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];

   416   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];

   417   return src_end - src;

   418 }

   420 // src points to a letter. Find the byte length of an octagram starting there.

   421 int OctaLen(const char* src) {

   422   const char* src_end = src;

   423   int charcount = 0;

   424   while (src_end[0] != ' ') {

   425     src_end += UTF8OneCharLen(src);

   426     ++charcount;

   427     if (charcount == 8) {break;}

   428   }

   429   return src_end - src;

   430 }

   432 }       // End namespace CLD2

The Tor Browser / file revision

browser/components/translation/cld2/internal/cldutil_shared.cc@6474c204b198

browser/components/translation/cld2/internal/cldutil_shared.cc