browser/components/translation/cld2/internal/cldutil_shared.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.
     2 //
     3 // Licensed under the Apache License, Version 2.0 (the "License");
     4 // you may not use this file except in compliance with the License.
     5 // You may obtain a copy of the License at
     6 //
     7 //     http://www.apache.org/licenses/LICENSE-2.0
     8 //
     9 // Unless required by applicable law or agreed to in writing, software
    10 // distributed under the License is distributed on an "AS IS" BASIS,
    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12 // See the License for the specific language governing permissions and
    13 // limitations under the License.
    15 //
    16 // Author: dsites@google.com (Dick Sites)
    17 //
    19 #include "cldutil_shared.h"
    20 #include <string>
    22 #include "cld2tablesummary.h"
    23 #include "integral_types.h"
    24 #include "port.h"
    25 #include "utf8statetable.h"
    27 namespace CLD2 {
    29 // Runtime routines for hashing, looking up, and scoring
    30 // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
    31 // Unigrams and bigrams are for CJK languages only, including simplified/
    32 // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
    33 // Zhuang Han characters. Surrounding spaces are not considered.
    34 // Quadgrams and octagrams for for non-CJK and include two bits indicating
    35 // preceding and trailing spaces (word boundaries).
    38 // Indicator bits for leading/trailing space around quad/octagram
    39 // NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
    40 // 1-, 2-, or 3-bytes each.
    41 static const uint32 kPreSpaceIndicator =  0x00004444;
    42 static const uint32 kPostSpaceIndicator = 0x44440000;
    44 // Little-endian masks for 0..24 bytes picked up as uint32's
    45 static const uint32 kWordMask0[4] = {
    46   0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
    47 };
    49 static const int kMinCJKUTF8CharBytes = 3;
    51 static const int kMinGramCount = 3;
    52 static const int kMaxGramCount = 16;
    54 static const int UTFmax = 4;        // Max number of bytes in a UTF-8 character
    57 // Routines to access a hash table of <key:wordhash, value:probs> pairs
    58 // Buckets have 4-byte wordhash for sizes < 32K buckets, but only
    59 // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
    60 // bucket subscript.
    61 // Probs is a packed: three languages plus a subscript for probability table
    62 // Buckets have all the keys together, then all the values.Key array never
    63 // crosses a cache-line boundary, so no-match case takes exactly one cache miss.
    64 // Match case may sometimes take an additional cache miss on value access.
    65 //
    66 // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
    67 // byte buckets with single cache miss.
    68 // Or 2-byte key and 6-byte value, allowing 5 languages instead  of three.
    71 //----------------------------------------------------------------------------//
    72 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores      //
    73 //----------------------------------------------------------------------------//
    75 // Design principles for these hash functions
    76 // - Few operations
    77 // - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
    78 //   Latin script expect 1- and 2-byte mixtures.
    79 // - Last byte of each character has about 5 bits of information
    80 // - Spread good bits around so they can interact in at least two ways
    81 //   with other characters
    82 // - Use add for additional mixing thorugh carries
    84 // CJK Three-byte bigram
    85 //   ....dddd..cccccc..bbbbbb....aaaa
    86 //   ..................ffffff..eeeeee
    87 // make
    88 //   ....dddd..cccccc..bbbbbb....aaaa
    89 //   000....dddd..cccccc..bbbbbb....a
    90 //   ..................ffffff..eeeeee
    91 //   ffffff..eeeeee000000000000000000
    92 //
    93 // CJK Four-byte bigram
    94 //   ..dddddd..cccccc....bbbb....aaaa
    95 //   ..hhhhhh..gggggg....ffff....eeee
    96 // make
    97 //   ..dddddd..cccccc....bbbb....aaaa
    98 //   000..dddddd..cccccc....bbbb....a
    99 //   ..hhhhhh..gggggg....ffff....eeee
   100 //   ..ffff....eeee000000000000000000
   102 // BIGRAM
   103 // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
   104 // OVERSHOOTS up to 3 bytes
   105 // For runtime use of tables
   106 // Does X86 unaligned loads
   107 uint32 BiHashV2(const char* word_ptr, int bytecount) {
   108   if (bytecount == 0) {return 0;}
   109   const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
   110   uint32 word0, word1;
   111   if (bytecount <= 4) {
   112     word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
   113     word0 = word0 ^ (word0 >> 3);
   114     return word0;
   115   }
   116   // Else do 8 bytes
   117   word0 = UNALIGNED_LOAD32(word_ptr32);
   118   word0 = word0 ^ (word0 >> 3);
   119   word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
   120   word1 = word1 ^ (word1 << 18);
   121   return word0 + word1;
   122 }
   124 //
   125 // Ascii-7 One-byte chars
   126 //   ...ddddd...ccccc...bbbbb...aaaaa
   127 // make
   128 //   ...ddddd...ccccc...bbbbb...aaaaa
   129 //   000...ddddd...ccccc...bbbbb...aa
   130 //
   131 // Latin 1- and 2-byte chars
   132 //   ...ddddd...ccccc...bbbbb...aaaaa
   133 //   ...................fffff...eeeee
   134 // make
   135 //   ...ddddd...ccccc...bbbbb...aaaaa
   136 //   000...ddddd...ccccc...bbbbb...aa
   137 //   ...................fffff...eeeee
   138 //   ...............fffff...eeeee0000
   139 //
   140 // Non-CJK Two-byte chars
   141 //   ...ddddd...........bbbbb........
   142 //   ...hhhhh...........fffff........
   143 // make
   144 //   ...ddddd...........bbbbb........
   145 //   000...ddddd...........bbbbb.....
   146 //   ...hhhhh...........fffff........
   147 //   hhhh...........fffff........0000
   148 //
   149 // Non-CJK Three-byte chars
   150 //   ...........ccccc................
   151 //   ...................fffff........
   152 //   ...lllll...................iiiii
   153 // make
   154 //   ...........ccccc................
   155 //   000...........ccccc.............
   156 //   ...................fffff........
   157 //   ...............fffff........0000
   158 //   ...lllll...................iiiii
   159 //   .lllll...................iiiii00
   160 //
   162 // QUADGRAM
   163 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
   164 // OVERSHOOTS up to 3 bytes
   165 // For runtime use of tables
   166 // Does X86 unaligned loads
   167 uint32 QuadHashV2Mix(const char* word_ptr, int bytecount, uint32 prepost) {
   168   const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
   169   uint32 word0, word1, word2;
   170   if (bytecount <= 4) {
   171     word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
   172     word0 = word0 ^ (word0 >> 3);
   173     return word0 ^ prepost;
   174   } else if (bytecount <= 8) {
   175     word0 = UNALIGNED_LOAD32(word_ptr32);
   176     word0 = word0 ^ (word0 >> 3);
   177     word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
   178     word1 = word1 ^ (word1 << 4);
   179     return (word0 ^ prepost) + word1;
   180   }
   181   // else do 12 bytes
   182   word0 = UNALIGNED_LOAD32(word_ptr32);
   183   word0 = word0 ^ (word0 >> 3);
   184   word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
   185   word1 = word1 ^ (word1 << 4);
   186   word2 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];
   187   word2 = word2 ^ (word2 << 2);
   188   return (word0 ^ prepost) + word1 + word2;
   189 }
   192 // QUADGRAM wrapper with surrounding spaces
   193 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
   194 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
   195 // For runtime use of tables
   196 uint32 QuadHashV2(const char* word_ptr, int bytecount) {
   197   if (bytecount == 0) {return 0;}
   198   uint32 prepost = 0;
   199   if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
   200   if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
   201   return QuadHashV2Mix(word_ptr, bytecount, prepost);
   202 }
   204 // QUADGRAM wrapper with surrounding underscores (offline use)
   205 // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
   206 // OVERSHOOTS up to 3 bytes
   207 // For offline construction of tables
   208 uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount) {
   209   if (bytecount == 0) {return 0;}
   210   const char* local_word_ptr = word_ptr;
   211   int local_bytecount = bytecount;
   212   uint32 prepost = 0;
   213   if (local_word_ptr[0] == '_') {
   214     prepost |= kPreSpaceIndicator;
   215     ++local_word_ptr;
   216     --local_bytecount;
   217   }
   218   if (local_word_ptr[local_bytecount - 1] == '_') {
   219     prepost |= kPostSpaceIndicator;
   220     --local_bytecount;
   221   }
   222   return QuadHashV2Mix(local_word_ptr, local_bytecount, prepost);
   223 }
   226 // OCTAGRAM
   227 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
   228 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
   229 //
   230 // The low 32 bits follow the pattern from above, tuned to different scripts
   231 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
   232 // For runtime use of tables V3
   233 // Does X86 unaligned loads
   234 uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
   235   const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
   236   uint64 word0;
   237   uint64 word1;
   238   uint64 sum;
   240   if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
   241   if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
   242   switch ((bytecount - 1) >> 2) {
   243   case 0:       // 1..4 bytes
   244     word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
   245     sum = word0;
   246     word0 = word0 ^ (word0 >> 3);
   247     break;
   248   case 1:       // 5..8 bytes
   249     word0 = UNALIGNED_LOAD32(word_ptr32);
   250     sum = word0;
   251     word0 = word0 ^ (word0 >> 3);
   252     word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
   253     sum += word1;
   254     word1 = word1 ^ (word1 << 4);
   255     word0 += word1;
   256     break;
   257   case 2:       // 9..12 bytes
   258     word0 = UNALIGNED_LOAD32(word_ptr32);
   259     sum = word0;
   260     word0 = word0 ^ (word0 >> 3);
   261     word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
   262     sum += word1;
   263     word1 = word1 ^ (word1 << 4);
   264     word0 += word1;
   265     word1 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];
   266     sum += word1;
   267     word1 = word1 ^ (word1 << 2);
   268     word0 += word1;
   269     break;
   270   case 3:       // 13..16 bytes
   271     word0 =UNALIGNED_LOAD32(word_ptr32);
   272     sum = word0;
   273     word0 = word0 ^ (word0 >> 3);
   274     word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
   275     sum += word1;
   276     word1 = word1 ^ (word1 << 4);
   277     word0 += word1;
   278     word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
   279     sum += word1;
   280     word1 = word1 ^ (word1 << 2);
   281     word0 += word1;
   282     word1 = UNALIGNED_LOAD32(word_ptr32 + 3) & kWordMask0[bytecount & 3];
   283     sum += word1;
   284     word1 = word1 ^ (word1 >> 8);
   285     word0 += word1;
   286     break;
   287   case 4:       // 17..20 bytes
   288     word0 = UNALIGNED_LOAD32(word_ptr32);
   289     sum = word0;
   290     word0 = word0 ^ (word0 >> 3);
   291     word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
   292     sum += word1;
   293     word1 = word1 ^ (word1 << 4);
   294     word0 += word1;
   295     word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
   296     sum += word1;
   297     word1 = word1 ^ (word1 << 2);
   298     word0 += word1;
   299     word1 = UNALIGNED_LOAD32(word_ptr32 + 3);
   300     sum += word1;
   301     word1 = word1 ^ (word1 >> 8);
   302     word0 += word1;
   303     word1 = UNALIGNED_LOAD32(word_ptr32 + 4) & kWordMask0[bytecount & 3];
   304     sum += word1;
   305     word1 = word1 ^ (word1 >> 4);
   306     word0 += word1;
   307     break;
   308   default:      // 21..24 bytes and higher (ignores beyond 24)
   309     word0 = UNALIGNED_LOAD32(word_ptr32);
   310     sum = word0;
   311     word0 = word0 ^ (word0 >> 3);
   312     word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
   313     sum += word1;
   314     word1 = word1 ^ (word1 << 4);
   315     word0 += word1;
   316     word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
   317     sum += word1;
   318     word1 = word1 ^ (word1 << 2);
   319     word0 += word1;
   320     word1 = UNALIGNED_LOAD32(word_ptr32 + 3);
   321     sum += word1;
   322     word1 = word1 ^ (word1 >> 8);
   323     word0 += word1;
   324     word1 = UNALIGNED_LOAD32(word_ptr32 + 4);
   325     sum += word1;
   326     word1 = word1 ^ (word1 >> 4);
   327     word0 += word1;
   328     word1 = UNALIGNED_LOAD32(word_ptr32 + 5) & kWordMask0[bytecount & 3];
   329     sum += word1;
   330     word1 = word1 ^ (word1 >> 6);
   331     word0 += word1;
   332     break;
   333   }
   335   sum += (sum >> 17);             // extra 1-bit shift for bytes 2 & 3
   336   sum += (sum >> 9);              // extra 1-bit shift for bytes 1 & 3
   337   sum = (sum & 0xff) << 32;
   338   return (word0 ^ prepost) + sum;
   339 }
   341 // OCTAGRAM wrapper with surrounding spaces
   342 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
   343 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
   344 //
   345 // The low 32 bits follow the pattern from above, tuned to different scripts
   346 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
   347 // For runtime use of tables V3
   348 uint64 OctaHash40(const char* word_ptr, int bytecount) {
   349   if (bytecount == 0) {return 0;}
   350   uint64 prepost = 0;
   351   if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
   352   if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
   353   return OctaHash40Mix(word_ptr, bytecount, prepost);
   354 }
   357 // OCTAGRAM wrapper with surrounding underscores (offline use)
   358 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
   359 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
   360 //
   361 // The low 32 bits follow the pattern from above, tuned to different scripts
   362 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
   363 // For offline construction of tables
   364 uint64 OctaHash40underscore(const char* word_ptr, int bytecount) {
   365   if (bytecount == 0) {return 0;}
   366   const char* local_word_ptr = word_ptr;
   367   int local_bytecount = bytecount;
   368   uint64 prepost = 0;
   369   if (local_word_ptr[0] == '_') {
   370     prepost |= kPreSpaceIndicator;
   371     ++local_word_ptr;
   372     --local_bytecount;
   373   }
   374   if (local_word_ptr[local_bytecount - 1] == '_') {
   375     prepost |= kPostSpaceIndicator;
   376     --local_bytecount;
   377   }
   378   return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
   379 }
   381 // Hash a consecutive pair of tokens/words A B
   382 // Old: hash is B - A, which gives too many false hits on one-char diffs
   383 // Now: rotate(A,13) + B
   384 uint64 PairHash(uint64 worda_hash, uint64 wordb_hash) {
   385    return ((worda_hash >> 13) | (worda_hash << (64 - 13))) + wordb_hash;
   386 }
   391 //----------------------------------------------------------------------------//
   392 // Finding groups of 1/2/4/8 letters                                          //
   393 //----------------------------------------------------------------------------//
   395 // src points to a letter. Find the byte length of a unigram starting there.
   396 int UniLen(const char* src) {
   397   const char* src_end = src;
   398   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   399   return src_end - src;
   400 }
   402 // src points to a letter. Find the byte length of a bigram starting there.
   403 int BiLen(const char* src) {
   404   const char* src_end = src;
   405   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   406   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   407   return src_end - src;
   408 }
   410 // src points to a letter. Find the byte length of a quadgram starting there.
   411 int QuadLen(const char* src) {
   412   const char* src_end = src;
   413   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   414   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   415   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   416   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   417   return src_end - src;
   418 }
   420 // src points to a letter. Find the byte length of an octagram starting there.
   421 int OctaLen(const char* src) {
   422   const char* src_end = src;
   423   int charcount = 0;
   424   while (src_end[0] != ' ') {
   425     src_end += UTF8OneCharLen(src);
   426     ++charcount;
   427     if (charcount == 8) {break;}
   428   }
   429   return src_end - src;
   430 }
   432 }       // End namespace CLD2

mercurial