browser/components/translation/cld2/internal/cldutil.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/browser/components/translation/cld2/internal/cldutil.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,620 @@
     1.4 +// Copyright 2013 Google Inc. All Rights Reserved.
     1.5 +//
     1.6 +// Licensed under the Apache License, Version 2.0 (the "License");
     1.7 +// you may not use this file except in compliance with the License.
     1.8 +// You may obtain a copy of the License at
     1.9 +//
    1.10 +//     http://www.apache.org/licenses/LICENSE-2.0
    1.11 +//
    1.12 +// Unless required by applicable law or agreed to in writing, software
    1.13 +// distributed under the License is distributed on an "AS IS" BASIS,
    1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    1.15 +// See the License for the specific language governing permissions and
    1.16 +// limitations under the License.
    1.17 +
    1.18 +//
    1.19 +// Author: dsites@google.com (Dick Sites)
    1.20 +// Updated 2014.01 for dual table lookup
    1.21 +//
    1.22 +
    1.23 +#include "cldutil.h"
    1.24 +#include <string>
    1.25 +
    1.26 +#include "cld2tablesummary.h"
    1.27 +#include "integral_types.h"
    1.28 +#include "port.h"
    1.29 +#include "utf8statetable.h"
    1.30 +
    1.31 +namespace CLD2 {
    1.32 +
    1.33 +// Caller supplies the right tables in scoringcontext
    1.34 +
    1.35 +// Runtime routines for hashing, looking up, and scoring
    1.36 +// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
    1.37 +// Unigrams and bigrams are for CJK languages only, including simplified/
    1.38 +// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
    1.39 +// Zhuang Han characters. Surrounding spaces are not considered.
    1.40 +// Quadgrams and octagrams for for non-CJK and include two bits indicating
    1.41 +// preceding and trailing spaces (word boundaries).
    1.42 +
    1.43 +
    1.44 +static const int kMinCJKUTF8CharBytes = 3;
    1.45 +
    1.46 +static const int kMinGramCount = 3;
    1.47 +static const int kMaxGramCount = 16;
    1.48 +
    1.49 +static const int UTFmax = 4;        // Max number of bytes in a UTF-8 character
    1.50 +
    1.51 +  // 1 to skip ASCII space, vowels AEIOU aeiou and UTF-8 continuation bytes 80-BF
    1.52 +  static const uint8 kSkipSpaceVowelContinue[256] = {
    1.53 +    0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.54 +    1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.55 +    0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
    1.56 +    0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
    1.57 +
    1.58 +    1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    1.59 +    1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    1.60 +    0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.61 +    0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.62 +  };
    1.63 +
    1.64 +  // 1 to skip ASCII space, and UTF-8 continuation bytes 80-BF
    1.65 +  static const uint8 kSkipSpaceContinue[256] = {
    1.66 +    0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.67 +    1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.68 +    0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.69 +    0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.70 +
    1.71 +    1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    1.72 +    1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    1.73 +    0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.74 +    0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.75 +  };
    1.76 +
    1.77 +
    1.78 +  // Always advances one UTF-8 character
    1.79 +  static const uint8 kAdvanceOneChar[256] = {
    1.80 +    1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    1.81 +    1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    1.82 +    1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    1.83 +    1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    1.84 +
    1.85 +    1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    1.86 +    1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    1.87 +    2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
    1.88 +    3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
    1.89 +  };
    1.90 +
    1.91 +  // Advances *only* on space (or illegal byte)
    1.92 +  static const uint8 kAdvanceOneCharSpace[256] = {
    1.93 +    1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    1.94 +    1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.95 +    0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.96 +    0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    1.97 +
    1.98 +    1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    1.99 +    1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
   1.100 +    0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1.101 +    0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1.102 +  };
   1.103 +
   1.104 +
   1.105 +// Routines to access a hash table of <key:wordhash, value:probs> pairs
   1.106 +// Buckets have 4-byte wordhash for sizes < 32K buckets, but only
   1.107 +// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
   1.108 +// bucket subscript.
   1.109 +// Probs is a packed: three languages plus a subscript for probability table
   1.110 +// Buckets have all the keys together, then all the values.Key array never
   1.111 +// crosses a cache-line boundary, so no-match case takes exactly one cache miss.
   1.112 +// Match case may sometimes take an additional cache miss on value access.
   1.113 +//
   1.114 +// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
   1.115 +// byte buckets with single cache miss.
   1.116 +// Or 2-byte key and 6-byte value, allowing 5 languages instead  of three.
   1.117 +//------------------------------------------------------------------------------
   1.118 +
   1.119 +//----------------------------------------------------------------------------//
   1.120 +// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores      //
   1.121 +//----------------------------------------------------------------------------//
   1.122 +
   1.123 +//----------------------------------------------------------------------------//
   1.124 +// Scoring single groups of letters                                           //
   1.125 +//----------------------------------------------------------------------------//
   1.126 +
   1.127 +// BIGRAM, QUADGRAM, OCTAGRAM score one => tote
   1.128 +// Input: 4-byte entry of 3 language numbers and one probability subscript, plus
   1.129 +//  an accumulator tote. (language 0 means unused entry)
   1.130 +// Output: running sums in tote updated
   1.131 +void ProcessProbV2Tote(uint32 probs, Tote* tote) {
   1.132 +  uint8 prob123 = (probs >> 0) & 0xff;
   1.133 +  const uint8* prob123_entry = LgProb2TblEntry(prob123);
   1.134 +
   1.135 +  uint8 top1 = (probs >> 8) & 0xff;
   1.136 +  if (top1 > 0) {tote->Add(top1, LgProb3(prob123_entry, 0));}
   1.137 +  uint8 top2 = (probs >> 16) & 0xff;
   1.138 +  if (top2 > 0) {tote->Add(top2, LgProb3(prob123_entry, 1));}
   1.139 +  uint8 top3 = (probs >> 24) & 0xff;
   1.140 +  if (top3 > 0) {tote->Add(top3, LgProb3(prob123_entry, 2));}
   1.141 +}
   1.142 +
   1.143 +// Return score for a particular per-script language, or zero
   1.144 +int GetLangScore(uint32 probs, uint8 pslang) {
   1.145 +  uint8 prob123 = (probs >> 0) & 0xff;
   1.146 +  const uint8* prob123_entry = LgProb2TblEntry(prob123);
   1.147 +  int retval = 0;
   1.148 +  uint8 top1 = (probs >> 8) & 0xff;
   1.149 +  if (top1 == pslang) {retval += LgProb3(prob123_entry, 0);}
   1.150 +  uint8 top2 = (probs >> 16) & 0xff;
   1.151 +  if (top2 == pslang) {retval += LgProb3(prob123_entry, 1);}
   1.152 +  uint8 top3 = (probs >> 24) & 0xff;
   1.153 +  if (top3 == pslang) {retval += LgProb3(prob123_entry, 2);}
   1.154 +  return retval;
   1.155 +}
   1.156 +
   1.157 +//----------------------------------------------------------------------------//
   1.158 +// Routines to accumulate probabilities                                       //
   1.159 +//----------------------------------------------------------------------------//
   1.160 +
   1.161 +
   1.162 +// BIGRAM, using hash table, always advancing by 1 char
   1.163 +// Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
   1.164 +// Score all bigrams in isrc, using languages that have bigrams (CJK)
   1.165 +// Return number of bigrams that hit in the hash table
   1.166 +int DoBigramScoreV3(const CLD2TableSummary* bigram_obj,
   1.167 +                         const char* isrc, int srclen, Tote* chunk_tote) {
   1.168 +  int hit_count = 0;
   1.169 +  const char* src = isrc;
   1.170 +
   1.171 +  // Hashtable-based CJK bigram lookup
   1.172 +  const uint8* usrc = reinterpret_cast<const uint8*>(src);
   1.173 +  const uint8* usrclimit1 = usrc + srclen - UTFmax;
   1.174 +
   1.175 +  while (usrc < usrclimit1) {
   1.176 +    int len = kAdvanceOneChar[usrc[0]];
   1.177 +    int len2 = kAdvanceOneChar[usrc[len]] + len;
   1.178 +
   1.179 +    if ((kMinCJKUTF8CharBytes * 2) <= len2) {      // Two CJK chars possible
   1.180 +      // Lookup and score this bigram
   1.181 +      // Always ignore pre/post spaces
   1.182 +      uint32 bihash = BiHashV2(reinterpret_cast<const char*>(usrc), len2);
   1.183 +      uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
   1.184 +      // Now go indirect on the subscript
   1.185 +      probs = bigram_obj->kCLDTableInd[probs &
   1.186 +        ~bigram_obj->kCLDTableKeyMask];
   1.187 +
   1.188 +      // Process the bigram
   1.189 +      if (probs != 0) {
   1.190 +        ProcessProbV2Tote(probs, chunk_tote);
   1.191 +        ++hit_count;
   1.192 +      }
   1.193 +    }
   1.194 +    usrc += len;  // Advance by one char
   1.195 +  }
   1.196 +
   1.197 +  return hit_count;
   1.198 +}
   1.199 +
   1.200 +
   1.201 +// Score up to 64KB of a single script span in one pass
   1.202 +// Make a dummy entry off the end to calc length of last span
   1.203 +// Return offset of first unused input byte
   1.204 +int GetUniHits(const char* text,
   1.205 +                     int letter_offset, int letter_limit,
   1.206 +                     ScoringContext* scoringcontext,
   1.207 +                     ScoringHitBuffer* hitbuffer) {
   1.208 +  const char* isrc = &text[letter_offset];
   1.209 +  const char* src = isrc;
   1.210 +  // Limit is end, which has extra 20 20 20 00 past len
   1.211 +  const char* srclimit = &text[letter_limit];
   1.212 +
   1.213 +  // Local copies
   1.214 +  const UTF8PropObj* unigram_obj =
   1.215 +    scoringcontext->scoringtables->unigram_obj;
   1.216 +  int next_base = hitbuffer->next_base;
   1.217 +  int next_base_limit = hitbuffer->maxscoringhits;
   1.218 +
   1.219 +  // Visit all unigrams
   1.220 +  if (src[0] == ' ') {++src;}   // skip any initial space
   1.221 +  while (src < srclimit) {
   1.222 +    const uint8* usrc = reinterpret_cast<const uint8*>(src);
   1.223 +    int len = kAdvanceOneChar[usrc[0]];
   1.224 +    src += len;
   1.225 +    // Look up property of one UTF-8 character and advance over it.
   1.226 +    // Updates usrc and len (bad interface design), hence increment above
   1.227 +    int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &len);
   1.228 +    if (propval > 0) {
   1.229 +      // Save indirect subscript for later scoring; 1 or 2 langprobs
   1.230 +      int indirect_subscr = propval;
   1.231 +      hitbuffer->base[next_base].offset = src - text;     // Offset in text
   1.232 +      hitbuffer->base[next_base].indirect = indirect_subscr;
   1.233 +      ++next_base;
   1.234 +    }
   1.235 +
   1.236 +    if (next_base >= next_base_limit) {break;}
   1.237 +  }
   1.238 +
   1.239 +  hitbuffer->next_base = next_base;
   1.240 +
   1.241 +  // Make a dummy entry off the end to calc length of last span
   1.242 +  int dummy_offset = src - text;
   1.243 +  hitbuffer->base[hitbuffer->next_base].offset = dummy_offset;
   1.244 +  hitbuffer->base[hitbuffer->next_base].indirect = 0;
   1.245 +
   1.246 +  return src - text;
   1.247 +}
   1.248 +
   1.249 +// Score up to 64KB of a single script span, doing both delta-bi and
   1.250 +// distinct bis in one pass
   1.251 +void GetBiHits(const char* text,
   1.252 +                     int letter_offset, int letter_limit,
   1.253 +                     ScoringContext* scoringcontext,
   1.254 +                     ScoringHitBuffer* hitbuffer) {
   1.255 +  const char* isrc = &text[letter_offset];
   1.256 +  const char* src = isrc;
   1.257 +  // Limit is end
   1.258 +  const char* srclimit1 = &text[letter_limit];
   1.259 +
   1.260 +  // Local copies
   1.261 +  const CLD2TableSummary* deltabi_obj =
   1.262 +    scoringcontext->scoringtables->deltabi_obj;
   1.263 +  const CLD2TableSummary* distinctbi_obj =
   1.264 +    scoringcontext->scoringtables->distinctbi_obj;
   1.265 +  int next_delta = hitbuffer->next_delta;
   1.266 +  int next_delta_limit = hitbuffer->maxscoringhits;
   1.267 +  int next_distinct = hitbuffer->next_distinct;
   1.268 +  // We can do 2 inserts per loop, so -1
   1.269 +  int next_distinct_limit = hitbuffer->maxscoringhits - 1;
   1.270 +
   1.271 +  while (src < srclimit1) {
   1.272 +    const uint8* usrc = reinterpret_cast<const uint8*>(src);
   1.273 +    int len = kAdvanceOneChar[usrc[0]];
   1.274 +    int len2 = kAdvanceOneChar[usrc[len]] + len;
   1.275 +
   1.276 +    if ((kMinCJKUTF8CharBytes * 2) <= len2) {      // Two CJK chars possible
   1.277 +      // Lookup and this bigram and save <offset, indirect>
   1.278 +      uint32 bihash = BiHashV2(src, len2);
   1.279 +      uint32 probs = QuadHashV3Lookup4(deltabi_obj, bihash);
   1.280 +      // Now go indirect on the subscript
   1.281 +      if (probs != 0) {
   1.282 +        // Save indirect subscript for later scoring; 1 langprob
   1.283 +        int indirect_subscr = probs & ~deltabi_obj->kCLDTableKeyMask;
   1.284 +        hitbuffer->delta[next_delta].offset = src - text;
   1.285 +        hitbuffer->delta[next_delta].indirect = indirect_subscr;
   1.286 +        ++next_delta;
   1.287 +      }
   1.288 +      // Lookup this distinct bigram and save <offset, indirect>
   1.289 +      probs = QuadHashV3Lookup4(distinctbi_obj, bihash);
   1.290 +      if (probs != 0) {
   1.291 +        int indirect_subscr = probs & ~distinctbi_obj->kCLDTableKeyMask;
   1.292 +        hitbuffer->distinct[next_distinct].offset = src - text;
   1.293 +        hitbuffer->distinct[next_distinct].indirect = indirect_subscr;
   1.294 +        ++next_distinct;
   1.295 +      }
   1.296 +    }
   1.297 +    src += len;  // Advance by one char (not two)
   1.298 +
   1.299 +    // Almost always srclimit hit first
   1.300 +    if (next_delta >= next_delta_limit) {break;}
   1.301 +    if (next_distinct >= next_distinct_limit) {break;}
   1.302 +  }
   1.303 +
   1.304 +  hitbuffer->next_delta = next_delta;
   1.305 +  hitbuffer->next_distinct = next_distinct;
   1.306 +
   1.307 +  // Make a dummy entry off the end to calc length of last span
   1.308 +  int dummy_offset = src - text;
   1.309 +  hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset;
   1.310 +  hitbuffer->delta[hitbuffer->next_delta].indirect = 0;
   1.311 +  hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset;
   1.312 +  hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0;
   1.313 +}
   1.314 +
   1.315 +// Score up to 64KB of a single script span in one pass
   1.316 +// Make a dummy entry off the end to calc length of last span
   1.317 +// Return offset of first unused input byte
   1.318 +int GetQuadHits(const char* text,
   1.319 +                     int letter_offset, int letter_limit,
   1.320 +                     ScoringContext* scoringcontext,
   1.321 +                     ScoringHitBuffer* hitbuffer) {
   1.322 +  const char* isrc = &text[letter_offset];
   1.323 +  const char* src = isrc;
   1.324 +  // Limit is end, which has extra 20 20 20 00 past len
   1.325 +  const char* srclimit = &text[letter_limit];
   1.326 +
   1.327 +  // Local copies
   1.328 +  const CLD2TableSummary* quadgram_obj =
   1.329 +    scoringcontext->scoringtables->quadgram_obj;
   1.330 +  const CLD2TableSummary* quadgram_obj2 =
   1.331 +    scoringcontext->scoringtables->quadgram_obj2;
   1.332 +  int next_base = hitbuffer->next_base;
   1.333 +  int next_base_limit = hitbuffer->maxscoringhits;
   1.334 +
   1.335 +  // Run a little cache of last quad hits to catch overly-repetitive "text"
   1.336 +  // We don't care if we miss a couple repetitions at scriptspan boundaries
   1.337 +  int next_prior_quadhash = 0;
   1.338 +  uint32 prior_quadhash[2] = {0, 0};
   1.339 +
   1.340 +  // Visit all quadgrams
   1.341 +  if (src[0] == ' ') {++src;}   // skip any initial space
   1.342 +  while (src < srclimit) {
   1.343 +    // Find one quadgram
   1.344 +    const char* src_end = src;
   1.345 +    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   1.346 +    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   1.347 +    const char* src_mid = src_end;
   1.348 +    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   1.349 +    src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   1.350 +    int len = src_end - src;
   1.351 +    // Hash the quadgram
   1.352 +    uint32 quadhash = QuadHashV2(src, len);
   1.353 +
   1.354 +    // Filter out recent repeats
   1.355 +    if ((quadhash != prior_quadhash[0]) && (quadhash != prior_quadhash[1])) {
   1.356 +      // Look up this quadgram and save <offset, indirect>
   1.357 +      uint32 indirect_flag = 0;   // For dual tables
   1.358 +      const CLD2TableSummary* hit_obj = quadgram_obj;
   1.359 +      uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
   1.360 +      if ((probs == 0) && (quadgram_obj2->kCLDTableSize != 0)) {
   1.361 +        // Try lookup in dual table if not found in first one
   1.362 +        // Note: we need to know later which of two indirect tables to use.
   1.363 +        indirect_flag = 0x80000000u;
   1.364 +        hit_obj = quadgram_obj2;
   1.365 +        probs = QuadHashV3Lookup4(quadgram_obj2, quadhash);
   1.366 +      }
   1.367 +      if (probs != 0) {
   1.368 +        // Round-robin two entries of actual hits
   1.369 +        prior_quadhash[next_prior_quadhash] = quadhash;
   1.370 +        next_prior_quadhash = (next_prior_quadhash + 1) & 1;
   1.371 +
   1.372 +        // Save indirect subscript for later scoring; 1 or 2 langprobs
   1.373 +        int indirect_subscr = probs & ~hit_obj->kCLDTableKeyMask;
   1.374 +        hitbuffer->base[next_base].offset = src - text;     // Offset in text
   1.375 +        // Flip the high bit for table2
   1.376 +        hitbuffer->base[next_base].indirect = indirect_subscr | indirect_flag;
   1.377 +        ++next_base;
   1.378 +      }
   1.379 +    }
   1.380 +
   1.381 +    // Advance: all the way past word if at end-of-word, else 2 chars
   1.382 +    if (src_end[0] == ' ') {
   1.383 +      src = src_end;
   1.384 +    } else {
   1.385 +      src = src_mid;
   1.386 +    }
   1.387 +
   1.388 +    // Skip over space at end of word, or ASCII vowel in middle of word
   1.389 +    // Use kAdvanceOneCharSpace instead to get rid of vowel hack
   1.390 +    if (src < srclimit) {
   1.391 +      src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
   1.392 +    } else {
   1.393 +      // Advancing by 4/8/16 can overshoot, but we are about to exit anyway
   1.394 +      src = srclimit;
   1.395 +    }
   1.396 +
   1.397 +    if (next_base >= next_base_limit) {break;}
   1.398 +  }
   1.399 +
   1.400 +  hitbuffer->next_base = next_base;
   1.401 +
   1.402 +  // Make a dummy entry off the end to calc length of last span
   1.403 +  int dummy_offset = src - text;
   1.404 +  hitbuffer->base[hitbuffer->next_base].offset = dummy_offset;
   1.405 +  hitbuffer->base[hitbuffer->next_base].indirect = 0;
   1.406 +
   1.407 +  return src - text;
   1.408 +}
   1.409 +
   1.410 +// inputs:
   1.411 +//  const tables
   1.412 +//  const char* isrc, int srclen (in sscriptbuffer)
   1.413 +// intermediates:
   1.414 +//  vector of octa <offset, probs>   (which need indirect table to decode)
   1.415 +//  vector of distinct <offset, probs>   (which need indirect table to decode)
   1.416 +
   1.417 +// Score up to 64KB of a single script span, doing both delta-octa and
   1.418 +// distinct words in one pass
   1.419 +void GetOctaHits(const char* text,
   1.420 +                     int letter_offset, int letter_limit,
   1.421 +                     ScoringContext* scoringcontext,
   1.422 +                     ScoringHitBuffer* hitbuffer) {
   1.423 +  const char* isrc = &text[letter_offset];
   1.424 +  const char* src = isrc;
   1.425 +  // Limit is end+1, to include extra space char (0x20) off the end
   1.426 +  const char* srclimit = &text[letter_limit + 1];
   1.427 +
   1.428 +  // Local copies
   1.429 +  const CLD2TableSummary* deltaocta_obj =
   1.430 +    scoringcontext->scoringtables->deltaocta_obj;
   1.431 +  int next_delta = hitbuffer->next_delta;
   1.432 +  int next_delta_limit = hitbuffer->maxscoringhits;
   1.433 +
   1.434 +  const CLD2TableSummary* distinctocta_obj =
   1.435 +    scoringcontext->scoringtables->distinctocta_obj;
   1.436 +  int next_distinct = hitbuffer->next_distinct;
   1.437 +  // We can do 2 inserts per loop, so -1
   1.438 +  int next_distinct_limit = hitbuffer->maxscoringhits - 1;
   1.439 +
   1.440 +  // Run a little cache of last octa hits to catch overly-repetitive "text"
   1.441 +  // We don't care if we miss a couple repetitions at scriptspan boundaries
   1.442 +  int next_prior_octahash = 0;
   1.443 +  uint64 prior_octahash[2] = {0, 0};
   1.444 +
   1.445 +  // Score all words truncated to 8 characters
   1.446 +  int charcount = 0;
   1.447 +  // Skip any initial space
   1.448 +  if (src[0] == ' ') {++src;}
   1.449 +
   1.450 +  // Begin the first word
   1.451 +  const char* prior_word_start = src;
   1.452 +  const char* word_start = src;
   1.453 +  const char* word_end = word_start;
   1.454 +  while (src < srclimit) {
   1.455 +    // Terminate previous word or continue current word
   1.456 +    if (src[0] == ' ') {
   1.457 +      int len = word_end - word_start;
   1.458 +      // Hash the word
   1.459 +      uint64 wordhash40 = OctaHash40(word_start, len);
   1.460 +      uint32 probs;
   1.461 +
   1.462 +      // Filter out recent repeats. Unlike quads, we update even if no hit,
   1.463 +      // so we can get hits on same word if separated by non-hit words
   1.464 +      if ((wordhash40 != prior_octahash[0]) &&
   1.465 +          (wordhash40 != prior_octahash[1])) {
   1.466 +        // Round-robin two entries of words
   1.467 +        prior_octahash[next_prior_octahash] = wordhash40;
   1.468 +        next_prior_octahash = 1 - next_prior_octahash;    // Alternates 0,1,0,1
   1.469 +
   1.470 +        // (1) Lookup distinct word PAIR. For a pair, we want an asymmetrical
   1.471 +        // function of the two word hashs. For words A B C, B-A and C-B are good
   1.472 +        // enough and fast. We use the same table as distinct single words
   1.473 +        // Do not look up a pair of identical words -- all pairs hash to zero
   1.474 +        // Both 1- and 2-word distinct lookups are in distinctocta_obj now
   1.475 +        // Do this first, because it has the lowest offset
   1.476 +        uint64 tmp_prior_hash = prior_octahash[next_prior_octahash];
   1.477 +        if ((tmp_prior_hash != 0) && (tmp_prior_hash != wordhash40)) {
   1.478 +          uint64 pair_hash = PairHash(tmp_prior_hash, wordhash40);
   1.479 +          probs = OctaHashV3Lookup4(distinctocta_obj, pair_hash);
   1.480 +          if (probs != 0) {
   1.481 +            int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask;
   1.482 +            hitbuffer->distinct[next_distinct].offset = prior_word_start - text;
   1.483 +            hitbuffer->distinct[next_distinct].indirect = indirect_subscr;
   1.484 +            ++next_distinct;
   1.485 +          }
   1.486 +        }
   1.487 +
   1.488 +        // (2) Lookup this distinct word and save <offset, indirect>
   1.489 +        probs = OctaHashV3Lookup4(distinctocta_obj, wordhash40);
   1.490 +        if (probs != 0) {
   1.491 +          int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask;
   1.492 +          hitbuffer->distinct[next_distinct].offset = word_start - text;
   1.493 +          hitbuffer->distinct[next_distinct].indirect = indirect_subscr;
   1.494 +          ++next_distinct;
   1.495 +        }
   1.496 +
   1.497 +        // (3) Lookup this word and save <offset, indirect>
   1.498 +        probs = OctaHashV3Lookup4(deltaocta_obj, wordhash40);
   1.499 +        if (probs != 0) {
   1.500 +          // Save indirect subscript for later scoring; 1 langprob
   1.501 +          int indirect_subscr = probs & ~deltaocta_obj->kCLDTableKeyMask;
   1.502 +          hitbuffer->delta[next_delta].offset = word_start - text;
   1.503 +          hitbuffer->delta[next_delta].indirect = indirect_subscr;
   1.504 +          ++next_delta;
   1.505 +        }
   1.506 +      }
   1.507 +
   1.508 +      // Begin the next word
   1.509 +      charcount = 0;
   1.510 +      prior_word_start = word_start;
   1.511 +      word_start = src + 1;   // Over the space
   1.512 +      word_end = word_start;
   1.513 +    } else {
   1.514 +      ++charcount;
   1.515 +    }
   1.516 +
   1.517 +    // Advance to next char
   1.518 +    src += UTF8OneCharLen(src);
   1.519 +    if (charcount <= 8) {
   1.520 +      word_end = src;
   1.521 +    }
   1.522 +    // Almost always srclimit hit first
   1.523 +    if (next_delta >= next_delta_limit) {break;}
   1.524 +    if (next_distinct >= next_distinct_limit) {break;}
   1.525 +  }
   1.526 +
   1.527 +  hitbuffer->next_delta = next_delta;
   1.528 +  hitbuffer->next_distinct = next_distinct;
   1.529 +
   1.530 +  // Make a dummy entry off the end to calc length of last span
   1.531 +  int dummy_offset = src - text;
   1.532 +  hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset;
   1.533 +  hitbuffer->delta[hitbuffer->next_delta].indirect = 0;
   1.534 +  hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset;
   1.535 +  hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0;
   1.536 +}
   1.537 +
   1.538 +
   1.539 +//----------------------------------------------------------------------------//
   1.540 +// Reliability calculations, for single language and between languages        //
   1.541 +//----------------------------------------------------------------------------//
   1.542 +
   1.543 +// Return reliablity of result 0..100 for top two scores
   1.544 +// delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable
   1.545 +// (on a scale where +1 is a factor of  2 ** 1.6 = 3.02)
   1.546 +// Threshold is uni/quadgram increment count, bounded above and below.
   1.547 +//
   1.548 +// Requiring a factor of 3 improvement (e.g. +1 log base 3)
   1.549 +// for each scored quadgram is too stringent, so I've backed this off to a
   1.550 +// factor of 2 (e.g. +5/8 log base 3).
   1.551 +//
   1.552 +// I also somewhat lowered the Min/MaxGramCount limits above
   1.553 +//
   1.554 +// Added: if fewer than 8 quads/unis, max reliability is 12*n percent
   1.555 +//
   1.556 +int ReliabilityDelta(int value1, int value2, int gramcount) {
   1.557 +  int max_reliability_percent = 100;
   1.558 +  if (gramcount < 8) {
   1.559 +    max_reliability_percent = 12 * gramcount;
   1.560 +  }
   1.561 +  int fully_reliable_thresh = (gramcount * 5) >> 3;     // see note above
   1.562 +  if (fully_reliable_thresh < kMinGramCount) {          // Fully = 3..16
   1.563 +    fully_reliable_thresh = kMinGramCount;
   1.564 +  } else if (fully_reliable_thresh > kMaxGramCount) {
   1.565 +    fully_reliable_thresh = kMaxGramCount;
   1.566 +  }
   1.567 +
   1.568 +  int delta = value1 - value2;
   1.569 +  if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
   1.570 +  if (delta <= 0) {return 0;}
   1.571 +  return minint(max_reliability_percent,
   1.572 +                     (100 * delta) / fully_reliable_thresh);
   1.573 +}
   1.574 +
   1.575 +// Return reliablity of result 0..100 for top score vs. expected mainsteam score
   1.576 +// Values are score per 1024 bytes of input
   1.577 +// ratio = max(top/mainstream, mainstream/top)
   1.578 +// ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable
   1.579 +// Change: short-text word scoring can give unusually good results.
   1.580 +//  Let top exceed mainstream by 4x at 50% reliable
   1.581 +//
   1.582 +// dsites April 2010: These could be tightened up. It would be
   1.583 +// reasonable with newer data and round-robin table allocation to start ramping
   1.584 +// down at mean * 1.5 and mean/1.5, while letting mean*2 and mean/2 pass,
   1.585 +// but just barely.
   1.586 +//
   1.587 +// dsites March 2013: Tightened up a bit.
   1.588 +static const double kRatio100 = 1.5;
   1.589 +static const double kRatio0 = 4.0;
   1.590 +int ReliabilityExpected(int actual_score_1kb, int expected_score_1kb) {
   1.591 +  if (expected_score_1kb == 0) {return 100;}    // No reliability data available yet
   1.592 +  if (actual_score_1kb == 0) {return 0;}        // zero score = unreliable
   1.593 +  double ratio;
   1.594 +  if (expected_score_1kb > actual_score_1kb) {
   1.595 +    ratio = (1.0 * expected_score_1kb) / actual_score_1kb;
   1.596 +  } else {
   1.597 +    ratio = (1.0 * actual_score_1kb) / expected_score_1kb;
   1.598 +  }
   1.599 +  // Ratio 1.0 .. 1.5 scores 100%
   1.600 +  // Ratio 2.0 scores 80%
   1.601 +  // Linear decline, to ratio 4.0 scores 0%
   1.602 +  if (ratio <= kRatio100) {return 100;}
   1.603 +  if (ratio > kRatio0) {return 0;}
   1.604 +
   1.605 +  int percent_good = 100.0 * (kRatio0 - ratio) / (kRatio0 - kRatio100);
   1.606 +  return percent_good;
   1.607 +}
   1.608 +
   1.609 +// Create a langprob packed value from its parts.
   1.610 +// qprob is quantized [0..12]
   1.611 +// We use Latn script to represent any RTypeMany language
   1.612 +uint32 MakeLangProb(Language lang, int qprob) {
   1.613 +  uint32 pslang = PerScriptNumber(ULScript_Latin, lang);
   1.614 +  uint32 retval = (pslang << 8) | kLgProbV2TblBackmap[qprob];
   1.615 +  return retval;
   1.616 +}
   1.617 +
   1.618 +}       // End namespace CLD2
   1.619 +
   1.620 +
   1.621 +
   1.622 +
   1.623 +

mercurial