1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/internal/cldutil.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,620 @@ 1.4 +// Copyright 2013 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +// 1.19 +// Author: dsites@google.com (Dick Sites) 1.20 +// Updated 2014.01 for dual table lookup 1.21 +// 1.22 + 1.23 +#include "cldutil.h" 1.24 +#include <string> 1.25 + 1.26 +#include "cld2tablesummary.h" 1.27 +#include "integral_types.h" 1.28 +#include "port.h" 1.29 +#include "utf8statetable.h" 1.30 + 1.31 +namespace CLD2 { 1.32 + 1.33 +// Caller supplies the right tables in scoringcontext 1.34 + 1.35 +// Runtime routines for hashing, looking up, and scoring 1.36 +// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams. 1.37 +// Unigrams and bigrams are for CJK languages only, including simplified/ 1.38 +// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and 1.39 +// Zhuang Han characters. Surrounding spaces are not considered. 1.40 +// Quadgrams and octagrams for for non-CJK and include two bits indicating 1.41 +// preceding and trailing spaces (word boundaries). 1.42 + 1.43 + 1.44 +static const int kMinCJKUTF8CharBytes = 3; 1.45 + 1.46 +static const int kMinGramCount = 3; 1.47 +static const int kMaxGramCount = 16; 1.48 + 1.49 +static const int UTFmax = 4; // Max number of bytes in a UTF-8 character 1.50 + 1.51 + // 1 to skip ASCII space, vowels AEIOU aeiou and UTF-8 continuation bytes 80-BF 1.52 + static const uint8 kSkipSpaceVowelContinue[256] = { 1.53 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.54 + 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.55 + 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, 1.56 + 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, 1.57 + 1.58 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.59 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.60 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.61 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.62 + }; 1.63 + 1.64 + // 1 to skip ASCII space, and UTF-8 continuation bytes 80-BF 1.65 + static const uint8 kSkipSpaceContinue[256] = { 1.66 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.67 + 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.68 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.69 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.70 + 1.71 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.72 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.73 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.74 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.75 + }; 1.76 + 1.77 + 1.78 + // Always advances one UTF-8 character 1.79 + static const uint8 kAdvanceOneChar[256] = { 1.80 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.81 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.82 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.83 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.84 + 1.85 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.86 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.87 + 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 1.88 + 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 1.89 + }; 1.90 + 1.91 + // Advances *only* on space (or illegal byte) 1.92 + static const uint8 kAdvanceOneCharSpace[256] = { 1.93 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.94 + 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.95 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.96 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.97 + 1.98 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.99 + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1.100 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.101 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.102 + }; 1.103 + 1.104 + 1.105 +// Routines to access a hash table of <key:wordhash, value:probs> pairs 1.106 +// Buckets have 4-byte wordhash for sizes < 32K buckets, but only 1.107 +// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as 1.108 +// bucket subscript. 1.109 +// Probs is a packed: three languages plus a subscript for probability table 1.110 +// Buckets have all the keys together, then all the values.Key array never 1.111 +// crosses a cache-line boundary, so no-match case takes exactly one cache miss. 1.112 +// Match case may sometimes take an additional cache miss on value access. 1.113 +// 1.114 +// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64 1.115 +// byte buckets with single cache miss. 1.116 +// Or 2-byte key and 6-byte value, allowing 5 languages instead of three. 1.117 +//------------------------------------------------------------------------------ 1.118 + 1.119 +//----------------------------------------------------------------------------// 1.120 +// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores // 1.121 +//----------------------------------------------------------------------------// 1.122 + 1.123 +//----------------------------------------------------------------------------// 1.124 +// Scoring single groups of letters // 1.125 +//----------------------------------------------------------------------------// 1.126 + 1.127 +// BIGRAM, QUADGRAM, OCTAGRAM score one => tote 1.128 +// Input: 4-byte entry of 3 language numbers and one probability subscript, plus 1.129 +// an accumulator tote. (language 0 means unused entry) 1.130 +// Output: running sums in tote updated 1.131 +void ProcessProbV2Tote(uint32 probs, Tote* tote) { 1.132 + uint8 prob123 = (probs >> 0) & 0xff; 1.133 + const uint8* prob123_entry = LgProb2TblEntry(prob123); 1.134 + 1.135 + uint8 top1 = (probs >> 8) & 0xff; 1.136 + if (top1 > 0) {tote->Add(top1, LgProb3(prob123_entry, 0));} 1.137 + uint8 top2 = (probs >> 16) & 0xff; 1.138 + if (top2 > 0) {tote->Add(top2, LgProb3(prob123_entry, 1));} 1.139 + uint8 top3 = (probs >> 24) & 0xff; 1.140 + if (top3 > 0) {tote->Add(top3, LgProb3(prob123_entry, 2));} 1.141 +} 1.142 + 1.143 +// Return score for a particular per-script language, or zero 1.144 +int GetLangScore(uint32 probs, uint8 pslang) { 1.145 + uint8 prob123 = (probs >> 0) & 0xff; 1.146 + const uint8* prob123_entry = LgProb2TblEntry(prob123); 1.147 + int retval = 0; 1.148 + uint8 top1 = (probs >> 8) & 0xff; 1.149 + if (top1 == pslang) {retval += LgProb3(prob123_entry, 0);} 1.150 + uint8 top2 = (probs >> 16) & 0xff; 1.151 + if (top2 == pslang) {retval += LgProb3(prob123_entry, 1);} 1.152 + uint8 top3 = (probs >> 24) & 0xff; 1.153 + if (top3 == pslang) {retval += LgProb3(prob123_entry, 2);} 1.154 + return retval; 1.155 +} 1.156 + 1.157 +//----------------------------------------------------------------------------// 1.158 +// Routines to accumulate probabilities // 1.159 +//----------------------------------------------------------------------------// 1.160 + 1.161 + 1.162 +// BIGRAM, using hash table, always advancing by 1 char 1.163 +// Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj 1.164 +// Score all bigrams in isrc, using languages that have bigrams (CJK) 1.165 +// Return number of bigrams that hit in the hash table 1.166 +int DoBigramScoreV3(const CLD2TableSummary* bigram_obj, 1.167 + const char* isrc, int srclen, Tote* chunk_tote) { 1.168 + int hit_count = 0; 1.169 + const char* src = isrc; 1.170 + 1.171 + // Hashtable-based CJK bigram lookup 1.172 + const uint8* usrc = reinterpret_cast<const uint8*>(src); 1.173 + const uint8* usrclimit1 = usrc + srclen - UTFmax; 1.174 + 1.175 + while (usrc < usrclimit1) { 1.176 + int len = kAdvanceOneChar[usrc[0]]; 1.177 + int len2 = kAdvanceOneChar[usrc[len]] + len; 1.178 + 1.179 + if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible 1.180 + // Lookup and score this bigram 1.181 + // Always ignore pre/post spaces 1.182 + uint32 bihash = BiHashV2(reinterpret_cast<const char*>(usrc), len2); 1.183 + uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash); 1.184 + // Now go indirect on the subscript 1.185 + probs = bigram_obj->kCLDTableInd[probs & 1.186 + ~bigram_obj->kCLDTableKeyMask]; 1.187 + 1.188 + // Process the bigram 1.189 + if (probs != 0) { 1.190 + ProcessProbV2Tote(probs, chunk_tote); 1.191 + ++hit_count; 1.192 + } 1.193 + } 1.194 + usrc += len; // Advance by one char 1.195 + } 1.196 + 1.197 + return hit_count; 1.198 +} 1.199 + 1.200 + 1.201 +// Score up to 64KB of a single script span in one pass 1.202 +// Make a dummy entry off the end to calc length of last span 1.203 +// Return offset of first unused input byte 1.204 +int GetUniHits(const char* text, 1.205 + int letter_offset, int letter_limit, 1.206 + ScoringContext* scoringcontext, 1.207 + ScoringHitBuffer* hitbuffer) { 1.208 + const char* isrc = &text[letter_offset]; 1.209 + const char* src = isrc; 1.210 + // Limit is end, which has extra 20 20 20 00 past len 1.211 + const char* srclimit = &text[letter_limit]; 1.212 + 1.213 + // Local copies 1.214 + const UTF8PropObj* unigram_obj = 1.215 + scoringcontext->scoringtables->unigram_obj; 1.216 + int next_base = hitbuffer->next_base; 1.217 + int next_base_limit = hitbuffer->maxscoringhits; 1.218 + 1.219 + // Visit all unigrams 1.220 + if (src[0] == ' ') {++src;} // skip any initial space 1.221 + while (src < srclimit) { 1.222 + const uint8* usrc = reinterpret_cast<const uint8*>(src); 1.223 + int len = kAdvanceOneChar[usrc[0]]; 1.224 + src += len; 1.225 + // Look up property of one UTF-8 character and advance over it. 1.226 + // Updates usrc and len (bad interface design), hence increment above 1.227 + int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &len); 1.228 + if (propval > 0) { 1.229 + // Save indirect subscript for later scoring; 1 or 2 langprobs 1.230 + int indirect_subscr = propval; 1.231 + hitbuffer->base[next_base].offset = src - text; // Offset in text 1.232 + hitbuffer->base[next_base].indirect = indirect_subscr; 1.233 + ++next_base; 1.234 + } 1.235 + 1.236 + if (next_base >= next_base_limit) {break;} 1.237 + } 1.238 + 1.239 + hitbuffer->next_base = next_base; 1.240 + 1.241 + // Make a dummy entry off the end to calc length of last span 1.242 + int dummy_offset = src - text; 1.243 + hitbuffer->base[hitbuffer->next_base].offset = dummy_offset; 1.244 + hitbuffer->base[hitbuffer->next_base].indirect = 0; 1.245 + 1.246 + return src - text; 1.247 +} 1.248 + 1.249 +// Score up to 64KB of a single script span, doing both delta-bi and 1.250 +// distinct bis in one pass 1.251 +void GetBiHits(const char* text, 1.252 + int letter_offset, int letter_limit, 1.253 + ScoringContext* scoringcontext, 1.254 + ScoringHitBuffer* hitbuffer) { 1.255 + const char* isrc = &text[letter_offset]; 1.256 + const char* src = isrc; 1.257 + // Limit is end 1.258 + const char* srclimit1 = &text[letter_limit]; 1.259 + 1.260 + // Local copies 1.261 + const CLD2TableSummary* deltabi_obj = 1.262 + scoringcontext->scoringtables->deltabi_obj; 1.263 + const CLD2TableSummary* distinctbi_obj = 1.264 + scoringcontext->scoringtables->distinctbi_obj; 1.265 + int next_delta = hitbuffer->next_delta; 1.266 + int next_delta_limit = hitbuffer->maxscoringhits; 1.267 + int next_distinct = hitbuffer->next_distinct; 1.268 + // We can do 2 inserts per loop, so -1 1.269 + int next_distinct_limit = hitbuffer->maxscoringhits - 1; 1.270 + 1.271 + while (src < srclimit1) { 1.272 + const uint8* usrc = reinterpret_cast<const uint8*>(src); 1.273 + int len = kAdvanceOneChar[usrc[0]]; 1.274 + int len2 = kAdvanceOneChar[usrc[len]] + len; 1.275 + 1.276 + if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible 1.277 + // Lookup and this bigram and save <offset, indirect> 1.278 + uint32 bihash = BiHashV2(src, len2); 1.279 + uint32 probs = QuadHashV3Lookup4(deltabi_obj, bihash); 1.280 + // Now go indirect on the subscript 1.281 + if (probs != 0) { 1.282 + // Save indirect subscript for later scoring; 1 langprob 1.283 + int indirect_subscr = probs & ~deltabi_obj->kCLDTableKeyMask; 1.284 + hitbuffer->delta[next_delta].offset = src - text; 1.285 + hitbuffer->delta[next_delta].indirect = indirect_subscr; 1.286 + ++next_delta; 1.287 + } 1.288 + // Lookup this distinct bigram and save <offset, indirect> 1.289 + probs = QuadHashV3Lookup4(distinctbi_obj, bihash); 1.290 + if (probs != 0) { 1.291 + int indirect_subscr = probs & ~distinctbi_obj->kCLDTableKeyMask; 1.292 + hitbuffer->distinct[next_distinct].offset = src - text; 1.293 + hitbuffer->distinct[next_distinct].indirect = indirect_subscr; 1.294 + ++next_distinct; 1.295 + } 1.296 + } 1.297 + src += len; // Advance by one char (not two) 1.298 + 1.299 + // Almost always srclimit hit first 1.300 + if (next_delta >= next_delta_limit) {break;} 1.301 + if (next_distinct >= next_distinct_limit) {break;} 1.302 + } 1.303 + 1.304 + hitbuffer->next_delta = next_delta; 1.305 + hitbuffer->next_distinct = next_distinct; 1.306 + 1.307 + // Make a dummy entry off the end to calc length of last span 1.308 + int dummy_offset = src - text; 1.309 + hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset; 1.310 + hitbuffer->delta[hitbuffer->next_delta].indirect = 0; 1.311 + hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset; 1.312 + hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0; 1.313 +} 1.314 + 1.315 +// Score up to 64KB of a single script span in one pass 1.316 +// Make a dummy entry off the end to calc length of last span 1.317 +// Return offset of first unused input byte 1.318 +int GetQuadHits(const char* text, 1.319 + int letter_offset, int letter_limit, 1.320 + ScoringContext* scoringcontext, 1.321 + ScoringHitBuffer* hitbuffer) { 1.322 + const char* isrc = &text[letter_offset]; 1.323 + const char* src = isrc; 1.324 + // Limit is end, which has extra 20 20 20 00 past len 1.325 + const char* srclimit = &text[letter_limit]; 1.326 + 1.327 + // Local copies 1.328 + const CLD2TableSummary* quadgram_obj = 1.329 + scoringcontext->scoringtables->quadgram_obj; 1.330 + const CLD2TableSummary* quadgram_obj2 = 1.331 + scoringcontext->scoringtables->quadgram_obj2; 1.332 + int next_base = hitbuffer->next_base; 1.333 + int next_base_limit = hitbuffer->maxscoringhits; 1.334 + 1.335 + // Run a little cache of last quad hits to catch overly-repetitive "text" 1.336 + // We don't care if we miss a couple repetitions at scriptspan boundaries 1.337 + int next_prior_quadhash = 0; 1.338 + uint32 prior_quadhash[2] = {0, 0}; 1.339 + 1.340 + // Visit all quadgrams 1.341 + if (src[0] == ' ') {++src;} // skip any initial space 1.342 + while (src < srclimit) { 1.343 + // Find one quadgram 1.344 + const char* src_end = src; 1.345 + src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; 1.346 + src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; 1.347 + const char* src_mid = src_end; 1.348 + src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; 1.349 + src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; 1.350 + int len = src_end - src; 1.351 + // Hash the quadgram 1.352 + uint32 quadhash = QuadHashV2(src, len); 1.353 + 1.354 + // Filter out recent repeats 1.355 + if ((quadhash != prior_quadhash[0]) && (quadhash != prior_quadhash[1])) { 1.356 + // Look up this quadgram and save <offset, indirect> 1.357 + uint32 indirect_flag = 0; // For dual tables 1.358 + const CLD2TableSummary* hit_obj = quadgram_obj; 1.359 + uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash); 1.360 + if ((probs == 0) && (quadgram_obj2->kCLDTableSize != 0)) { 1.361 + // Try lookup in dual table if not found in first one 1.362 + // Note: we need to know later which of two indirect tables to use. 1.363 + indirect_flag = 0x80000000u; 1.364 + hit_obj = quadgram_obj2; 1.365 + probs = QuadHashV3Lookup4(quadgram_obj2, quadhash); 1.366 + } 1.367 + if (probs != 0) { 1.368 + // Round-robin two entries of actual hits 1.369 + prior_quadhash[next_prior_quadhash] = quadhash; 1.370 + next_prior_quadhash = (next_prior_quadhash + 1) & 1; 1.371 + 1.372 + // Save indirect subscript for later scoring; 1 or 2 langprobs 1.373 + int indirect_subscr = probs & ~hit_obj->kCLDTableKeyMask; 1.374 + hitbuffer->base[next_base].offset = src - text; // Offset in text 1.375 + // Flip the high bit for table2 1.376 + hitbuffer->base[next_base].indirect = indirect_subscr | indirect_flag; 1.377 + ++next_base; 1.378 + } 1.379 + } 1.380 + 1.381 + // Advance: all the way past word if at end-of-word, else 2 chars 1.382 + if (src_end[0] == ' ') { 1.383 + src = src_end; 1.384 + } else { 1.385 + src = src_mid; 1.386 + } 1.387 + 1.388 + // Skip over space at end of word, or ASCII vowel in middle of word 1.389 + // Use kAdvanceOneCharSpace instead to get rid of vowel hack 1.390 + if (src < srclimit) { 1.391 + src += kAdvanceOneCharSpaceVowel[(uint8)src[0]]; 1.392 + } else { 1.393 + // Advancing by 4/8/16 can overshoot, but we are about to exit anyway 1.394 + src = srclimit; 1.395 + } 1.396 + 1.397 + if (next_base >= next_base_limit) {break;} 1.398 + } 1.399 + 1.400 + hitbuffer->next_base = next_base; 1.401 + 1.402 + // Make a dummy entry off the end to calc length of last span 1.403 + int dummy_offset = src - text; 1.404 + hitbuffer->base[hitbuffer->next_base].offset = dummy_offset; 1.405 + hitbuffer->base[hitbuffer->next_base].indirect = 0; 1.406 + 1.407 + return src - text; 1.408 +} 1.409 + 1.410 +// inputs: 1.411 +// const tables 1.412 +// const char* isrc, int srclen (in sscriptbuffer) 1.413 +// intermediates: 1.414 +// vector of octa <offset, probs> (which need indirect table to decode) 1.415 +// vector of distinct <offset, probs> (which need indirect table to decode) 1.416 + 1.417 +// Score up to 64KB of a single script span, doing both delta-octa and 1.418 +// distinct words in one pass 1.419 +void GetOctaHits(const char* text, 1.420 + int letter_offset, int letter_limit, 1.421 + ScoringContext* scoringcontext, 1.422 + ScoringHitBuffer* hitbuffer) { 1.423 + const char* isrc = &text[letter_offset]; 1.424 + const char* src = isrc; 1.425 + // Limit is end+1, to include extra space char (0x20) off the end 1.426 + const char* srclimit = &text[letter_limit + 1]; 1.427 + 1.428 + // Local copies 1.429 + const CLD2TableSummary* deltaocta_obj = 1.430 + scoringcontext->scoringtables->deltaocta_obj; 1.431 + int next_delta = hitbuffer->next_delta; 1.432 + int next_delta_limit = hitbuffer->maxscoringhits; 1.433 + 1.434 + const CLD2TableSummary* distinctocta_obj = 1.435 + scoringcontext->scoringtables->distinctocta_obj; 1.436 + int next_distinct = hitbuffer->next_distinct; 1.437 + // We can do 2 inserts per loop, so -1 1.438 + int next_distinct_limit = hitbuffer->maxscoringhits - 1; 1.439 + 1.440 + // Run a little cache of last octa hits to catch overly-repetitive "text" 1.441 + // We don't care if we miss a couple repetitions at scriptspan boundaries 1.442 + int next_prior_octahash = 0; 1.443 + uint64 prior_octahash[2] = {0, 0}; 1.444 + 1.445 + // Score all words truncated to 8 characters 1.446 + int charcount = 0; 1.447 + // Skip any initial space 1.448 + if (src[0] == ' ') {++src;} 1.449 + 1.450 + // Begin the first word 1.451 + const char* prior_word_start = src; 1.452 + const char* word_start = src; 1.453 + const char* word_end = word_start; 1.454 + while (src < srclimit) { 1.455 + // Terminate previous word or continue current word 1.456 + if (src[0] == ' ') { 1.457 + int len = word_end - word_start; 1.458 + // Hash the word 1.459 + uint64 wordhash40 = OctaHash40(word_start, len); 1.460 + uint32 probs; 1.461 + 1.462 + // Filter out recent repeats. Unlike quads, we update even if no hit, 1.463 + // so we can get hits on same word if separated by non-hit words 1.464 + if ((wordhash40 != prior_octahash[0]) && 1.465 + (wordhash40 != prior_octahash[1])) { 1.466 + // Round-robin two entries of words 1.467 + prior_octahash[next_prior_octahash] = wordhash40; 1.468 + next_prior_octahash = 1 - next_prior_octahash; // Alternates 0,1,0,1 1.469 + 1.470 + // (1) Lookup distinct word PAIR. For a pair, we want an asymmetrical 1.471 + // function of the two word hashs. For words A B C, B-A and C-B are good 1.472 + // enough and fast. We use the same table as distinct single words 1.473 + // Do not look up a pair of identical words -- all pairs hash to zero 1.474 + // Both 1- and 2-word distinct lookups are in distinctocta_obj now 1.475 + // Do this first, because it has the lowest offset 1.476 + uint64 tmp_prior_hash = prior_octahash[next_prior_octahash]; 1.477 + if ((tmp_prior_hash != 0) && (tmp_prior_hash != wordhash40)) { 1.478 + uint64 pair_hash = PairHash(tmp_prior_hash, wordhash40); 1.479 + probs = OctaHashV3Lookup4(distinctocta_obj, pair_hash); 1.480 + if (probs != 0) { 1.481 + int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask; 1.482 + hitbuffer->distinct[next_distinct].offset = prior_word_start - text; 1.483 + hitbuffer->distinct[next_distinct].indirect = indirect_subscr; 1.484 + ++next_distinct; 1.485 + } 1.486 + } 1.487 + 1.488 + // (2) Lookup this distinct word and save <offset, indirect> 1.489 + probs = OctaHashV3Lookup4(distinctocta_obj, wordhash40); 1.490 + if (probs != 0) { 1.491 + int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask; 1.492 + hitbuffer->distinct[next_distinct].offset = word_start - text; 1.493 + hitbuffer->distinct[next_distinct].indirect = indirect_subscr; 1.494 + ++next_distinct; 1.495 + } 1.496 + 1.497 + // (3) Lookup this word and save <offset, indirect> 1.498 + probs = OctaHashV3Lookup4(deltaocta_obj, wordhash40); 1.499 + if (probs != 0) { 1.500 + // Save indirect subscript for later scoring; 1 langprob 1.501 + int indirect_subscr = probs & ~deltaocta_obj->kCLDTableKeyMask; 1.502 + hitbuffer->delta[next_delta].offset = word_start - text; 1.503 + hitbuffer->delta[next_delta].indirect = indirect_subscr; 1.504 + ++next_delta; 1.505 + } 1.506 + } 1.507 + 1.508 + // Begin the next word 1.509 + charcount = 0; 1.510 + prior_word_start = word_start; 1.511 + word_start = src + 1; // Over the space 1.512 + word_end = word_start; 1.513 + } else { 1.514 + ++charcount; 1.515 + } 1.516 + 1.517 + // Advance to next char 1.518 + src += UTF8OneCharLen(src); 1.519 + if (charcount <= 8) { 1.520 + word_end = src; 1.521 + } 1.522 + // Almost always srclimit hit first 1.523 + if (next_delta >= next_delta_limit) {break;} 1.524 + if (next_distinct >= next_distinct_limit) {break;} 1.525 + } 1.526 + 1.527 + hitbuffer->next_delta = next_delta; 1.528 + hitbuffer->next_distinct = next_distinct; 1.529 + 1.530 + // Make a dummy entry off the end to calc length of last span 1.531 + int dummy_offset = src - text; 1.532 + hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset; 1.533 + hitbuffer->delta[hitbuffer->next_delta].indirect = 0; 1.534 + hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset; 1.535 + hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0; 1.536 +} 1.537 + 1.538 + 1.539 +//----------------------------------------------------------------------------// 1.540 +// Reliability calculations, for single language and between languages // 1.541 +//----------------------------------------------------------------------------// 1.542 + 1.543 +// Return reliablity of result 0..100 for top two scores 1.544 +// delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable 1.545 +// (on a scale where +1 is a factor of 2 ** 1.6 = 3.02) 1.546 +// Threshold is uni/quadgram increment count, bounded above and below. 1.547 +// 1.548 +// Requiring a factor of 3 improvement (e.g. +1 log base 3) 1.549 +// for each scored quadgram is too stringent, so I've backed this off to a 1.550 +// factor of 2 (e.g. +5/8 log base 3). 1.551 +// 1.552 +// I also somewhat lowered the Min/MaxGramCount limits above 1.553 +// 1.554 +// Added: if fewer than 8 quads/unis, max reliability is 12*n percent 1.555 +// 1.556 +int ReliabilityDelta(int value1, int value2, int gramcount) { 1.557 + int max_reliability_percent = 100; 1.558 + if (gramcount < 8) { 1.559 + max_reliability_percent = 12 * gramcount; 1.560 + } 1.561 + int fully_reliable_thresh = (gramcount * 5) >> 3; // see note above 1.562 + if (fully_reliable_thresh < kMinGramCount) { // Fully = 3..16 1.563 + fully_reliable_thresh = kMinGramCount; 1.564 + } else if (fully_reliable_thresh > kMaxGramCount) { 1.565 + fully_reliable_thresh = kMaxGramCount; 1.566 + } 1.567 + 1.568 + int delta = value1 - value2; 1.569 + if (delta >= fully_reliable_thresh) {return max_reliability_percent;} 1.570 + if (delta <= 0) {return 0;} 1.571 + return minint(max_reliability_percent, 1.572 + (100 * delta) / fully_reliable_thresh); 1.573 +} 1.574 + 1.575 +// Return reliablity of result 0..100 for top score vs. expected mainsteam score 1.576 +// Values are score per 1024 bytes of input 1.577 +// ratio = max(top/mainstream, mainstream/top) 1.578 +// ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable 1.579 +// Change: short-text word scoring can give unusually good results. 1.580 +// Let top exceed mainstream by 4x at 50% reliable 1.581 +// 1.582 +// dsites April 2010: These could be tightened up. It would be 1.583 +// reasonable with newer data and round-robin table allocation to start ramping 1.584 +// down at mean * 1.5 and mean/1.5, while letting mean*2 and mean/2 pass, 1.585 +// but just barely. 1.586 +// 1.587 +// dsites March 2013: Tightened up a bit. 1.588 +static const double kRatio100 = 1.5; 1.589 +static const double kRatio0 = 4.0; 1.590 +int ReliabilityExpected(int actual_score_1kb, int expected_score_1kb) { 1.591 + if (expected_score_1kb == 0) {return 100;} // No reliability data available yet 1.592 + if (actual_score_1kb == 0) {return 0;} // zero score = unreliable 1.593 + double ratio; 1.594 + if (expected_score_1kb > actual_score_1kb) { 1.595 + ratio = (1.0 * expected_score_1kb) / actual_score_1kb; 1.596 + } else { 1.597 + ratio = (1.0 * actual_score_1kb) / expected_score_1kb; 1.598 + } 1.599 + // Ratio 1.0 .. 1.5 scores 100% 1.600 + // Ratio 2.0 scores 80% 1.601 + // Linear decline, to ratio 4.0 scores 0% 1.602 + if (ratio <= kRatio100) {return 100;} 1.603 + if (ratio > kRatio0) {return 0;} 1.604 + 1.605 + int percent_good = 100.0 * (kRatio0 - ratio) / (kRatio0 - kRatio100); 1.606 + return percent_good; 1.607 +} 1.608 + 1.609 +// Create a langprob packed value from its parts. 1.610 +// qprob is quantized [0..12] 1.611 +// We use Latn script to represent any RTypeMany language 1.612 +uint32 MakeLangProb(Language lang, int qprob) { 1.613 + uint32 pslang = PerScriptNumber(ULScript_Latin, lang); 1.614 + uint32 retval = (pslang << 8) | kLgProbV2TblBackmap[qprob]; 1.615 + return retval; 1.616 +} 1.617 + 1.618 +} // End namespace CLD2 1.619 + 1.620 + 1.621 + 1.622 + 1.623 +