browser/components/translation/cld2/internal/cldutil.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 // Copyright 2013 Google Inc. All Rights Reserved.
michael@0 2 //
michael@0 3 // Licensed under the Apache License, Version 2.0 (the "License");
michael@0 4 // you may not use this file except in compliance with the License.
michael@0 5 // You may obtain a copy of the License at
michael@0 6 //
michael@0 7 // http://www.apache.org/licenses/LICENSE-2.0
michael@0 8 //
michael@0 9 // Unless required by applicable law or agreed to in writing, software
michael@0 10 // distributed under the License is distributed on an "AS IS" BASIS,
michael@0 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0 12 // See the License for the specific language governing permissions and
michael@0 13 // limitations under the License.
michael@0 14
michael@0 15 //
michael@0 16 // Author: dsites@google.com (Dick Sites)
michael@0 17 // Updated 2014.01 for dual table lookup
michael@0 18 //
michael@0 19
michael@0 20 #include "cldutil.h"
michael@0 21 #include <string>
michael@0 22
michael@0 23 #include "cld2tablesummary.h"
michael@0 24 #include "integral_types.h"
michael@0 25 #include "port.h"
michael@0 26 #include "utf8statetable.h"
michael@0 27
michael@0 28 namespace CLD2 {
michael@0 29
michael@0 30 // Caller supplies the right tables in scoringcontext
michael@0 31
michael@0 32 // Runtime routines for hashing, looking up, and scoring
michael@0 33 // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
michael@0 34 // Unigrams and bigrams are for CJK languages only, including simplified/
michael@0 35 // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
michael@0 36 // Zhuang Han characters. Surrounding spaces are not considered.
michael@0 37 // Quadgrams and octagrams for for non-CJK and include two bits indicating
michael@0 38 // preceding and trailing spaces (word boundaries).
michael@0 39
michael@0 40
michael@0 41 static const int kMinCJKUTF8CharBytes = 3;
michael@0 42
michael@0 43 static const int kMinGramCount = 3;
michael@0 44 static const int kMaxGramCount = 16;
michael@0 45
michael@0 46 static const int UTFmax = 4; // Max number of bytes in a UTF-8 character
michael@0 47
michael@0 48 // 1 to skip ASCII space, vowels AEIOU aeiou and UTF-8 continuation bytes 80-BF
michael@0 49 static const uint8 kSkipSpaceVowelContinue[256] = {
michael@0 50 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 51 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 52 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
michael@0 53 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
michael@0 54
michael@0 55 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 56 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 57 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 58 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 59 };
michael@0 60
michael@0 61 // 1 to skip ASCII space, and UTF-8 continuation bytes 80-BF
michael@0 62 static const uint8 kSkipSpaceContinue[256] = {
michael@0 63 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 64 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 65 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 66 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 67
michael@0 68 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 69 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 70 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 71 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 72 };
michael@0 73
michael@0 74
michael@0 75 // Always advances one UTF-8 character
michael@0 76 static const uint8 kAdvanceOneChar[256] = {
michael@0 77 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 78 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 79 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 80 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 81
michael@0 82 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 83 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 84 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
michael@0 85 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
michael@0 86 };
michael@0 87
michael@0 88 // Advances *only* on space (or illegal byte)
michael@0 89 static const uint8 kAdvanceOneCharSpace[256] = {
michael@0 90 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 91 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 92 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 93 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 94
michael@0 95 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 96 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
michael@0 97 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 98 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 99 };
michael@0 100
michael@0 101
michael@0 102 // Routines to access a hash table of <key:wordhash, value:probs> pairs
michael@0 103 // Buckets have 4-byte wordhash for sizes < 32K buckets, but only
michael@0 104 // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
michael@0 105 // bucket subscript.
michael@0 106 // Probs is a packed: three languages plus a subscript for probability table
michael@0 107 // Buckets have all the keys together, then all the values.Key array never
michael@0 108 // crosses a cache-line boundary, so no-match case takes exactly one cache miss.
michael@0 109 // Match case may sometimes take an additional cache miss on value access.
michael@0 110 //
michael@0 111 // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
michael@0 112 // byte buckets with single cache miss.
michael@0 113 // Or 2-byte key and 6-byte value, allowing 5 languages instead of three.
michael@0 114 //------------------------------------------------------------------------------
michael@0 115
michael@0 116 //----------------------------------------------------------------------------//
michael@0 117 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores //
michael@0 118 //----------------------------------------------------------------------------//
michael@0 119
michael@0 120 //----------------------------------------------------------------------------//
michael@0 121 // Scoring single groups of letters //
michael@0 122 //----------------------------------------------------------------------------//
michael@0 123
michael@0 124 // BIGRAM, QUADGRAM, OCTAGRAM score one => tote
michael@0 125 // Input: 4-byte entry of 3 language numbers and one probability subscript, plus
michael@0 126 // an accumulator tote. (language 0 means unused entry)
michael@0 127 // Output: running sums in tote updated
michael@0 128 void ProcessProbV2Tote(uint32 probs, Tote* tote) {
michael@0 129 uint8 prob123 = (probs >> 0) & 0xff;
michael@0 130 const uint8* prob123_entry = LgProb2TblEntry(prob123);
michael@0 131
michael@0 132 uint8 top1 = (probs >> 8) & 0xff;
michael@0 133 if (top1 > 0) {tote->Add(top1, LgProb3(prob123_entry, 0));}
michael@0 134 uint8 top2 = (probs >> 16) & 0xff;
michael@0 135 if (top2 > 0) {tote->Add(top2, LgProb3(prob123_entry, 1));}
michael@0 136 uint8 top3 = (probs >> 24) & 0xff;
michael@0 137 if (top3 > 0) {tote->Add(top3, LgProb3(prob123_entry, 2));}
michael@0 138 }
michael@0 139
michael@0 140 // Return score for a particular per-script language, or zero
michael@0 141 int GetLangScore(uint32 probs, uint8 pslang) {
michael@0 142 uint8 prob123 = (probs >> 0) & 0xff;
michael@0 143 const uint8* prob123_entry = LgProb2TblEntry(prob123);
michael@0 144 int retval = 0;
michael@0 145 uint8 top1 = (probs >> 8) & 0xff;
michael@0 146 if (top1 == pslang) {retval += LgProb3(prob123_entry, 0);}
michael@0 147 uint8 top2 = (probs >> 16) & 0xff;
michael@0 148 if (top2 == pslang) {retval += LgProb3(prob123_entry, 1);}
michael@0 149 uint8 top3 = (probs >> 24) & 0xff;
michael@0 150 if (top3 == pslang) {retval += LgProb3(prob123_entry, 2);}
michael@0 151 return retval;
michael@0 152 }
michael@0 153
michael@0 154 //----------------------------------------------------------------------------//
michael@0 155 // Routines to accumulate probabilities //
michael@0 156 //----------------------------------------------------------------------------//
michael@0 157
michael@0 158
michael@0 159 // BIGRAM, using hash table, always advancing by 1 char
michael@0 160 // Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
michael@0 161 // Score all bigrams in isrc, using languages that have bigrams (CJK)
michael@0 162 // Return number of bigrams that hit in the hash table
michael@0 163 int DoBigramScoreV3(const CLD2TableSummary* bigram_obj,
michael@0 164 const char* isrc, int srclen, Tote* chunk_tote) {
michael@0 165 int hit_count = 0;
michael@0 166 const char* src = isrc;
michael@0 167
michael@0 168 // Hashtable-based CJK bigram lookup
michael@0 169 const uint8* usrc = reinterpret_cast<const uint8*>(src);
michael@0 170 const uint8* usrclimit1 = usrc + srclen - UTFmax;
michael@0 171
michael@0 172 while (usrc < usrclimit1) {
michael@0 173 int len = kAdvanceOneChar[usrc[0]];
michael@0 174 int len2 = kAdvanceOneChar[usrc[len]] + len;
michael@0 175
michael@0 176 if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible
michael@0 177 // Lookup and score this bigram
michael@0 178 // Always ignore pre/post spaces
michael@0 179 uint32 bihash = BiHashV2(reinterpret_cast<const char*>(usrc), len2);
michael@0 180 uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
michael@0 181 // Now go indirect on the subscript
michael@0 182 probs = bigram_obj->kCLDTableInd[probs &
michael@0 183 ~bigram_obj->kCLDTableKeyMask];
michael@0 184
michael@0 185 // Process the bigram
michael@0 186 if (probs != 0) {
michael@0 187 ProcessProbV2Tote(probs, chunk_tote);
michael@0 188 ++hit_count;
michael@0 189 }
michael@0 190 }
michael@0 191 usrc += len; // Advance by one char
michael@0 192 }
michael@0 193
michael@0 194 return hit_count;
michael@0 195 }
michael@0 196
michael@0 197
michael@0 198 // Score up to 64KB of a single script span in one pass
michael@0 199 // Make a dummy entry off the end to calc length of last span
michael@0 200 // Return offset of first unused input byte
michael@0 201 int GetUniHits(const char* text,
michael@0 202 int letter_offset, int letter_limit,
michael@0 203 ScoringContext* scoringcontext,
michael@0 204 ScoringHitBuffer* hitbuffer) {
michael@0 205 const char* isrc = &text[letter_offset];
michael@0 206 const char* src = isrc;
michael@0 207 // Limit is end, which has extra 20 20 20 00 past len
michael@0 208 const char* srclimit = &text[letter_limit];
michael@0 209
michael@0 210 // Local copies
michael@0 211 const UTF8PropObj* unigram_obj =
michael@0 212 scoringcontext->scoringtables->unigram_obj;
michael@0 213 int next_base = hitbuffer->next_base;
michael@0 214 int next_base_limit = hitbuffer->maxscoringhits;
michael@0 215
michael@0 216 // Visit all unigrams
michael@0 217 if (src[0] == ' ') {++src;} // skip any initial space
michael@0 218 while (src < srclimit) {
michael@0 219 const uint8* usrc = reinterpret_cast<const uint8*>(src);
michael@0 220 int len = kAdvanceOneChar[usrc[0]];
michael@0 221 src += len;
michael@0 222 // Look up property of one UTF-8 character and advance over it.
michael@0 223 // Updates usrc and len (bad interface design), hence increment above
michael@0 224 int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &len);
michael@0 225 if (propval > 0) {
michael@0 226 // Save indirect subscript for later scoring; 1 or 2 langprobs
michael@0 227 int indirect_subscr = propval;
michael@0 228 hitbuffer->base[next_base].offset = src - text; // Offset in text
michael@0 229 hitbuffer->base[next_base].indirect = indirect_subscr;
michael@0 230 ++next_base;
michael@0 231 }
michael@0 232
michael@0 233 if (next_base >= next_base_limit) {break;}
michael@0 234 }
michael@0 235
michael@0 236 hitbuffer->next_base = next_base;
michael@0 237
michael@0 238 // Make a dummy entry off the end to calc length of last span
michael@0 239 int dummy_offset = src - text;
michael@0 240 hitbuffer->base[hitbuffer->next_base].offset = dummy_offset;
michael@0 241 hitbuffer->base[hitbuffer->next_base].indirect = 0;
michael@0 242
michael@0 243 return src - text;
michael@0 244 }
michael@0 245
michael@0 246 // Score up to 64KB of a single script span, doing both delta-bi and
michael@0 247 // distinct bis in one pass
michael@0 248 void GetBiHits(const char* text,
michael@0 249 int letter_offset, int letter_limit,
michael@0 250 ScoringContext* scoringcontext,
michael@0 251 ScoringHitBuffer* hitbuffer) {
michael@0 252 const char* isrc = &text[letter_offset];
michael@0 253 const char* src = isrc;
michael@0 254 // Limit is end
michael@0 255 const char* srclimit1 = &text[letter_limit];
michael@0 256
michael@0 257 // Local copies
michael@0 258 const CLD2TableSummary* deltabi_obj =
michael@0 259 scoringcontext->scoringtables->deltabi_obj;
michael@0 260 const CLD2TableSummary* distinctbi_obj =
michael@0 261 scoringcontext->scoringtables->distinctbi_obj;
michael@0 262 int next_delta = hitbuffer->next_delta;
michael@0 263 int next_delta_limit = hitbuffer->maxscoringhits;
michael@0 264 int next_distinct = hitbuffer->next_distinct;
michael@0 265 // We can do 2 inserts per loop, so -1
michael@0 266 int next_distinct_limit = hitbuffer->maxscoringhits - 1;
michael@0 267
michael@0 268 while (src < srclimit1) {
michael@0 269 const uint8* usrc = reinterpret_cast<const uint8*>(src);
michael@0 270 int len = kAdvanceOneChar[usrc[0]];
michael@0 271 int len2 = kAdvanceOneChar[usrc[len]] + len;
michael@0 272
michael@0 273 if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible
michael@0 274 // Lookup and this bigram and save <offset, indirect>
michael@0 275 uint32 bihash = BiHashV2(src, len2);
michael@0 276 uint32 probs = QuadHashV3Lookup4(deltabi_obj, bihash);
michael@0 277 // Now go indirect on the subscript
michael@0 278 if (probs != 0) {
michael@0 279 // Save indirect subscript for later scoring; 1 langprob
michael@0 280 int indirect_subscr = probs & ~deltabi_obj->kCLDTableKeyMask;
michael@0 281 hitbuffer->delta[next_delta].offset = src - text;
michael@0 282 hitbuffer->delta[next_delta].indirect = indirect_subscr;
michael@0 283 ++next_delta;
michael@0 284 }
michael@0 285 // Lookup this distinct bigram and save <offset, indirect>
michael@0 286 probs = QuadHashV3Lookup4(distinctbi_obj, bihash);
michael@0 287 if (probs != 0) {
michael@0 288 int indirect_subscr = probs & ~distinctbi_obj->kCLDTableKeyMask;
michael@0 289 hitbuffer->distinct[next_distinct].offset = src - text;
michael@0 290 hitbuffer->distinct[next_distinct].indirect = indirect_subscr;
michael@0 291 ++next_distinct;
michael@0 292 }
michael@0 293 }
michael@0 294 src += len; // Advance by one char (not two)
michael@0 295
michael@0 296 // Almost always srclimit hit first
michael@0 297 if (next_delta >= next_delta_limit) {break;}
michael@0 298 if (next_distinct >= next_distinct_limit) {break;}
michael@0 299 }
michael@0 300
michael@0 301 hitbuffer->next_delta = next_delta;
michael@0 302 hitbuffer->next_distinct = next_distinct;
michael@0 303
michael@0 304 // Make a dummy entry off the end to calc length of last span
michael@0 305 int dummy_offset = src - text;
michael@0 306 hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset;
michael@0 307 hitbuffer->delta[hitbuffer->next_delta].indirect = 0;
michael@0 308 hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset;
michael@0 309 hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0;
michael@0 310 }
michael@0 311
michael@0 312 // Score up to 64KB of a single script span in one pass
michael@0 313 // Make a dummy entry off the end to calc length of last span
michael@0 314 // Return offset of first unused input byte
michael@0 315 int GetQuadHits(const char* text,
michael@0 316 int letter_offset, int letter_limit,
michael@0 317 ScoringContext* scoringcontext,
michael@0 318 ScoringHitBuffer* hitbuffer) {
michael@0 319 const char* isrc = &text[letter_offset];
michael@0 320 const char* src = isrc;
michael@0 321 // Limit is end, which has extra 20 20 20 00 past len
michael@0 322 const char* srclimit = &text[letter_limit];
michael@0 323
michael@0 324 // Local copies
michael@0 325 const CLD2TableSummary* quadgram_obj =
michael@0 326 scoringcontext->scoringtables->quadgram_obj;
michael@0 327 const CLD2TableSummary* quadgram_obj2 =
michael@0 328 scoringcontext->scoringtables->quadgram_obj2;
michael@0 329 int next_base = hitbuffer->next_base;
michael@0 330 int next_base_limit = hitbuffer->maxscoringhits;
michael@0 331
michael@0 332 // Run a little cache of last quad hits to catch overly-repetitive "text"
michael@0 333 // We don't care if we miss a couple repetitions at scriptspan boundaries
michael@0 334 int next_prior_quadhash = 0;
michael@0 335 uint32 prior_quadhash[2] = {0, 0};
michael@0 336
michael@0 337 // Visit all quadgrams
michael@0 338 if (src[0] == ' ') {++src;} // skip any initial space
michael@0 339 while (src < srclimit) {
michael@0 340 // Find one quadgram
michael@0 341 const char* src_end = src;
michael@0 342 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
michael@0 343 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
michael@0 344 const char* src_mid = src_end;
michael@0 345 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
michael@0 346 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
michael@0 347 int len = src_end - src;
michael@0 348 // Hash the quadgram
michael@0 349 uint32 quadhash = QuadHashV2(src, len);
michael@0 350
michael@0 351 // Filter out recent repeats
michael@0 352 if ((quadhash != prior_quadhash[0]) && (quadhash != prior_quadhash[1])) {
michael@0 353 // Look up this quadgram and save <offset, indirect>
michael@0 354 uint32 indirect_flag = 0; // For dual tables
michael@0 355 const CLD2TableSummary* hit_obj = quadgram_obj;
michael@0 356 uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
michael@0 357 if ((probs == 0) && (quadgram_obj2->kCLDTableSize != 0)) {
michael@0 358 // Try lookup in dual table if not found in first one
michael@0 359 // Note: we need to know later which of two indirect tables to use.
michael@0 360 indirect_flag = 0x80000000u;
michael@0 361 hit_obj = quadgram_obj2;
michael@0 362 probs = QuadHashV3Lookup4(quadgram_obj2, quadhash);
michael@0 363 }
michael@0 364 if (probs != 0) {
michael@0 365 // Round-robin two entries of actual hits
michael@0 366 prior_quadhash[next_prior_quadhash] = quadhash;
michael@0 367 next_prior_quadhash = (next_prior_quadhash + 1) & 1;
michael@0 368
michael@0 369 // Save indirect subscript for later scoring; 1 or 2 langprobs
michael@0 370 int indirect_subscr = probs & ~hit_obj->kCLDTableKeyMask;
michael@0 371 hitbuffer->base[next_base].offset = src - text; // Offset in text
michael@0 372 // Flip the high bit for table2
michael@0 373 hitbuffer->base[next_base].indirect = indirect_subscr | indirect_flag;
michael@0 374 ++next_base;
michael@0 375 }
michael@0 376 }
michael@0 377
michael@0 378 // Advance: all the way past word if at end-of-word, else 2 chars
michael@0 379 if (src_end[0] == ' ') {
michael@0 380 src = src_end;
michael@0 381 } else {
michael@0 382 src = src_mid;
michael@0 383 }
michael@0 384
michael@0 385 // Skip over space at end of word, or ASCII vowel in middle of word
michael@0 386 // Use kAdvanceOneCharSpace instead to get rid of vowel hack
michael@0 387 if (src < srclimit) {
michael@0 388 src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
michael@0 389 } else {
michael@0 390 // Advancing by 4/8/16 can overshoot, but we are about to exit anyway
michael@0 391 src = srclimit;
michael@0 392 }
michael@0 393
michael@0 394 if (next_base >= next_base_limit) {break;}
michael@0 395 }
michael@0 396
michael@0 397 hitbuffer->next_base = next_base;
michael@0 398
michael@0 399 // Make a dummy entry off the end to calc length of last span
michael@0 400 int dummy_offset = src - text;
michael@0 401 hitbuffer->base[hitbuffer->next_base].offset = dummy_offset;
michael@0 402 hitbuffer->base[hitbuffer->next_base].indirect = 0;
michael@0 403
michael@0 404 return src - text;
michael@0 405 }
michael@0 406
michael@0 407 // inputs:
michael@0 408 // const tables
michael@0 409 // const char* isrc, int srclen (in sscriptbuffer)
michael@0 410 // intermediates:
michael@0 411 // vector of octa <offset, probs> (which need indirect table to decode)
michael@0 412 // vector of distinct <offset, probs> (which need indirect table to decode)
michael@0 413
michael@0 414 // Score up to 64KB of a single script span, doing both delta-octa and
michael@0 415 // distinct words in one pass
michael@0 416 void GetOctaHits(const char* text,
michael@0 417 int letter_offset, int letter_limit,
michael@0 418 ScoringContext* scoringcontext,
michael@0 419 ScoringHitBuffer* hitbuffer) {
michael@0 420 const char* isrc = &text[letter_offset];
michael@0 421 const char* src = isrc;
michael@0 422 // Limit is end+1, to include extra space char (0x20) off the end
michael@0 423 const char* srclimit = &text[letter_limit + 1];
michael@0 424
michael@0 425 // Local copies
michael@0 426 const CLD2TableSummary* deltaocta_obj =
michael@0 427 scoringcontext->scoringtables->deltaocta_obj;
michael@0 428 int next_delta = hitbuffer->next_delta;
michael@0 429 int next_delta_limit = hitbuffer->maxscoringhits;
michael@0 430
michael@0 431 const CLD2TableSummary* distinctocta_obj =
michael@0 432 scoringcontext->scoringtables->distinctocta_obj;
michael@0 433 int next_distinct = hitbuffer->next_distinct;
michael@0 434 // We can do 2 inserts per loop, so -1
michael@0 435 int next_distinct_limit = hitbuffer->maxscoringhits - 1;
michael@0 436
michael@0 437 // Run a little cache of last octa hits to catch overly-repetitive "text"
michael@0 438 // We don't care if we miss a couple repetitions at scriptspan boundaries
michael@0 439 int next_prior_octahash = 0;
michael@0 440 uint64 prior_octahash[2] = {0, 0};
michael@0 441
michael@0 442 // Score all words truncated to 8 characters
michael@0 443 int charcount = 0;
michael@0 444 // Skip any initial space
michael@0 445 if (src[0] == ' ') {++src;}
michael@0 446
michael@0 447 // Begin the first word
michael@0 448 const char* prior_word_start = src;
michael@0 449 const char* word_start = src;
michael@0 450 const char* word_end = word_start;
michael@0 451 while (src < srclimit) {
michael@0 452 // Terminate previous word or continue current word
michael@0 453 if (src[0] == ' ') {
michael@0 454 int len = word_end - word_start;
michael@0 455 // Hash the word
michael@0 456 uint64 wordhash40 = OctaHash40(word_start, len);
michael@0 457 uint32 probs;
michael@0 458
michael@0 459 // Filter out recent repeats. Unlike quads, we update even if no hit,
michael@0 460 // so we can get hits on same word if separated by non-hit words
michael@0 461 if ((wordhash40 != prior_octahash[0]) &&
michael@0 462 (wordhash40 != prior_octahash[1])) {
michael@0 463 // Round-robin two entries of words
michael@0 464 prior_octahash[next_prior_octahash] = wordhash40;
michael@0 465 next_prior_octahash = 1 - next_prior_octahash; // Alternates 0,1,0,1
michael@0 466
michael@0 467 // (1) Lookup distinct word PAIR. For a pair, we want an asymmetrical
michael@0 468 // function of the two word hashs. For words A B C, B-A and C-B are good
michael@0 469 // enough and fast. We use the same table as distinct single words
michael@0 470 // Do not look up a pair of identical words -- all pairs hash to zero
michael@0 471 // Both 1- and 2-word distinct lookups are in distinctocta_obj now
michael@0 472 // Do this first, because it has the lowest offset
michael@0 473 uint64 tmp_prior_hash = prior_octahash[next_prior_octahash];
michael@0 474 if ((tmp_prior_hash != 0) && (tmp_prior_hash != wordhash40)) {
michael@0 475 uint64 pair_hash = PairHash(tmp_prior_hash, wordhash40);
michael@0 476 probs = OctaHashV3Lookup4(distinctocta_obj, pair_hash);
michael@0 477 if (probs != 0) {
michael@0 478 int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask;
michael@0 479 hitbuffer->distinct[next_distinct].offset = prior_word_start - text;
michael@0 480 hitbuffer->distinct[next_distinct].indirect = indirect_subscr;
michael@0 481 ++next_distinct;
michael@0 482 }
michael@0 483 }
michael@0 484
michael@0 485 // (2) Lookup this distinct word and save <offset, indirect>
michael@0 486 probs = OctaHashV3Lookup4(distinctocta_obj, wordhash40);
michael@0 487 if (probs != 0) {
michael@0 488 int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask;
michael@0 489 hitbuffer->distinct[next_distinct].offset = word_start - text;
michael@0 490 hitbuffer->distinct[next_distinct].indirect = indirect_subscr;
michael@0 491 ++next_distinct;
michael@0 492 }
michael@0 493
michael@0 494 // (3) Lookup this word and save <offset, indirect>
michael@0 495 probs = OctaHashV3Lookup4(deltaocta_obj, wordhash40);
michael@0 496 if (probs != 0) {
michael@0 497 // Save indirect subscript for later scoring; 1 langprob
michael@0 498 int indirect_subscr = probs & ~deltaocta_obj->kCLDTableKeyMask;
michael@0 499 hitbuffer->delta[next_delta].offset = word_start - text;
michael@0 500 hitbuffer->delta[next_delta].indirect = indirect_subscr;
michael@0 501 ++next_delta;
michael@0 502 }
michael@0 503 }
michael@0 504
michael@0 505 // Begin the next word
michael@0 506 charcount = 0;
michael@0 507 prior_word_start = word_start;
michael@0 508 word_start = src + 1; // Over the space
michael@0 509 word_end = word_start;
michael@0 510 } else {
michael@0 511 ++charcount;
michael@0 512 }
michael@0 513
michael@0 514 // Advance to next char
michael@0 515 src += UTF8OneCharLen(src);
michael@0 516 if (charcount <= 8) {
michael@0 517 word_end = src;
michael@0 518 }
michael@0 519 // Almost always srclimit hit first
michael@0 520 if (next_delta >= next_delta_limit) {break;}
michael@0 521 if (next_distinct >= next_distinct_limit) {break;}
michael@0 522 }
michael@0 523
michael@0 524 hitbuffer->next_delta = next_delta;
michael@0 525 hitbuffer->next_distinct = next_distinct;
michael@0 526
michael@0 527 // Make a dummy entry off the end to calc length of last span
michael@0 528 int dummy_offset = src - text;
michael@0 529 hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset;
michael@0 530 hitbuffer->delta[hitbuffer->next_delta].indirect = 0;
michael@0 531 hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset;
michael@0 532 hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0;
michael@0 533 }
michael@0 534
michael@0 535
michael@0 536 //----------------------------------------------------------------------------//
michael@0 537 // Reliability calculations, for single language and between languages //
michael@0 538 //----------------------------------------------------------------------------//
michael@0 539
michael@0 540 // Return reliablity of result 0..100 for top two scores
michael@0 541 // delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable
michael@0 542 // (on a scale where +1 is a factor of 2 ** 1.6 = 3.02)
michael@0 543 // Threshold is uni/quadgram increment count, bounded above and below.
michael@0 544 //
michael@0 545 // Requiring a factor of 3 improvement (e.g. +1 log base 3)
michael@0 546 // for each scored quadgram is too stringent, so I've backed this off to a
michael@0 547 // factor of 2 (e.g. +5/8 log base 3).
michael@0 548 //
michael@0 549 // I also somewhat lowered the Min/MaxGramCount limits above
michael@0 550 //
michael@0 551 // Added: if fewer than 8 quads/unis, max reliability is 12*n percent
michael@0 552 //
michael@0 553 int ReliabilityDelta(int value1, int value2, int gramcount) {
michael@0 554 int max_reliability_percent = 100;
michael@0 555 if (gramcount < 8) {
michael@0 556 max_reliability_percent = 12 * gramcount;
michael@0 557 }
michael@0 558 int fully_reliable_thresh = (gramcount * 5) >> 3; // see note above
michael@0 559 if (fully_reliable_thresh < kMinGramCount) { // Fully = 3..16
michael@0 560 fully_reliable_thresh = kMinGramCount;
michael@0 561 } else if (fully_reliable_thresh > kMaxGramCount) {
michael@0 562 fully_reliable_thresh = kMaxGramCount;
michael@0 563 }
michael@0 564
michael@0 565 int delta = value1 - value2;
michael@0 566 if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
michael@0 567 if (delta <= 0) {return 0;}
michael@0 568 return minint(max_reliability_percent,
michael@0 569 (100 * delta) / fully_reliable_thresh);
michael@0 570 }
michael@0 571
michael@0 572 // Return reliablity of result 0..100 for top score vs. expected mainsteam score
michael@0 573 // Values are score per 1024 bytes of input
michael@0 574 // ratio = max(top/mainstream, mainstream/top)
michael@0 575 // ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable
michael@0 576 // Change: short-text word scoring can give unusually good results.
michael@0 577 // Let top exceed mainstream by 4x at 50% reliable
michael@0 578 //
michael@0 579 // dsites April 2010: These could be tightened up. It would be
michael@0 580 // reasonable with newer data and round-robin table allocation to start ramping
michael@0 581 // down at mean * 1.5 and mean/1.5, while letting mean*2 and mean/2 pass,
michael@0 582 // but just barely.
michael@0 583 //
michael@0 584 // dsites March 2013: Tightened up a bit.
michael@0 585 static const double kRatio100 = 1.5;
michael@0 586 static const double kRatio0 = 4.0;
michael@0 587 int ReliabilityExpected(int actual_score_1kb, int expected_score_1kb) {
michael@0 588 if (expected_score_1kb == 0) {return 100;} // No reliability data available yet
michael@0 589 if (actual_score_1kb == 0) {return 0;} // zero score = unreliable
michael@0 590 double ratio;
michael@0 591 if (expected_score_1kb > actual_score_1kb) {
michael@0 592 ratio = (1.0 * expected_score_1kb) / actual_score_1kb;
michael@0 593 } else {
michael@0 594 ratio = (1.0 * actual_score_1kb) / expected_score_1kb;
michael@0 595 }
michael@0 596 // Ratio 1.0 .. 1.5 scores 100%
michael@0 597 // Ratio 2.0 scores 80%
michael@0 598 // Linear decline, to ratio 4.0 scores 0%
michael@0 599 if (ratio <= kRatio100) {return 100;}
michael@0 600 if (ratio > kRatio0) {return 0;}
michael@0 601
michael@0 602 int percent_good = 100.0 * (kRatio0 - ratio) / (kRatio0 - kRatio100);
michael@0 603 return percent_good;
michael@0 604 }
michael@0 605
michael@0 606 // Create a langprob packed value from its parts.
michael@0 607 // qprob is quantized [0..12]
michael@0 608 // We use Latn script to represent any RTypeMany language
michael@0 609 uint32 MakeLangProb(Language lang, int qprob) {
michael@0 610 uint32 pslang = PerScriptNumber(ULScript_Latin, lang);
michael@0 611 uint32 retval = (pslang << 8) | kLgProbV2TblBackmap[qprob];
michael@0 612 return retval;
michael@0 613 }
michael@0 614
michael@0 615 } // End namespace CLD2
michael@0 616
michael@0 617
michael@0 618
michael@0 619
michael@0 620

mercurial