Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | // Copyright 2013 Google Inc. All Rights Reserved. |
michael@0 | 2 | // |
michael@0 | 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
michael@0 | 4 | // you may not use this file except in compliance with the License. |
michael@0 | 5 | // You may obtain a copy of the License at |
michael@0 | 6 | // |
michael@0 | 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
michael@0 | 8 | // |
michael@0 | 9 | // Unless required by applicable law or agreed to in writing, software |
michael@0 | 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
michael@0 | 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
michael@0 | 12 | // See the License for the specific language governing permissions and |
michael@0 | 13 | // limitations under the License. |
michael@0 | 14 | |
michael@0 | 15 | // |
michael@0 | 16 | // Author: dsites@google.com (Dick Sites) |
michael@0 | 17 | // Updated 2014.01 for dual table lookup |
michael@0 | 18 | // |
michael@0 | 19 | |
michael@0 | 20 | #include "cldutil.h" |
michael@0 | 21 | #include <string> |
michael@0 | 22 | |
michael@0 | 23 | #include "cld2tablesummary.h" |
michael@0 | 24 | #include "integral_types.h" |
michael@0 | 25 | #include "port.h" |
michael@0 | 26 | #include "utf8statetable.h" |
michael@0 | 27 | |
michael@0 | 28 | namespace CLD2 { |
michael@0 | 29 | |
michael@0 | 30 | // Caller supplies the right tables in scoringcontext |
michael@0 | 31 | |
michael@0 | 32 | // Runtime routines for hashing, looking up, and scoring |
michael@0 | 33 | // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams. |
michael@0 | 34 | // Unigrams and bigrams are for CJK languages only, including simplified/ |
michael@0 | 35 | // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and |
michael@0 | 36 | // Zhuang Han characters. Surrounding spaces are not considered. |
michael@0 | 37 | // Quadgrams and octagrams for for non-CJK and include two bits indicating |
michael@0 | 38 | // preceding and trailing spaces (word boundaries). |
michael@0 | 39 | |
michael@0 | 40 | |
michael@0 | 41 | static const int kMinCJKUTF8CharBytes = 3; |
michael@0 | 42 | |
michael@0 | 43 | static const int kMinGramCount = 3; |
michael@0 | 44 | static const int kMaxGramCount = 16; |
michael@0 | 45 | |
michael@0 | 46 | static const int UTFmax = 4; // Max number of bytes in a UTF-8 character |
michael@0 | 47 | |
michael@0 | 48 | // 1 to skip ASCII space, vowels AEIOU aeiou and UTF-8 continuation bytes 80-BF |
michael@0 | 49 | static const uint8 kSkipSpaceVowelContinue[256] = { |
michael@0 | 50 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 51 | 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 52 | 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 53 | 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 54 | |
michael@0 | 55 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 56 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 57 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 58 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 59 | }; |
michael@0 | 60 | |
michael@0 | 61 | // 1 to skip ASCII space, and UTF-8 continuation bytes 80-BF |
michael@0 | 62 | static const uint8 kSkipSpaceContinue[256] = { |
michael@0 | 63 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 64 | 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 65 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 66 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 67 | |
michael@0 | 68 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 69 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 70 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 71 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 72 | }; |
michael@0 | 73 | |
michael@0 | 74 | |
michael@0 | 75 | // Always advances one UTF-8 character |
michael@0 | 76 | static const uint8 kAdvanceOneChar[256] = { |
michael@0 | 77 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 78 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 79 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 80 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 81 | |
michael@0 | 82 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 83 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 84 | 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
michael@0 | 85 | 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, |
michael@0 | 86 | }; |
michael@0 | 87 | |
michael@0 | 88 | // Advances *only* on space (or illegal byte) |
michael@0 | 89 | static const uint8 kAdvanceOneCharSpace[256] = { |
michael@0 | 90 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 91 | 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 92 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 93 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 94 | |
michael@0 | 95 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 96 | 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
michael@0 | 97 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 98 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 99 | }; |
michael@0 | 100 | |
michael@0 | 101 | |
michael@0 | 102 | // Routines to access a hash table of <key:wordhash, value:probs> pairs |
michael@0 | 103 | // Buckets have 4-byte wordhash for sizes < 32K buckets, but only |
michael@0 | 104 | // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as |
michael@0 | 105 | // bucket subscript. |
michael@0 | 106 | // Probs is a packed: three languages plus a subscript for probability table |
michael@0 | 107 | // Buckets have all the keys together, then all the values.Key array never |
michael@0 | 108 | // crosses a cache-line boundary, so no-match case takes exactly one cache miss. |
michael@0 | 109 | // Match case may sometimes take an additional cache miss on value access. |
michael@0 | 110 | // |
michael@0 | 111 | // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64 |
michael@0 | 112 | // byte buckets with single cache miss. |
michael@0 | 113 | // Or 2-byte key and 6-byte value, allowing 5 languages instead of three. |
michael@0 | 114 | //------------------------------------------------------------------------------ |
michael@0 | 115 | |
michael@0 | 116 | //----------------------------------------------------------------------------// |
michael@0 | 117 | // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores // |
michael@0 | 118 | //----------------------------------------------------------------------------// |
michael@0 | 119 | |
michael@0 | 120 | //----------------------------------------------------------------------------// |
michael@0 | 121 | // Scoring single groups of letters // |
michael@0 | 122 | //----------------------------------------------------------------------------// |
michael@0 | 123 | |
michael@0 | 124 | // BIGRAM, QUADGRAM, OCTAGRAM score one => tote |
michael@0 | 125 | // Input: 4-byte entry of 3 language numbers and one probability subscript, plus |
michael@0 | 126 | // an accumulator tote. (language 0 means unused entry) |
michael@0 | 127 | // Output: running sums in tote updated |
michael@0 | 128 | void ProcessProbV2Tote(uint32 probs, Tote* tote) { |
michael@0 | 129 | uint8 prob123 = (probs >> 0) & 0xff; |
michael@0 | 130 | const uint8* prob123_entry = LgProb2TblEntry(prob123); |
michael@0 | 131 | |
michael@0 | 132 | uint8 top1 = (probs >> 8) & 0xff; |
michael@0 | 133 | if (top1 > 0) {tote->Add(top1, LgProb3(prob123_entry, 0));} |
michael@0 | 134 | uint8 top2 = (probs >> 16) & 0xff; |
michael@0 | 135 | if (top2 > 0) {tote->Add(top2, LgProb3(prob123_entry, 1));} |
michael@0 | 136 | uint8 top3 = (probs >> 24) & 0xff; |
michael@0 | 137 | if (top3 > 0) {tote->Add(top3, LgProb3(prob123_entry, 2));} |
michael@0 | 138 | } |
michael@0 | 139 | |
michael@0 | 140 | // Return score for a particular per-script language, or zero |
michael@0 | 141 | int GetLangScore(uint32 probs, uint8 pslang) { |
michael@0 | 142 | uint8 prob123 = (probs >> 0) & 0xff; |
michael@0 | 143 | const uint8* prob123_entry = LgProb2TblEntry(prob123); |
michael@0 | 144 | int retval = 0; |
michael@0 | 145 | uint8 top1 = (probs >> 8) & 0xff; |
michael@0 | 146 | if (top1 == pslang) {retval += LgProb3(prob123_entry, 0);} |
michael@0 | 147 | uint8 top2 = (probs >> 16) & 0xff; |
michael@0 | 148 | if (top2 == pslang) {retval += LgProb3(prob123_entry, 1);} |
michael@0 | 149 | uint8 top3 = (probs >> 24) & 0xff; |
michael@0 | 150 | if (top3 == pslang) {retval += LgProb3(prob123_entry, 2);} |
michael@0 | 151 | return retval; |
michael@0 | 152 | } |
michael@0 | 153 | |
michael@0 | 154 | //----------------------------------------------------------------------------// |
michael@0 | 155 | // Routines to accumulate probabilities // |
michael@0 | 156 | //----------------------------------------------------------------------------// |
michael@0 | 157 | |
michael@0 | 158 | |
michael@0 | 159 | // BIGRAM, using hash table, always advancing by 1 char |
michael@0 | 160 | // Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj |
michael@0 | 161 | // Score all bigrams in isrc, using languages that have bigrams (CJK) |
michael@0 | 162 | // Return number of bigrams that hit in the hash table |
michael@0 | 163 | int DoBigramScoreV3(const CLD2TableSummary* bigram_obj, |
michael@0 | 164 | const char* isrc, int srclen, Tote* chunk_tote) { |
michael@0 | 165 | int hit_count = 0; |
michael@0 | 166 | const char* src = isrc; |
michael@0 | 167 | |
michael@0 | 168 | // Hashtable-based CJK bigram lookup |
michael@0 | 169 | const uint8* usrc = reinterpret_cast<const uint8*>(src); |
michael@0 | 170 | const uint8* usrclimit1 = usrc + srclen - UTFmax; |
michael@0 | 171 | |
michael@0 | 172 | while (usrc < usrclimit1) { |
michael@0 | 173 | int len = kAdvanceOneChar[usrc[0]]; |
michael@0 | 174 | int len2 = kAdvanceOneChar[usrc[len]] + len; |
michael@0 | 175 | |
michael@0 | 176 | if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible |
michael@0 | 177 | // Lookup and score this bigram |
michael@0 | 178 | // Always ignore pre/post spaces |
michael@0 | 179 | uint32 bihash = BiHashV2(reinterpret_cast<const char*>(usrc), len2); |
michael@0 | 180 | uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash); |
michael@0 | 181 | // Now go indirect on the subscript |
michael@0 | 182 | probs = bigram_obj->kCLDTableInd[probs & |
michael@0 | 183 | ~bigram_obj->kCLDTableKeyMask]; |
michael@0 | 184 | |
michael@0 | 185 | // Process the bigram |
michael@0 | 186 | if (probs != 0) { |
michael@0 | 187 | ProcessProbV2Tote(probs, chunk_tote); |
michael@0 | 188 | ++hit_count; |
michael@0 | 189 | } |
michael@0 | 190 | } |
michael@0 | 191 | usrc += len; // Advance by one char |
michael@0 | 192 | } |
michael@0 | 193 | |
michael@0 | 194 | return hit_count; |
michael@0 | 195 | } |
michael@0 | 196 | |
michael@0 | 197 | |
michael@0 | 198 | // Score up to 64KB of a single script span in one pass |
michael@0 | 199 | // Make a dummy entry off the end to calc length of last span |
michael@0 | 200 | // Return offset of first unused input byte |
michael@0 | 201 | int GetUniHits(const char* text, |
michael@0 | 202 | int letter_offset, int letter_limit, |
michael@0 | 203 | ScoringContext* scoringcontext, |
michael@0 | 204 | ScoringHitBuffer* hitbuffer) { |
michael@0 | 205 | const char* isrc = &text[letter_offset]; |
michael@0 | 206 | const char* src = isrc; |
michael@0 | 207 | // Limit is end, which has extra 20 20 20 00 past len |
michael@0 | 208 | const char* srclimit = &text[letter_limit]; |
michael@0 | 209 | |
michael@0 | 210 | // Local copies |
michael@0 | 211 | const UTF8PropObj* unigram_obj = |
michael@0 | 212 | scoringcontext->scoringtables->unigram_obj; |
michael@0 | 213 | int next_base = hitbuffer->next_base; |
michael@0 | 214 | int next_base_limit = hitbuffer->maxscoringhits; |
michael@0 | 215 | |
michael@0 | 216 | // Visit all unigrams |
michael@0 | 217 | if (src[0] == ' ') {++src;} // skip any initial space |
michael@0 | 218 | while (src < srclimit) { |
michael@0 | 219 | const uint8* usrc = reinterpret_cast<const uint8*>(src); |
michael@0 | 220 | int len = kAdvanceOneChar[usrc[0]]; |
michael@0 | 221 | src += len; |
michael@0 | 222 | // Look up property of one UTF-8 character and advance over it. |
michael@0 | 223 | // Updates usrc and len (bad interface design), hence increment above |
michael@0 | 224 | int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &len); |
michael@0 | 225 | if (propval > 0) { |
michael@0 | 226 | // Save indirect subscript for later scoring; 1 or 2 langprobs |
michael@0 | 227 | int indirect_subscr = propval; |
michael@0 | 228 | hitbuffer->base[next_base].offset = src - text; // Offset in text |
michael@0 | 229 | hitbuffer->base[next_base].indirect = indirect_subscr; |
michael@0 | 230 | ++next_base; |
michael@0 | 231 | } |
michael@0 | 232 | |
michael@0 | 233 | if (next_base >= next_base_limit) {break;} |
michael@0 | 234 | } |
michael@0 | 235 | |
michael@0 | 236 | hitbuffer->next_base = next_base; |
michael@0 | 237 | |
michael@0 | 238 | // Make a dummy entry off the end to calc length of last span |
michael@0 | 239 | int dummy_offset = src - text; |
michael@0 | 240 | hitbuffer->base[hitbuffer->next_base].offset = dummy_offset; |
michael@0 | 241 | hitbuffer->base[hitbuffer->next_base].indirect = 0; |
michael@0 | 242 | |
michael@0 | 243 | return src - text; |
michael@0 | 244 | } |
michael@0 | 245 | |
michael@0 | 246 | // Score up to 64KB of a single script span, doing both delta-bi and |
michael@0 | 247 | // distinct bis in one pass |
michael@0 | 248 | void GetBiHits(const char* text, |
michael@0 | 249 | int letter_offset, int letter_limit, |
michael@0 | 250 | ScoringContext* scoringcontext, |
michael@0 | 251 | ScoringHitBuffer* hitbuffer) { |
michael@0 | 252 | const char* isrc = &text[letter_offset]; |
michael@0 | 253 | const char* src = isrc; |
michael@0 | 254 | // Limit is end |
michael@0 | 255 | const char* srclimit1 = &text[letter_limit]; |
michael@0 | 256 | |
michael@0 | 257 | // Local copies |
michael@0 | 258 | const CLD2TableSummary* deltabi_obj = |
michael@0 | 259 | scoringcontext->scoringtables->deltabi_obj; |
michael@0 | 260 | const CLD2TableSummary* distinctbi_obj = |
michael@0 | 261 | scoringcontext->scoringtables->distinctbi_obj; |
michael@0 | 262 | int next_delta = hitbuffer->next_delta; |
michael@0 | 263 | int next_delta_limit = hitbuffer->maxscoringhits; |
michael@0 | 264 | int next_distinct = hitbuffer->next_distinct; |
michael@0 | 265 | // We can do 2 inserts per loop, so -1 |
michael@0 | 266 | int next_distinct_limit = hitbuffer->maxscoringhits - 1; |
michael@0 | 267 | |
michael@0 | 268 | while (src < srclimit1) { |
michael@0 | 269 | const uint8* usrc = reinterpret_cast<const uint8*>(src); |
michael@0 | 270 | int len = kAdvanceOneChar[usrc[0]]; |
michael@0 | 271 | int len2 = kAdvanceOneChar[usrc[len]] + len; |
michael@0 | 272 | |
michael@0 | 273 | if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible |
michael@0 | 274 | // Lookup and this bigram and save <offset, indirect> |
michael@0 | 275 | uint32 bihash = BiHashV2(src, len2); |
michael@0 | 276 | uint32 probs = QuadHashV3Lookup4(deltabi_obj, bihash); |
michael@0 | 277 | // Now go indirect on the subscript |
michael@0 | 278 | if (probs != 0) { |
michael@0 | 279 | // Save indirect subscript for later scoring; 1 langprob |
michael@0 | 280 | int indirect_subscr = probs & ~deltabi_obj->kCLDTableKeyMask; |
michael@0 | 281 | hitbuffer->delta[next_delta].offset = src - text; |
michael@0 | 282 | hitbuffer->delta[next_delta].indirect = indirect_subscr; |
michael@0 | 283 | ++next_delta; |
michael@0 | 284 | } |
michael@0 | 285 | // Lookup this distinct bigram and save <offset, indirect> |
michael@0 | 286 | probs = QuadHashV3Lookup4(distinctbi_obj, bihash); |
michael@0 | 287 | if (probs != 0) { |
michael@0 | 288 | int indirect_subscr = probs & ~distinctbi_obj->kCLDTableKeyMask; |
michael@0 | 289 | hitbuffer->distinct[next_distinct].offset = src - text; |
michael@0 | 290 | hitbuffer->distinct[next_distinct].indirect = indirect_subscr; |
michael@0 | 291 | ++next_distinct; |
michael@0 | 292 | } |
michael@0 | 293 | } |
michael@0 | 294 | src += len; // Advance by one char (not two) |
michael@0 | 295 | |
michael@0 | 296 | // Almost always srclimit hit first |
michael@0 | 297 | if (next_delta >= next_delta_limit) {break;} |
michael@0 | 298 | if (next_distinct >= next_distinct_limit) {break;} |
michael@0 | 299 | } |
michael@0 | 300 | |
michael@0 | 301 | hitbuffer->next_delta = next_delta; |
michael@0 | 302 | hitbuffer->next_distinct = next_distinct; |
michael@0 | 303 | |
michael@0 | 304 | // Make a dummy entry off the end to calc length of last span |
michael@0 | 305 | int dummy_offset = src - text; |
michael@0 | 306 | hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset; |
michael@0 | 307 | hitbuffer->delta[hitbuffer->next_delta].indirect = 0; |
michael@0 | 308 | hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset; |
michael@0 | 309 | hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0; |
michael@0 | 310 | } |
michael@0 | 311 | |
michael@0 | 312 | // Score up to 64KB of a single script span in one pass |
michael@0 | 313 | // Make a dummy entry off the end to calc length of last span |
michael@0 | 314 | // Return offset of first unused input byte |
michael@0 | 315 | int GetQuadHits(const char* text, |
michael@0 | 316 | int letter_offset, int letter_limit, |
michael@0 | 317 | ScoringContext* scoringcontext, |
michael@0 | 318 | ScoringHitBuffer* hitbuffer) { |
michael@0 | 319 | const char* isrc = &text[letter_offset]; |
michael@0 | 320 | const char* src = isrc; |
michael@0 | 321 | // Limit is end, which has extra 20 20 20 00 past len |
michael@0 | 322 | const char* srclimit = &text[letter_limit]; |
michael@0 | 323 | |
michael@0 | 324 | // Local copies |
michael@0 | 325 | const CLD2TableSummary* quadgram_obj = |
michael@0 | 326 | scoringcontext->scoringtables->quadgram_obj; |
michael@0 | 327 | const CLD2TableSummary* quadgram_obj2 = |
michael@0 | 328 | scoringcontext->scoringtables->quadgram_obj2; |
michael@0 | 329 | int next_base = hitbuffer->next_base; |
michael@0 | 330 | int next_base_limit = hitbuffer->maxscoringhits; |
michael@0 | 331 | |
michael@0 | 332 | // Run a little cache of last quad hits to catch overly-repetitive "text" |
michael@0 | 333 | // We don't care if we miss a couple repetitions at scriptspan boundaries |
michael@0 | 334 | int next_prior_quadhash = 0; |
michael@0 | 335 | uint32 prior_quadhash[2] = {0, 0}; |
michael@0 | 336 | |
michael@0 | 337 | // Visit all quadgrams |
michael@0 | 338 | if (src[0] == ' ') {++src;} // skip any initial space |
michael@0 | 339 | while (src < srclimit) { |
michael@0 | 340 | // Find one quadgram |
michael@0 | 341 | const char* src_end = src; |
michael@0 | 342 | src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
michael@0 | 343 | src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
michael@0 | 344 | const char* src_mid = src_end; |
michael@0 | 345 | src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
michael@0 | 346 | src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; |
michael@0 | 347 | int len = src_end - src; |
michael@0 | 348 | // Hash the quadgram |
michael@0 | 349 | uint32 quadhash = QuadHashV2(src, len); |
michael@0 | 350 | |
michael@0 | 351 | // Filter out recent repeats |
michael@0 | 352 | if ((quadhash != prior_quadhash[0]) && (quadhash != prior_quadhash[1])) { |
michael@0 | 353 | // Look up this quadgram and save <offset, indirect> |
michael@0 | 354 | uint32 indirect_flag = 0; // For dual tables |
michael@0 | 355 | const CLD2TableSummary* hit_obj = quadgram_obj; |
michael@0 | 356 | uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash); |
michael@0 | 357 | if ((probs == 0) && (quadgram_obj2->kCLDTableSize != 0)) { |
michael@0 | 358 | // Try lookup in dual table if not found in first one |
michael@0 | 359 | // Note: we need to know later which of two indirect tables to use. |
michael@0 | 360 | indirect_flag = 0x80000000u; |
michael@0 | 361 | hit_obj = quadgram_obj2; |
michael@0 | 362 | probs = QuadHashV3Lookup4(quadgram_obj2, quadhash); |
michael@0 | 363 | } |
michael@0 | 364 | if (probs != 0) { |
michael@0 | 365 | // Round-robin two entries of actual hits |
michael@0 | 366 | prior_quadhash[next_prior_quadhash] = quadhash; |
michael@0 | 367 | next_prior_quadhash = (next_prior_quadhash + 1) & 1; |
michael@0 | 368 | |
michael@0 | 369 | // Save indirect subscript for later scoring; 1 or 2 langprobs |
michael@0 | 370 | int indirect_subscr = probs & ~hit_obj->kCLDTableKeyMask; |
michael@0 | 371 | hitbuffer->base[next_base].offset = src - text; // Offset in text |
michael@0 | 372 | // Flip the high bit for table2 |
michael@0 | 373 | hitbuffer->base[next_base].indirect = indirect_subscr | indirect_flag; |
michael@0 | 374 | ++next_base; |
michael@0 | 375 | } |
michael@0 | 376 | } |
michael@0 | 377 | |
michael@0 | 378 | // Advance: all the way past word if at end-of-word, else 2 chars |
michael@0 | 379 | if (src_end[0] == ' ') { |
michael@0 | 380 | src = src_end; |
michael@0 | 381 | } else { |
michael@0 | 382 | src = src_mid; |
michael@0 | 383 | } |
michael@0 | 384 | |
michael@0 | 385 | // Skip over space at end of word, or ASCII vowel in middle of word |
michael@0 | 386 | // Use kAdvanceOneCharSpace instead to get rid of vowel hack |
michael@0 | 387 | if (src < srclimit) { |
michael@0 | 388 | src += kAdvanceOneCharSpaceVowel[(uint8)src[0]]; |
michael@0 | 389 | } else { |
michael@0 | 390 | // Advancing by 4/8/16 can overshoot, but we are about to exit anyway |
michael@0 | 391 | src = srclimit; |
michael@0 | 392 | } |
michael@0 | 393 | |
michael@0 | 394 | if (next_base >= next_base_limit) {break;} |
michael@0 | 395 | } |
michael@0 | 396 | |
michael@0 | 397 | hitbuffer->next_base = next_base; |
michael@0 | 398 | |
michael@0 | 399 | // Make a dummy entry off the end to calc length of last span |
michael@0 | 400 | int dummy_offset = src - text; |
michael@0 | 401 | hitbuffer->base[hitbuffer->next_base].offset = dummy_offset; |
michael@0 | 402 | hitbuffer->base[hitbuffer->next_base].indirect = 0; |
michael@0 | 403 | |
michael@0 | 404 | return src - text; |
michael@0 | 405 | } |
michael@0 | 406 | |
michael@0 | 407 | // inputs: |
michael@0 | 408 | // const tables |
michael@0 | 409 | // const char* isrc, int srclen (in sscriptbuffer) |
michael@0 | 410 | // intermediates: |
michael@0 | 411 | // vector of octa <offset, probs> (which need indirect table to decode) |
michael@0 | 412 | // vector of distinct <offset, probs> (which need indirect table to decode) |
michael@0 | 413 | |
michael@0 | 414 | // Score up to 64KB of a single script span, doing both delta-octa and |
michael@0 | 415 | // distinct words in one pass |
michael@0 | 416 | void GetOctaHits(const char* text, |
michael@0 | 417 | int letter_offset, int letter_limit, |
michael@0 | 418 | ScoringContext* scoringcontext, |
michael@0 | 419 | ScoringHitBuffer* hitbuffer) { |
michael@0 | 420 | const char* isrc = &text[letter_offset]; |
michael@0 | 421 | const char* src = isrc; |
michael@0 | 422 | // Limit is end+1, to include extra space char (0x20) off the end |
michael@0 | 423 | const char* srclimit = &text[letter_limit + 1]; |
michael@0 | 424 | |
michael@0 | 425 | // Local copies |
michael@0 | 426 | const CLD2TableSummary* deltaocta_obj = |
michael@0 | 427 | scoringcontext->scoringtables->deltaocta_obj; |
michael@0 | 428 | int next_delta = hitbuffer->next_delta; |
michael@0 | 429 | int next_delta_limit = hitbuffer->maxscoringhits; |
michael@0 | 430 | |
michael@0 | 431 | const CLD2TableSummary* distinctocta_obj = |
michael@0 | 432 | scoringcontext->scoringtables->distinctocta_obj; |
michael@0 | 433 | int next_distinct = hitbuffer->next_distinct; |
michael@0 | 434 | // We can do 2 inserts per loop, so -1 |
michael@0 | 435 | int next_distinct_limit = hitbuffer->maxscoringhits - 1; |
michael@0 | 436 | |
michael@0 | 437 | // Run a little cache of last octa hits to catch overly-repetitive "text" |
michael@0 | 438 | // We don't care if we miss a couple repetitions at scriptspan boundaries |
michael@0 | 439 | int next_prior_octahash = 0; |
michael@0 | 440 | uint64 prior_octahash[2] = {0, 0}; |
michael@0 | 441 | |
michael@0 | 442 | // Score all words truncated to 8 characters |
michael@0 | 443 | int charcount = 0; |
michael@0 | 444 | // Skip any initial space |
michael@0 | 445 | if (src[0] == ' ') {++src;} |
michael@0 | 446 | |
michael@0 | 447 | // Begin the first word |
michael@0 | 448 | const char* prior_word_start = src; |
michael@0 | 449 | const char* word_start = src; |
michael@0 | 450 | const char* word_end = word_start; |
michael@0 | 451 | while (src < srclimit) { |
michael@0 | 452 | // Terminate previous word or continue current word |
michael@0 | 453 | if (src[0] == ' ') { |
michael@0 | 454 | int len = word_end - word_start; |
michael@0 | 455 | // Hash the word |
michael@0 | 456 | uint64 wordhash40 = OctaHash40(word_start, len); |
michael@0 | 457 | uint32 probs; |
michael@0 | 458 | |
michael@0 | 459 | // Filter out recent repeats. Unlike quads, we update even if no hit, |
michael@0 | 460 | // so we can get hits on same word if separated by non-hit words |
michael@0 | 461 | if ((wordhash40 != prior_octahash[0]) && |
michael@0 | 462 | (wordhash40 != prior_octahash[1])) { |
michael@0 | 463 | // Round-robin two entries of words |
michael@0 | 464 | prior_octahash[next_prior_octahash] = wordhash40; |
michael@0 | 465 | next_prior_octahash = 1 - next_prior_octahash; // Alternates 0,1,0,1 |
michael@0 | 466 | |
michael@0 | 467 | // (1) Lookup distinct word PAIR. For a pair, we want an asymmetrical |
michael@0 | 468 | // function of the two word hashs. For words A B C, B-A and C-B are good |
michael@0 | 469 | // enough and fast. We use the same table as distinct single words |
michael@0 | 470 | // Do not look up a pair of identical words -- all pairs hash to zero |
michael@0 | 471 | // Both 1- and 2-word distinct lookups are in distinctocta_obj now |
michael@0 | 472 | // Do this first, because it has the lowest offset |
michael@0 | 473 | uint64 tmp_prior_hash = prior_octahash[next_prior_octahash]; |
michael@0 | 474 | if ((tmp_prior_hash != 0) && (tmp_prior_hash != wordhash40)) { |
michael@0 | 475 | uint64 pair_hash = PairHash(tmp_prior_hash, wordhash40); |
michael@0 | 476 | probs = OctaHashV3Lookup4(distinctocta_obj, pair_hash); |
michael@0 | 477 | if (probs != 0) { |
michael@0 | 478 | int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask; |
michael@0 | 479 | hitbuffer->distinct[next_distinct].offset = prior_word_start - text; |
michael@0 | 480 | hitbuffer->distinct[next_distinct].indirect = indirect_subscr; |
michael@0 | 481 | ++next_distinct; |
michael@0 | 482 | } |
michael@0 | 483 | } |
michael@0 | 484 | |
michael@0 | 485 | // (2) Lookup this distinct word and save <offset, indirect> |
michael@0 | 486 | probs = OctaHashV3Lookup4(distinctocta_obj, wordhash40); |
michael@0 | 487 | if (probs != 0) { |
michael@0 | 488 | int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask; |
michael@0 | 489 | hitbuffer->distinct[next_distinct].offset = word_start - text; |
michael@0 | 490 | hitbuffer->distinct[next_distinct].indirect = indirect_subscr; |
michael@0 | 491 | ++next_distinct; |
michael@0 | 492 | } |
michael@0 | 493 | |
michael@0 | 494 | // (3) Lookup this word and save <offset, indirect> |
michael@0 | 495 | probs = OctaHashV3Lookup4(deltaocta_obj, wordhash40); |
michael@0 | 496 | if (probs != 0) { |
michael@0 | 497 | // Save indirect subscript for later scoring; 1 langprob |
michael@0 | 498 | int indirect_subscr = probs & ~deltaocta_obj->kCLDTableKeyMask; |
michael@0 | 499 | hitbuffer->delta[next_delta].offset = word_start - text; |
michael@0 | 500 | hitbuffer->delta[next_delta].indirect = indirect_subscr; |
michael@0 | 501 | ++next_delta; |
michael@0 | 502 | } |
michael@0 | 503 | } |
michael@0 | 504 | |
michael@0 | 505 | // Begin the next word |
michael@0 | 506 | charcount = 0; |
michael@0 | 507 | prior_word_start = word_start; |
michael@0 | 508 | word_start = src + 1; // Over the space |
michael@0 | 509 | word_end = word_start; |
michael@0 | 510 | } else { |
michael@0 | 511 | ++charcount; |
michael@0 | 512 | } |
michael@0 | 513 | |
michael@0 | 514 | // Advance to next char |
michael@0 | 515 | src += UTF8OneCharLen(src); |
michael@0 | 516 | if (charcount <= 8) { |
michael@0 | 517 | word_end = src; |
michael@0 | 518 | } |
michael@0 | 519 | // Almost always srclimit hit first |
michael@0 | 520 | if (next_delta >= next_delta_limit) {break;} |
michael@0 | 521 | if (next_distinct >= next_distinct_limit) {break;} |
michael@0 | 522 | } |
michael@0 | 523 | |
michael@0 | 524 | hitbuffer->next_delta = next_delta; |
michael@0 | 525 | hitbuffer->next_distinct = next_distinct; |
michael@0 | 526 | |
michael@0 | 527 | // Make a dummy entry off the end to calc length of last span |
michael@0 | 528 | int dummy_offset = src - text; |
michael@0 | 529 | hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset; |
michael@0 | 530 | hitbuffer->delta[hitbuffer->next_delta].indirect = 0; |
michael@0 | 531 | hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset; |
michael@0 | 532 | hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0; |
michael@0 | 533 | } |
michael@0 | 534 | |
michael@0 | 535 | |
michael@0 | 536 | //----------------------------------------------------------------------------// |
michael@0 | 537 | // Reliability calculations, for single language and between languages // |
michael@0 | 538 | //----------------------------------------------------------------------------// |
michael@0 | 539 | |
michael@0 | 540 | // Return reliablity of result 0..100 for top two scores |
michael@0 | 541 | // delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable |
michael@0 | 542 | // (on a scale where +1 is a factor of 2 ** 1.6 = 3.02) |
michael@0 | 543 | // Threshold is uni/quadgram increment count, bounded above and below. |
michael@0 | 544 | // |
michael@0 | 545 | // Requiring a factor of 3 improvement (e.g. +1 log base 3) |
michael@0 | 546 | // for each scored quadgram is too stringent, so I've backed this off to a |
michael@0 | 547 | // factor of 2 (e.g. +5/8 log base 3). |
michael@0 | 548 | // |
michael@0 | 549 | // I also somewhat lowered the Min/MaxGramCount limits above |
michael@0 | 550 | // |
michael@0 | 551 | // Added: if fewer than 8 quads/unis, max reliability is 12*n percent |
michael@0 | 552 | // |
michael@0 | 553 | int ReliabilityDelta(int value1, int value2, int gramcount) { |
michael@0 | 554 | int max_reliability_percent = 100; |
michael@0 | 555 | if (gramcount < 8) { |
michael@0 | 556 | max_reliability_percent = 12 * gramcount; |
michael@0 | 557 | } |
michael@0 | 558 | int fully_reliable_thresh = (gramcount * 5) >> 3; // see note above |
michael@0 | 559 | if (fully_reliable_thresh < kMinGramCount) { // Fully = 3..16 |
michael@0 | 560 | fully_reliable_thresh = kMinGramCount; |
michael@0 | 561 | } else if (fully_reliable_thresh > kMaxGramCount) { |
michael@0 | 562 | fully_reliable_thresh = kMaxGramCount; |
michael@0 | 563 | } |
michael@0 | 564 | |
michael@0 | 565 | int delta = value1 - value2; |
michael@0 | 566 | if (delta >= fully_reliable_thresh) {return max_reliability_percent;} |
michael@0 | 567 | if (delta <= 0) {return 0;} |
michael@0 | 568 | return minint(max_reliability_percent, |
michael@0 | 569 | (100 * delta) / fully_reliable_thresh); |
michael@0 | 570 | } |
michael@0 | 571 | |
michael@0 | 572 | // Return reliablity of result 0..100 for top score vs. expected mainsteam score |
michael@0 | 573 | // Values are score per 1024 bytes of input |
michael@0 | 574 | // ratio = max(top/mainstream, mainstream/top) |
michael@0 | 575 | // ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable |
michael@0 | 576 | // Change: short-text word scoring can give unusually good results. |
michael@0 | 577 | // Let top exceed mainstream by 4x at 50% reliable |
michael@0 | 578 | // |
michael@0 | 579 | // dsites April 2010: These could be tightened up. It would be |
michael@0 | 580 | // reasonable with newer data and round-robin table allocation to start ramping |
michael@0 | 581 | // down at mean * 1.5 and mean/1.5, while letting mean*2 and mean/2 pass, |
michael@0 | 582 | // but just barely. |
michael@0 | 583 | // |
michael@0 | 584 | // dsites March 2013: Tightened up a bit. |
michael@0 | 585 | static const double kRatio100 = 1.5; |
michael@0 | 586 | static const double kRatio0 = 4.0; |
michael@0 | 587 | int ReliabilityExpected(int actual_score_1kb, int expected_score_1kb) { |
michael@0 | 588 | if (expected_score_1kb == 0) {return 100;} // No reliability data available yet |
michael@0 | 589 | if (actual_score_1kb == 0) {return 0;} // zero score = unreliable |
michael@0 | 590 | double ratio; |
michael@0 | 591 | if (expected_score_1kb > actual_score_1kb) { |
michael@0 | 592 | ratio = (1.0 * expected_score_1kb) / actual_score_1kb; |
michael@0 | 593 | } else { |
michael@0 | 594 | ratio = (1.0 * actual_score_1kb) / expected_score_1kb; |
michael@0 | 595 | } |
michael@0 | 596 | // Ratio 1.0 .. 1.5 scores 100% |
michael@0 | 597 | // Ratio 2.0 scores 80% |
michael@0 | 598 | // Linear decline, to ratio 4.0 scores 0% |
michael@0 | 599 | if (ratio <= kRatio100) {return 100;} |
michael@0 | 600 | if (ratio > kRatio0) {return 0;} |
michael@0 | 601 | |
michael@0 | 602 | int percent_good = 100.0 * (kRatio0 - ratio) / (kRatio0 - kRatio100); |
michael@0 | 603 | return percent_good; |
michael@0 | 604 | } |
michael@0 | 605 | |
michael@0 | 606 | // Create a langprob packed value from its parts. |
michael@0 | 607 | // qprob is quantized [0..12] |
michael@0 | 608 | // We use Latn script to represent any RTypeMany language |
michael@0 | 609 | uint32 MakeLangProb(Language lang, int qprob) { |
michael@0 | 610 | uint32 pslang = PerScriptNumber(ULScript_Latin, lang); |
michael@0 | 611 | uint32 retval = (pslang << 8) | kLgProbV2TblBackmap[qprob]; |
michael@0 | 612 | return retval; |
michael@0 | 613 | } |
michael@0 | 614 | |
michael@0 | 615 | } // End namespace CLD2 |
michael@0 | 616 | |
michael@0 | 617 | |
michael@0 | 618 | |
michael@0 | 619 | |
michael@0 | 620 |