michael@0: // Copyright 2013 Google Inc. All Rights Reserved. michael@0: // michael@0: // Licensed under the Apache License, Version 2.0 (the "License"); michael@0: // you may not use this file except in compliance with the License. michael@0: // You may obtain a copy of the License at michael@0: // michael@0: // http://www.apache.org/licenses/LICENSE-2.0 michael@0: // michael@0: // Unless required by applicable law or agreed to in writing, software michael@0: // distributed under the License is distributed on an "AS IS" BASIS, michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: // See the License for the specific language governing permissions and michael@0: // limitations under the License. michael@0: michael@0: // michael@0: // Author: dsites@google.com (Dick Sites) michael@0: // Updated 2014.01 for dual table lookup michael@0: // michael@0: michael@0: #include "cldutil.h" michael@0: #include michael@0: michael@0: #include "cld2tablesummary.h" michael@0: #include "integral_types.h" michael@0: #include "port.h" michael@0: #include "utf8statetable.h" michael@0: michael@0: namespace CLD2 { michael@0: michael@0: // Caller supplies the right tables in scoringcontext michael@0: michael@0: // Runtime routines for hashing, looking up, and scoring michael@0: // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams. michael@0: // Unigrams and bigrams are for CJK languages only, including simplified/ michael@0: // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and michael@0: // Zhuang Han characters. Surrounding spaces are not considered. michael@0: // Quadgrams and octagrams for for non-CJK and include two bits indicating michael@0: // preceding and trailing spaces (word boundaries). michael@0: michael@0: michael@0: static const int kMinCJKUTF8CharBytes = 3; michael@0: michael@0: static const int kMinGramCount = 3; michael@0: static const int kMaxGramCount = 16; michael@0: michael@0: static const int UTFmax = 4; // Max number of bytes in a UTF-8 character michael@0: michael@0: // 1 to skip ASCII space, vowels AEIOU aeiou and UTF-8 continuation bytes 80-BF michael@0: static const uint8 kSkipSpaceVowelContinue[256] = { michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, michael@0: michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: }; michael@0: michael@0: // 1 to skip ASCII space, and UTF-8 continuation bytes 80-BF michael@0: static const uint8 kSkipSpaceContinue[256] = { michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: }; michael@0: michael@0: michael@0: // Always advances one UTF-8 character michael@0: static const uint8 kAdvanceOneChar[256] = { michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, michael@0: 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, michael@0: }; michael@0: michael@0: // Advances *only* on space (or illegal byte) michael@0: static const uint8 kAdvanceOneCharSpace[256] = { michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: }; michael@0: michael@0: michael@0: // Routines to access a hash table of pairs michael@0: // Buckets have 4-byte wordhash for sizes < 32K buckets, but only michael@0: // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as michael@0: // bucket subscript. michael@0: // Probs is a packed: three languages plus a subscript for probability table michael@0: // Buckets have all the keys together, then all the values.Key array never michael@0: // crosses a cache-line boundary, so no-match case takes exactly one cache miss. michael@0: // Match case may sometimes take an additional cache miss on value access. michael@0: // michael@0: // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64 michael@0: // byte buckets with single cache miss. michael@0: // Or 2-byte key and 6-byte value, allowing 5 languages instead of three. michael@0: //------------------------------------------------------------------------------ michael@0: michael@0: //----------------------------------------------------------------------------// michael@0: // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores // michael@0: //----------------------------------------------------------------------------// michael@0: michael@0: //----------------------------------------------------------------------------// michael@0: // Scoring single groups of letters // michael@0: //----------------------------------------------------------------------------// michael@0: michael@0: // BIGRAM, QUADGRAM, OCTAGRAM score one => tote michael@0: // Input: 4-byte entry of 3 language numbers and one probability subscript, plus michael@0: // an accumulator tote. (language 0 means unused entry) michael@0: // Output: running sums in tote updated michael@0: void ProcessProbV2Tote(uint32 probs, Tote* tote) { michael@0: uint8 prob123 = (probs >> 0) & 0xff; michael@0: const uint8* prob123_entry = LgProb2TblEntry(prob123); michael@0: michael@0: uint8 top1 = (probs >> 8) & 0xff; michael@0: if (top1 > 0) {tote->Add(top1, LgProb3(prob123_entry, 0));} michael@0: uint8 top2 = (probs >> 16) & 0xff; michael@0: if (top2 > 0) {tote->Add(top2, LgProb3(prob123_entry, 1));} michael@0: uint8 top3 = (probs >> 24) & 0xff; michael@0: if (top3 > 0) {tote->Add(top3, LgProb3(prob123_entry, 2));} michael@0: } michael@0: michael@0: // Return score for a particular per-script language, or zero michael@0: int GetLangScore(uint32 probs, uint8 pslang) { michael@0: uint8 prob123 = (probs >> 0) & 0xff; michael@0: const uint8* prob123_entry = LgProb2TblEntry(prob123); michael@0: int retval = 0; michael@0: uint8 top1 = (probs >> 8) & 0xff; michael@0: if (top1 == pslang) {retval += LgProb3(prob123_entry, 0);} michael@0: uint8 top2 = (probs >> 16) & 0xff; michael@0: if (top2 == pslang) {retval += LgProb3(prob123_entry, 1);} michael@0: uint8 top3 = (probs >> 24) & 0xff; michael@0: if (top3 == pslang) {retval += LgProb3(prob123_entry, 2);} michael@0: return retval; michael@0: } michael@0: michael@0: //----------------------------------------------------------------------------// michael@0: // Routines to accumulate probabilities // michael@0: //----------------------------------------------------------------------------// michael@0: michael@0: michael@0: // BIGRAM, using hash table, always advancing by 1 char michael@0: // Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj michael@0: // Score all bigrams in isrc, using languages that have bigrams (CJK) michael@0: // Return number of bigrams that hit in the hash table michael@0: int DoBigramScoreV3(const CLD2TableSummary* bigram_obj, michael@0: const char* isrc, int srclen, Tote* chunk_tote) { michael@0: int hit_count = 0; michael@0: const char* src = isrc; michael@0: michael@0: // Hashtable-based CJK bigram lookup michael@0: const uint8* usrc = reinterpret_cast(src); michael@0: const uint8* usrclimit1 = usrc + srclen - UTFmax; michael@0: michael@0: while (usrc < usrclimit1) { michael@0: int len = kAdvanceOneChar[usrc[0]]; michael@0: int len2 = kAdvanceOneChar[usrc[len]] + len; michael@0: michael@0: if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible michael@0: // Lookup and score this bigram michael@0: // Always ignore pre/post spaces michael@0: uint32 bihash = BiHashV2(reinterpret_cast(usrc), len2); michael@0: uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash); michael@0: // Now go indirect on the subscript michael@0: probs = bigram_obj->kCLDTableInd[probs & michael@0: ~bigram_obj->kCLDTableKeyMask]; michael@0: michael@0: // Process the bigram michael@0: if (probs != 0) { michael@0: ProcessProbV2Tote(probs, chunk_tote); michael@0: ++hit_count; michael@0: } michael@0: } michael@0: usrc += len; // Advance by one char michael@0: } michael@0: michael@0: return hit_count; michael@0: } michael@0: michael@0: michael@0: // Score up to 64KB of a single script span in one pass michael@0: // Make a dummy entry off the end to calc length of last span michael@0: // Return offset of first unused input byte michael@0: int GetUniHits(const char* text, michael@0: int letter_offset, int letter_limit, michael@0: ScoringContext* scoringcontext, michael@0: ScoringHitBuffer* hitbuffer) { michael@0: const char* isrc = &text[letter_offset]; michael@0: const char* src = isrc; michael@0: // Limit is end, which has extra 20 20 20 00 past len michael@0: const char* srclimit = &text[letter_limit]; michael@0: michael@0: // Local copies michael@0: const UTF8PropObj* unigram_obj = michael@0: scoringcontext->scoringtables->unigram_obj; michael@0: int next_base = hitbuffer->next_base; michael@0: int next_base_limit = hitbuffer->maxscoringhits; michael@0: michael@0: // Visit all unigrams michael@0: if (src[0] == ' ') {++src;} // skip any initial space michael@0: while (src < srclimit) { michael@0: const uint8* usrc = reinterpret_cast(src); michael@0: int len = kAdvanceOneChar[usrc[0]]; michael@0: src += len; michael@0: // Look up property of one UTF-8 character and advance over it. michael@0: // Updates usrc and len (bad interface design), hence increment above michael@0: int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &len); michael@0: if (propval > 0) { michael@0: // Save indirect subscript for later scoring; 1 or 2 langprobs michael@0: int indirect_subscr = propval; michael@0: hitbuffer->base[next_base].offset = src - text; // Offset in text michael@0: hitbuffer->base[next_base].indirect = indirect_subscr; michael@0: ++next_base; michael@0: } michael@0: michael@0: if (next_base >= next_base_limit) {break;} michael@0: } michael@0: michael@0: hitbuffer->next_base = next_base; michael@0: michael@0: // Make a dummy entry off the end to calc length of last span michael@0: int dummy_offset = src - text; michael@0: hitbuffer->base[hitbuffer->next_base].offset = dummy_offset; michael@0: hitbuffer->base[hitbuffer->next_base].indirect = 0; michael@0: michael@0: return src - text; michael@0: } michael@0: michael@0: // Score up to 64KB of a single script span, doing both delta-bi and michael@0: // distinct bis in one pass michael@0: void GetBiHits(const char* text, michael@0: int letter_offset, int letter_limit, michael@0: ScoringContext* scoringcontext, michael@0: ScoringHitBuffer* hitbuffer) { michael@0: const char* isrc = &text[letter_offset]; michael@0: const char* src = isrc; michael@0: // Limit is end michael@0: const char* srclimit1 = &text[letter_limit]; michael@0: michael@0: // Local copies michael@0: const CLD2TableSummary* deltabi_obj = michael@0: scoringcontext->scoringtables->deltabi_obj; michael@0: const CLD2TableSummary* distinctbi_obj = michael@0: scoringcontext->scoringtables->distinctbi_obj; michael@0: int next_delta = hitbuffer->next_delta; michael@0: int next_delta_limit = hitbuffer->maxscoringhits; michael@0: int next_distinct = hitbuffer->next_distinct; michael@0: // We can do 2 inserts per loop, so -1 michael@0: int next_distinct_limit = hitbuffer->maxscoringhits - 1; michael@0: michael@0: while (src < srclimit1) { michael@0: const uint8* usrc = reinterpret_cast(src); michael@0: int len = kAdvanceOneChar[usrc[0]]; michael@0: int len2 = kAdvanceOneChar[usrc[len]] + len; michael@0: michael@0: if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible michael@0: // Lookup and this bigram and save michael@0: uint32 bihash = BiHashV2(src, len2); michael@0: uint32 probs = QuadHashV3Lookup4(deltabi_obj, bihash); michael@0: // Now go indirect on the subscript michael@0: if (probs != 0) { michael@0: // Save indirect subscript for later scoring; 1 langprob michael@0: int indirect_subscr = probs & ~deltabi_obj->kCLDTableKeyMask; michael@0: hitbuffer->delta[next_delta].offset = src - text; michael@0: hitbuffer->delta[next_delta].indirect = indirect_subscr; michael@0: ++next_delta; michael@0: } michael@0: // Lookup this distinct bigram and save michael@0: probs = QuadHashV3Lookup4(distinctbi_obj, bihash); michael@0: if (probs != 0) { michael@0: int indirect_subscr = probs & ~distinctbi_obj->kCLDTableKeyMask; michael@0: hitbuffer->distinct[next_distinct].offset = src - text; michael@0: hitbuffer->distinct[next_distinct].indirect = indirect_subscr; michael@0: ++next_distinct; michael@0: } michael@0: } michael@0: src += len; // Advance by one char (not two) michael@0: michael@0: // Almost always srclimit hit first michael@0: if (next_delta >= next_delta_limit) {break;} michael@0: if (next_distinct >= next_distinct_limit) {break;} michael@0: } michael@0: michael@0: hitbuffer->next_delta = next_delta; michael@0: hitbuffer->next_distinct = next_distinct; michael@0: michael@0: // Make a dummy entry off the end to calc length of last span michael@0: int dummy_offset = src - text; michael@0: hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset; michael@0: hitbuffer->delta[hitbuffer->next_delta].indirect = 0; michael@0: hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset; michael@0: hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0; michael@0: } michael@0: michael@0: // Score up to 64KB of a single script span in one pass michael@0: // Make a dummy entry off the end to calc length of last span michael@0: // Return offset of first unused input byte michael@0: int GetQuadHits(const char* text, michael@0: int letter_offset, int letter_limit, michael@0: ScoringContext* scoringcontext, michael@0: ScoringHitBuffer* hitbuffer) { michael@0: const char* isrc = &text[letter_offset]; michael@0: const char* src = isrc; michael@0: // Limit is end, which has extra 20 20 20 00 past len michael@0: const char* srclimit = &text[letter_limit]; michael@0: michael@0: // Local copies michael@0: const CLD2TableSummary* quadgram_obj = michael@0: scoringcontext->scoringtables->quadgram_obj; michael@0: const CLD2TableSummary* quadgram_obj2 = michael@0: scoringcontext->scoringtables->quadgram_obj2; michael@0: int next_base = hitbuffer->next_base; michael@0: int next_base_limit = hitbuffer->maxscoringhits; michael@0: michael@0: // Run a little cache of last quad hits to catch overly-repetitive "text" michael@0: // We don't care if we miss a couple repetitions at scriptspan boundaries michael@0: int next_prior_quadhash = 0; michael@0: uint32 prior_quadhash[2] = {0, 0}; michael@0: michael@0: // Visit all quadgrams michael@0: if (src[0] == ' ') {++src;} // skip any initial space michael@0: while (src < srclimit) { michael@0: // Find one quadgram michael@0: const char* src_end = src; michael@0: src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; michael@0: src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; michael@0: const char* src_mid = src_end; michael@0: src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; michael@0: src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; michael@0: int len = src_end - src; michael@0: // Hash the quadgram michael@0: uint32 quadhash = QuadHashV2(src, len); michael@0: michael@0: // Filter out recent repeats michael@0: if ((quadhash != prior_quadhash[0]) && (quadhash != prior_quadhash[1])) { michael@0: // Look up this quadgram and save michael@0: uint32 indirect_flag = 0; // For dual tables michael@0: const CLD2TableSummary* hit_obj = quadgram_obj; michael@0: uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash); michael@0: if ((probs == 0) && (quadgram_obj2->kCLDTableSize != 0)) { michael@0: // Try lookup in dual table if not found in first one michael@0: // Note: we need to know later which of two indirect tables to use. michael@0: indirect_flag = 0x80000000u; michael@0: hit_obj = quadgram_obj2; michael@0: probs = QuadHashV3Lookup4(quadgram_obj2, quadhash); michael@0: } michael@0: if (probs != 0) { michael@0: // Round-robin two entries of actual hits michael@0: prior_quadhash[next_prior_quadhash] = quadhash; michael@0: next_prior_quadhash = (next_prior_quadhash + 1) & 1; michael@0: michael@0: // Save indirect subscript for later scoring; 1 or 2 langprobs michael@0: int indirect_subscr = probs & ~hit_obj->kCLDTableKeyMask; michael@0: hitbuffer->base[next_base].offset = src - text; // Offset in text michael@0: // Flip the high bit for table2 michael@0: hitbuffer->base[next_base].indirect = indirect_subscr | indirect_flag; michael@0: ++next_base; michael@0: } michael@0: } michael@0: michael@0: // Advance: all the way past word if at end-of-word, else 2 chars michael@0: if (src_end[0] == ' ') { michael@0: src = src_end; michael@0: } else { michael@0: src = src_mid; michael@0: } michael@0: michael@0: // Skip over space at end of word, or ASCII vowel in middle of word michael@0: // Use kAdvanceOneCharSpace instead to get rid of vowel hack michael@0: if (src < srclimit) { michael@0: src += kAdvanceOneCharSpaceVowel[(uint8)src[0]]; michael@0: } else { michael@0: // Advancing by 4/8/16 can overshoot, but we are about to exit anyway michael@0: src = srclimit; michael@0: } michael@0: michael@0: if (next_base >= next_base_limit) {break;} michael@0: } michael@0: michael@0: hitbuffer->next_base = next_base; michael@0: michael@0: // Make a dummy entry off the end to calc length of last span michael@0: int dummy_offset = src - text; michael@0: hitbuffer->base[hitbuffer->next_base].offset = dummy_offset; michael@0: hitbuffer->base[hitbuffer->next_base].indirect = 0; michael@0: michael@0: return src - text; michael@0: } michael@0: michael@0: // inputs: michael@0: // const tables michael@0: // const char* isrc, int srclen (in sscriptbuffer) michael@0: // intermediates: michael@0: // vector of octa (which need indirect table to decode) michael@0: // vector of distinct (which need indirect table to decode) michael@0: michael@0: // Score up to 64KB of a single script span, doing both delta-octa and michael@0: // distinct words in one pass michael@0: void GetOctaHits(const char* text, michael@0: int letter_offset, int letter_limit, michael@0: ScoringContext* scoringcontext, michael@0: ScoringHitBuffer* hitbuffer) { michael@0: const char* isrc = &text[letter_offset]; michael@0: const char* src = isrc; michael@0: // Limit is end+1, to include extra space char (0x20) off the end michael@0: const char* srclimit = &text[letter_limit + 1]; michael@0: michael@0: // Local copies michael@0: const CLD2TableSummary* deltaocta_obj = michael@0: scoringcontext->scoringtables->deltaocta_obj; michael@0: int next_delta = hitbuffer->next_delta; michael@0: int next_delta_limit = hitbuffer->maxscoringhits; michael@0: michael@0: const CLD2TableSummary* distinctocta_obj = michael@0: scoringcontext->scoringtables->distinctocta_obj; michael@0: int next_distinct = hitbuffer->next_distinct; michael@0: // We can do 2 inserts per loop, so -1 michael@0: int next_distinct_limit = hitbuffer->maxscoringhits - 1; michael@0: michael@0: // Run a little cache of last octa hits to catch overly-repetitive "text" michael@0: // We don't care if we miss a couple repetitions at scriptspan boundaries michael@0: int next_prior_octahash = 0; michael@0: uint64 prior_octahash[2] = {0, 0}; michael@0: michael@0: // Score all words truncated to 8 characters michael@0: int charcount = 0; michael@0: // Skip any initial space michael@0: if (src[0] == ' ') {++src;} michael@0: michael@0: // Begin the first word michael@0: const char* prior_word_start = src; michael@0: const char* word_start = src; michael@0: const char* word_end = word_start; michael@0: while (src < srclimit) { michael@0: // Terminate previous word or continue current word michael@0: if (src[0] == ' ') { michael@0: int len = word_end - word_start; michael@0: // Hash the word michael@0: uint64 wordhash40 = OctaHash40(word_start, len); michael@0: uint32 probs; michael@0: michael@0: // Filter out recent repeats. Unlike quads, we update even if no hit, michael@0: // so we can get hits on same word if separated by non-hit words michael@0: if ((wordhash40 != prior_octahash[0]) && michael@0: (wordhash40 != prior_octahash[1])) { michael@0: // Round-robin two entries of words michael@0: prior_octahash[next_prior_octahash] = wordhash40; michael@0: next_prior_octahash = 1 - next_prior_octahash; // Alternates 0,1,0,1 michael@0: michael@0: // (1) Lookup distinct word PAIR. For a pair, we want an asymmetrical michael@0: // function of the two word hashs. For words A B C, B-A and C-B are good michael@0: // enough and fast. We use the same table as distinct single words michael@0: // Do not look up a pair of identical words -- all pairs hash to zero michael@0: // Both 1- and 2-word distinct lookups are in distinctocta_obj now michael@0: // Do this first, because it has the lowest offset michael@0: uint64 tmp_prior_hash = prior_octahash[next_prior_octahash]; michael@0: if ((tmp_prior_hash != 0) && (tmp_prior_hash != wordhash40)) { michael@0: uint64 pair_hash = PairHash(tmp_prior_hash, wordhash40); michael@0: probs = OctaHashV3Lookup4(distinctocta_obj, pair_hash); michael@0: if (probs != 0) { michael@0: int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask; michael@0: hitbuffer->distinct[next_distinct].offset = prior_word_start - text; michael@0: hitbuffer->distinct[next_distinct].indirect = indirect_subscr; michael@0: ++next_distinct; michael@0: } michael@0: } michael@0: michael@0: // (2) Lookup this distinct word and save michael@0: probs = OctaHashV3Lookup4(distinctocta_obj, wordhash40); michael@0: if (probs != 0) { michael@0: int indirect_subscr = probs & ~distinctocta_obj->kCLDTableKeyMask; michael@0: hitbuffer->distinct[next_distinct].offset = word_start - text; michael@0: hitbuffer->distinct[next_distinct].indirect = indirect_subscr; michael@0: ++next_distinct; michael@0: } michael@0: michael@0: // (3) Lookup this word and save michael@0: probs = OctaHashV3Lookup4(deltaocta_obj, wordhash40); michael@0: if (probs != 0) { michael@0: // Save indirect subscript for later scoring; 1 langprob michael@0: int indirect_subscr = probs & ~deltaocta_obj->kCLDTableKeyMask; michael@0: hitbuffer->delta[next_delta].offset = word_start - text; michael@0: hitbuffer->delta[next_delta].indirect = indirect_subscr; michael@0: ++next_delta; michael@0: } michael@0: } michael@0: michael@0: // Begin the next word michael@0: charcount = 0; michael@0: prior_word_start = word_start; michael@0: word_start = src + 1; // Over the space michael@0: word_end = word_start; michael@0: } else { michael@0: ++charcount; michael@0: } michael@0: michael@0: // Advance to next char michael@0: src += UTF8OneCharLen(src); michael@0: if (charcount <= 8) { michael@0: word_end = src; michael@0: } michael@0: // Almost always srclimit hit first michael@0: if (next_delta >= next_delta_limit) {break;} michael@0: if (next_distinct >= next_distinct_limit) {break;} michael@0: } michael@0: michael@0: hitbuffer->next_delta = next_delta; michael@0: hitbuffer->next_distinct = next_distinct; michael@0: michael@0: // Make a dummy entry off the end to calc length of last span michael@0: int dummy_offset = src - text; michael@0: hitbuffer->delta[hitbuffer->next_delta].offset = dummy_offset; michael@0: hitbuffer->delta[hitbuffer->next_delta].indirect = 0; michael@0: hitbuffer->distinct[hitbuffer->next_distinct].offset = dummy_offset; michael@0: hitbuffer->distinct[hitbuffer->next_distinct].indirect = 0; michael@0: } michael@0: michael@0: michael@0: //----------------------------------------------------------------------------// michael@0: // Reliability calculations, for single language and between languages // michael@0: //----------------------------------------------------------------------------// michael@0: michael@0: // Return reliablity of result 0..100 for top two scores michael@0: // delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable michael@0: // (on a scale where +1 is a factor of 2 ** 1.6 = 3.02) michael@0: // Threshold is uni/quadgram increment count, bounded above and below. michael@0: // michael@0: // Requiring a factor of 3 improvement (e.g. +1 log base 3) michael@0: // for each scored quadgram is too stringent, so I've backed this off to a michael@0: // factor of 2 (e.g. +5/8 log base 3). michael@0: // michael@0: // I also somewhat lowered the Min/MaxGramCount limits above michael@0: // michael@0: // Added: if fewer than 8 quads/unis, max reliability is 12*n percent michael@0: // michael@0: int ReliabilityDelta(int value1, int value2, int gramcount) { michael@0: int max_reliability_percent = 100; michael@0: if (gramcount < 8) { michael@0: max_reliability_percent = 12 * gramcount; michael@0: } michael@0: int fully_reliable_thresh = (gramcount * 5) >> 3; // see note above michael@0: if (fully_reliable_thresh < kMinGramCount) { // Fully = 3..16 michael@0: fully_reliable_thresh = kMinGramCount; michael@0: } else if (fully_reliable_thresh > kMaxGramCount) { michael@0: fully_reliable_thresh = kMaxGramCount; michael@0: } michael@0: michael@0: int delta = value1 - value2; michael@0: if (delta >= fully_reliable_thresh) {return max_reliability_percent;} michael@0: if (delta <= 0) {return 0;} michael@0: return minint(max_reliability_percent, michael@0: (100 * delta) / fully_reliable_thresh); michael@0: } michael@0: michael@0: // Return reliablity of result 0..100 for top score vs. expected mainsteam score michael@0: // Values are score per 1024 bytes of input michael@0: // ratio = max(top/mainstream, mainstream/top) michael@0: // ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable michael@0: // Change: short-text word scoring can give unusually good results. michael@0: // Let top exceed mainstream by 4x at 50% reliable michael@0: // michael@0: // dsites April 2010: These could be tightened up. It would be michael@0: // reasonable with newer data and round-robin table allocation to start ramping michael@0: // down at mean * 1.5 and mean/1.5, while letting mean*2 and mean/2 pass, michael@0: // but just barely. michael@0: // michael@0: // dsites March 2013: Tightened up a bit. michael@0: static const double kRatio100 = 1.5; michael@0: static const double kRatio0 = 4.0; michael@0: int ReliabilityExpected(int actual_score_1kb, int expected_score_1kb) { michael@0: if (expected_score_1kb == 0) {return 100;} // No reliability data available yet michael@0: if (actual_score_1kb == 0) {return 0;} // zero score = unreliable michael@0: double ratio; michael@0: if (expected_score_1kb > actual_score_1kb) { michael@0: ratio = (1.0 * expected_score_1kb) / actual_score_1kb; michael@0: } else { michael@0: ratio = (1.0 * actual_score_1kb) / expected_score_1kb; michael@0: } michael@0: // Ratio 1.0 .. 1.5 scores 100% michael@0: // Ratio 2.0 scores 80% michael@0: // Linear decline, to ratio 4.0 scores 0% michael@0: if (ratio <= kRatio100) {return 100;} michael@0: if (ratio > kRatio0) {return 0;} michael@0: michael@0: int percent_good = 100.0 * (kRatio0 - ratio) / (kRatio0 - kRatio100); michael@0: return percent_good; michael@0: } michael@0: michael@0: // Create a langprob packed value from its parts. michael@0: // qprob is quantized [0..12] michael@0: // We use Latn script to represent any RTypeMany language michael@0: uint32 MakeLangProb(Language lang, int qprob) { michael@0: uint32 pslang = PerScriptNumber(ULScript_Latin, lang); michael@0: uint32 retval = (pslang << 8) | kLgProbV2TblBackmap[qprob]; michael@0: return retval; michael@0: } michael@0: michael@0: } // End namespace CLD2 michael@0: michael@0: michael@0: michael@0: michael@0: