michael@0: // Copyright 2013 Google Inc. All Rights Reserved. michael@0: // michael@0: // Licensed under the Apache License, Version 2.0 (the "License"); michael@0: // you may not use this file except in compliance with the License. michael@0: // You may obtain a copy of the License at michael@0: // michael@0: // http://www.apache.org/licenses/LICENSE-2.0 michael@0: // michael@0: // Unless required by applicable law or agreed to in writing, software michael@0: // distributed under the License is distributed on an "AS IS" BASIS, michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: // See the License for the specific language governing permissions and michael@0: // limitations under the License. michael@0: michael@0: // michael@0: // Author: dsites@google.com (Dick Sites) michael@0: // michael@0: michael@0: #include "cldutil_shared.h" michael@0: #include michael@0: michael@0: #include "cld2tablesummary.h" michael@0: #include "integral_types.h" michael@0: #include "port.h" michael@0: #include "utf8statetable.h" michael@0: michael@0: namespace CLD2 { michael@0: michael@0: // Runtime routines for hashing, looking up, and scoring michael@0: // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams. michael@0: // Unigrams and bigrams are for CJK languages only, including simplified/ michael@0: // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and michael@0: // Zhuang Han characters. Surrounding spaces are not considered. michael@0: // Quadgrams and octagrams for for non-CJK and include two bits indicating michael@0: // preceding and trailing spaces (word boundaries). michael@0: michael@0: michael@0: // Indicator bits for leading/trailing space around quad/octagram michael@0: // NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of michael@0: // 1-, 2-, or 3-bytes each. michael@0: static const uint32 kPreSpaceIndicator = 0x00004444; michael@0: static const uint32 kPostSpaceIndicator = 0x44440000; michael@0: michael@0: // Little-endian masks for 0..24 bytes picked up as uint32's michael@0: static const uint32 kWordMask0[4] = { michael@0: 0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF michael@0: }; michael@0: michael@0: static const int kMinCJKUTF8CharBytes = 3; michael@0: michael@0: static const int kMinGramCount = 3; michael@0: static const int kMaxGramCount = 16; michael@0: michael@0: static const int UTFmax = 4; // Max number of bytes in a UTF-8 character michael@0: michael@0: michael@0: // Routines to access a hash table of pairs michael@0: // Buckets have 4-byte wordhash for sizes < 32K buckets, but only michael@0: // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as michael@0: // bucket subscript. michael@0: // Probs is a packed: three languages plus a subscript for probability table michael@0: // Buckets have all the keys together, then all the values.Key array never michael@0: // crosses a cache-line boundary, so no-match case takes exactly one cache miss. michael@0: // Match case may sometimes take an additional cache miss on value access. michael@0: // michael@0: // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64 michael@0: // byte buckets with single cache miss. michael@0: // Or 2-byte key and 6-byte value, allowing 5 languages instead of three. michael@0: michael@0: michael@0: //----------------------------------------------------------------------------// michael@0: // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores // michael@0: //----------------------------------------------------------------------------// michael@0: michael@0: // Design principles for these hash functions michael@0: // - Few operations michael@0: // - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in michael@0: // Latin script expect 1- and 2-byte mixtures. michael@0: // - Last byte of each character has about 5 bits of information michael@0: // - Spread good bits around so they can interact in at least two ways michael@0: // with other characters michael@0: // - Use add for additional mixing thorugh carries michael@0: michael@0: // CJK Three-byte bigram michael@0: // ....dddd..cccccc..bbbbbb....aaaa michael@0: // ..................ffffff..eeeeee michael@0: // make michael@0: // ....dddd..cccccc..bbbbbb....aaaa michael@0: // 000....dddd..cccccc..bbbbbb....a michael@0: // ..................ffffff..eeeeee michael@0: // ffffff..eeeeee000000000000000000 michael@0: // michael@0: // CJK Four-byte bigram michael@0: // ..dddddd..cccccc....bbbb....aaaa michael@0: // ..hhhhhh..gggggg....ffff....eeee michael@0: // make michael@0: // ..dddddd..cccccc....bbbb....aaaa michael@0: // 000..dddddd..cccccc....bbbb....a michael@0: // ..hhhhhh..gggggg....ffff....eeee michael@0: // ..ffff....eeee000000000000000000 michael@0: michael@0: // BIGRAM michael@0: // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post michael@0: // OVERSHOOTS up to 3 bytes michael@0: // For runtime use of tables michael@0: // Does X86 unaligned loads michael@0: uint32 BiHashV2(const char* word_ptr, int bytecount) { michael@0: if (bytecount == 0) {return 0;} michael@0: const uint32* word_ptr32 = reinterpret_cast(word_ptr); michael@0: uint32 word0, word1; michael@0: if (bytecount <= 4) { michael@0: word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3]; michael@0: word0 = word0 ^ (word0 >> 3); michael@0: return word0; michael@0: } michael@0: // Else do 8 bytes michael@0: word0 = UNALIGNED_LOAD32(word_ptr32); michael@0: word0 = word0 ^ (word0 >> 3); michael@0: word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3]; michael@0: word1 = word1 ^ (word1 << 18); michael@0: return word0 + word1; michael@0: } michael@0: michael@0: // michael@0: // Ascii-7 One-byte chars michael@0: // ...ddddd...ccccc...bbbbb...aaaaa michael@0: // make michael@0: // ...ddddd...ccccc...bbbbb...aaaaa michael@0: // 000...ddddd...ccccc...bbbbb...aa michael@0: // michael@0: // Latin 1- and 2-byte chars michael@0: // ...ddddd...ccccc...bbbbb...aaaaa michael@0: // ...................fffff...eeeee michael@0: // make michael@0: // ...ddddd...ccccc...bbbbb...aaaaa michael@0: // 000...ddddd...ccccc...bbbbb...aa michael@0: // ...................fffff...eeeee michael@0: // ...............fffff...eeeee0000 michael@0: // michael@0: // Non-CJK Two-byte chars michael@0: // ...ddddd...........bbbbb........ michael@0: // ...hhhhh...........fffff........ michael@0: // make michael@0: // ...ddddd...........bbbbb........ michael@0: // 000...ddddd...........bbbbb..... michael@0: // ...hhhhh...........fffff........ michael@0: // hhhh...........fffff........0000 michael@0: // michael@0: // Non-CJK Three-byte chars michael@0: // ...........ccccc................ michael@0: // ...................fffff........ michael@0: // ...lllll...................iiiii michael@0: // make michael@0: // ...........ccccc................ michael@0: // 000...........ccccc............. michael@0: // ...................fffff........ michael@0: // ...............fffff........0000 michael@0: // ...lllll...................iiiii michael@0: // .lllll...................iiiii00 michael@0: // michael@0: michael@0: // QUADGRAM michael@0: // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add michael@0: // OVERSHOOTS up to 3 bytes michael@0: // For runtime use of tables michael@0: // Does X86 unaligned loads michael@0: uint32 QuadHashV2Mix(const char* word_ptr, int bytecount, uint32 prepost) { michael@0: const uint32* word_ptr32 = reinterpret_cast(word_ptr); michael@0: uint32 word0, word1, word2; michael@0: if (bytecount <= 4) { michael@0: word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3]; michael@0: word0 = word0 ^ (word0 >> 3); michael@0: return word0 ^ prepost; michael@0: } else if (bytecount <= 8) { michael@0: word0 = UNALIGNED_LOAD32(word_ptr32); michael@0: word0 = word0 ^ (word0 >> 3); michael@0: word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3]; michael@0: word1 = word1 ^ (word1 << 4); michael@0: return (word0 ^ prepost) + word1; michael@0: } michael@0: // else do 12 bytes michael@0: word0 = UNALIGNED_LOAD32(word_ptr32); michael@0: word0 = word0 ^ (word0 >> 3); michael@0: word1 = UNALIGNED_LOAD32(word_ptr32 + 1); michael@0: word1 = word1 ^ (word1 << 4); michael@0: word2 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3]; michael@0: word2 = word2 ^ (word2 << 2); michael@0: return (word0 ^ prepost) + word1 + word2; michael@0: } michael@0: michael@0: michael@0: // QUADGRAM wrapper with surrounding spaces michael@0: // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add michael@0: // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes michael@0: // For runtime use of tables michael@0: uint32 QuadHashV2(const char* word_ptr, int bytecount) { michael@0: if (bytecount == 0) {return 0;} michael@0: uint32 prepost = 0; michael@0: if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} michael@0: if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} michael@0: return QuadHashV2Mix(word_ptr, bytecount, prepost); michael@0: } michael@0: michael@0: // QUADGRAM wrapper with surrounding underscores (offline use) michael@0: // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add michael@0: // OVERSHOOTS up to 3 bytes michael@0: // For offline construction of tables michael@0: uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount) { michael@0: if (bytecount == 0) {return 0;} michael@0: const char* local_word_ptr = word_ptr; michael@0: int local_bytecount = bytecount; michael@0: uint32 prepost = 0; michael@0: if (local_word_ptr[0] == '_') { michael@0: prepost |= kPreSpaceIndicator; michael@0: ++local_word_ptr; michael@0: --local_bytecount; michael@0: } michael@0: if (local_word_ptr[local_bytecount - 1] == '_') { michael@0: prepost |= kPostSpaceIndicator; michael@0: --local_bytecount; michael@0: } michael@0: return QuadHashV2Mix(local_word_ptr, local_bytecount, prepost); michael@0: } michael@0: michael@0: michael@0: // OCTAGRAM michael@0: // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add michael@0: // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes michael@0: // michael@0: // The low 32 bits follow the pattern from above, tuned to different scripts michael@0: // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each michael@0: // For runtime use of tables V3 michael@0: // Does X86 unaligned loads michael@0: uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) { michael@0: const uint32* word_ptr32 = reinterpret_cast(word_ptr); michael@0: uint64 word0; michael@0: uint64 word1; michael@0: uint64 sum; michael@0: michael@0: if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} michael@0: if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} michael@0: switch ((bytecount - 1) >> 2) { michael@0: case 0: // 1..4 bytes michael@0: word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3]; michael@0: sum = word0; michael@0: word0 = word0 ^ (word0 >> 3); michael@0: break; michael@0: case 1: // 5..8 bytes michael@0: word0 = UNALIGNED_LOAD32(word_ptr32); michael@0: sum = word0; michael@0: word0 = word0 ^ (word0 >> 3); michael@0: word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3]; michael@0: sum += word1; michael@0: word1 = word1 ^ (word1 << 4); michael@0: word0 += word1; michael@0: break; michael@0: case 2: // 9..12 bytes michael@0: word0 = UNALIGNED_LOAD32(word_ptr32); michael@0: sum = word0; michael@0: word0 = word0 ^ (word0 >> 3); michael@0: word1 = UNALIGNED_LOAD32(word_ptr32 + 1); michael@0: sum += word1; michael@0: word1 = word1 ^ (word1 << 4); michael@0: word0 += word1; michael@0: word1 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3]; michael@0: sum += word1; michael@0: word1 = word1 ^ (word1 << 2); michael@0: word0 += word1; michael@0: break; michael@0: case 3: // 13..16 bytes michael@0: word0 =UNALIGNED_LOAD32(word_ptr32); michael@0: sum = word0; michael@0: word0 = word0 ^ (word0 >> 3); michael@0: word1 = UNALIGNED_LOAD32(word_ptr32 + 1); michael@0: sum += word1; michael@0: word1 = word1 ^ (word1 << 4); michael@0: word0 += word1; michael@0: word1 = UNALIGNED_LOAD32(word_ptr32 + 2); michael@0: sum += word1; michael@0: word1 = word1 ^ (word1 << 2); michael@0: word0 += word1; michael@0: word1 = UNALIGNED_LOAD32(word_ptr32 + 3) & kWordMask0[bytecount & 3]; michael@0: sum += word1; michael@0: word1 = word1 ^ (word1 >> 8); michael@0: word0 += word1; michael@0: break; michael@0: case 4: // 17..20 bytes michael@0: word0 = UNALIGNED_LOAD32(word_ptr32); michael@0: sum = word0; michael@0: word0 = word0 ^ (word0 >> 3); michael@0: word1 = UNALIGNED_LOAD32(word_ptr32 + 1); michael@0: sum += word1; michael@0: word1 = word1 ^ (word1 << 4); michael@0: word0 += word1; michael@0: word1 = UNALIGNED_LOAD32(word_ptr32 + 2); michael@0: sum += word1; michael@0: word1 = word1 ^ (word1 << 2); michael@0: word0 += word1; michael@0: word1 = UNALIGNED_LOAD32(word_ptr32 + 3); michael@0: sum += word1; michael@0: word1 = word1 ^ (word1 >> 8); michael@0: word0 += word1; michael@0: word1 = UNALIGNED_LOAD32(word_ptr32 + 4) & kWordMask0[bytecount & 3]; michael@0: sum += word1; michael@0: word1 = word1 ^ (word1 >> 4); michael@0: word0 += word1; michael@0: break; michael@0: default: // 21..24 bytes and higher (ignores beyond 24) michael@0: word0 = UNALIGNED_LOAD32(word_ptr32); michael@0: sum = word0; michael@0: word0 = word0 ^ (word0 >> 3); michael@0: word1 = UNALIGNED_LOAD32(word_ptr32 + 1); michael@0: sum += word1; michael@0: word1 = word1 ^ (word1 << 4); michael@0: word0 += word1; michael@0: word1 = UNALIGNED_LOAD32(word_ptr32 + 2); michael@0: sum += word1; michael@0: word1 = word1 ^ (word1 << 2); michael@0: word0 += word1; michael@0: word1 = UNALIGNED_LOAD32(word_ptr32 + 3); michael@0: sum += word1; michael@0: word1 = word1 ^ (word1 >> 8); michael@0: word0 += word1; michael@0: word1 = UNALIGNED_LOAD32(word_ptr32 + 4); michael@0: sum += word1; michael@0: word1 = word1 ^ (word1 >> 4); michael@0: word0 += word1; michael@0: word1 = UNALIGNED_LOAD32(word_ptr32 + 5) & kWordMask0[bytecount & 3]; michael@0: sum += word1; michael@0: word1 = word1 ^ (word1 >> 6); michael@0: word0 += word1; michael@0: break; michael@0: } michael@0: michael@0: sum += (sum >> 17); // extra 1-bit shift for bytes 2 & 3 michael@0: sum += (sum >> 9); // extra 1-bit shift for bytes 1 & 3 michael@0: sum = (sum & 0xff) << 32; michael@0: return (word0 ^ prepost) + sum; michael@0: } michael@0: michael@0: // OCTAGRAM wrapper with surrounding spaces michael@0: // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add michael@0: // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes michael@0: // michael@0: // The low 32 bits follow the pattern from above, tuned to different scripts michael@0: // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each michael@0: // For runtime use of tables V3 michael@0: uint64 OctaHash40(const char* word_ptr, int bytecount) { michael@0: if (bytecount == 0) {return 0;} michael@0: uint64 prepost = 0; michael@0: if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} michael@0: if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} michael@0: return OctaHash40Mix(word_ptr, bytecount, prepost); michael@0: } michael@0: michael@0: michael@0: // OCTAGRAM wrapper with surrounding underscores (offline use) michael@0: // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add michael@0: // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes michael@0: // michael@0: // The low 32 bits follow the pattern from above, tuned to different scripts michael@0: // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each michael@0: // For offline construction of tables michael@0: uint64 OctaHash40underscore(const char* word_ptr, int bytecount) { michael@0: if (bytecount == 0) {return 0;} michael@0: const char* local_word_ptr = word_ptr; michael@0: int local_bytecount = bytecount; michael@0: uint64 prepost = 0; michael@0: if (local_word_ptr[0] == '_') { michael@0: prepost |= kPreSpaceIndicator; michael@0: ++local_word_ptr; michael@0: --local_bytecount; michael@0: } michael@0: if (local_word_ptr[local_bytecount - 1] == '_') { michael@0: prepost |= kPostSpaceIndicator; michael@0: --local_bytecount; michael@0: } michael@0: return OctaHash40Mix(local_word_ptr, local_bytecount, prepost); michael@0: } michael@0: michael@0: // Hash a consecutive pair of tokens/words A B michael@0: // Old: hash is B - A, which gives too many false hits on one-char diffs michael@0: // Now: rotate(A,13) + B michael@0: uint64 PairHash(uint64 worda_hash, uint64 wordb_hash) { michael@0: return ((worda_hash >> 13) | (worda_hash << (64 - 13))) + wordb_hash; michael@0: } michael@0: michael@0: michael@0: michael@0: michael@0: //----------------------------------------------------------------------------// michael@0: // Finding groups of 1/2/4/8 letters // michael@0: //----------------------------------------------------------------------------// michael@0: michael@0: // src points to a letter. Find the byte length of a unigram starting there. michael@0: int UniLen(const char* src) { michael@0: const char* src_end = src; michael@0: src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; michael@0: return src_end - src; michael@0: } michael@0: michael@0: // src points to a letter. Find the byte length of a bigram starting there. michael@0: int BiLen(const char* src) { michael@0: const char* src_end = src; michael@0: src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; michael@0: src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; michael@0: return src_end - src; michael@0: } michael@0: michael@0: // src points to a letter. Find the byte length of a quadgram starting there. michael@0: int QuadLen(const char* src) { michael@0: const char* src_end = src; michael@0: src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; michael@0: src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; michael@0: src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; michael@0: src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; michael@0: return src_end - src; michael@0: } michael@0: michael@0: // src points to a letter. Find the byte length of an octagram starting there. michael@0: int OctaLen(const char* src) { michael@0: const char* src_end = src; michael@0: int charcount = 0; michael@0: while (src_end[0] != ' ') { michael@0: src_end += UTF8OneCharLen(src); michael@0: ++charcount; michael@0: if (charcount == 8) {break;} michael@0: } michael@0: return src_end - src; michael@0: } michael@0: michael@0: } // End namespace CLD2 michael@0: michael@0: michael@0: michael@0: michael@0: