michael@0: // Copyright 2013 Google Inc. All Rights Reserved. michael@0: // michael@0: // Licensed under the Apache License, Version 2.0 (the "License"); michael@0: // you may not use this file except in compliance with the License. michael@0: // You may obtain a copy of the License at michael@0: // michael@0: // http://www.apache.org/licenses/LICENSE-2.0 michael@0: // michael@0: // Unless required by applicable law or agreed to in writing, software michael@0: // distributed under the License is distributed on an "AS IS" BASIS, michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: // See the License for the specific language governing permissions and michael@0: // limitations under the License. michael@0: michael@0: // michael@0: // Author: dsites@google.com (Dick Sites) michael@0: // michael@0: // Just the stuff shared between offline table builder and online detector michael@0: // michael@0: michael@0: #ifndef I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__ michael@0: #define I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__ michael@0: michael@0: #include "integral_types.h" michael@0: #include "cld2tablesummary.h" michael@0: michael@0: namespace CLD2 { michael@0: michael@0: // Runtime routines for hashing, looking up, and scoring michael@0: // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams. michael@0: // Unigrams and bigrams are for CJK languages only, including simplified/ michael@0: // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and michael@0: // Zhuang Han characters. Surrounding spaces are not considered. michael@0: // Quadgrams and octagrams for for non-CJK and include two bits indicating michael@0: // preceding and trailing spaces (word boundaries). michael@0: michael@0: michael@0: //----------------------------------------------------------------------------// michael@0: // Main quantized probability table // michael@0: //----------------------------------------------------------------------------// michael@0: michael@0: // Table has 240 eight-byte entries. Each entry has a five-byte array and michael@0: // a three-byte array of log base 2 probabilities in the range 1..12. michael@0: // The intended use is to express five or three probabilities in a single-byte michael@0: // subscript, then decode via this table. These probabilities are michael@0: // intended to go with an array of five or three language numbers. michael@0: // michael@0: // The corresponding language numbers will have to be sorted by descending michael@0: // probability, then the actual probability subscript chosen to match the michael@0: // closest available entry in this table. michael@0: // michael@0: // Pattern of probability values: michael@0: // hi 3/4 1/2 1/4 lo hi mid lo michael@0: // where "3/4" is (hi*3+lo)/4, "1/2" is (hi+lo)/2, and "1/4" is (hi+lo*3)/4 michael@0: // and mid is one of 3/4 1/2 or 1/4. michael@0: // There are three groups of 78 (=12*13/2) entries, with hi running 1..12 and michael@0: // lo running 1..hi. Only the first group is used for five-entry lookups. michael@0: // The mid value in the first group is 1/2, the second group 3/4, and the michael@0: // third group 1/4. For three-entry lookups, this allows the mid entry to be michael@0: // somewhat higher or lower than the midpoint, to allow a better match to the michael@0: // original probabilities. michael@0: static const int kLgProbV2TblSize = 240; michael@0: static const uint8 kLgProbV2Tbl[kLgProbV2TblSize * 8] = { michael@0: 1,1,1,1,1, 1,1,1, // [0] michael@0: 2,2,2,1,1, 2,2,1, // [1] michael@0: 2,2,2,2,2, 2,2,2, michael@0: 3,3,2,2,1, 3,2,1, // [3] michael@0: 3,3,3,2,2, 3,3,2, michael@0: 3,3,3,3,3, 3,3,3, michael@0: 4,3,3,2,1, 4,3,1, // [6] michael@0: 4,4,3,3,2, 4,3,2, michael@0: 4,4,4,3,3, 4,4,3, michael@0: 4,4,4,4,4, 4,4,4, michael@0: 5,4,3,2,1, 5,3,1, // [10] michael@0: 5,4,4,3,2, 5,4,2, michael@0: 5,5,4,4,3, 5,4,3, michael@0: 5,5,5,4,4, 5,5,4, michael@0: 5,5,5,5,5, 5,5,5, michael@0: 6,5,4,2,1, 6,4,1, // [15] michael@0: 6,5,4,3,2, 6,4,2, michael@0: 6,5,5,4,3, 6,5,3, michael@0: 6,6,5,5,4, 6,5,4, michael@0: 6,6,6,5,5, 6,6,5, michael@0: 6,6,6,6,6, 6,6,6, michael@0: 7,6,4,3,1, 7,4,1, // [21] michael@0: 7,6,5,3,2, 7,5,2, michael@0: 7,6,5,4,3, 7,5,3, michael@0: 7,6,6,5,4, 7,6,4, michael@0: 7,7,6,6,5, 7,6,5, michael@0: 7,7,7,6,6, 7,7,6, michael@0: 7,7,7,7,7, 7,7,7, michael@0: 8,6,5,3,1, 8,5,1, // [28] michael@0: 8,7,5,4,2, 8,5,2, michael@0: 8,7,6,4,3, 8,6,3, michael@0: 8,7,6,5,4, 8,6,4, michael@0: 8,7,7,6,5, 8,7,5, michael@0: 8,8,7,7,6, 8,7,6, michael@0: 8,8,8,7,7, 8,8,7, michael@0: 8,8,8,8,8, 8,8,8, michael@0: 9,7,5,3,1, 9,5,1, // [36] michael@0: 9,7,6,4,2, 9,6,2, michael@0: 9,8,6,5,3, 9,6,3, michael@0: 9,8,7,5,4, 9,7,4, michael@0: 9,8,7,6,5, 9,7,5, michael@0: 9,8,8,7,6, 9,8,6, michael@0: 9,9,8,8,7, 9,8,7, michael@0: 9,9,9,8,8, 9,9,8, michael@0: 9,9,9,9,9, 9,9,9, michael@0: 10,8,6,3,1, 10,6,1, // [45] michael@0: 10,8,6,4,2, 10,6,2, michael@0: 10,8,7,5,3, 10,7,3, michael@0: 10,9,7,6,4, 10,7,4, michael@0: 10,9,8,6,5, 10,8,5, michael@0: 10,9,8,7,6, 10,8,6, michael@0: 10,9,9,8,7, 10,9,7, michael@0: 10,10,9,9,8, 10,9,8, michael@0: 10,10,10,9,9, 10,10,9, michael@0: 10,10,10,10,10, 10,10,10, michael@0: 11,9,6,4,1, 11,6,1, // [55] michael@0: 11,9,7,4,2, 11,7,2, michael@0: 11,9,7,5,3, 11,7,3, michael@0: 11,9,8,6,4, 11,8,4, michael@0: 11,10,8,7,5, 11,8,5, michael@0: 11,10,9,7,6, 11,9,6, michael@0: 11,10,9,8,7, 11,9,7, michael@0: 11,10,10,9,8, 11,10,8, michael@0: 11,11,10,10,9, 11,10,9, michael@0: 11,11,11,10,10, 11,11,10, michael@0: 11,11,11,11,11, 11,11,11, michael@0: 12,9,7,4,1, 12,7,1, // [66] michael@0: 12,10,7,5,2, 12,7,2, michael@0: 12,10,8,5,3, 12,8,3, michael@0: 12,10,8,6,4, 12,8,4, michael@0: 12,10,9,7,5, 12,9,5, michael@0: 12,11,9,8,6, 12,9,6, michael@0: 12,11,10,8,7, 12,10,7, michael@0: 12,11,10,9,8, 12,10,8, michael@0: 12,11,11,10,9, 12,11,9, michael@0: 12,12,11,11,10, 12,11,10, michael@0: 12,12,12,11,11, 12,12,11, michael@0: 12,12,12,12,12, 12,12,12, michael@0: michael@0: 1,1,1,1,1, 1,1,1, michael@0: 2,2,2,1,1, 2,2,1, michael@0: 2,2,2,2,2, 2,2,2, michael@0: 3,3,2,2,1, 3,3,1, michael@0: 3,3,3,2,2, 3,3,2, michael@0: 3,3,3,3,3, 3,3,3, michael@0: 4,3,3,2,1, 4,3,1, michael@0: 4,4,3,3,2, 4,4,2, michael@0: 4,4,4,3,3, 4,4,3, michael@0: 4,4,4,4,4, 4,4,4, michael@0: 5,4,3,2,1, 5,4,1, michael@0: 5,4,4,3,2, 5,4,2, michael@0: 5,5,4,4,3, 5,5,3, michael@0: 5,5,5,4,4, 5,5,4, michael@0: 5,5,5,5,5, 5,5,5, michael@0: 6,5,4,2,1, 6,5,1, michael@0: 6,5,4,3,2, 6,5,2, michael@0: 6,5,5,4,3, 6,5,3, michael@0: 6,6,5,5,4, 6,6,4, michael@0: 6,6,6,5,5, 6,6,5, michael@0: 6,6,6,6,6, 6,6,6, michael@0: 7,6,4,3,1, 7,6,1, michael@0: 7,6,5,3,2, 7,6,2, michael@0: 7,6,5,4,3, 7,6,3, michael@0: 7,6,6,5,4, 7,6,4, michael@0: 7,7,6,6,5, 7,7,5, michael@0: 7,7,7,6,6, 7,7,6, michael@0: 7,7,7,7,7, 7,7,7, michael@0: 8,6,5,3,1, 8,6,1, michael@0: 8,7,5,4,2, 8,7,2, michael@0: 8,7,6,4,3, 8,7,3, michael@0: 8,7,6,5,4, 8,7,4, michael@0: 8,7,7,6,5, 8,7,5, michael@0: 8,8,7,7,6, 8,8,6, michael@0: 8,8,8,7,7, 8,8,7, michael@0: 8,8,8,8,8, 8,8,8, michael@0: 9,7,5,3,1, 9,7,1, michael@0: 9,7,6,4,2, 9,7,2, michael@0: 9,8,6,5,3, 9,8,3, michael@0: 9,8,7,5,4, 9,8,4, michael@0: 9,8,7,6,5, 9,8,5, michael@0: 9,8,8,7,6, 9,8,6, michael@0: 9,9,8,8,7, 9,9,7, michael@0: 9,9,9,8,8, 9,9,8, michael@0: 9,9,9,9,9, 9,9,9, michael@0: 10,8,6,3,1, 10,8,1, michael@0: 10,8,6,4,2, 10,8,2, michael@0: 10,8,7,5,3, 10,8,3, michael@0: 10,9,7,6,4, 10,9,4, michael@0: 10,9,8,6,5, 10,9,5, michael@0: 10,9,8,7,6, 10,9,6, michael@0: 10,9,9,8,7, 10,9,7, michael@0: 10,10,9,9,8, 10,10,8, michael@0: 10,10,10,9,9, 10,10,9, michael@0: 10,10,10,10,10, 10,10,10, michael@0: 11,9,6,4,1, 11,9,1, michael@0: 11,9,7,4,2, 11,9,2, michael@0: 11,9,7,5,3, 11,9,3, michael@0: 11,9,8,6,4, 11,9,4, michael@0: 11,10,8,7,5, 11,10,5, michael@0: 11,10,9,7,6, 11,10,6, michael@0: 11,10,9,8,7, 11,10,7, michael@0: 11,10,10,9,8, 11,10,8, michael@0: 11,11,10,10,9, 11,11,9, michael@0: 11,11,11,10,10, 11,11,10, michael@0: 11,11,11,11,11, 11,11,11, michael@0: 12,9,7,4,1, 12,9,1, michael@0: 12,10,7,5,2, 12,10,2, michael@0: 12,10,8,5,3, 12,10,3, michael@0: 12,10,8,6,4, 12,10,4, michael@0: 12,10,9,7,5, 12,10,5, michael@0: 12,11,9,8,6, 12,11,6, michael@0: 12,11,10,8,7, 12,11,7, michael@0: 12,11,10,9,8, 12,11,8, michael@0: 12,11,11,10,9, 12,11,9, michael@0: 12,12,11,11,10, 12,12,10, michael@0: 12,12,12,11,11, 12,12,11, michael@0: 12,12,12,12,12, 12,12,12, michael@0: michael@0: 1,1,1,1,1, 1,1,1, michael@0: 2,2,2,1,1, 2,1,1, michael@0: 2,2,2,2,2, 2,2,2, michael@0: 3,3,2,2,1, 3,2,1, michael@0: 3,3,3,2,2, 3,2,2, michael@0: 3,3,3,3,3, 3,3,3, michael@0: 4,3,3,2,1, 4,2,1, michael@0: 4,4,3,3,2, 4,3,2, michael@0: 4,4,4,3,3, 4,3,3, michael@0: 4,4,4,4,4, 4,4,4, michael@0: 5,4,3,2,1, 5,2,1, michael@0: 5,4,4,3,2, 5,3,2, michael@0: 5,5,4,4,3, 5,4,3, michael@0: 5,5,5,4,4, 5,4,4, michael@0: 5,5,5,5,5, 5,5,5, michael@0: 6,5,4,2,1, 6,2,1, michael@0: 6,5,4,3,2, 6,3,2, michael@0: 6,5,5,4,3, 6,4,3, michael@0: 6,6,5,5,4, 6,5,4, michael@0: 6,6,6,5,5, 6,5,5, michael@0: 6,6,6,6,6, 6,6,6, michael@0: 7,6,4,3,1, 7,3,1, michael@0: 7,6,5,3,2, 7,3,2, michael@0: 7,6,5,4,3, 7,4,3, michael@0: 7,6,6,5,4, 7,5,4, michael@0: 7,7,6,6,5, 7,6,5, michael@0: 7,7,7,6,6, 7,6,6, michael@0: 7,7,7,7,7, 7,7,7, michael@0: 8,6,5,3,1, 8,3,1, michael@0: 8,7,5,4,2, 8,4,2, michael@0: 8,7,6,4,3, 8,4,3, michael@0: 8,7,6,5,4, 8,5,4, michael@0: 8,7,7,6,5, 8,6,5, michael@0: 8,8,7,7,6, 8,7,6, michael@0: 8,8,8,7,7, 8,7,7, michael@0: 8,8,8,8,8, 8,8,8, michael@0: 9,7,5,3,1, 9,3,1, michael@0: 9,7,6,4,2, 9,4,2, michael@0: 9,8,6,5,3, 9,5,3, michael@0: 9,8,7,5,4, 9,5,4, michael@0: 9,8,7,6,5, 9,6,5, michael@0: 9,8,8,7,6, 9,7,6, michael@0: 9,9,8,8,7, 9,8,7, michael@0: 9,9,9,8,8, 9,8,8, michael@0: 9,9,9,9,9, 9,9,9, michael@0: 10,8,6,3,1, 10,3,1, michael@0: 10,8,6,4,2, 10,4,2, michael@0: 10,8,7,5,3, 10,5,3, michael@0: 10,9,7,6,4, 10,6,4, michael@0: 10,9,8,6,5, 10,6,5, michael@0: 10,9,8,7,6, 10,7,6, michael@0: 10,9,9,8,7, 10,8,7, michael@0: 10,10,9,9,8, 10,9,8, michael@0: 10,10,10,9,9, 10,9,9, michael@0: 10,10,10,10,10, 10,10,10, michael@0: 11,9,6,4,1, 11,4,1, michael@0: 11,9,7,4,2, 11,4,2, michael@0: 11,9,7,5,3, 11,5,3, michael@0: 11,9,8,6,4, 11,6,4, michael@0: 11,10,8,7,5, 11,7,5, michael@0: 11,10,9,7,6, 11,7,6, michael@0: 11,10,9,8,7, 11,8,7, michael@0: 11,10,10,9,8, 11,9,8, michael@0: 11,11,10,10,9, 11,10,9, michael@0: 11,11,11,10,10, 11,10,10, michael@0: 11,11,11,11,11, 11,11,11, michael@0: 12,9,7,4,1, 12,4,1, michael@0: 12,10,7,5,2, 12,5,2, michael@0: 12,10,8,5,3, 12,5,3, michael@0: 12,10,8,6,4, 12,6,4, michael@0: 12,10,9,7,5, 12,7,5, michael@0: 12,11,9,8,6, 12,8,6, michael@0: 12,11,10,8,7, 12,8,7, michael@0: 12,11,10,9,8, 12,9,8, michael@0: 12,11,11,10,9, 12,10,9, michael@0: 12,12,11,11,10, 12,11,10, michael@0: 12,12,12,11,11, 12,11,11, michael@0: 12,12,12,12,12, 12,12,12, michael@0: michael@0: // Added 2013.01.28 for CJK compatible mapping michael@0: 8,5,2,2,2, 8,2,2, michael@0: 6,6,6,4,2, 6,6,2, michael@0: 6,5,4,4,4, 6,4,4, michael@0: 6,4,2,2,2, 6,2,2, michael@0: 4,3,2,2,2, 4,2,2, michael@0: 2,2,2,2,2, 2,2,2, michael@0: }; michael@0: michael@0: // Backmap a single desired probability into an entry in kLgProbV2Tbl michael@0: static const uint8 kLgProbV2TblBackmap[13] = { michael@0: 0, michael@0: 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, michael@0: }; michael@0: michael@0: // Return address of 8-byte entry[i] michael@0: inline const uint8* LgProb2TblEntry(int i) { michael@0: return &kLgProbV2Tbl[i * 8]; michael@0: } michael@0: michael@0: // Return one of three probabilities in an entry michael@0: inline uint8 LgProb3(const uint8* entry, int j) { michael@0: return entry[j + 5]; michael@0: } michael@0: michael@0: michael@0: // Routines to access a hash table of pairs michael@0: // Buckets have 4-byte wordhash for sizes < 32K buckets, but only michael@0: // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as michael@0: // bucket subscript. michael@0: // Probs is a packed: three languages plus a subscript for probability table michael@0: // Buckets have all the keys together, then all the values.Key array never michael@0: // crosses a cache-line boundary, so no-match case takes exactly one cache miss. michael@0: // Match case may sometimes take an additional cache miss on value access. michael@0: // michael@0: // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64 michael@0: // byte buckets with single cache miss. michael@0: // Or 2-byte key and 6-byte value, allowing 5 languages instead of three. michael@0: michael@0: michael@0: //----------------------------------------------------------------------------// michael@0: // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores // michael@0: //----------------------------------------------------------------------------// michael@0: michael@0: // BIGRAM michael@0: // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post michael@0: // OVERSHOOTS up to 3 bytes michael@0: // For runtime use of tables michael@0: // Does X86 unaligned loads if !defined(NEED_ALIGNED_LOADS)UNALIGNED_LOAD32(_p) michael@0: uint32 BiHashV2(const char* word_ptr, int bytecount); michael@0: michael@0: // QUADGRAM wrapper with surrounding spaces michael@0: // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add michael@0: // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes michael@0: // For runtime use of tables michael@0: uint32 QuadHashV2(const char* word_ptr, int bytecount); michael@0: michael@0: // QUADGRAM wrapper with surrounding underscores (offline use) michael@0: // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add michael@0: // OVERSHOOTS up to 3 bytes michael@0: // For offline construction of tables michael@0: uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount); michael@0: michael@0: // OCTAGRAM wrapper with surrounding spaces michael@0: // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add michael@0: // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes michael@0: uint64 OctaHash40(const char* word_ptr, int bytecount); michael@0: michael@0: michael@0: // OCTAGRAM wrapper with surrounding underscores (offline use) michael@0: // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add michael@0: // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes michael@0: uint64 OctaHash40underscore(const char* word_ptr, int bytecount); michael@0: michael@0: // Hash a consecutive pair of tokens/words A B michael@0: uint64 PairHash(uint64 worda_hash, uint64 wordb_hash); michael@0: michael@0: michael@0: // From 32-bit gram FP, return hash table subscript and remaining key michael@0: inline void QuadFPJustHash(uint32 quadhash, michael@0: uint32 keymask, michael@0: int bucketcount, michael@0: uint32* subscr, uint32* hashkey) { michael@0: *subscr = (quadhash + (quadhash >> 12)) & (bucketcount - 1); michael@0: *hashkey = quadhash & keymask; michael@0: } michael@0: michael@0: // From 40-bit gram FP, return hash table subscript and remaining key michael@0: inline void OctaFPJustHash(uint64 longwordhash, michael@0: uint32 keymask, michael@0: int bucketcount, michael@0: uint32* subscr, uint32* hashkey) { michael@0: uint32 temp = (longwordhash + (longwordhash >> 12)) & (bucketcount - 1); michael@0: *subscr = temp; michael@0: temp = longwordhash >> 4; michael@0: *hashkey = temp & keymask; michael@0: } michael@0: michael@0: michael@0: // Look up 32-bit gram FP in caller-passed table michael@0: // Typical size 256K entries (1.5MB) michael@0: // Two-byte hashkey michael@0: inline const uint32 QuadHashV3Lookup4(const CLD2TableSummary* gram_obj, michael@0: uint32 quadhash) { michael@0: uint32 subscr, hashkey; michael@0: const IndirectProbBucket4* quadtable = gram_obj->kCLDTable; michael@0: uint32 keymask = gram_obj->kCLDTableKeyMask; michael@0: int bucketcount = gram_obj->kCLDTableSize; michael@0: QuadFPJustHash(quadhash, keymask, bucketcount, &subscr, &hashkey); michael@0: const IndirectProbBucket4* bucket_ptr = &quadtable[subscr]; michael@0: // Four-way associative, 4 compares michael@0: if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) { michael@0: return bucket_ptr->keyvalue[0]; michael@0: } michael@0: if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) { michael@0: return bucket_ptr->keyvalue[1]; michael@0: } michael@0: if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) { michael@0: return bucket_ptr->keyvalue[2]; michael@0: } michael@0: if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) { michael@0: return bucket_ptr->keyvalue[3]; michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: // Look up 40-bit gram FP in caller-passed table michael@0: // Typical size 256K-4M entries (1-16MB) michael@0: // 24-12 bit hashkey packed with 8-20 bit indirect lang/probs michael@0: // keymask is 0xfffff000 for 20-bit hashkey and 12-bit indirect michael@0: inline const uint32 OctaHashV3Lookup4(const CLD2TableSummary* gram_obj, michael@0: uint64 longwordhash) { michael@0: uint32 subscr, hashkey; michael@0: const IndirectProbBucket4* octatable = gram_obj->kCLDTable; michael@0: uint32 keymask = gram_obj->kCLDTableKeyMask; michael@0: int bucketcount = gram_obj->kCLDTableSize; michael@0: OctaFPJustHash(longwordhash, keymask, bucketcount, michael@0: &subscr, &hashkey); michael@0: const IndirectProbBucket4* bucket_ptr = &octatable[subscr]; michael@0: // Four-way associative, 4 compares michael@0: if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) { michael@0: return bucket_ptr->keyvalue[0]; michael@0: } michael@0: if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) { michael@0: return bucket_ptr->keyvalue[1]; michael@0: } michael@0: if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) { michael@0: return bucket_ptr->keyvalue[2]; michael@0: } michael@0: if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) { michael@0: return bucket_ptr->keyvalue[3]; michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: michael@0: //----------------------------------------------------------------------------// michael@0: // Finding groups of 1/2/4/8 letters // michael@0: //----------------------------------------------------------------------------// michael@0: michael@0: // Does not advance past space or tab/cr/lf/nul michael@0: static const uint8 kAdvanceOneCharButSpace[256] = { michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, michael@0: 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, michael@0: }; michael@0: michael@0: michael@0: // Advances *only* on space or ASCII vowel (or illegal byte) michael@0: static const uint8 kAdvanceOneCharSpaceVowel[256] = { michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0, michael@0: michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: }; michael@0: michael@0: michael@0: // src points to a letter. Find the byte length of a unigram starting there. michael@0: int UniLen(const char* src); michael@0: michael@0: // src points to a letter. Find the byte length of a bigram starting there. michael@0: int BiLen(const char* src); michael@0: michael@0: // src points to a letter. Find the byte length of a quadgram starting there. michael@0: int QuadLen(const char* src); michael@0: michael@0: // src points to a letter. Find the byte length of an octagram starting there. michael@0: int OctaLen(const char* src); michael@0: michael@0: } // End namespace CLD2 michael@0: michael@0: #endif // I18N_ENCODINGS_CLD2_INTERNAL_NEW_CLDUTIL_SHARED_H__ michael@0: michael@0: michael@0: michael@0: michael@0: michael@0: