michael@0: // Copyright 2013 Google Inc. All Rights Reserved. michael@0: // michael@0: // Licensed under the Apache License, Version 2.0 (the "License"); michael@0: // you may not use this file except in compliance with the License. michael@0: // You may obtain a copy of the License at michael@0: // michael@0: // http://www.apache.org/licenses/LICENSE-2.0 michael@0: // michael@0: // Unless required by applicable law or agreed to in writing, software michael@0: // distributed under the License is distributed on an "AS IS" BASIS, michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: // See the License for the specific language governing permissions and michael@0: // limitations under the License. michael@0: michael@0: // michael@0: // Author: dsites@google.com (Dick Sites) michael@0: // michael@0: // michael@0: // Terminology: michael@0: // Incoming original text has HTML tags and entities removed, all but letters michael@0: // removed, and letters lowercased. Strings of non-letters are mapped to a michael@0: // single ASCII space. michael@0: // michael@0: // One scriptspan has a run of letters/spaces in a single script. This is the michael@0: // fundamental text unit that is scored. There is an optional backmap from michael@0: // scriptspan text to the original document text, so that the language ranges michael@0: // reported in ResultChunkVector refer to byte ranges inthe original text. michael@0: // michael@0: // Scripts come in two forms, the full Unicode scripts described by michael@0: // http://www.unicode.org/Public/UNIDATA/Scripts.txt michael@0: // and a modified list used exclusively in CLD2. The modified form maps all michael@0: // the CJK scripts to one, Hani. The current version description is in michael@0: // i18n/encodings/cld2/builddata/script_summary.txt michael@0: // In addition, all non-letters are mapped to the Common script. michael@0: // michael@0: // ULScript describes this Unicode Letter script. michael@0: // michael@0: // Scoring uses text nil-grams, uni-grams, bi-grams, quad-grams, and octa-grams. michael@0: // Nilgrams (no text lookup at all) are for script-based pseudo-languages and michael@0: // for languages that are 1:1 with a given script. Unigrams and bigrams are michael@0: // used to score the CJK languages, all in the Hani script. Quadgrams and michael@0: // octagrams are used to score all other languages. michael@0: // michael@0: // RType is the Recognition Type per ulscript. michael@0: // michael@0: // The scoring tables map various grams to language-probability scores. michael@0: // A given gram that hits in scoring table maps to an indirect subscript into michael@0: // a list of packed languages and log probabilities. michael@0: // michael@0: // Languages are stored in two forms: 10-bit values in the Languge enum, and michael@0: // shorter 8-bit per-ulscript values in the scoring tables. michael@0: // michael@0: // Language refers to the full 10-bit range. michael@0: // pslang refers to the per-ulscript shorter values. michael@0: // michael@0: // Log probabilities also come in two forms. The full range uses values 0..255 michael@0: // to represent minus log base 10th-root-of-2, covering 1 .. 1/2**25.5 or about michael@0: // TODO BOGUS description, 24 vs 12 michael@0: // 1/47.5M. The second form quantizes these into multiples of 8 that can be michael@0: // added together to represent probability products. The quantized form uses michael@0: // values 24..0 with 0 now least likely instead of most likely, thus making michael@0: // larger sums for more probable results. 24 maps to original 1/2**4.8 (~1/28) michael@0: // and 0 maps to original 1/2**24.0 (~1/16M). michael@0: // michael@0: // qprob refers to quantized log probabilities. michael@0: // michael@0: // langprob is a uint32 holding three 1-byte pslangs and a 1-byte subscript to michael@0: // a list of three qprobs. It always nees a companion ulscript michael@0: // michael@0: // A scriptspan is scored via one or more hitbuffers michael@0: michael@0: michael@0: #ifndef I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__ michael@0: #define I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__ michael@0: michael@0: #include michael@0: michael@0: #include "integral_types.h" // for uint8 etc. michael@0: michael@0: #include "cld2tablesummary.h" michael@0: #include "compact_lang_det_impl.h" // for ResultChunkVector michael@0: #include "getonescriptspan.h" michael@0: #include "langspan.h" michael@0: #include "tote.h" michael@0: #include "utf8statetable.h" michael@0: michael@0: namespace CLD2 { michael@0: michael@0: static const int kMaxBoosts = 4; // For each of PerScriptLangBoosts michael@0: // must be power of two for wrap() michael@0: static const int kChunksizeQuads = 20; // For non-CJK michael@0: static const int kChunksizeUnis = 50; // For CJK michael@0: static const int kMaxScoringHits = 1000; michael@0: static const int kMaxSummaries = kMaxScoringHits / kChunksizeQuads; michael@0: michael@0: michael@0: // The first four tables are for CJK languages, michael@0: // the next three for quadgram languages, and michael@0: // the last for expected scores. michael@0: typedef struct { michael@0: const UTF8PropObj* unigram_obj; // 80K CJK characters michael@0: const CLD2TableSummary* unigram_compat_obj; // 256 CJK lookup probabilities michael@0: const CLD2TableSummary* deltabi_obj; michael@0: const CLD2TableSummary* distinctbi_obj; michael@0: michael@0: const CLD2TableSummary* quadgram_obj; // Primary quadgram lookup table michael@0: const CLD2TableSummary* quadgram_obj2; // Secondary " michael@0: const CLD2TableSummary* deltaocta_obj; michael@0: const CLD2TableSummary* distinctocta_obj; michael@0: michael@0: const short* kExpectedScore; // Expected base + delta + distinct score michael@0: // per 1KB input michael@0: // Subscripted by language and script4 michael@0: } ScoringTables; michael@0: michael@0: // Context for boosting several languages michael@0: typedef struct { michael@0: int32 n; michael@0: uint32 langprob[kMaxBoosts]; michael@0: int wrap(int32 n) {return n & (kMaxBoosts - 1);} michael@0: } LangBoosts; michael@0: michael@0: typedef struct { michael@0: LangBoosts latn; michael@0: LangBoosts othr; michael@0: } PerScriptLangBoosts; michael@0: michael@0: michael@0: michael@0: // ScoringContext carries state across scriptspans michael@0: // ScoringContext also has read-only scoring tables mapping grams to qprobs michael@0: typedef struct { michael@0: FILE* debug_file; // Non-NULL if debug output wanted michael@0: bool flags_cld2_score_as_quads; michael@0: bool flags_cld2_html; michael@0: bool flags_cld2_cr; michael@0: bool flags_cld2_verbose; michael@0: ULScript ulscript; // langprobs below are with respect to this script michael@0: Language prior_chunk_lang; // Mostly for debug output michael@0: // boost has a packed set of per-script langs and probabilites michael@0: // whack has a per-script lang to be suppressed from ever scoring (zeroed) michael@0: // When a language in a close set is given as an explicit hint, others in michael@0: // that set will be whacked. michael@0: PerScriptLangBoosts langprior_boost; // From http content-lang or meta lang= michael@0: PerScriptLangBoosts langprior_whack; // From http content-lang or meta lang= michael@0: PerScriptLangBoosts distinct_boost; // From distinctive letter groups michael@0: int oldest_distinct_boost; // Subscript in hitbuffer of oldest michael@0: // distinct score to use michael@0: const ScoringTables* scoringtables; // Probability lookup tables michael@0: ScriptScanner* scanner; // For ResultChunkVector backmap michael@0: michael@0: // Inits boosts michael@0: void init() { michael@0: memset(&langprior_boost, 0, sizeof(langprior_boost)); michael@0: memset(&langprior_whack, 0, sizeof(langprior_whack)); michael@0: memset(&distinct_boost, 0, sizeof(distinct_boost)); michael@0: }; michael@0: } ScoringContext; michael@0: michael@0: michael@0: michael@0: // Begin private michael@0: michael@0: // Holds one scoring-table lookup hit. We hold indirect subscript instead of michael@0: // langprob to allow a single hit to use a variable number of langprobs. michael@0: typedef struct { michael@0: int offset; // First byte of quad/octa etc. in scriptspan michael@0: int indirect; // subscript of langprobs in scoring table michael@0: } ScoringHit; michael@0: michael@0: typedef enum { michael@0: UNIHIT = 0, michael@0: QUADHIT = 1, michael@0: DELTAHIT = 2, michael@0: DISTINCTHIT = 3 michael@0: } LinearHitType; michael@0: michael@0: // Holds one scoring-table lookup hit resolved into a langprob. michael@0: typedef struct { michael@0: uint16 offset; // First byte of quad/octa etc. in scriptspan michael@0: uint16 type; // LinearHitType michael@0: uint32 langprob; // langprob from scoring table michael@0: } LangprobHit; michael@0: michael@0: // Holds arrays of scoring-table lookup hits for (part of) a scriptspan michael@0: typedef struct { michael@0: ULScript ulscript; // langprobs below are with respect to this script michael@0: int maxscoringhits; // determines size of arrays below michael@0: int next_base; // First unused entry in each array michael@0: int next_delta; // " michael@0: int next_distinct; // " michael@0: int next_linear; // " michael@0: int next_chunk_start; // First unused chunk_start entry michael@0: int lowest_offset; // First byte of text span used to fill hitbuffer michael@0: // Dummy entry at the end of each giving offset of first unused text byte michael@0: ScoringHit base[kMaxScoringHits + 1]; // Uni/quad hits michael@0: ScoringHit delta[kMaxScoringHits + 1]; // delta-bi/delta-octa hits michael@0: ScoringHit distinct[kMaxScoringHits + 1]; // distinct-word hits michael@0: LangprobHit linear[4 * kMaxScoringHits + 1]; // Above three merge-sorted michael@0: // (4: some bases => 2 linear) michael@0: int chunk_start[kMaxSummaries + 1]; // First linear[] subscr of michael@0: // each scored chunk michael@0: int chunk_offset[kMaxSummaries + 1]; // First text subscr of michael@0: // each scored chunk michael@0: michael@0: void init() { michael@0: ulscript = ULScript_Common; michael@0: maxscoringhits = kMaxScoringHits; michael@0: next_base = 0; michael@0: next_delta = 0; michael@0: next_distinct = 0; michael@0: next_linear = 0; michael@0: next_chunk_start = 0; michael@0: lowest_offset = 0; michael@0: base[0].offset = 0; michael@0: base[0].indirect = 0; michael@0: delta[0].offset = 0; michael@0: delta[0].indirect = 0; michael@0: distinct[0].offset = 0; michael@0: distinct[0].indirect = 0; michael@0: linear[0].offset = 0; michael@0: linear[0].langprob = 0; michael@0: chunk_start[0] = 0; michael@0: chunk_offset[0] = 0; michael@0: }; michael@0: } ScoringHitBuffer; michael@0: michael@0: // TODO: Explain here why we need both ChunkSpan and ChunkSummary michael@0: typedef struct { michael@0: int chunk_base; // Subscript of first hitbuffer.base[] in chunk michael@0: int chunk_delta; // Subscript of first hitbuffer.delta[] michael@0: int chunk_distinct; // Subscript of first hitbuffer.distinct[] michael@0: int base_len; // Number of hitbuffer.base[] in chunk michael@0: int delta_len; // Number of hitbuffer.delta[] in chunk michael@0: int distinct_len; // Number of hitbuffer.distinct[] in chunk michael@0: } ChunkSpan; michael@0: michael@0: michael@0: // Packed into 20 bytes for space michael@0: typedef struct { michael@0: uint16 offset; // Text offset within current scriptspan.text michael@0: uint16 chunk_start; // Scoring subscr within hitbuffer->linear[] michael@0: uint16 lang1; // Top lang, mapped to full Language michael@0: uint16 lang2; // Second lang, mapped to full Language michael@0: uint16 score1; // Top lang raw score michael@0: uint16 score2; // Second lang raw score michael@0: uint16 bytes; // Number of lower letters bytes in chunk michael@0: uint16 grams; // Number of scored base quad- uni-grams in chunk michael@0: uint16 ulscript; // ULScript of chunk michael@0: uint8 reliability_delta; // Reliability 0..100, delta top:second scores michael@0: uint8 reliability_score; // Reliability 0..100, top:expected score michael@0: } ChunkSummary; michael@0: michael@0: michael@0: // We buffer up ~50 chunk summaries, corresponding to chunks of 20 quads in a michael@0: // 1000-quad hit buffer, so we can do boundary adjustment on them michael@0: // when adjacent entries are different languages. After that, we add them michael@0: // all into the document score michael@0: // michael@0: // About 50 * 20 = 1000 bytes. OK for stack alloc michael@0: typedef struct { michael@0: int n; michael@0: ChunkSummary chunksummary[kMaxSummaries + 1]; michael@0: } SummaryBuffer; michael@0: michael@0: // End private michael@0: michael@0: michael@0: // Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating michael@0: // scoringcontext michael@0: void ScoreEntireScriptSpan(const LangSpan& scriptspan, michael@0: ScoringContext* scoringcontext, michael@0: DocTote* doc_tote, michael@0: ResultChunkVector* vec); michael@0: michael@0: // Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext michael@0: void ScoreCJKScriptSpan(const LangSpan& scriptspan, michael@0: ScoringContext* scoringcontext, michael@0: DocTote* doc_tote, michael@0: ResultChunkVector* vec); michael@0: michael@0: // Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext michael@0: void ScoreQuadScriptSpan(const LangSpan& scriptspan, michael@0: ScoringContext* scoringcontext, michael@0: DocTote* doc_tote, michael@0: ResultChunkVector* vec); michael@0: michael@0: // Score one scriptspan into doc_tote and vec, updating scoringcontext michael@0: void ScoreOneScriptSpan(const LangSpan& scriptspan, michael@0: ScoringContext* scoringcontext, michael@0: DocTote* doc_tote, michael@0: ResultChunkVector* vec); michael@0: michael@0: } // End namespace CLD2 michael@0: michael@0: #endif // I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__ michael@0: