michael@0: // Copyright 2013 Google Inc. All Rights Reserved.
michael@0: //
michael@0: // Licensed under the Apache License, Version 2.0 (the "License");
michael@0: // you may not use this file except in compliance with the License.
michael@0: // You may obtain a copy of the License at
michael@0: //
michael@0: //     http://www.apache.org/licenses/LICENSE-2.0
michael@0: //
michael@0: // Unless required by applicable law or agreed to in writing, software
michael@0: // distributed under the License is distributed on an "AS IS" BASIS,
michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0: // See the License for the specific language governing permissions and
michael@0: // limitations under the License.
michael@0: 
michael@0: //
michael@0: // Author: dsites@google.com (Dick Sites)
michael@0: //
michael@0: //
michael@0: // Terminology:
michael@0: // Incoming original text has HTML tags and entities removed, all but letters
michael@0: // removed, and letters lowercased. Strings of non-letters are mapped to a
michael@0: // single ASCII space.
michael@0: //
michael@0: // One scriptspan has a run of letters/spaces  in a single script. This is the
michael@0: // fundamental text unit that is scored. There is an optional backmap from
michael@0: // scriptspan text to the original document text, so that the language ranges
michael@0: // reported in ResultChunkVector refer to byte ranges inthe original text.
michael@0: //
michael@0: // Scripts come in two forms, the full Unicode scripts described by
michael@0: //   http://www.unicode.org/Public/UNIDATA/Scripts.txt
michael@0: // and a modified list used exclusively in CLD2. The modified form maps all
michael@0: // the CJK scripts to one, Hani. The current version description is in
michael@0: //  i18n/encodings/cld2/builddata/script_summary.txt
michael@0: // In addition, all non-letters are mapped to the Common script.
michael@0: //
michael@0: // ULScript describes this Unicode Letter script.
michael@0: //
michael@0: // Scoring uses text nil-grams, uni-grams, bi-grams, quad-grams, and octa-grams.
michael@0: // Nilgrams (no text lookup at all) are for script-based pseudo-languages and
michael@0: // for languages that are 1:1 with a given script. Unigrams and bigrams are
michael@0: // used to score the CJK languages, all in the Hani script. Quadgrams and
michael@0: // octagrams are used to score all other languages.
michael@0: //
michael@0: // RType is the Recognition Type per ulscript.
michael@0: //
michael@0: // The scoring tables map various grams to language-probability scores.
michael@0: // A given gram that hits in scoring table maps to an indirect subscript into
michael@0: // a list of packed languages and log probabilities.
michael@0: //
michael@0: // Languages are stored in two forms: 10-bit values in the Languge enum, and
michael@0: // shorter 8-bit per-ulscript values in the scoring tables.
michael@0: //
michael@0: // Language refers to the full 10-bit range.
michael@0: // pslang refers to the per-ulscript shorter values.
michael@0: //
michael@0: // Log probabilities also come in two forms. The full range uses values 0..255
michael@0: // to represent minus log base 10th-root-of-2, covering 1 .. 1/2**25.5 or about
michael@0: // TODO BOGUS description, 24 vs 12
michael@0: // 1/47.5M. The second form quantizes these into multiples of 8 that can be
michael@0: // added together to represent probability products. The quantized form uses
michael@0: // values 24..0 with 0 now least likely instead of most likely, thus making
michael@0: // larger sums for more probable results. 24 maps to original 1/2**4.8 (~1/28)
michael@0: // and 0 maps to original 1/2**24.0 (~1/16M).
michael@0: //
michael@0: // qprob refers to quantized log probabilities.
michael@0: //
michael@0: // langprob is a uint32 holding three 1-byte pslangs and a 1-byte subscript to
michael@0: // a list of three qprobs. It always nees a companion ulscript
michael@0: //
michael@0: // A scriptspan is scored via one or more hitbuffers
michael@0: 
michael@0: 
michael@0: #ifndef I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__
michael@0: #define I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__
michael@0: 
michael@0: #include <stdio.h>
michael@0: 
michael@0: #include "integral_types.h"           // for uint8 etc.
michael@0: 
michael@0: #include "cld2tablesummary.h"
michael@0: #include "compact_lang_det_impl.h"    // for ResultChunkVector
michael@0: #include "getonescriptspan.h"
michael@0: #include "langspan.h"
michael@0: #include "tote.h"
michael@0: #include "utf8statetable.h"
michael@0: 
michael@0: namespace CLD2 {
michael@0: 
michael@0: static const int kMaxBoosts = 4;              // For each of PerScriptLangBoosts
michael@0:                                               // must be power of two for wrap()
michael@0: static const int kChunksizeQuads = 20;        // For non-CJK
michael@0: static const int kChunksizeUnis = 50;         // For CJK
michael@0: static const int kMaxScoringHits = 1000;
michael@0: static const int kMaxSummaries = kMaxScoringHits / kChunksizeQuads;
michael@0: 
michael@0: 
michael@0: // The first four tables are for CJK languages,
michael@0: // the next three for quadgram languages, and
michael@0: // the last for expected scores.
michael@0: typedef struct {
michael@0:   const UTF8PropObj* unigram_obj;               // 80K CJK characters
michael@0:   const CLD2TableSummary* unigram_compat_obj;   // 256 CJK lookup probabilities
michael@0:   const CLD2TableSummary* deltabi_obj;
michael@0:   const CLD2TableSummary* distinctbi_obj;
michael@0: 
michael@0:   const CLD2TableSummary* quadgram_obj;         // Primary quadgram lookup table
michael@0:   const CLD2TableSummary* quadgram_obj2;        // Secondary  "
michael@0:   const CLD2TableSummary* deltaocta_obj;
michael@0:   const CLD2TableSummary* distinctocta_obj;
michael@0: 
michael@0:   const short* kExpectedScore;      // Expected base + delta + distinct score
michael@0:                                     // per 1KB input
michael@0:                                     // Subscripted by language and script4
michael@0: } ScoringTables;
michael@0: 
michael@0: // Context for boosting several languages
michael@0: typedef struct {
michael@0:    int32 n;
michael@0:    uint32 langprob[kMaxBoosts];
michael@0:    int wrap(int32 n) {return n & (kMaxBoosts - 1);}
michael@0: } LangBoosts;
michael@0: 
michael@0: typedef struct {
michael@0:    LangBoosts latn;
michael@0:    LangBoosts othr;
michael@0: } PerScriptLangBoosts;
michael@0: 
michael@0: 
michael@0: 
michael@0: // ScoringContext carries state across scriptspans
michael@0: // ScoringContext also has read-only scoring tables mapping grams to qprobs
michael@0: typedef struct {
michael@0:   FILE* debug_file;                   // Non-NULL if debug output wanted
michael@0:   bool flags_cld2_score_as_quads;
michael@0:   bool flags_cld2_html;
michael@0:   bool flags_cld2_cr;
michael@0:   bool flags_cld2_verbose;
michael@0:   ULScript ulscript;        // langprobs below are with respect to this script
michael@0:   Language prior_chunk_lang;          // Mostly for debug output
michael@0:   // boost has a packed set of per-script langs and probabilites
michael@0:   // whack has a per-script lang to be suppressed from ever scoring (zeroed)
michael@0:   // When a language in a close set is given as an explicit hint, others in
michael@0:   //  that set will be whacked.
michael@0:   PerScriptLangBoosts langprior_boost;  // From http content-lang or meta lang=
michael@0:   PerScriptLangBoosts langprior_whack;  // From http content-lang or meta lang=
michael@0:   PerScriptLangBoosts distinct_boost;   // From distinctive letter groups
michael@0:   int oldest_distinct_boost;          // Subscript in hitbuffer of oldest
michael@0:                                       // distinct score to use
michael@0:   const ScoringTables* scoringtables; // Probability lookup tables
michael@0:   ScriptScanner* scanner;             // For ResultChunkVector backmap
michael@0: 
michael@0:   // Inits boosts
michael@0:   void init() {
michael@0:     memset(&langprior_boost, 0, sizeof(langprior_boost));
michael@0:     memset(&langprior_whack, 0, sizeof(langprior_whack));
michael@0:     memset(&distinct_boost, 0, sizeof(distinct_boost));
michael@0:   };
michael@0: } ScoringContext;
michael@0: 
michael@0: 
michael@0: 
michael@0: // Begin private
michael@0: 
michael@0: // Holds one scoring-table lookup hit. We hold indirect subscript instead of
michael@0: // langprob to allow a single hit to use a variable number of langprobs.
michael@0: typedef struct {
michael@0:   int offset;         // First byte of quad/octa etc. in scriptspan
michael@0:   int indirect;       // subscript of langprobs in scoring table
michael@0: } ScoringHit;
michael@0: 
michael@0: typedef enum {
michael@0:   UNIHIT                       = 0,
michael@0:   QUADHIT                      = 1,
michael@0:   DELTAHIT                     = 2,
michael@0:   DISTINCTHIT                  = 3
michael@0: } LinearHitType;
michael@0: 
michael@0: // Holds one scoring-table lookup hit resolved into a langprob.
michael@0: typedef struct {
michael@0:   uint16 offset;      // First byte of quad/octa etc. in scriptspan
michael@0:   uint16 type;        // LinearHitType
michael@0:   uint32 langprob;    // langprob from scoring table
michael@0: } LangprobHit;
michael@0: 
michael@0: // Holds arrays of scoring-table lookup hits for (part of) a scriptspan
michael@0: typedef struct {
michael@0:   ULScript ulscript;        // langprobs below are with respect to this script
michael@0:   int maxscoringhits;       // determines size of arrays below
michael@0:   int next_base;            // First unused entry in each array
michael@0:   int next_delta;           //   "
michael@0:   int next_distinct;        //   "
michael@0:   int next_linear;          //   "
michael@0:   int next_chunk_start;     // First unused chunk_start entry
michael@0:   int lowest_offset;        // First byte of text span used to fill hitbuffer
michael@0:   // Dummy entry at the end of each giving offset of first unused text byte
michael@0:   ScoringHit base[kMaxScoringHits + 1];         // Uni/quad hits
michael@0:   ScoringHit delta[kMaxScoringHits + 1];        // delta-bi/delta-octa hits
michael@0:   ScoringHit distinct[kMaxScoringHits + 1];     // distinct-word hits
michael@0:   LangprobHit linear[4 * kMaxScoringHits + 1];  // Above three merge-sorted
michael@0:                                                 // (4: some bases => 2 linear)
michael@0:   int chunk_start[kMaxSummaries + 1];           // First linear[] subscr of
michael@0:                                                 //  each scored chunk
michael@0:   int chunk_offset[kMaxSummaries + 1];          // First text subscr of
michael@0:                                                 //  each scored chunk
michael@0: 
michael@0:   void init() {
michael@0:     ulscript = ULScript_Common;
michael@0:     maxscoringhits = kMaxScoringHits;
michael@0:     next_base = 0;
michael@0:     next_delta = 0;
michael@0:     next_distinct = 0;
michael@0:     next_linear = 0;
michael@0:     next_chunk_start = 0;
michael@0:     lowest_offset = 0;
michael@0:     base[0].offset = 0;
michael@0:     base[0].indirect = 0;
michael@0:     delta[0].offset = 0;
michael@0:     delta[0].indirect = 0;
michael@0:     distinct[0].offset = 0;
michael@0:     distinct[0].indirect = 0;
michael@0:     linear[0].offset = 0;
michael@0:     linear[0].langprob = 0;
michael@0:     chunk_start[0] = 0;
michael@0:     chunk_offset[0] = 0;
michael@0:   };
michael@0: } ScoringHitBuffer;
michael@0: 
michael@0: // TODO: Explain here why we need both ChunkSpan and ChunkSummary
michael@0: typedef struct {
michael@0:   int chunk_base;       // Subscript of first hitbuffer.base[] in chunk
michael@0:   int chunk_delta;      // Subscript of first hitbuffer.delta[]
michael@0:   int chunk_distinct;   // Subscript of first hitbuffer.distinct[]
michael@0:   int base_len;         // Number of hitbuffer.base[] in chunk
michael@0:   int delta_len;        // Number of hitbuffer.delta[] in chunk
michael@0:   int distinct_len;     // Number of hitbuffer.distinct[] in chunk
michael@0: } ChunkSpan;
michael@0: 
michael@0: 
michael@0: // Packed into 20 bytes for space
michael@0: typedef struct {
michael@0:   uint16 offset;              // Text offset within current scriptspan.text
michael@0:   uint16 chunk_start;         // Scoring subscr within hitbuffer->linear[]
michael@0:   uint16 lang1;               // Top lang, mapped to full Language
michael@0:   uint16 lang2;               // Second lang, mapped to full Language
michael@0:   uint16 score1;              // Top lang raw score
michael@0:   uint16 score2;              // Second lang raw score
michael@0:   uint16 bytes;               // Number of lower letters bytes in chunk
michael@0:   uint16 grams;               // Number of scored base quad- uni-grams in chunk
michael@0:   uint16 ulscript;            // ULScript of chunk
michael@0:   uint8 reliability_delta;    // Reliability 0..100, delta top:second scores
michael@0:   uint8 reliability_score;    // Reliability 0..100, top:expected score
michael@0: } ChunkSummary;
michael@0: 
michael@0: 
michael@0: // We buffer up ~50 chunk summaries, corresponding to chunks of 20 quads in a
michael@0: // 1000-quad hit buffer, so we can do boundary adjustment on them
michael@0: // when adjacent entries are different languages. After that, we add them
michael@0: // all into the document score
michael@0: //
michael@0: // About 50 * 20 = 1000 bytes. OK for stack alloc
michael@0: typedef struct {
michael@0:   int n;
michael@0:   ChunkSummary chunksummary[kMaxSummaries + 1];
michael@0: } SummaryBuffer;
michael@0: 
michael@0: // End private
michael@0: 
michael@0: 
michael@0: // Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating
michael@0: // scoringcontext
michael@0: void ScoreEntireScriptSpan(const LangSpan& scriptspan,
michael@0:                            ScoringContext* scoringcontext,
michael@0:                            DocTote* doc_tote,
michael@0:                            ResultChunkVector* vec);
michael@0: 
michael@0: // Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext
michael@0: void ScoreCJKScriptSpan(const LangSpan& scriptspan,
michael@0:                         ScoringContext* scoringcontext,
michael@0:                         DocTote* doc_tote,
michael@0:                         ResultChunkVector* vec);
michael@0: 
michael@0: // Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext
michael@0: void ScoreQuadScriptSpan(const LangSpan& scriptspan,
michael@0:                          ScoringContext* scoringcontext,
michael@0:                          DocTote* doc_tote,
michael@0:                          ResultChunkVector* vec);
michael@0: 
michael@0: // Score one scriptspan into doc_tote and vec, updating scoringcontext
michael@0: void ScoreOneScriptSpan(const LangSpan& scriptspan,
michael@0:                         ScoringContext* scoringcontext,
michael@0:                         DocTote* doc_tote,
michael@0:                         ResultChunkVector* vec);
michael@0: 
michael@0: }       // End namespace CLD2
michael@0: 
michael@0: #endif  // I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__
michael@0: