The Tor Browser: browser/components/translation/cld2/internal/scoreonescriptspan.h@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.

     2 //

     3 // Licensed under the Apache License, Version 2.0 (the "License");

     4 // you may not use this file except in compliance with the License.

     5 // You may obtain a copy of the License at

     6 //

     7 //     http://www.apache.org/licenses/LICENSE-2.0

     8 //

     9 // Unless required by applicable law or agreed to in writing, software

    10 // distributed under the License is distributed on an "AS IS" BASIS,

    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

    12 // See the License for the specific language governing permissions and

    13 // limitations under the License.

    15 //

    16 // Author: dsites@google.com (Dick Sites)

    17 //

    18 //

    19 // Terminology:

    20 // Incoming original text has HTML tags and entities removed, all but letters

    21 // removed, and letters lowercased. Strings of non-letters are mapped to a

    22 // single ASCII space.

    23 //

    24 // One scriptspan has a run of letters/spaces  in a single script. This is the

    25 // fundamental text unit that is scored. There is an optional backmap from

    26 // scriptspan text to the original document text, so that the language ranges

    27 // reported in ResultChunkVector refer to byte ranges inthe original text.

    28 //

    29 // Scripts come in two forms, the full Unicode scripts described by

    30 //   http://www.unicode.org/Public/UNIDATA/Scripts.txt

    31 // and a modified list used exclusively in CLD2. The modified form maps all

    32 // the CJK scripts to one, Hani. The current version description is in

    33 //  i18n/encodings/cld2/builddata/script_summary.txt

    34 // In addition, all non-letters are mapped to the Common script.

    35 //

    36 // ULScript describes this Unicode Letter script.

    37 //

    38 // Scoring uses text nil-grams, uni-grams, bi-grams, quad-grams, and octa-grams.

    39 // Nilgrams (no text lookup at all) are for script-based pseudo-languages and

    40 // for languages that are 1:1 with a given script. Unigrams and bigrams are

    41 // used to score the CJK languages, all in the Hani script. Quadgrams and

    42 // octagrams are used to score all other languages.

    43 //

    44 // RType is the Recognition Type per ulscript.

    45 //

    46 // The scoring tables map various grams to language-probability scores.

    47 // A given gram that hits in scoring table maps to an indirect subscript into

    48 // a list of packed languages and log probabilities.

    49 //

    50 // Languages are stored in two forms: 10-bit values in the Languge enum, and

    51 // shorter 8-bit per-ulscript values in the scoring tables.

    52 //

    53 // Language refers to the full 10-bit range.

    54 // pslang refers to the per-ulscript shorter values.

    55 //

    56 // Log probabilities also come in two forms. The full range uses values 0..255

    57 // to represent minus log base 10th-root-of-2, covering 1 .. 1/2**25.5 or about

    58 // TODO BOGUS description, 24 vs 12

    59 // 1/47.5M. The second form quantizes these into multiples of 8 that can be

    60 // added together to represent probability products. The quantized form uses

    61 // values 24..0 with 0 now least likely instead of most likely, thus making

    62 // larger sums for more probable results. 24 maps to original 1/2**4.8 (~1/28)

    63 // and 0 maps to original 1/2**24.0 (~1/16M).

    64 //

    65 // qprob refers to quantized log probabilities.

    66 //

    67 // langprob is a uint32 holding three 1-byte pslangs and a 1-byte subscript to

    68 // a list of three qprobs. It always nees a companion ulscript

    69 //

    70 // A scriptspan is scored via one or more hitbuffers

    73 #ifndef I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__

    74 #define I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__

    76 #include <stdio.h>

    78 #include "integral_types.h"           // for uint8 etc.

    80 #include "cld2tablesummary.h"

    81 #include "compact_lang_det_impl.h"    // for ResultChunkVector

    82 #include "getonescriptspan.h"

    83 #include "langspan.h"

    84 #include "tote.h"

    85 #include "utf8statetable.h"

    87 namespace CLD2 {

    89 static const int kMaxBoosts = 4;              // For each of PerScriptLangBoosts

    90                                               // must be power of two for wrap()

    91 static const int kChunksizeQuads = 20;        // For non-CJK

    92 static const int kChunksizeUnis = 50;         // For CJK

    93 static const int kMaxScoringHits = 1000;

    94 static const int kMaxSummaries = kMaxScoringHits / kChunksizeQuads;

    97 // The first four tables are for CJK languages,

    98 // the next three for quadgram languages, and

    99 // the last for expected scores.

   100 typedef struct {

   101   const UTF8PropObj* unigram_obj;               // 80K CJK characters

   102   const CLD2TableSummary* unigram_compat_obj;   // 256 CJK lookup probabilities

   103   const CLD2TableSummary* deltabi_obj;

   104   const CLD2TableSummary* distinctbi_obj;

   106   const CLD2TableSummary* quadgram_obj;         // Primary quadgram lookup table

   107   const CLD2TableSummary* quadgram_obj2;        // Secondary  "

   108   const CLD2TableSummary* deltaocta_obj;

   109   const CLD2TableSummary* distinctocta_obj;

   111   const short* kExpectedScore;      // Expected base + delta + distinct score

   112                                     // per 1KB input

   113                                     // Subscripted by language and script4

   114 } ScoringTables;

   116 // Context for boosting several languages

   117 typedef struct {

   118    int32 n;

   119    uint32 langprob[kMaxBoosts];

   120    int wrap(int32 n) {return n & (kMaxBoosts - 1);}

   121 } LangBoosts;

   123 typedef struct {

   124    LangBoosts latn;

   125    LangBoosts othr;

   126 } PerScriptLangBoosts;

   130 // ScoringContext carries state across scriptspans

   131 // ScoringContext also has read-only scoring tables mapping grams to qprobs

   132 typedef struct {

   133   FILE* debug_file;                   // Non-NULL if debug output wanted

   134   bool flags_cld2_score_as_quads;

   135   bool flags_cld2_html;

   136   bool flags_cld2_cr;

   137   bool flags_cld2_verbose;

   138   ULScript ulscript;        // langprobs below are with respect to this script

   139   Language prior_chunk_lang;          // Mostly for debug output

   140   // boost has a packed set of per-script langs and probabilites

   141   // whack has a per-script lang to be suppressed from ever scoring (zeroed)

   142   // When a language in a close set is given as an explicit hint, others in

   143   //  that set will be whacked.

   144   PerScriptLangBoosts langprior_boost;  // From http content-lang or meta lang=

   145   PerScriptLangBoosts langprior_whack;  // From http content-lang or meta lang=

   146   PerScriptLangBoosts distinct_boost;   // From distinctive letter groups

   147   int oldest_distinct_boost;          // Subscript in hitbuffer of oldest

   148                                       // distinct score to use

   149   const ScoringTables* scoringtables; // Probability lookup tables

   150   ScriptScanner* scanner;             // For ResultChunkVector backmap

   152   // Inits boosts

   153   void init() {

   154     memset(&langprior_boost, 0, sizeof(langprior_boost));

   155     memset(&langprior_whack, 0, sizeof(langprior_whack));

   156     memset(&distinct_boost, 0, sizeof(distinct_boost));

   157   };

   158 } ScoringContext;

   162 // Begin private

   164 // Holds one scoring-table lookup hit. We hold indirect subscript instead of

   165 // langprob to allow a single hit to use a variable number of langprobs.

   166 typedef struct {

   167   int offset;         // First byte of quad/octa etc. in scriptspan

   168   int indirect;       // subscript of langprobs in scoring table

   169 } ScoringHit;

   171 typedef enum {

   172   UNIHIT                       = 0,

   173   QUADHIT                      = 1,

   174   DELTAHIT                     = 2,

   175   DISTINCTHIT                  = 3

   176 } LinearHitType;

   178 // Holds one scoring-table lookup hit resolved into a langprob.

   179 typedef struct {

   180   uint16 offset;      // First byte of quad/octa etc. in scriptspan

   181   uint16 type;        // LinearHitType

   182   uint32 langprob;    // langprob from scoring table

   183 } LangprobHit;

   185 // Holds arrays of scoring-table lookup hits for (part of) a scriptspan

   186 typedef struct {

   187   ULScript ulscript;        // langprobs below are with respect to this script

   188   int maxscoringhits;       // determines size of arrays below

   189   int next_base;            // First unused entry in each array

   190   int next_delta;           //   "

   191   int next_distinct;        //   "

   192   int next_linear;          //   "

   193   int next_chunk_start;     // First unused chunk_start entry

   194   int lowest_offset;        // First byte of text span used to fill hitbuffer

   195   // Dummy entry at the end of each giving offset of first unused text byte

   196   ScoringHit base[kMaxScoringHits + 1];         // Uni/quad hits

   197   ScoringHit delta[kMaxScoringHits + 1];        // delta-bi/delta-octa hits

   198   ScoringHit distinct[kMaxScoringHits + 1];     // distinct-word hits

   199   LangprobHit linear[4 * kMaxScoringHits + 1];  // Above three merge-sorted

   200                                                 // (4: some bases => 2 linear)

   201   int chunk_start[kMaxSummaries + 1];           // First linear[] subscr of

   202                                                 //  each scored chunk

   203   int chunk_offset[kMaxSummaries + 1];          // First text subscr of

   204                                                 //  each scored chunk

   206   void init() {

   207     ulscript = ULScript_Common;

   208     maxscoringhits = kMaxScoringHits;

   209     next_base = 0;

   210     next_delta = 0;

   211     next_distinct = 0;

   212     next_linear = 0;

   213     next_chunk_start = 0;

   214     lowest_offset = 0;

   215     base[0].offset = 0;

   216     base[0].indirect = 0;

   217     delta[0].offset = 0;

   218     delta[0].indirect = 0;

   219     distinct[0].offset = 0;

   220     distinct[0].indirect = 0;

   221     linear[0].offset = 0;

   222     linear[0].langprob = 0;

   223     chunk_start[0] = 0;

   224     chunk_offset[0] = 0;

   225   };

   226 } ScoringHitBuffer;

   228 // TODO: Explain here why we need both ChunkSpan and ChunkSummary

   229 typedef struct {

   230   int chunk_base;       // Subscript of first hitbuffer.base[] in chunk

   231   int chunk_delta;      // Subscript of first hitbuffer.delta[]

   232   int chunk_distinct;   // Subscript of first hitbuffer.distinct[]

   233   int base_len;         // Number of hitbuffer.base[] in chunk

   234   int delta_len;        // Number of hitbuffer.delta[] in chunk

   235   int distinct_len;     // Number of hitbuffer.distinct[] in chunk

   236 } ChunkSpan;

   239 // Packed into 20 bytes for space

   240 typedef struct {

   241   uint16 offset;              // Text offset within current scriptspan.text

   242   uint16 chunk_start;         // Scoring subscr within hitbuffer->linear[]

   243   uint16 lang1;               // Top lang, mapped to full Language

   244   uint16 lang2;               // Second lang, mapped to full Language

   245   uint16 score1;              // Top lang raw score

   246   uint16 score2;              // Second lang raw score

   247   uint16 bytes;               // Number of lower letters bytes in chunk

   248   uint16 grams;               // Number of scored base quad- uni-grams in chunk

   249   uint16 ulscript;            // ULScript of chunk

   250   uint8 reliability_delta;    // Reliability 0..100, delta top:second scores

   251   uint8 reliability_score;    // Reliability 0..100, top:expected score

   252 } ChunkSummary;

   255 // We buffer up ~50 chunk summaries, corresponding to chunks of 20 quads in a

   256 // 1000-quad hit buffer, so we can do boundary adjustment on them

   257 // when adjacent entries are different languages. After that, we add them

   258 // all into the document score

   259 //

   260 // About 50 * 20 = 1000 bytes. OK for stack alloc

   261 typedef struct {

   262   int n;

   263   ChunkSummary chunksummary[kMaxSummaries + 1];

   264 } SummaryBuffer;

   266 // End private

   269 // Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating

   270 // scoringcontext

   271 void ScoreEntireScriptSpan(const LangSpan& scriptspan,

   272                            ScoringContext* scoringcontext,

   273                            DocTote* doc_tote,

   274                            ResultChunkVector* vec);

   276 // Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext

   277 void ScoreCJKScriptSpan(const LangSpan& scriptspan,

   278                         ScoringContext* scoringcontext,

   279                         DocTote* doc_tote,

   280                         ResultChunkVector* vec);

   282 // Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext

   283 void ScoreQuadScriptSpan(const LangSpan& scriptspan,

   284                          ScoringContext* scoringcontext,

   285                          DocTote* doc_tote,

   286                          ResultChunkVector* vec);

   288 // Score one scriptspan into doc_tote and vec, updating scoringcontext

   289 void ScoreOneScriptSpan(const LangSpan& scriptspan,

   290                         ScoringContext* scoringcontext,

   291                         DocTote* doc_tote,

   292                         ResultChunkVector* vec);

   294 }       // End namespace CLD2

   296 #endif  // I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__

The Tor Browser / file revision

browser/components/translation/cld2/internal/scoreonescriptspan.h@6474c204b198

browser/components/translation/cld2/internal/scoreonescriptspan.h