browser/components/translation/cld2/internal/scoreonescriptspan.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/browser/components/translation/cld2/internal/scoreonescriptspan.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,297 @@
     1.4 +// Copyright 2013 Google Inc. All Rights Reserved.
     1.5 +//
     1.6 +// Licensed under the Apache License, Version 2.0 (the "License");
     1.7 +// you may not use this file except in compliance with the License.
     1.8 +// You may obtain a copy of the License at
     1.9 +//
    1.10 +//     http://www.apache.org/licenses/LICENSE-2.0
    1.11 +//
    1.12 +// Unless required by applicable law or agreed to in writing, software
    1.13 +// distributed under the License is distributed on an "AS IS" BASIS,
    1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    1.15 +// See the License for the specific language governing permissions and
    1.16 +// limitations under the License.
    1.17 +
    1.18 +//
    1.19 +// Author: dsites@google.com (Dick Sites)
    1.20 +//
    1.21 +//
    1.22 +// Terminology:
    1.23 +// Incoming original text has HTML tags and entities removed, all but letters
    1.24 +// removed, and letters lowercased. Strings of non-letters are mapped to a
    1.25 +// single ASCII space.
    1.26 +//
    1.27 +// One scriptspan has a run of letters/spaces  in a single script. This is the
    1.28 +// fundamental text unit that is scored. There is an optional backmap from
    1.29 +// scriptspan text to the original document text, so that the language ranges
    1.30 +// reported in ResultChunkVector refer to byte ranges inthe original text.
    1.31 +//
    1.32 +// Scripts come in two forms, the full Unicode scripts described by
    1.33 +//   http://www.unicode.org/Public/UNIDATA/Scripts.txt
    1.34 +// and a modified list used exclusively in CLD2. The modified form maps all
    1.35 +// the CJK scripts to one, Hani. The current version description is in
    1.36 +//  i18n/encodings/cld2/builddata/script_summary.txt
    1.37 +// In addition, all non-letters are mapped to the Common script.
    1.38 +//
    1.39 +// ULScript describes this Unicode Letter script.
    1.40 +//
    1.41 +// Scoring uses text nil-grams, uni-grams, bi-grams, quad-grams, and octa-grams.
    1.42 +// Nilgrams (no text lookup at all) are for script-based pseudo-languages and
    1.43 +// for languages that are 1:1 with a given script. Unigrams and bigrams are
    1.44 +// used to score the CJK languages, all in the Hani script. Quadgrams and
    1.45 +// octagrams are used to score all other languages.
    1.46 +//
    1.47 +// RType is the Recognition Type per ulscript.
    1.48 +//
    1.49 +// The scoring tables map various grams to language-probability scores.
    1.50 +// A given gram that hits in scoring table maps to an indirect subscript into
    1.51 +// a list of packed languages and log probabilities.
    1.52 +//
    1.53 +// Languages are stored in two forms: 10-bit values in the Languge enum, and
    1.54 +// shorter 8-bit per-ulscript values in the scoring tables.
    1.55 +//
    1.56 +// Language refers to the full 10-bit range.
    1.57 +// pslang refers to the per-ulscript shorter values.
    1.58 +//
    1.59 +// Log probabilities also come in two forms. The full range uses values 0..255
    1.60 +// to represent minus log base 10th-root-of-2, covering 1 .. 1/2**25.5 or about
    1.61 +// TODO BOGUS description, 24 vs 12
    1.62 +// 1/47.5M. The second form quantizes these into multiples of 8 that can be
    1.63 +// added together to represent probability products. The quantized form uses
    1.64 +// values 24..0 with 0 now least likely instead of most likely, thus making
    1.65 +// larger sums for more probable results. 24 maps to original 1/2**4.8 (~1/28)
    1.66 +// and 0 maps to original 1/2**24.0 (~1/16M).
    1.67 +//
    1.68 +// qprob refers to quantized log probabilities.
    1.69 +//
    1.70 +// langprob is a uint32 holding three 1-byte pslangs and a 1-byte subscript to
    1.71 +// a list of three qprobs. It always nees a companion ulscript
    1.72 +//
    1.73 +// A scriptspan is scored via one or more hitbuffers
    1.74 +
    1.75 +
    1.76 +#ifndef I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__
    1.77 +#define I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__
    1.78 +
    1.79 +#include <stdio.h>
    1.80 +
    1.81 +#include "integral_types.h"           // for uint8 etc.
    1.82 +
    1.83 +#include "cld2tablesummary.h"
    1.84 +#include "compact_lang_det_impl.h"    // for ResultChunkVector
    1.85 +#include "getonescriptspan.h"
    1.86 +#include "langspan.h"
    1.87 +#include "tote.h"
    1.88 +#include "utf8statetable.h"
    1.89 +
    1.90 +namespace CLD2 {
    1.91 +
    1.92 +static const int kMaxBoosts = 4;              // For each of PerScriptLangBoosts
    1.93 +                                              // must be power of two for wrap()
    1.94 +static const int kChunksizeQuads = 20;        // For non-CJK
    1.95 +static const int kChunksizeUnis = 50;         // For CJK
    1.96 +static const int kMaxScoringHits = 1000;
    1.97 +static const int kMaxSummaries = kMaxScoringHits / kChunksizeQuads;
    1.98 +
    1.99 +
   1.100 +// The first four tables are for CJK languages,
   1.101 +// the next three for quadgram languages, and
   1.102 +// the last for expected scores.
   1.103 +typedef struct {
   1.104 +  const UTF8PropObj* unigram_obj;               // 80K CJK characters
   1.105 +  const CLD2TableSummary* unigram_compat_obj;   // 256 CJK lookup probabilities
   1.106 +  const CLD2TableSummary* deltabi_obj;
   1.107 +  const CLD2TableSummary* distinctbi_obj;
   1.108 +
   1.109 +  const CLD2TableSummary* quadgram_obj;         // Primary quadgram lookup table
   1.110 +  const CLD2TableSummary* quadgram_obj2;        // Secondary  "
   1.111 +  const CLD2TableSummary* deltaocta_obj;
   1.112 +  const CLD2TableSummary* distinctocta_obj;
   1.113 +
   1.114 +  const short* kExpectedScore;      // Expected base + delta + distinct score
   1.115 +                                    // per 1KB input
   1.116 +                                    // Subscripted by language and script4
   1.117 +} ScoringTables;
   1.118 +
   1.119 +// Context for boosting several languages
   1.120 +typedef struct {
   1.121 +   int32 n;
   1.122 +   uint32 langprob[kMaxBoosts];
   1.123 +   int wrap(int32 n) {return n & (kMaxBoosts - 1);}
   1.124 +} LangBoosts;
   1.125 +
   1.126 +typedef struct {
   1.127 +   LangBoosts latn;
   1.128 +   LangBoosts othr;
   1.129 +} PerScriptLangBoosts;
   1.130 +
   1.131 +
   1.132 +
   1.133 +// ScoringContext carries state across scriptspans
   1.134 +// ScoringContext also has read-only scoring tables mapping grams to qprobs
   1.135 +typedef struct {
   1.136 +  FILE* debug_file;                   // Non-NULL if debug output wanted
   1.137 +  bool flags_cld2_score_as_quads;
   1.138 +  bool flags_cld2_html;
   1.139 +  bool flags_cld2_cr;
   1.140 +  bool flags_cld2_verbose;
   1.141 +  ULScript ulscript;        // langprobs below are with respect to this script
   1.142 +  Language prior_chunk_lang;          // Mostly for debug output
   1.143 +  // boost has a packed set of per-script langs and probabilites
   1.144 +  // whack has a per-script lang to be suppressed from ever scoring (zeroed)
   1.145 +  // When a language in a close set is given as an explicit hint, others in
   1.146 +  //  that set will be whacked.
   1.147 +  PerScriptLangBoosts langprior_boost;  // From http content-lang or meta lang=
   1.148 +  PerScriptLangBoosts langprior_whack;  // From http content-lang or meta lang=
   1.149 +  PerScriptLangBoosts distinct_boost;   // From distinctive letter groups
   1.150 +  int oldest_distinct_boost;          // Subscript in hitbuffer of oldest
   1.151 +                                      // distinct score to use
   1.152 +  const ScoringTables* scoringtables; // Probability lookup tables
   1.153 +  ScriptScanner* scanner;             // For ResultChunkVector backmap
   1.154 +
   1.155 +  // Inits boosts
   1.156 +  void init() {
   1.157 +    memset(&langprior_boost, 0, sizeof(langprior_boost));
   1.158 +    memset(&langprior_whack, 0, sizeof(langprior_whack));
   1.159 +    memset(&distinct_boost, 0, sizeof(distinct_boost));
   1.160 +  };
   1.161 +} ScoringContext;
   1.162 +
   1.163 +
   1.164 +
   1.165 +// Begin private
   1.166 +
   1.167 +// Holds one scoring-table lookup hit. We hold indirect subscript instead of
   1.168 +// langprob to allow a single hit to use a variable number of langprobs.
   1.169 +typedef struct {
   1.170 +  int offset;         // First byte of quad/octa etc. in scriptspan
   1.171 +  int indirect;       // subscript of langprobs in scoring table
   1.172 +} ScoringHit;
   1.173 +
   1.174 +typedef enum {
   1.175 +  UNIHIT                       = 0,
   1.176 +  QUADHIT                      = 1,
   1.177 +  DELTAHIT                     = 2,
   1.178 +  DISTINCTHIT                  = 3
   1.179 +} LinearHitType;
   1.180 +
   1.181 +// Holds one scoring-table lookup hit resolved into a langprob.
   1.182 +typedef struct {
   1.183 +  uint16 offset;      // First byte of quad/octa etc. in scriptspan
   1.184 +  uint16 type;        // LinearHitType
   1.185 +  uint32 langprob;    // langprob from scoring table
   1.186 +} LangprobHit;
   1.187 +
   1.188 +// Holds arrays of scoring-table lookup hits for (part of) a scriptspan
   1.189 +typedef struct {
   1.190 +  ULScript ulscript;        // langprobs below are with respect to this script
   1.191 +  int maxscoringhits;       // determines size of arrays below
   1.192 +  int next_base;            // First unused entry in each array
   1.193 +  int next_delta;           //   "
   1.194 +  int next_distinct;        //   "
   1.195 +  int next_linear;          //   "
   1.196 +  int next_chunk_start;     // First unused chunk_start entry
   1.197 +  int lowest_offset;        // First byte of text span used to fill hitbuffer
   1.198 +  // Dummy entry at the end of each giving offset of first unused text byte
   1.199 +  ScoringHit base[kMaxScoringHits + 1];         // Uni/quad hits
   1.200 +  ScoringHit delta[kMaxScoringHits + 1];        // delta-bi/delta-octa hits
   1.201 +  ScoringHit distinct[kMaxScoringHits + 1];     // distinct-word hits
   1.202 +  LangprobHit linear[4 * kMaxScoringHits + 1];  // Above three merge-sorted
   1.203 +                                                // (4: some bases => 2 linear)
   1.204 +  int chunk_start[kMaxSummaries + 1];           // First linear[] subscr of
   1.205 +                                                //  each scored chunk
   1.206 +  int chunk_offset[kMaxSummaries + 1];          // First text subscr of
   1.207 +                                                //  each scored chunk
   1.208 +
   1.209 +  void init() {
   1.210 +    ulscript = ULScript_Common;
   1.211 +    maxscoringhits = kMaxScoringHits;
   1.212 +    next_base = 0;
   1.213 +    next_delta = 0;
   1.214 +    next_distinct = 0;
   1.215 +    next_linear = 0;
   1.216 +    next_chunk_start = 0;
   1.217 +    lowest_offset = 0;
   1.218 +    base[0].offset = 0;
   1.219 +    base[0].indirect = 0;
   1.220 +    delta[0].offset = 0;
   1.221 +    delta[0].indirect = 0;
   1.222 +    distinct[0].offset = 0;
   1.223 +    distinct[0].indirect = 0;
   1.224 +    linear[0].offset = 0;
   1.225 +    linear[0].langprob = 0;
   1.226 +    chunk_start[0] = 0;
   1.227 +    chunk_offset[0] = 0;
   1.228 +  };
   1.229 +} ScoringHitBuffer;
   1.230 +
   1.231 +// TODO: Explain here why we need both ChunkSpan and ChunkSummary
   1.232 +typedef struct {
   1.233 +  int chunk_base;       // Subscript of first hitbuffer.base[] in chunk
   1.234 +  int chunk_delta;      // Subscript of first hitbuffer.delta[]
   1.235 +  int chunk_distinct;   // Subscript of first hitbuffer.distinct[]
   1.236 +  int base_len;         // Number of hitbuffer.base[] in chunk
   1.237 +  int delta_len;        // Number of hitbuffer.delta[] in chunk
   1.238 +  int distinct_len;     // Number of hitbuffer.distinct[] in chunk
   1.239 +} ChunkSpan;
   1.240 +
   1.241 +
   1.242 +// Packed into 20 bytes for space
   1.243 +typedef struct {
   1.244 +  uint16 offset;              // Text offset within current scriptspan.text
   1.245 +  uint16 chunk_start;         // Scoring subscr within hitbuffer->linear[]
   1.246 +  uint16 lang1;               // Top lang, mapped to full Language
   1.247 +  uint16 lang2;               // Second lang, mapped to full Language
   1.248 +  uint16 score1;              // Top lang raw score
   1.249 +  uint16 score2;              // Second lang raw score
   1.250 +  uint16 bytes;               // Number of lower letters bytes in chunk
   1.251 +  uint16 grams;               // Number of scored base quad- uni-grams in chunk
   1.252 +  uint16 ulscript;            // ULScript of chunk
   1.253 +  uint8 reliability_delta;    // Reliability 0..100, delta top:second scores
   1.254 +  uint8 reliability_score;    // Reliability 0..100, top:expected score
   1.255 +} ChunkSummary;
   1.256 +
   1.257 +
   1.258 +// We buffer up ~50 chunk summaries, corresponding to chunks of 20 quads in a
   1.259 +// 1000-quad hit buffer, so we can do boundary adjustment on them
   1.260 +// when adjacent entries are different languages. After that, we add them
   1.261 +// all into the document score
   1.262 +//
   1.263 +// About 50 * 20 = 1000 bytes. OK for stack alloc
   1.264 +typedef struct {
   1.265 +  int n;
   1.266 +  ChunkSummary chunksummary[kMaxSummaries + 1];
   1.267 +} SummaryBuffer;
   1.268 +
   1.269 +// End private
   1.270 +
   1.271 +
   1.272 +// Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating
   1.273 +// scoringcontext
   1.274 +void ScoreEntireScriptSpan(const LangSpan& scriptspan,
   1.275 +                           ScoringContext* scoringcontext,
   1.276 +                           DocTote* doc_tote,
   1.277 +                           ResultChunkVector* vec);
   1.278 +
   1.279 +// Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext
   1.280 +void ScoreCJKScriptSpan(const LangSpan& scriptspan,
   1.281 +                        ScoringContext* scoringcontext,
   1.282 +                        DocTote* doc_tote,
   1.283 +                        ResultChunkVector* vec);
   1.284 +
   1.285 +// Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext
   1.286 +void ScoreQuadScriptSpan(const LangSpan& scriptspan,
   1.287 +                         ScoringContext* scoringcontext,
   1.288 +                         DocTote* doc_tote,
   1.289 +                         ResultChunkVector* vec);
   1.290 +
   1.291 +// Score one scriptspan into doc_tote and vec, updating scoringcontext
   1.292 +void ScoreOneScriptSpan(const LangSpan& scriptspan,
   1.293 +                        ScoringContext* scoringcontext,
   1.294 +                        DocTote* doc_tote,
   1.295 +                        ResultChunkVector* vec);
   1.296 +
   1.297 +}       // End namespace CLD2
   1.298 +
   1.299 +#endif  // I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__
   1.300 +

mercurial