browser/components/translation/cld2/internal/scoreonescriptspan.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 // Copyright 2013 Google Inc. All Rights Reserved.
michael@0 2 //
michael@0 3 // Licensed under the Apache License, Version 2.0 (the "License");
michael@0 4 // you may not use this file except in compliance with the License.
michael@0 5 // You may obtain a copy of the License at
michael@0 6 //
michael@0 7 // http://www.apache.org/licenses/LICENSE-2.0
michael@0 8 //
michael@0 9 // Unless required by applicable law or agreed to in writing, software
michael@0 10 // distributed under the License is distributed on an "AS IS" BASIS,
michael@0 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0 12 // See the License for the specific language governing permissions and
michael@0 13 // limitations under the License.
michael@0 14
michael@0 15 //
michael@0 16 // Author: dsites@google.com (Dick Sites)
michael@0 17 //
michael@0 18 //
michael@0 19 // Terminology:
michael@0 20 // Incoming original text has HTML tags and entities removed, all but letters
michael@0 21 // removed, and letters lowercased. Strings of non-letters are mapped to a
michael@0 22 // single ASCII space.
michael@0 23 //
michael@0 24 // One scriptspan has a run of letters/spaces in a single script. This is the
michael@0 25 // fundamental text unit that is scored. There is an optional backmap from
michael@0 26 // scriptspan text to the original document text, so that the language ranges
michael@0 27 // reported in ResultChunkVector refer to byte ranges inthe original text.
michael@0 28 //
michael@0 29 // Scripts come in two forms, the full Unicode scripts described by
michael@0 30 // http://www.unicode.org/Public/UNIDATA/Scripts.txt
michael@0 31 // and a modified list used exclusively in CLD2. The modified form maps all
michael@0 32 // the CJK scripts to one, Hani. The current version description is in
michael@0 33 // i18n/encodings/cld2/builddata/script_summary.txt
michael@0 34 // In addition, all non-letters are mapped to the Common script.
michael@0 35 //
michael@0 36 // ULScript describes this Unicode Letter script.
michael@0 37 //
michael@0 38 // Scoring uses text nil-grams, uni-grams, bi-grams, quad-grams, and octa-grams.
michael@0 39 // Nilgrams (no text lookup at all) are for script-based pseudo-languages and
michael@0 40 // for languages that are 1:1 with a given script. Unigrams and bigrams are
michael@0 41 // used to score the CJK languages, all in the Hani script. Quadgrams and
michael@0 42 // octagrams are used to score all other languages.
michael@0 43 //
michael@0 44 // RType is the Recognition Type per ulscript.
michael@0 45 //
michael@0 46 // The scoring tables map various grams to language-probability scores.
michael@0 47 // A given gram that hits in scoring table maps to an indirect subscript into
michael@0 48 // a list of packed languages and log probabilities.
michael@0 49 //
michael@0 50 // Languages are stored in two forms: 10-bit values in the Languge enum, and
michael@0 51 // shorter 8-bit per-ulscript values in the scoring tables.
michael@0 52 //
michael@0 53 // Language refers to the full 10-bit range.
michael@0 54 // pslang refers to the per-ulscript shorter values.
michael@0 55 //
michael@0 56 // Log probabilities also come in two forms. The full range uses values 0..255
michael@0 57 // to represent minus log base 10th-root-of-2, covering 1 .. 1/2**25.5 or about
michael@0 58 // TODO BOGUS description, 24 vs 12
michael@0 59 // 1/47.5M. The second form quantizes these into multiples of 8 that can be
michael@0 60 // added together to represent probability products. The quantized form uses
michael@0 61 // values 24..0 with 0 now least likely instead of most likely, thus making
michael@0 62 // larger sums for more probable results. 24 maps to original 1/2**4.8 (~1/28)
michael@0 63 // and 0 maps to original 1/2**24.0 (~1/16M).
michael@0 64 //
michael@0 65 // qprob refers to quantized log probabilities.
michael@0 66 //
michael@0 67 // langprob is a uint32 holding three 1-byte pslangs and a 1-byte subscript to
michael@0 68 // a list of three qprobs. It always nees a companion ulscript
michael@0 69 //
michael@0 70 // A scriptspan is scored via one or more hitbuffers
michael@0 71
michael@0 72
michael@0 73 #ifndef I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__
michael@0 74 #define I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__
michael@0 75
michael@0 76 #include <stdio.h>
michael@0 77
michael@0 78 #include "integral_types.h" // for uint8 etc.
michael@0 79
michael@0 80 #include "cld2tablesummary.h"
michael@0 81 #include "compact_lang_det_impl.h" // for ResultChunkVector
michael@0 82 #include "getonescriptspan.h"
michael@0 83 #include "langspan.h"
michael@0 84 #include "tote.h"
michael@0 85 #include "utf8statetable.h"
michael@0 86
michael@0 87 namespace CLD2 {
michael@0 88
michael@0 89 static const int kMaxBoosts = 4; // For each of PerScriptLangBoosts
michael@0 90 // must be power of two for wrap()
michael@0 91 static const int kChunksizeQuads = 20; // For non-CJK
michael@0 92 static const int kChunksizeUnis = 50; // For CJK
michael@0 93 static const int kMaxScoringHits = 1000;
michael@0 94 static const int kMaxSummaries = kMaxScoringHits / kChunksizeQuads;
michael@0 95
michael@0 96
michael@0 97 // The first four tables are for CJK languages,
michael@0 98 // the next three for quadgram languages, and
michael@0 99 // the last for expected scores.
michael@0 100 typedef struct {
michael@0 101 const UTF8PropObj* unigram_obj; // 80K CJK characters
michael@0 102 const CLD2TableSummary* unigram_compat_obj; // 256 CJK lookup probabilities
michael@0 103 const CLD2TableSummary* deltabi_obj;
michael@0 104 const CLD2TableSummary* distinctbi_obj;
michael@0 105
michael@0 106 const CLD2TableSummary* quadgram_obj; // Primary quadgram lookup table
michael@0 107 const CLD2TableSummary* quadgram_obj2; // Secondary "
michael@0 108 const CLD2TableSummary* deltaocta_obj;
michael@0 109 const CLD2TableSummary* distinctocta_obj;
michael@0 110
michael@0 111 const short* kExpectedScore; // Expected base + delta + distinct score
michael@0 112 // per 1KB input
michael@0 113 // Subscripted by language and script4
michael@0 114 } ScoringTables;
michael@0 115
michael@0 116 // Context for boosting several languages
michael@0 117 typedef struct {
michael@0 118 int32 n;
michael@0 119 uint32 langprob[kMaxBoosts];
michael@0 120 int wrap(int32 n) {return n & (kMaxBoosts - 1);}
michael@0 121 } LangBoosts;
michael@0 122
michael@0 123 typedef struct {
michael@0 124 LangBoosts latn;
michael@0 125 LangBoosts othr;
michael@0 126 } PerScriptLangBoosts;
michael@0 127
michael@0 128
michael@0 129
michael@0 130 // ScoringContext carries state across scriptspans
michael@0 131 // ScoringContext also has read-only scoring tables mapping grams to qprobs
michael@0 132 typedef struct {
michael@0 133 FILE* debug_file; // Non-NULL if debug output wanted
michael@0 134 bool flags_cld2_score_as_quads;
michael@0 135 bool flags_cld2_html;
michael@0 136 bool flags_cld2_cr;
michael@0 137 bool flags_cld2_verbose;
michael@0 138 ULScript ulscript; // langprobs below are with respect to this script
michael@0 139 Language prior_chunk_lang; // Mostly for debug output
michael@0 140 // boost has a packed set of per-script langs and probabilites
michael@0 141 // whack has a per-script lang to be suppressed from ever scoring (zeroed)
michael@0 142 // When a language in a close set is given as an explicit hint, others in
michael@0 143 // that set will be whacked.
michael@0 144 PerScriptLangBoosts langprior_boost; // From http content-lang or meta lang=
michael@0 145 PerScriptLangBoosts langprior_whack; // From http content-lang or meta lang=
michael@0 146 PerScriptLangBoosts distinct_boost; // From distinctive letter groups
michael@0 147 int oldest_distinct_boost; // Subscript in hitbuffer of oldest
michael@0 148 // distinct score to use
michael@0 149 const ScoringTables* scoringtables; // Probability lookup tables
michael@0 150 ScriptScanner* scanner; // For ResultChunkVector backmap
michael@0 151
michael@0 152 // Inits boosts
michael@0 153 void init() {
michael@0 154 memset(&langprior_boost, 0, sizeof(langprior_boost));
michael@0 155 memset(&langprior_whack, 0, sizeof(langprior_whack));
michael@0 156 memset(&distinct_boost, 0, sizeof(distinct_boost));
michael@0 157 };
michael@0 158 } ScoringContext;
michael@0 159
michael@0 160
michael@0 161
michael@0 162 // Begin private
michael@0 163
michael@0 164 // Holds one scoring-table lookup hit. We hold indirect subscript instead of
michael@0 165 // langprob to allow a single hit to use a variable number of langprobs.
michael@0 166 typedef struct {
michael@0 167 int offset; // First byte of quad/octa etc. in scriptspan
michael@0 168 int indirect; // subscript of langprobs in scoring table
michael@0 169 } ScoringHit;
michael@0 170
michael@0 171 typedef enum {
michael@0 172 UNIHIT = 0,
michael@0 173 QUADHIT = 1,
michael@0 174 DELTAHIT = 2,
michael@0 175 DISTINCTHIT = 3
michael@0 176 } LinearHitType;
michael@0 177
michael@0 178 // Holds one scoring-table lookup hit resolved into a langprob.
michael@0 179 typedef struct {
michael@0 180 uint16 offset; // First byte of quad/octa etc. in scriptspan
michael@0 181 uint16 type; // LinearHitType
michael@0 182 uint32 langprob; // langprob from scoring table
michael@0 183 } LangprobHit;
michael@0 184
michael@0 185 // Holds arrays of scoring-table lookup hits for (part of) a scriptspan
michael@0 186 typedef struct {
michael@0 187 ULScript ulscript; // langprobs below are with respect to this script
michael@0 188 int maxscoringhits; // determines size of arrays below
michael@0 189 int next_base; // First unused entry in each array
michael@0 190 int next_delta; // "
michael@0 191 int next_distinct; // "
michael@0 192 int next_linear; // "
michael@0 193 int next_chunk_start; // First unused chunk_start entry
michael@0 194 int lowest_offset; // First byte of text span used to fill hitbuffer
michael@0 195 // Dummy entry at the end of each giving offset of first unused text byte
michael@0 196 ScoringHit base[kMaxScoringHits + 1]; // Uni/quad hits
michael@0 197 ScoringHit delta[kMaxScoringHits + 1]; // delta-bi/delta-octa hits
michael@0 198 ScoringHit distinct[kMaxScoringHits + 1]; // distinct-word hits
michael@0 199 LangprobHit linear[4 * kMaxScoringHits + 1]; // Above three merge-sorted
michael@0 200 // (4: some bases => 2 linear)
michael@0 201 int chunk_start[kMaxSummaries + 1]; // First linear[] subscr of
michael@0 202 // each scored chunk
michael@0 203 int chunk_offset[kMaxSummaries + 1]; // First text subscr of
michael@0 204 // each scored chunk
michael@0 205
michael@0 206 void init() {
michael@0 207 ulscript = ULScript_Common;
michael@0 208 maxscoringhits = kMaxScoringHits;
michael@0 209 next_base = 0;
michael@0 210 next_delta = 0;
michael@0 211 next_distinct = 0;
michael@0 212 next_linear = 0;
michael@0 213 next_chunk_start = 0;
michael@0 214 lowest_offset = 0;
michael@0 215 base[0].offset = 0;
michael@0 216 base[0].indirect = 0;
michael@0 217 delta[0].offset = 0;
michael@0 218 delta[0].indirect = 0;
michael@0 219 distinct[0].offset = 0;
michael@0 220 distinct[0].indirect = 0;
michael@0 221 linear[0].offset = 0;
michael@0 222 linear[0].langprob = 0;
michael@0 223 chunk_start[0] = 0;
michael@0 224 chunk_offset[0] = 0;
michael@0 225 };
michael@0 226 } ScoringHitBuffer;
michael@0 227
michael@0 228 // TODO: Explain here why we need both ChunkSpan and ChunkSummary
michael@0 229 typedef struct {
michael@0 230 int chunk_base; // Subscript of first hitbuffer.base[] in chunk
michael@0 231 int chunk_delta; // Subscript of first hitbuffer.delta[]
michael@0 232 int chunk_distinct; // Subscript of first hitbuffer.distinct[]
michael@0 233 int base_len; // Number of hitbuffer.base[] in chunk
michael@0 234 int delta_len; // Number of hitbuffer.delta[] in chunk
michael@0 235 int distinct_len; // Number of hitbuffer.distinct[] in chunk
michael@0 236 } ChunkSpan;
michael@0 237
michael@0 238
michael@0 239 // Packed into 20 bytes for space
michael@0 240 typedef struct {
michael@0 241 uint16 offset; // Text offset within current scriptspan.text
michael@0 242 uint16 chunk_start; // Scoring subscr within hitbuffer->linear[]
michael@0 243 uint16 lang1; // Top lang, mapped to full Language
michael@0 244 uint16 lang2; // Second lang, mapped to full Language
michael@0 245 uint16 score1; // Top lang raw score
michael@0 246 uint16 score2; // Second lang raw score
michael@0 247 uint16 bytes; // Number of lower letters bytes in chunk
michael@0 248 uint16 grams; // Number of scored base quad- uni-grams in chunk
michael@0 249 uint16 ulscript; // ULScript of chunk
michael@0 250 uint8 reliability_delta; // Reliability 0..100, delta top:second scores
michael@0 251 uint8 reliability_score; // Reliability 0..100, top:expected score
michael@0 252 } ChunkSummary;
michael@0 253
michael@0 254
michael@0 255 // We buffer up ~50 chunk summaries, corresponding to chunks of 20 quads in a
michael@0 256 // 1000-quad hit buffer, so we can do boundary adjustment on them
michael@0 257 // when adjacent entries are different languages. After that, we add them
michael@0 258 // all into the document score
michael@0 259 //
michael@0 260 // About 50 * 20 = 1000 bytes. OK for stack alloc
michael@0 261 typedef struct {
michael@0 262 int n;
michael@0 263 ChunkSummary chunksummary[kMaxSummaries + 1];
michael@0 264 } SummaryBuffer;
michael@0 265
michael@0 266 // End private
michael@0 267
michael@0 268
michael@0 269 // Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating
michael@0 270 // scoringcontext
michael@0 271 void ScoreEntireScriptSpan(const LangSpan& scriptspan,
michael@0 272 ScoringContext* scoringcontext,
michael@0 273 DocTote* doc_tote,
michael@0 274 ResultChunkVector* vec);
michael@0 275
michael@0 276 // Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext
michael@0 277 void ScoreCJKScriptSpan(const LangSpan& scriptspan,
michael@0 278 ScoringContext* scoringcontext,
michael@0 279 DocTote* doc_tote,
michael@0 280 ResultChunkVector* vec);
michael@0 281
michael@0 282 // Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext
michael@0 283 void ScoreQuadScriptSpan(const LangSpan& scriptspan,
michael@0 284 ScoringContext* scoringcontext,
michael@0 285 DocTote* doc_tote,
michael@0 286 ResultChunkVector* vec);
michael@0 287
michael@0 288 // Score one scriptspan into doc_tote and vec, updating scoringcontext
michael@0 289 void ScoreOneScriptSpan(const LangSpan& scriptspan,
michael@0 290 ScoringContext* scoringcontext,
michael@0 291 DocTote* doc_tote,
michael@0 292 ResultChunkVector* vec);
michael@0 293
michael@0 294 } // End namespace CLD2
michael@0 295
michael@0 296 #endif // I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__
michael@0 297

mercurial