1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/internal/scoreonescriptspan.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,297 @@ 1.4 +// Copyright 2013 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +// 1.19 +// Author: dsites@google.com (Dick Sites) 1.20 +// 1.21 +// 1.22 +// Terminology: 1.23 +// Incoming original text has HTML tags and entities removed, all but letters 1.24 +// removed, and letters lowercased. Strings of non-letters are mapped to a 1.25 +// single ASCII space. 1.26 +// 1.27 +// One scriptspan has a run of letters/spaces in a single script. This is the 1.28 +// fundamental text unit that is scored. There is an optional backmap from 1.29 +// scriptspan text to the original document text, so that the language ranges 1.30 +// reported in ResultChunkVector refer to byte ranges inthe original text. 1.31 +// 1.32 +// Scripts come in two forms, the full Unicode scripts described by 1.33 +// http://www.unicode.org/Public/UNIDATA/Scripts.txt 1.34 +// and a modified list used exclusively in CLD2. The modified form maps all 1.35 +// the CJK scripts to one, Hani. The current version description is in 1.36 +// i18n/encodings/cld2/builddata/script_summary.txt 1.37 +// In addition, all non-letters are mapped to the Common script. 1.38 +// 1.39 +// ULScript describes this Unicode Letter script. 1.40 +// 1.41 +// Scoring uses text nil-grams, uni-grams, bi-grams, quad-grams, and octa-grams. 1.42 +// Nilgrams (no text lookup at all) are for script-based pseudo-languages and 1.43 +// for languages that are 1:1 with a given script. Unigrams and bigrams are 1.44 +// used to score the CJK languages, all in the Hani script. Quadgrams and 1.45 +// octagrams are used to score all other languages. 1.46 +// 1.47 +// RType is the Recognition Type per ulscript. 1.48 +// 1.49 +// The scoring tables map various grams to language-probability scores. 1.50 +// A given gram that hits in scoring table maps to an indirect subscript into 1.51 +// a list of packed languages and log probabilities. 1.52 +// 1.53 +// Languages are stored in two forms: 10-bit values in the Languge enum, and 1.54 +// shorter 8-bit per-ulscript values in the scoring tables. 1.55 +// 1.56 +// Language refers to the full 10-bit range. 1.57 +// pslang refers to the per-ulscript shorter values. 1.58 +// 1.59 +// Log probabilities also come in two forms. The full range uses values 0..255 1.60 +// to represent minus log base 10th-root-of-2, covering 1 .. 1/2**25.5 or about 1.61 +// TODO BOGUS description, 24 vs 12 1.62 +// 1/47.5M. The second form quantizes these into multiples of 8 that can be 1.63 +// added together to represent probability products. The quantized form uses 1.64 +// values 24..0 with 0 now least likely instead of most likely, thus making 1.65 +// larger sums for more probable results. 24 maps to original 1/2**4.8 (~1/28) 1.66 +// and 0 maps to original 1/2**24.0 (~1/16M). 1.67 +// 1.68 +// qprob refers to quantized log probabilities. 1.69 +// 1.70 +// langprob is a uint32 holding three 1-byte pslangs and a 1-byte subscript to 1.71 +// a list of three qprobs. It always nees a companion ulscript 1.72 +// 1.73 +// A scriptspan is scored via one or more hitbuffers 1.74 + 1.75 + 1.76 +#ifndef I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__ 1.77 +#define I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__ 1.78 + 1.79 +#include <stdio.h> 1.80 + 1.81 +#include "integral_types.h" // for uint8 etc. 1.82 + 1.83 +#include "cld2tablesummary.h" 1.84 +#include "compact_lang_det_impl.h" // for ResultChunkVector 1.85 +#include "getonescriptspan.h" 1.86 +#include "langspan.h" 1.87 +#include "tote.h" 1.88 +#include "utf8statetable.h" 1.89 + 1.90 +namespace CLD2 { 1.91 + 1.92 +static const int kMaxBoosts = 4; // For each of PerScriptLangBoosts 1.93 + // must be power of two for wrap() 1.94 +static const int kChunksizeQuads = 20; // For non-CJK 1.95 +static const int kChunksizeUnis = 50; // For CJK 1.96 +static const int kMaxScoringHits = 1000; 1.97 +static const int kMaxSummaries = kMaxScoringHits / kChunksizeQuads; 1.98 + 1.99 + 1.100 +// The first four tables are for CJK languages, 1.101 +// the next three for quadgram languages, and 1.102 +// the last for expected scores. 1.103 +typedef struct { 1.104 + const UTF8PropObj* unigram_obj; // 80K CJK characters 1.105 + const CLD2TableSummary* unigram_compat_obj; // 256 CJK lookup probabilities 1.106 + const CLD2TableSummary* deltabi_obj; 1.107 + const CLD2TableSummary* distinctbi_obj; 1.108 + 1.109 + const CLD2TableSummary* quadgram_obj; // Primary quadgram lookup table 1.110 + const CLD2TableSummary* quadgram_obj2; // Secondary " 1.111 + const CLD2TableSummary* deltaocta_obj; 1.112 + const CLD2TableSummary* distinctocta_obj; 1.113 + 1.114 + const short* kExpectedScore; // Expected base + delta + distinct score 1.115 + // per 1KB input 1.116 + // Subscripted by language and script4 1.117 +} ScoringTables; 1.118 + 1.119 +// Context for boosting several languages 1.120 +typedef struct { 1.121 + int32 n; 1.122 + uint32 langprob[kMaxBoosts]; 1.123 + int wrap(int32 n) {return n & (kMaxBoosts - 1);} 1.124 +} LangBoosts; 1.125 + 1.126 +typedef struct { 1.127 + LangBoosts latn; 1.128 + LangBoosts othr; 1.129 +} PerScriptLangBoosts; 1.130 + 1.131 + 1.132 + 1.133 +// ScoringContext carries state across scriptspans 1.134 +// ScoringContext also has read-only scoring tables mapping grams to qprobs 1.135 +typedef struct { 1.136 + FILE* debug_file; // Non-NULL if debug output wanted 1.137 + bool flags_cld2_score_as_quads; 1.138 + bool flags_cld2_html; 1.139 + bool flags_cld2_cr; 1.140 + bool flags_cld2_verbose; 1.141 + ULScript ulscript; // langprobs below are with respect to this script 1.142 + Language prior_chunk_lang; // Mostly for debug output 1.143 + // boost has a packed set of per-script langs and probabilites 1.144 + // whack has a per-script lang to be suppressed from ever scoring (zeroed) 1.145 + // When a language in a close set is given as an explicit hint, others in 1.146 + // that set will be whacked. 1.147 + PerScriptLangBoosts langprior_boost; // From http content-lang or meta lang= 1.148 + PerScriptLangBoosts langprior_whack; // From http content-lang or meta lang= 1.149 + PerScriptLangBoosts distinct_boost; // From distinctive letter groups 1.150 + int oldest_distinct_boost; // Subscript in hitbuffer of oldest 1.151 + // distinct score to use 1.152 + const ScoringTables* scoringtables; // Probability lookup tables 1.153 + ScriptScanner* scanner; // For ResultChunkVector backmap 1.154 + 1.155 + // Inits boosts 1.156 + void init() { 1.157 + memset(&langprior_boost, 0, sizeof(langprior_boost)); 1.158 + memset(&langprior_whack, 0, sizeof(langprior_whack)); 1.159 + memset(&distinct_boost, 0, sizeof(distinct_boost)); 1.160 + }; 1.161 +} ScoringContext; 1.162 + 1.163 + 1.164 + 1.165 +// Begin private 1.166 + 1.167 +// Holds one scoring-table lookup hit. We hold indirect subscript instead of 1.168 +// langprob to allow a single hit to use a variable number of langprobs. 1.169 +typedef struct { 1.170 + int offset; // First byte of quad/octa etc. in scriptspan 1.171 + int indirect; // subscript of langprobs in scoring table 1.172 +} ScoringHit; 1.173 + 1.174 +typedef enum { 1.175 + UNIHIT = 0, 1.176 + QUADHIT = 1, 1.177 + DELTAHIT = 2, 1.178 + DISTINCTHIT = 3 1.179 +} LinearHitType; 1.180 + 1.181 +// Holds one scoring-table lookup hit resolved into a langprob. 1.182 +typedef struct { 1.183 + uint16 offset; // First byte of quad/octa etc. in scriptspan 1.184 + uint16 type; // LinearHitType 1.185 + uint32 langprob; // langprob from scoring table 1.186 +} LangprobHit; 1.187 + 1.188 +// Holds arrays of scoring-table lookup hits for (part of) a scriptspan 1.189 +typedef struct { 1.190 + ULScript ulscript; // langprobs below are with respect to this script 1.191 + int maxscoringhits; // determines size of arrays below 1.192 + int next_base; // First unused entry in each array 1.193 + int next_delta; // " 1.194 + int next_distinct; // " 1.195 + int next_linear; // " 1.196 + int next_chunk_start; // First unused chunk_start entry 1.197 + int lowest_offset; // First byte of text span used to fill hitbuffer 1.198 + // Dummy entry at the end of each giving offset of first unused text byte 1.199 + ScoringHit base[kMaxScoringHits + 1]; // Uni/quad hits 1.200 + ScoringHit delta[kMaxScoringHits + 1]; // delta-bi/delta-octa hits 1.201 + ScoringHit distinct[kMaxScoringHits + 1]; // distinct-word hits 1.202 + LangprobHit linear[4 * kMaxScoringHits + 1]; // Above three merge-sorted 1.203 + // (4: some bases => 2 linear) 1.204 + int chunk_start[kMaxSummaries + 1]; // First linear[] subscr of 1.205 + // each scored chunk 1.206 + int chunk_offset[kMaxSummaries + 1]; // First text subscr of 1.207 + // each scored chunk 1.208 + 1.209 + void init() { 1.210 + ulscript = ULScript_Common; 1.211 + maxscoringhits = kMaxScoringHits; 1.212 + next_base = 0; 1.213 + next_delta = 0; 1.214 + next_distinct = 0; 1.215 + next_linear = 0; 1.216 + next_chunk_start = 0; 1.217 + lowest_offset = 0; 1.218 + base[0].offset = 0; 1.219 + base[0].indirect = 0; 1.220 + delta[0].offset = 0; 1.221 + delta[0].indirect = 0; 1.222 + distinct[0].offset = 0; 1.223 + distinct[0].indirect = 0; 1.224 + linear[0].offset = 0; 1.225 + linear[0].langprob = 0; 1.226 + chunk_start[0] = 0; 1.227 + chunk_offset[0] = 0; 1.228 + }; 1.229 +} ScoringHitBuffer; 1.230 + 1.231 +// TODO: Explain here why we need both ChunkSpan and ChunkSummary 1.232 +typedef struct { 1.233 + int chunk_base; // Subscript of first hitbuffer.base[] in chunk 1.234 + int chunk_delta; // Subscript of first hitbuffer.delta[] 1.235 + int chunk_distinct; // Subscript of first hitbuffer.distinct[] 1.236 + int base_len; // Number of hitbuffer.base[] in chunk 1.237 + int delta_len; // Number of hitbuffer.delta[] in chunk 1.238 + int distinct_len; // Number of hitbuffer.distinct[] in chunk 1.239 +} ChunkSpan; 1.240 + 1.241 + 1.242 +// Packed into 20 bytes for space 1.243 +typedef struct { 1.244 + uint16 offset; // Text offset within current scriptspan.text 1.245 + uint16 chunk_start; // Scoring subscr within hitbuffer->linear[] 1.246 + uint16 lang1; // Top lang, mapped to full Language 1.247 + uint16 lang2; // Second lang, mapped to full Language 1.248 + uint16 score1; // Top lang raw score 1.249 + uint16 score2; // Second lang raw score 1.250 + uint16 bytes; // Number of lower letters bytes in chunk 1.251 + uint16 grams; // Number of scored base quad- uni-grams in chunk 1.252 + uint16 ulscript; // ULScript of chunk 1.253 + uint8 reliability_delta; // Reliability 0..100, delta top:second scores 1.254 + uint8 reliability_score; // Reliability 0..100, top:expected score 1.255 +} ChunkSummary; 1.256 + 1.257 + 1.258 +// We buffer up ~50 chunk summaries, corresponding to chunks of 20 quads in a 1.259 +// 1000-quad hit buffer, so we can do boundary adjustment on them 1.260 +// when adjacent entries are different languages. After that, we add them 1.261 +// all into the document score 1.262 +// 1.263 +// About 50 * 20 = 1000 bytes. OK for stack alloc 1.264 +typedef struct { 1.265 + int n; 1.266 + ChunkSummary chunksummary[kMaxSummaries + 1]; 1.267 +} SummaryBuffer; 1.268 + 1.269 +// End private 1.270 + 1.271 + 1.272 +// Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating 1.273 +// scoringcontext 1.274 +void ScoreEntireScriptSpan(const LangSpan& scriptspan, 1.275 + ScoringContext* scoringcontext, 1.276 + DocTote* doc_tote, 1.277 + ResultChunkVector* vec); 1.278 + 1.279 +// Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext 1.280 +void ScoreCJKScriptSpan(const LangSpan& scriptspan, 1.281 + ScoringContext* scoringcontext, 1.282 + DocTote* doc_tote, 1.283 + ResultChunkVector* vec); 1.284 + 1.285 +// Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext 1.286 +void ScoreQuadScriptSpan(const LangSpan& scriptspan, 1.287 + ScoringContext* scoringcontext, 1.288 + DocTote* doc_tote, 1.289 + ResultChunkVector* vec); 1.290 + 1.291 +// Score one scriptspan into doc_tote and vec, updating scoringcontext 1.292 +void ScoreOneScriptSpan(const LangSpan& scriptspan, 1.293 + ScoringContext* scoringcontext, 1.294 + DocTote* doc_tote, 1.295 + ResultChunkVector* vec); 1.296 + 1.297 +} // End namespace CLD2 1.298 + 1.299 +#endif // I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__ 1.300 +