Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | // Copyright 2013 Google Inc. All Rights Reserved. |
michael@0 | 2 | // |
michael@0 | 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
michael@0 | 4 | // you may not use this file except in compliance with the License. |
michael@0 | 5 | // You may obtain a copy of the License at |
michael@0 | 6 | // |
michael@0 | 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
michael@0 | 8 | // |
michael@0 | 9 | // Unless required by applicable law or agreed to in writing, software |
michael@0 | 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
michael@0 | 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
michael@0 | 12 | // See the License for the specific language governing permissions and |
michael@0 | 13 | // limitations under the License. |
michael@0 | 14 | |
michael@0 | 15 | // |
michael@0 | 16 | // Author: dsites@google.com (Dick Sites) |
michael@0 | 17 | // |
michael@0 | 18 | // |
michael@0 | 19 | // Terminology: |
michael@0 | 20 | // Incoming original text has HTML tags and entities removed, all but letters |
michael@0 | 21 | // removed, and letters lowercased. Strings of non-letters are mapped to a |
michael@0 | 22 | // single ASCII space. |
michael@0 | 23 | // |
michael@0 | 24 | // One scriptspan has a run of letters/spaces in a single script. This is the |
michael@0 | 25 | // fundamental text unit that is scored. There is an optional backmap from |
michael@0 | 26 | // scriptspan text to the original document text, so that the language ranges |
michael@0 | 27 | // reported in ResultChunkVector refer to byte ranges inthe original text. |
michael@0 | 28 | // |
michael@0 | 29 | // Scripts come in two forms, the full Unicode scripts described by |
michael@0 | 30 | // http://www.unicode.org/Public/UNIDATA/Scripts.txt |
michael@0 | 31 | // and a modified list used exclusively in CLD2. The modified form maps all |
michael@0 | 32 | // the CJK scripts to one, Hani. The current version description is in |
michael@0 | 33 | // i18n/encodings/cld2/builddata/script_summary.txt |
michael@0 | 34 | // In addition, all non-letters are mapped to the Common script. |
michael@0 | 35 | // |
michael@0 | 36 | // ULScript describes this Unicode Letter script. |
michael@0 | 37 | // |
michael@0 | 38 | // Scoring uses text nil-grams, uni-grams, bi-grams, quad-grams, and octa-grams. |
michael@0 | 39 | // Nilgrams (no text lookup at all) are for script-based pseudo-languages and |
michael@0 | 40 | // for languages that are 1:1 with a given script. Unigrams and bigrams are |
michael@0 | 41 | // used to score the CJK languages, all in the Hani script. Quadgrams and |
michael@0 | 42 | // octagrams are used to score all other languages. |
michael@0 | 43 | // |
michael@0 | 44 | // RType is the Recognition Type per ulscript. |
michael@0 | 45 | // |
michael@0 | 46 | // The scoring tables map various grams to language-probability scores. |
michael@0 | 47 | // A given gram that hits in scoring table maps to an indirect subscript into |
michael@0 | 48 | // a list of packed languages and log probabilities. |
michael@0 | 49 | // |
michael@0 | 50 | // Languages are stored in two forms: 10-bit values in the Languge enum, and |
michael@0 | 51 | // shorter 8-bit per-ulscript values in the scoring tables. |
michael@0 | 52 | // |
michael@0 | 53 | // Language refers to the full 10-bit range. |
michael@0 | 54 | // pslang refers to the per-ulscript shorter values. |
michael@0 | 55 | // |
michael@0 | 56 | // Log probabilities also come in two forms. The full range uses values 0..255 |
michael@0 | 57 | // to represent minus log base 10th-root-of-2, covering 1 .. 1/2**25.5 or about |
michael@0 | 58 | // TODO BOGUS description, 24 vs 12 |
michael@0 | 59 | // 1/47.5M. The second form quantizes these into multiples of 8 that can be |
michael@0 | 60 | // added together to represent probability products. The quantized form uses |
michael@0 | 61 | // values 24..0 with 0 now least likely instead of most likely, thus making |
michael@0 | 62 | // larger sums for more probable results. 24 maps to original 1/2**4.8 (~1/28) |
michael@0 | 63 | // and 0 maps to original 1/2**24.0 (~1/16M). |
michael@0 | 64 | // |
michael@0 | 65 | // qprob refers to quantized log probabilities. |
michael@0 | 66 | // |
michael@0 | 67 | // langprob is a uint32 holding three 1-byte pslangs and a 1-byte subscript to |
michael@0 | 68 | // a list of three qprobs. It always nees a companion ulscript |
michael@0 | 69 | // |
michael@0 | 70 | // A scriptspan is scored via one or more hitbuffers |
michael@0 | 71 | |
michael@0 | 72 | |
michael@0 | 73 | #ifndef I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__ |
michael@0 | 74 | #define I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__ |
michael@0 | 75 | |
michael@0 | 76 | #include <stdio.h> |
michael@0 | 77 | |
michael@0 | 78 | #include "integral_types.h" // for uint8 etc. |
michael@0 | 79 | |
michael@0 | 80 | #include "cld2tablesummary.h" |
michael@0 | 81 | #include "compact_lang_det_impl.h" // for ResultChunkVector |
michael@0 | 82 | #include "getonescriptspan.h" |
michael@0 | 83 | #include "langspan.h" |
michael@0 | 84 | #include "tote.h" |
michael@0 | 85 | #include "utf8statetable.h" |
michael@0 | 86 | |
michael@0 | 87 | namespace CLD2 { |
michael@0 | 88 | |
michael@0 | 89 | static const int kMaxBoosts = 4; // For each of PerScriptLangBoosts |
michael@0 | 90 | // must be power of two for wrap() |
michael@0 | 91 | static const int kChunksizeQuads = 20; // For non-CJK |
michael@0 | 92 | static const int kChunksizeUnis = 50; // For CJK |
michael@0 | 93 | static const int kMaxScoringHits = 1000; |
michael@0 | 94 | static const int kMaxSummaries = kMaxScoringHits / kChunksizeQuads; |
michael@0 | 95 | |
michael@0 | 96 | |
michael@0 | 97 | // The first four tables are for CJK languages, |
michael@0 | 98 | // the next three for quadgram languages, and |
michael@0 | 99 | // the last for expected scores. |
michael@0 | 100 | typedef struct { |
michael@0 | 101 | const UTF8PropObj* unigram_obj; // 80K CJK characters |
michael@0 | 102 | const CLD2TableSummary* unigram_compat_obj; // 256 CJK lookup probabilities |
michael@0 | 103 | const CLD2TableSummary* deltabi_obj; |
michael@0 | 104 | const CLD2TableSummary* distinctbi_obj; |
michael@0 | 105 | |
michael@0 | 106 | const CLD2TableSummary* quadgram_obj; // Primary quadgram lookup table |
michael@0 | 107 | const CLD2TableSummary* quadgram_obj2; // Secondary " |
michael@0 | 108 | const CLD2TableSummary* deltaocta_obj; |
michael@0 | 109 | const CLD2TableSummary* distinctocta_obj; |
michael@0 | 110 | |
michael@0 | 111 | const short* kExpectedScore; // Expected base + delta + distinct score |
michael@0 | 112 | // per 1KB input |
michael@0 | 113 | // Subscripted by language and script4 |
michael@0 | 114 | } ScoringTables; |
michael@0 | 115 | |
michael@0 | 116 | // Context for boosting several languages |
michael@0 | 117 | typedef struct { |
michael@0 | 118 | int32 n; |
michael@0 | 119 | uint32 langprob[kMaxBoosts]; |
michael@0 | 120 | int wrap(int32 n) {return n & (kMaxBoosts - 1);} |
michael@0 | 121 | } LangBoosts; |
michael@0 | 122 | |
michael@0 | 123 | typedef struct { |
michael@0 | 124 | LangBoosts latn; |
michael@0 | 125 | LangBoosts othr; |
michael@0 | 126 | } PerScriptLangBoosts; |
michael@0 | 127 | |
michael@0 | 128 | |
michael@0 | 129 | |
michael@0 | 130 | // ScoringContext carries state across scriptspans |
michael@0 | 131 | // ScoringContext also has read-only scoring tables mapping grams to qprobs |
michael@0 | 132 | typedef struct { |
michael@0 | 133 | FILE* debug_file; // Non-NULL if debug output wanted |
michael@0 | 134 | bool flags_cld2_score_as_quads; |
michael@0 | 135 | bool flags_cld2_html; |
michael@0 | 136 | bool flags_cld2_cr; |
michael@0 | 137 | bool flags_cld2_verbose; |
michael@0 | 138 | ULScript ulscript; // langprobs below are with respect to this script |
michael@0 | 139 | Language prior_chunk_lang; // Mostly for debug output |
michael@0 | 140 | // boost has a packed set of per-script langs and probabilites |
michael@0 | 141 | // whack has a per-script lang to be suppressed from ever scoring (zeroed) |
michael@0 | 142 | // When a language in a close set is given as an explicit hint, others in |
michael@0 | 143 | // that set will be whacked. |
michael@0 | 144 | PerScriptLangBoosts langprior_boost; // From http content-lang or meta lang= |
michael@0 | 145 | PerScriptLangBoosts langprior_whack; // From http content-lang or meta lang= |
michael@0 | 146 | PerScriptLangBoosts distinct_boost; // From distinctive letter groups |
michael@0 | 147 | int oldest_distinct_boost; // Subscript in hitbuffer of oldest |
michael@0 | 148 | // distinct score to use |
michael@0 | 149 | const ScoringTables* scoringtables; // Probability lookup tables |
michael@0 | 150 | ScriptScanner* scanner; // For ResultChunkVector backmap |
michael@0 | 151 | |
michael@0 | 152 | // Inits boosts |
michael@0 | 153 | void init() { |
michael@0 | 154 | memset(&langprior_boost, 0, sizeof(langprior_boost)); |
michael@0 | 155 | memset(&langprior_whack, 0, sizeof(langprior_whack)); |
michael@0 | 156 | memset(&distinct_boost, 0, sizeof(distinct_boost)); |
michael@0 | 157 | }; |
michael@0 | 158 | } ScoringContext; |
michael@0 | 159 | |
michael@0 | 160 | |
michael@0 | 161 | |
michael@0 | 162 | // Begin private |
michael@0 | 163 | |
michael@0 | 164 | // Holds one scoring-table lookup hit. We hold indirect subscript instead of |
michael@0 | 165 | // langprob to allow a single hit to use a variable number of langprobs. |
michael@0 | 166 | typedef struct { |
michael@0 | 167 | int offset; // First byte of quad/octa etc. in scriptspan |
michael@0 | 168 | int indirect; // subscript of langprobs in scoring table |
michael@0 | 169 | } ScoringHit; |
michael@0 | 170 | |
michael@0 | 171 | typedef enum { |
michael@0 | 172 | UNIHIT = 0, |
michael@0 | 173 | QUADHIT = 1, |
michael@0 | 174 | DELTAHIT = 2, |
michael@0 | 175 | DISTINCTHIT = 3 |
michael@0 | 176 | } LinearHitType; |
michael@0 | 177 | |
michael@0 | 178 | // Holds one scoring-table lookup hit resolved into a langprob. |
michael@0 | 179 | typedef struct { |
michael@0 | 180 | uint16 offset; // First byte of quad/octa etc. in scriptspan |
michael@0 | 181 | uint16 type; // LinearHitType |
michael@0 | 182 | uint32 langprob; // langprob from scoring table |
michael@0 | 183 | } LangprobHit; |
michael@0 | 184 | |
michael@0 | 185 | // Holds arrays of scoring-table lookup hits for (part of) a scriptspan |
michael@0 | 186 | typedef struct { |
michael@0 | 187 | ULScript ulscript; // langprobs below are with respect to this script |
michael@0 | 188 | int maxscoringhits; // determines size of arrays below |
michael@0 | 189 | int next_base; // First unused entry in each array |
michael@0 | 190 | int next_delta; // " |
michael@0 | 191 | int next_distinct; // " |
michael@0 | 192 | int next_linear; // " |
michael@0 | 193 | int next_chunk_start; // First unused chunk_start entry |
michael@0 | 194 | int lowest_offset; // First byte of text span used to fill hitbuffer |
michael@0 | 195 | // Dummy entry at the end of each giving offset of first unused text byte |
michael@0 | 196 | ScoringHit base[kMaxScoringHits + 1]; // Uni/quad hits |
michael@0 | 197 | ScoringHit delta[kMaxScoringHits + 1]; // delta-bi/delta-octa hits |
michael@0 | 198 | ScoringHit distinct[kMaxScoringHits + 1]; // distinct-word hits |
michael@0 | 199 | LangprobHit linear[4 * kMaxScoringHits + 1]; // Above three merge-sorted |
michael@0 | 200 | // (4: some bases => 2 linear) |
michael@0 | 201 | int chunk_start[kMaxSummaries + 1]; // First linear[] subscr of |
michael@0 | 202 | // each scored chunk |
michael@0 | 203 | int chunk_offset[kMaxSummaries + 1]; // First text subscr of |
michael@0 | 204 | // each scored chunk |
michael@0 | 205 | |
michael@0 | 206 | void init() { |
michael@0 | 207 | ulscript = ULScript_Common; |
michael@0 | 208 | maxscoringhits = kMaxScoringHits; |
michael@0 | 209 | next_base = 0; |
michael@0 | 210 | next_delta = 0; |
michael@0 | 211 | next_distinct = 0; |
michael@0 | 212 | next_linear = 0; |
michael@0 | 213 | next_chunk_start = 0; |
michael@0 | 214 | lowest_offset = 0; |
michael@0 | 215 | base[0].offset = 0; |
michael@0 | 216 | base[0].indirect = 0; |
michael@0 | 217 | delta[0].offset = 0; |
michael@0 | 218 | delta[0].indirect = 0; |
michael@0 | 219 | distinct[0].offset = 0; |
michael@0 | 220 | distinct[0].indirect = 0; |
michael@0 | 221 | linear[0].offset = 0; |
michael@0 | 222 | linear[0].langprob = 0; |
michael@0 | 223 | chunk_start[0] = 0; |
michael@0 | 224 | chunk_offset[0] = 0; |
michael@0 | 225 | }; |
michael@0 | 226 | } ScoringHitBuffer; |
michael@0 | 227 | |
michael@0 | 228 | // TODO: Explain here why we need both ChunkSpan and ChunkSummary |
michael@0 | 229 | typedef struct { |
michael@0 | 230 | int chunk_base; // Subscript of first hitbuffer.base[] in chunk |
michael@0 | 231 | int chunk_delta; // Subscript of first hitbuffer.delta[] |
michael@0 | 232 | int chunk_distinct; // Subscript of first hitbuffer.distinct[] |
michael@0 | 233 | int base_len; // Number of hitbuffer.base[] in chunk |
michael@0 | 234 | int delta_len; // Number of hitbuffer.delta[] in chunk |
michael@0 | 235 | int distinct_len; // Number of hitbuffer.distinct[] in chunk |
michael@0 | 236 | } ChunkSpan; |
michael@0 | 237 | |
michael@0 | 238 | |
michael@0 | 239 | // Packed into 20 bytes for space |
michael@0 | 240 | typedef struct { |
michael@0 | 241 | uint16 offset; // Text offset within current scriptspan.text |
michael@0 | 242 | uint16 chunk_start; // Scoring subscr within hitbuffer->linear[] |
michael@0 | 243 | uint16 lang1; // Top lang, mapped to full Language |
michael@0 | 244 | uint16 lang2; // Second lang, mapped to full Language |
michael@0 | 245 | uint16 score1; // Top lang raw score |
michael@0 | 246 | uint16 score2; // Second lang raw score |
michael@0 | 247 | uint16 bytes; // Number of lower letters bytes in chunk |
michael@0 | 248 | uint16 grams; // Number of scored base quad- uni-grams in chunk |
michael@0 | 249 | uint16 ulscript; // ULScript of chunk |
michael@0 | 250 | uint8 reliability_delta; // Reliability 0..100, delta top:second scores |
michael@0 | 251 | uint8 reliability_score; // Reliability 0..100, top:expected score |
michael@0 | 252 | } ChunkSummary; |
michael@0 | 253 | |
michael@0 | 254 | |
michael@0 | 255 | // We buffer up ~50 chunk summaries, corresponding to chunks of 20 quads in a |
michael@0 | 256 | // 1000-quad hit buffer, so we can do boundary adjustment on them |
michael@0 | 257 | // when adjacent entries are different languages. After that, we add them |
michael@0 | 258 | // all into the document score |
michael@0 | 259 | // |
michael@0 | 260 | // About 50 * 20 = 1000 bytes. OK for stack alloc |
michael@0 | 261 | typedef struct { |
michael@0 | 262 | int n; |
michael@0 | 263 | ChunkSummary chunksummary[kMaxSummaries + 1]; |
michael@0 | 264 | } SummaryBuffer; |
michael@0 | 265 | |
michael@0 | 266 | // End private |
michael@0 | 267 | |
michael@0 | 268 | |
michael@0 | 269 | // Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating |
michael@0 | 270 | // scoringcontext |
michael@0 | 271 | void ScoreEntireScriptSpan(const LangSpan& scriptspan, |
michael@0 | 272 | ScoringContext* scoringcontext, |
michael@0 | 273 | DocTote* doc_tote, |
michael@0 | 274 | ResultChunkVector* vec); |
michael@0 | 275 | |
michael@0 | 276 | // Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext |
michael@0 | 277 | void ScoreCJKScriptSpan(const LangSpan& scriptspan, |
michael@0 | 278 | ScoringContext* scoringcontext, |
michael@0 | 279 | DocTote* doc_tote, |
michael@0 | 280 | ResultChunkVector* vec); |
michael@0 | 281 | |
michael@0 | 282 | // Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext |
michael@0 | 283 | void ScoreQuadScriptSpan(const LangSpan& scriptspan, |
michael@0 | 284 | ScoringContext* scoringcontext, |
michael@0 | 285 | DocTote* doc_tote, |
michael@0 | 286 | ResultChunkVector* vec); |
michael@0 | 287 | |
michael@0 | 288 | // Score one scriptspan into doc_tote and vec, updating scoringcontext |
michael@0 | 289 | void ScoreOneScriptSpan(const LangSpan& scriptspan, |
michael@0 | 290 | ScoringContext* scoringcontext, |
michael@0 | 291 | DocTote* doc_tote, |
michael@0 | 292 | ResultChunkVector* vec); |
michael@0 | 293 | |
michael@0 | 294 | } // End namespace CLD2 |
michael@0 | 295 | |
michael@0 | 296 | #endif // I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__ |
michael@0 | 297 |