|
1 // Copyright 2013 Google Inc. All Rights Reserved. |
|
2 // |
|
3 // Licensed under the Apache License, Version 2.0 (the "License"); |
|
4 // you may not use this file except in compliance with the License. |
|
5 // You may obtain a copy of the License at |
|
6 // |
|
7 // http://www.apache.org/licenses/LICENSE-2.0 |
|
8 // |
|
9 // Unless required by applicable law or agreed to in writing, software |
|
10 // distributed under the License is distributed on an "AS IS" BASIS, |
|
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
12 // See the License for the specific language governing permissions and |
|
13 // limitations under the License. |
|
14 |
|
15 // |
|
16 // Author: dsites@google.com (Dick Sites) |
|
17 // |
|
18 // |
|
19 // Terminology: |
|
20 // Incoming original text has HTML tags and entities removed, all but letters |
|
21 // removed, and letters lowercased. Strings of non-letters are mapped to a |
|
22 // single ASCII space. |
|
23 // |
|
24 // One scriptspan has a run of letters/spaces in a single script. This is the |
|
25 // fundamental text unit that is scored. There is an optional backmap from |
|
26 // scriptspan text to the original document text, so that the language ranges |
|
27 // reported in ResultChunkVector refer to byte ranges inthe original text. |
|
28 // |
|
29 // Scripts come in two forms, the full Unicode scripts described by |
|
30 // http://www.unicode.org/Public/UNIDATA/Scripts.txt |
|
31 // and a modified list used exclusively in CLD2. The modified form maps all |
|
32 // the CJK scripts to one, Hani. The current version description is in |
|
33 // i18n/encodings/cld2/builddata/script_summary.txt |
|
34 // In addition, all non-letters are mapped to the Common script. |
|
35 // |
|
36 // ULScript describes this Unicode Letter script. |
|
37 // |
|
38 // Scoring uses text nil-grams, uni-grams, bi-grams, quad-grams, and octa-grams. |
|
39 // Nilgrams (no text lookup at all) are for script-based pseudo-languages and |
|
40 // for languages that are 1:1 with a given script. Unigrams and bigrams are |
|
41 // used to score the CJK languages, all in the Hani script. Quadgrams and |
|
42 // octagrams are used to score all other languages. |
|
43 // |
|
44 // RType is the Recognition Type per ulscript. |
|
45 // |
|
46 // The scoring tables map various grams to language-probability scores. |
|
47 // A given gram that hits in scoring table maps to an indirect subscript into |
|
48 // a list of packed languages and log probabilities. |
|
49 // |
|
50 // Languages are stored in two forms: 10-bit values in the Languge enum, and |
|
51 // shorter 8-bit per-ulscript values in the scoring tables. |
|
52 // |
|
53 // Language refers to the full 10-bit range. |
|
54 // pslang refers to the per-ulscript shorter values. |
|
55 // |
|
56 // Log probabilities also come in two forms. The full range uses values 0..255 |
|
57 // to represent minus log base 10th-root-of-2, covering 1 .. 1/2**25.5 or about |
|
58 // TODO BOGUS description, 24 vs 12 |
|
59 // 1/47.5M. The second form quantizes these into multiples of 8 that can be |
|
60 // added together to represent probability products. The quantized form uses |
|
61 // values 24..0 with 0 now least likely instead of most likely, thus making |
|
62 // larger sums for more probable results. 24 maps to original 1/2**4.8 (~1/28) |
|
63 // and 0 maps to original 1/2**24.0 (~1/16M). |
|
64 // |
|
65 // qprob refers to quantized log probabilities. |
|
66 // |
|
67 // langprob is a uint32 holding three 1-byte pslangs and a 1-byte subscript to |
|
68 // a list of three qprobs. It always nees a companion ulscript |
|
69 // |
|
70 // A scriptspan is scored via one or more hitbuffers |
|
71 |
|
72 |
|
73 #ifndef I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__ |
|
74 #define I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__ |
|
75 |
|
76 #include <stdio.h> |
|
77 |
|
78 #include "integral_types.h" // for uint8 etc. |
|
79 |
|
80 #include "cld2tablesummary.h" |
|
81 #include "compact_lang_det_impl.h" // for ResultChunkVector |
|
82 #include "getonescriptspan.h" |
|
83 #include "langspan.h" |
|
84 #include "tote.h" |
|
85 #include "utf8statetable.h" |
|
86 |
|
87 namespace CLD2 { |
|
88 |
|
89 static const int kMaxBoosts = 4; // For each of PerScriptLangBoosts |
|
90 // must be power of two for wrap() |
|
91 static const int kChunksizeQuads = 20; // For non-CJK |
|
92 static const int kChunksizeUnis = 50; // For CJK |
|
93 static const int kMaxScoringHits = 1000; |
|
94 static const int kMaxSummaries = kMaxScoringHits / kChunksizeQuads; |
|
95 |
|
96 |
|
97 // The first four tables are for CJK languages, |
|
98 // the next three for quadgram languages, and |
|
99 // the last for expected scores. |
|
100 typedef struct { |
|
101 const UTF8PropObj* unigram_obj; // 80K CJK characters |
|
102 const CLD2TableSummary* unigram_compat_obj; // 256 CJK lookup probabilities |
|
103 const CLD2TableSummary* deltabi_obj; |
|
104 const CLD2TableSummary* distinctbi_obj; |
|
105 |
|
106 const CLD2TableSummary* quadgram_obj; // Primary quadgram lookup table |
|
107 const CLD2TableSummary* quadgram_obj2; // Secondary " |
|
108 const CLD2TableSummary* deltaocta_obj; |
|
109 const CLD2TableSummary* distinctocta_obj; |
|
110 |
|
111 const short* kExpectedScore; // Expected base + delta + distinct score |
|
112 // per 1KB input |
|
113 // Subscripted by language and script4 |
|
114 } ScoringTables; |
|
115 |
|
116 // Context for boosting several languages |
|
117 typedef struct { |
|
118 int32 n; |
|
119 uint32 langprob[kMaxBoosts]; |
|
120 int wrap(int32 n) {return n & (kMaxBoosts - 1);} |
|
121 } LangBoosts; |
|
122 |
|
123 typedef struct { |
|
124 LangBoosts latn; |
|
125 LangBoosts othr; |
|
126 } PerScriptLangBoosts; |
|
127 |
|
128 |
|
129 |
|
130 // ScoringContext carries state across scriptspans |
|
131 // ScoringContext also has read-only scoring tables mapping grams to qprobs |
|
132 typedef struct { |
|
133 FILE* debug_file; // Non-NULL if debug output wanted |
|
134 bool flags_cld2_score_as_quads; |
|
135 bool flags_cld2_html; |
|
136 bool flags_cld2_cr; |
|
137 bool flags_cld2_verbose; |
|
138 ULScript ulscript; // langprobs below are with respect to this script |
|
139 Language prior_chunk_lang; // Mostly for debug output |
|
140 // boost has a packed set of per-script langs and probabilites |
|
141 // whack has a per-script lang to be suppressed from ever scoring (zeroed) |
|
142 // When a language in a close set is given as an explicit hint, others in |
|
143 // that set will be whacked. |
|
144 PerScriptLangBoosts langprior_boost; // From http content-lang or meta lang= |
|
145 PerScriptLangBoosts langprior_whack; // From http content-lang or meta lang= |
|
146 PerScriptLangBoosts distinct_boost; // From distinctive letter groups |
|
147 int oldest_distinct_boost; // Subscript in hitbuffer of oldest |
|
148 // distinct score to use |
|
149 const ScoringTables* scoringtables; // Probability lookup tables |
|
150 ScriptScanner* scanner; // For ResultChunkVector backmap |
|
151 |
|
152 // Inits boosts |
|
153 void init() { |
|
154 memset(&langprior_boost, 0, sizeof(langprior_boost)); |
|
155 memset(&langprior_whack, 0, sizeof(langprior_whack)); |
|
156 memset(&distinct_boost, 0, sizeof(distinct_boost)); |
|
157 }; |
|
158 } ScoringContext; |
|
159 |
|
160 |
|
161 |
|
162 // Begin private |
|
163 |
|
164 // Holds one scoring-table lookup hit. We hold indirect subscript instead of |
|
165 // langprob to allow a single hit to use a variable number of langprobs. |
|
166 typedef struct { |
|
167 int offset; // First byte of quad/octa etc. in scriptspan |
|
168 int indirect; // subscript of langprobs in scoring table |
|
169 } ScoringHit; |
|
170 |
|
171 typedef enum { |
|
172 UNIHIT = 0, |
|
173 QUADHIT = 1, |
|
174 DELTAHIT = 2, |
|
175 DISTINCTHIT = 3 |
|
176 } LinearHitType; |
|
177 |
|
178 // Holds one scoring-table lookup hit resolved into a langprob. |
|
179 typedef struct { |
|
180 uint16 offset; // First byte of quad/octa etc. in scriptspan |
|
181 uint16 type; // LinearHitType |
|
182 uint32 langprob; // langprob from scoring table |
|
183 } LangprobHit; |
|
184 |
|
185 // Holds arrays of scoring-table lookup hits for (part of) a scriptspan |
|
186 typedef struct { |
|
187 ULScript ulscript; // langprobs below are with respect to this script |
|
188 int maxscoringhits; // determines size of arrays below |
|
189 int next_base; // First unused entry in each array |
|
190 int next_delta; // " |
|
191 int next_distinct; // " |
|
192 int next_linear; // " |
|
193 int next_chunk_start; // First unused chunk_start entry |
|
194 int lowest_offset; // First byte of text span used to fill hitbuffer |
|
195 // Dummy entry at the end of each giving offset of first unused text byte |
|
196 ScoringHit base[kMaxScoringHits + 1]; // Uni/quad hits |
|
197 ScoringHit delta[kMaxScoringHits + 1]; // delta-bi/delta-octa hits |
|
198 ScoringHit distinct[kMaxScoringHits + 1]; // distinct-word hits |
|
199 LangprobHit linear[4 * kMaxScoringHits + 1]; // Above three merge-sorted |
|
200 // (4: some bases => 2 linear) |
|
201 int chunk_start[kMaxSummaries + 1]; // First linear[] subscr of |
|
202 // each scored chunk |
|
203 int chunk_offset[kMaxSummaries + 1]; // First text subscr of |
|
204 // each scored chunk |
|
205 |
|
206 void init() { |
|
207 ulscript = ULScript_Common; |
|
208 maxscoringhits = kMaxScoringHits; |
|
209 next_base = 0; |
|
210 next_delta = 0; |
|
211 next_distinct = 0; |
|
212 next_linear = 0; |
|
213 next_chunk_start = 0; |
|
214 lowest_offset = 0; |
|
215 base[0].offset = 0; |
|
216 base[0].indirect = 0; |
|
217 delta[0].offset = 0; |
|
218 delta[0].indirect = 0; |
|
219 distinct[0].offset = 0; |
|
220 distinct[0].indirect = 0; |
|
221 linear[0].offset = 0; |
|
222 linear[0].langprob = 0; |
|
223 chunk_start[0] = 0; |
|
224 chunk_offset[0] = 0; |
|
225 }; |
|
226 } ScoringHitBuffer; |
|
227 |
|
228 // TODO: Explain here why we need both ChunkSpan and ChunkSummary |
|
229 typedef struct { |
|
230 int chunk_base; // Subscript of first hitbuffer.base[] in chunk |
|
231 int chunk_delta; // Subscript of first hitbuffer.delta[] |
|
232 int chunk_distinct; // Subscript of first hitbuffer.distinct[] |
|
233 int base_len; // Number of hitbuffer.base[] in chunk |
|
234 int delta_len; // Number of hitbuffer.delta[] in chunk |
|
235 int distinct_len; // Number of hitbuffer.distinct[] in chunk |
|
236 } ChunkSpan; |
|
237 |
|
238 |
|
239 // Packed into 20 bytes for space |
|
240 typedef struct { |
|
241 uint16 offset; // Text offset within current scriptspan.text |
|
242 uint16 chunk_start; // Scoring subscr within hitbuffer->linear[] |
|
243 uint16 lang1; // Top lang, mapped to full Language |
|
244 uint16 lang2; // Second lang, mapped to full Language |
|
245 uint16 score1; // Top lang raw score |
|
246 uint16 score2; // Second lang raw score |
|
247 uint16 bytes; // Number of lower letters bytes in chunk |
|
248 uint16 grams; // Number of scored base quad- uni-grams in chunk |
|
249 uint16 ulscript; // ULScript of chunk |
|
250 uint8 reliability_delta; // Reliability 0..100, delta top:second scores |
|
251 uint8 reliability_score; // Reliability 0..100, top:expected score |
|
252 } ChunkSummary; |
|
253 |
|
254 |
|
255 // We buffer up ~50 chunk summaries, corresponding to chunks of 20 quads in a |
|
256 // 1000-quad hit buffer, so we can do boundary adjustment on them |
|
257 // when adjacent entries are different languages. After that, we add them |
|
258 // all into the document score |
|
259 // |
|
260 // About 50 * 20 = 1000 bytes. OK for stack alloc |
|
261 typedef struct { |
|
262 int n; |
|
263 ChunkSummary chunksummary[kMaxSummaries + 1]; |
|
264 } SummaryBuffer; |
|
265 |
|
266 // End private |
|
267 |
|
268 |
|
269 // Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating |
|
270 // scoringcontext |
|
271 void ScoreEntireScriptSpan(const LangSpan& scriptspan, |
|
272 ScoringContext* scoringcontext, |
|
273 DocTote* doc_tote, |
|
274 ResultChunkVector* vec); |
|
275 |
|
276 // Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext |
|
277 void ScoreCJKScriptSpan(const LangSpan& scriptspan, |
|
278 ScoringContext* scoringcontext, |
|
279 DocTote* doc_tote, |
|
280 ResultChunkVector* vec); |
|
281 |
|
282 // Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext |
|
283 void ScoreQuadScriptSpan(const LangSpan& scriptspan, |
|
284 ScoringContext* scoringcontext, |
|
285 DocTote* doc_tote, |
|
286 ResultChunkVector* vec); |
|
287 |
|
288 // Score one scriptspan into doc_tote and vec, updating scoringcontext |
|
289 void ScoreOneScriptSpan(const LangSpan& scriptspan, |
|
290 ScoringContext* scoringcontext, |
|
291 DocTote* doc_tote, |
|
292 ResultChunkVector* vec); |
|
293 |
|
294 } // End namespace CLD2 |
|
295 |
|
296 #endif // I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__ |
|
297 |