|
1 // Copyright 2013 Google Inc. All Rights Reserved. |
|
2 // |
|
3 // Licensed under the Apache License, Version 2.0 (the "License"); |
|
4 // you may not use this file except in compliance with the License. |
|
5 // You may obtain a copy of the License at |
|
6 // |
|
7 // http://www.apache.org/licenses/LICENSE-2.0 |
|
8 // |
|
9 // Unless required by applicable law or agreed to in writing, software |
|
10 // distributed under the License is distributed on an "AS IS" BASIS, |
|
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
12 // See the License for the specific language governing permissions and |
|
13 // limitations under the License. |
|
14 |
|
15 // |
|
16 // Author: dsites@google.com (Dick Sites) |
|
17 // Updated 2014.01 for dual table lookup |
|
18 // |
|
19 |
|
20 #include <stdio.h> |
|
21 #include <string.h> |
|
22 #include <string> |
|
23 #include <vector> |
|
24 |
|
25 #include "cldutil.h" |
|
26 #include "debug.h" |
|
27 #include "integral_types.h" |
|
28 #include "lang_script.h" |
|
29 #include "utf8statetable.h" |
|
30 |
|
31 #ifdef CLD2_DYNAMIC_MODE |
|
32 #include "cld2_dynamic_data.h" |
|
33 #include "cld2_dynamic_data_loader.h" |
|
34 #endif |
|
35 #include "cld2tablesummary.h" |
|
36 #include "compact_lang_det_impl.h" |
|
37 #include "compact_lang_det_hint_code.h" |
|
38 #include "getonescriptspan.h" |
|
39 #include "tote.h" |
|
40 |
|
41 |
|
42 namespace CLD2 { |
|
43 |
|
44 using namespace std; |
|
45 |
|
46 // Linker supplies the right tables, From files |
|
47 // cld_generated_cjk_uni_prop_80.cc cld2_generated_cjk_compatible.cc |
|
48 // cld_generated_cjk_delta_bi_32.cc generated_distinct_bi_0.cc |
|
49 // cld2_generated_quad*.cc cld2_generated_deltaocta*.cc |
|
50 // cld2_generated_distinctocta*.cc |
|
51 // cld_generated_score_quad_octa_1024_256.cc |
|
52 |
|
53 // 2014.01 Now implementing quadgram dual lookup tables, to allow main table |
|
54 // sizes that are 1/3/5 times a power of two, instead of just powers of two. |
|
55 // Gives more flexibility of total footprint for CLD2. |
|
56 |
|
57 extern const int kLanguageToPLangSize; |
|
58 extern const int kCloseSetSize; |
|
59 |
|
60 extern const UTF8PropObj cld_generated_CjkUni_obj; |
|
61 extern const CLD2TableSummary kCjkCompat_obj; |
|
62 extern const CLD2TableSummary kCjkDeltaBi_obj; |
|
63 extern const CLD2TableSummary kDistinctBiTable_obj; |
|
64 extern const CLD2TableSummary kQuad_obj; |
|
65 extern const CLD2TableSummary kQuad_obj2; // Dual lookup tables |
|
66 extern const CLD2TableSummary kDeltaOcta_obj; |
|
67 extern const CLD2TableSummary kDistinctOcta_obj; |
|
68 extern const short kAvgDeltaOctaScore[]; |
|
69 |
|
70 #ifdef CLD2_DYNAMIC_MODE |
|
71 // CLD2_DYNAMIC_MODE is defined: |
|
72 // Data will be read from an mmap opened at runtime. |
|
73 static ScoringTables kScoringtables = { |
|
74 NULL, //&cld_generated_CjkUni_obj, |
|
75 NULL, //&kCjkCompat_obj, |
|
76 NULL, //&kCjkDeltaBi_obj, |
|
77 NULL, //&kDistinctBiTable_obj, |
|
78 NULL, //&kQuad_obj, |
|
79 NULL, //&kQuad_obj2, |
|
80 NULL, //&kDeltaOcta_obj, |
|
81 NULL, //&kDistinctOcta_obj, |
|
82 NULL, //kAvgDeltaOctaScore, |
|
83 }; |
|
84 static bool dynamicDataLoaded = false; |
|
85 static ScoringTables* dynamicTables = NULL; |
|
86 static void* mmapAddress = NULL; |
|
87 static int mmapLength = 0; |
|
88 |
|
89 bool isDataLoaded() { return dynamicDataLoaded; } |
|
90 |
|
91 void loadData(const char* fileName) { |
|
92 if (isDataLoaded()) { |
|
93 unloadData(); |
|
94 } |
|
95 dynamicTables = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength); |
|
96 kScoringtables = *dynamicTables; |
|
97 dynamicDataLoaded = true; |
|
98 }; |
|
99 |
|
100 void unloadData() { |
|
101 if (!dynamicDataLoaded) return; |
|
102 dynamicDataLoaded = false; |
|
103 // unloading will null all the pointers out. |
|
104 CLD2DynamicDataLoader::unloadData(&dynamicTables, &mmapAddress, &mmapLength); |
|
105 } |
|
106 #else |
|
107 // This initializes kScoringtables.quadgram_obj etc. |
|
108 static const ScoringTables kScoringtables = { |
|
109 &cld_generated_CjkUni_obj, |
|
110 &kCjkCompat_obj, |
|
111 &kCjkDeltaBi_obj, |
|
112 &kDistinctBiTable_obj, |
|
113 |
|
114 &kQuad_obj, |
|
115 &kQuad_obj2, // Dual lookup tables |
|
116 &kDeltaOcta_obj, |
|
117 &kDistinctOcta_obj, |
|
118 |
|
119 kAvgDeltaOctaScore, |
|
120 }; |
|
121 #endif // #ifdef CLD2_DYNAMIC_MODE |
|
122 |
|
123 |
|
124 static const bool FLAGS_cld_no_minimum_bytes = false; |
|
125 static const bool FLAGS_cld_forcewords = true; |
|
126 static const bool FLAGS_cld_showme = false; |
|
127 static const bool FLAGS_cld_echotext = true; |
|
128 static const int32 FLAGS_cld_textlimit = 160; |
|
129 static const int32 FLAGS_cld_smoothwidth = 20; |
|
130 static const bool FLAGS_cld_2011_hints = true; |
|
131 static const int32 FLAGS_cld_max_lang_tag_scan_kb = 8; |
|
132 |
|
133 static const bool FLAGS_dbgscore = false; |
|
134 |
|
135 |
|
136 static const int kLangHintInitial = 12; // Boost language by N initially |
|
137 static const int kLangHintBoost = 12; // Boost language by N/16 per quadgram |
|
138 |
|
139 static const int kShortSpanThresh = 32; // Bytes |
|
140 static const int kMaxSecondChanceLen = 1024; // Look at first 1K of short spans |
|
141 |
|
142 static const int kCheapSqueezeTestThresh = 4096; // Only look for squeezing |
|
143 // after this many text bytes |
|
144 static const int kCheapSqueezeTestLen = 256; // Bytes to test to trigger sqz |
|
145 static const int kSpacesTriggerPercent = 25; // Trigger sqz if >=25% spaces |
|
146 static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted |
|
147 |
|
148 static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks |
|
149 static const int kSpacesThreshPercent = 25; // Squeeze if >=25% spaces |
|
150 static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted |
|
151 |
|
152 static const int kMaxSpaceScan = 32; // Bytes |
|
153 |
|
154 static const int kGoodLang1Percent = 70; |
|
155 static const int kGoodLang1and2Percent = 93; |
|
156 static const int kShortTextThresh = 256; // Bytes |
|
157 |
|
158 static const int kMinChunkSizeQuads = 4; // Chunk is at least four quads |
|
159 static const int kMaxChunkSizeQuads = 1024; // Chunk is at most 1K quads |
|
160 |
|
161 static const int kDefaultWordSpan = 256; // Scan at least this many initial |
|
162 // bytes with word scoring |
|
163 static const int kReallyBigWordSpan = 9999999; // Forces word scoring all text |
|
164 |
|
165 static const int kMinReliableSeq = 50; // Record in seq if >= 50% reliable |
|
166 |
|
167 static const int kPredictionTableSize = 4096; // Must be exactly 4096 for |
|
168 // cheap compressor |
|
169 |
|
170 static const int kNonEnBoilerplateMinPercent = 17; // <this => no second |
|
171 static const int kNonFIGSBoilerplateMinPercent = 20; // <this => no second |
|
172 static const int kGoodFirstMinPercent = 26; // <this => UNK |
|
173 static const int kGoodFirstReliableMinPercent = 51; // <this => unreli |
|
174 static const int kIgnoreMaxPercent = 20; // >this => unreli |
|
175 static const int kKeepMinPercent = 2; // <this => unreli |
|
176 |
|
177 |
|
178 |
|
179 // Statistically closest language, based on quadgram table |
|
180 // Those that are far from other languges map to UNKNOWN_LANGUAGE |
|
181 // Subscripted by Language |
|
182 // |
|
183 // From lang_correlation.txt and hand-edits |
|
184 // sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/ |
|
185 // (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE, |
|
186 // \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt |
|
187 // |
|
188 static const int kMinCorrPercent = 24; // Pick off how close you want |
|
189 // 24 catches PERSIAN <== ARABIC |
|
190 // but not SPANISH <== PORTUGESE |
|
191 static Language Unknown = UNKNOWN_LANGUAGE; |
|
192 |
|
193 // Suspect idea |
|
194 // Subscripted by Language |
|
195 static const Language kClosestAltLanguage[] = { |
|
196 (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // ENGLISH |
|
197 (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // DANISH |
|
198 (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE, // DUTCH |
|
199 (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // FINNISH |
|
200 (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // FRENCH |
|
201 (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE, // GERMAN |
|
202 (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE, // HEBREW |
|
203 (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE, // ITALIAN |
|
204 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Japanese |
|
205 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Korean |
|
206 (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE, // NORWEGIAN |
|
207 ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // POLISH |
|
208 (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // PORTUGUESE |
|
209 (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // RUSSIAN |
|
210 (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE, // SPANISH |
|
211 (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // SWEDISH |
|
212 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Chinese |
|
213 (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // CZECH |
|
214 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GREEK |
|
215 (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE, // ICELANDIC |
|
216 ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE, // LATVIAN |
|
217 ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE, // LITHUANIAN |
|
218 ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ROMANIAN |
|
219 ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // HUNGARIAN |
|
220 (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE, // ESTONIAN |
|
221 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Ignore |
|
222 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Unknown |
|
223 (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // BULGARIAN |
|
224 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CROATIAN |
|
225 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SERBIAN |
|
226 (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE, // IRISH |
|
227 (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GALICIAN |
|
228 ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // TAGALOG |
|
229 (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE, // TURKISH |
|
230 (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // UKRAINIAN |
|
231 (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // HINDI |
|
232 (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // MACEDONIAN |
|
233 (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE, // BENGALI |
|
234 (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // INDONESIAN |
|
235 ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // LATIN |
|
236 (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // MALAY |
|
237 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MALAYALAM |
|
238 ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE, // WELSH |
|
239 ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // NEPALI |
|
240 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TELUGU |
|
241 ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE, // ALBANIAN |
|
242 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TAMIL |
|
243 (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE, // BELARUSIAN |
|
244 (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE, // JAVANESE |
|
245 (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE, // OCCITAN |
|
246 (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // URDU |
|
247 (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // BIHARI |
|
248 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GUJARATI |
|
249 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // THAI |
|
250 (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // ARABIC |
|
251 (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // CATALAN |
|
252 ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ESPERANTO |
|
253 ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // BASQUE |
|
254 ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // INTERLINGUA |
|
255 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KANNADA |
|
256 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PUNJABI |
|
257 (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE, // SCOTS_GAELIC |
|
258 ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SWAHILI |
|
259 (28 >= kMinCorrPercent) ? SERBIAN : UNKNOWN_LANGUAGE, // SLOVENIAN |
|
260 (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // MARATHI |
|
261 ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // MALTESE |
|
262 ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE, // VIETNAMESE |
|
263 (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // FRISIAN |
|
264 (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE, // SLOVAK |
|
265 // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ChineseT |
|
266 (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE, // ChineseT |
|
267 (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE, // FAROESE |
|
268 (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE, // SUNDANESE |
|
269 (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE, // UZBEK |
|
270 ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE, // AMHARIC |
|
271 (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // AZERBAIJANI |
|
272 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GEORGIAN |
|
273 ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE, // TIGRINYA |
|
274 (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // PERSIAN |
|
275 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // BOSNIAN |
|
276 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SINHALESE |
|
277 (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // NORWEGIAN_N |
|
278 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_P |
|
279 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_B |
|
280 (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // XHOSA |
|
281 (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE, // ZULU |
|
282 ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GUARANI |
|
283 (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE, // SESOTHO |
|
284 ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // TURKMEN |
|
285 ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE, // KYRGYZ |
|
286 ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE, // BRETON |
|
287 ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE, // TWI |
|
288 (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE, // YIDDISH |
|
289 (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE, // SERBO_CROATIAN |
|
290 (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // SOMALI |
|
291 ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // UIGHUR |
|
292 (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // KURDISH |
|
293 ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // MONGOLIAN |
|
294 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ARMENIAN |
|
295 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // LAOTHIAN |
|
296 ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // SINDHI |
|
297 (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // RHAETO_ROMANCE |
|
298 (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // AFRIKAANS |
|
299 (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // LUXEMBOURGISH |
|
300 ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // BURMESE |
|
301 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KHMER |
|
302 (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE, // TIBETAN |
|
303 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // DHIVEHI |
|
304 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CHEROKEE |
|
305 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SYRIAC |
|
306 ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // LIMBU |
|
307 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ORIYA |
|
308 (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE, // ASSAMESE |
|
309 (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // CORSICAN |
|
310 ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // INTERLINGUE |
|
311 ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // KAZAKH |
|
312 ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE, // LINGALA |
|
313 (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // MOLDAVIAN |
|
314 (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // PASHTO |
|
315 ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE, // QUECHUA |
|
316 ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SHONA |
|
317 (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // TAJIK |
|
318 (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE, // TATAR |
|
319 (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE, // TONGA |
|
320 ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE, // YORUBA |
|
321 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_ENGLISH_BASED |
|
322 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_FRENCH_BASED |
|
323 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED |
|
324 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_OTHER |
|
325 ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // MAORI |
|
326 ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // WOLOF |
|
327 ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE, // ABKHAZIAN |
|
328 ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // AFAR |
|
329 ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE, // AYMARA |
|
330 (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE, // BASHKIR |
|
331 ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // BISLAMA |
|
332 (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE, // DZONGKHA |
|
333 ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // FIJIAN |
|
334 ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE, // GREENLANDIC |
|
335 ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE, // HAUSA |
|
336 ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // HAITIAN_CREOLE |
|
337 ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE, // INUPIAK |
|
338 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // INUKTITUT |
|
339 ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // KASHMIRI |
|
340 (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE, // KINYARWANDA |
|
341 ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE, // MALAGASY |
|
342 (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // NAURU |
|
343 (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // OROMO |
|
344 (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // RUNDI |
|
345 (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // SAMOAN |
|
346 ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE, // SANGO |
|
347 (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // SANSKRIT |
|
348 (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // SISWANT |
|
349 ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE, // TSONGA |
|
350 (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE, // TSWANA |
|
351 ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // VOLAPUK |
|
352 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ZHUANG |
|
353 ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // KHASI |
|
354 (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // SCOTS |
|
355 (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // GANDA |
|
356 ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // MANX |
|
357 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MONTENEGRIN |
|
358 |
|
359 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // AKAN |
|
360 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // IGBO |
|
361 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MAURITIAN_CREOLE |
|
362 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // HAWAIIAN |
|
363 }; |
|
364 |
|
365 // COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES, |
|
366 // kClosestAltLanguage_has_incorrect_size); |
|
367 |
|
368 |
|
369 inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;} |
|
370 inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;} |
|
371 inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;} |
|
372 inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;} |
|
373 inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;} |
|
374 inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;} |
|
375 inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;} |
|
376 |
|
377 |
|
378 // Defines Top40 packed languages |
|
379 |
|
380 // Google top 40 languages |
|
381 // |
|
382 // Tier 0/1 Language enum list (16) |
|
383 // ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH, // E - FIGS |
|
384 // DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN, |
|
385 // PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI, |
|
386 // ARABIC, |
|
387 // |
|
388 // Tier 2 Language enum list (22) |
|
389 // SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN, |
|
390 // HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN, |
|
391 // VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK, |
|
392 // TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN, |
|
393 // UKRAINIAN, HINDI, |
|
394 // |
|
395 // use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21) |
|
396 // |
|
397 // Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40 |
|
398 |
|
399 |
|
400 void DemoteNotTop40(Tote* chunk_tote, uint16 psplus_one) { |
|
401 // REVISIT |
|
402 } |
|
403 |
|
404 void PrintText(FILE* f, Language cur_lang, const string& temp) { |
|
405 if (temp.size() == 0) {return;} |
|
406 fprintf(f, "PrintText[%s]%s<br>\n", LanguageName(cur_lang), temp.c_str()); |
|
407 } |
|
408 |
|
409 |
|
410 //------------------------------------------------------------------------------ |
|
411 // For --cld_html debugging output. Not thread safe |
|
412 //------------------------------------------------------------------------------ |
|
413 static Language prior_lang = UNKNOWN_LANGUAGE; |
|
414 static bool prior_unreliable = false; |
|
415 |
|
416 //------------------------------------------------------------------------------ |
|
417 // End For --cld_html debugging output |
|
418 //------------------------------------------------------------------------------ |
|
419 |
|
420 |
|
421 // Backscan to word boundary, returning how many bytes n to go back |
|
422 // so that src - n is non-space ans src - n - 1 is space. |
|
423 // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary |
|
424 int BackscanToSpace(const char* src, int limit) { |
|
425 int n = 0; |
|
426 limit = minint(limit, kMaxSpaceScan); |
|
427 while (n < limit) { |
|
428 if (src[-n - 1] == ' ') {return n;} // We are at _X |
|
429 ++n; |
|
430 } |
|
431 n = 0; |
|
432 while (n < limit) { |
|
433 if ((src[-n] & 0xc0) != 0x80) {return n;} // We are at char begin |
|
434 ++n; |
|
435 } |
|
436 return 0; |
|
437 } |
|
438 |
|
439 // Forwardscan to word boundary, returning how many bytes n to go forward |
|
440 // so that src + n is non-space ans src + n - 1 is space. |
|
441 // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary |
|
442 int ForwardscanToSpace(const char* src, int limit) { |
|
443 int n = 0; |
|
444 limit = minint(limit, kMaxSpaceScan); |
|
445 while (n < limit) { |
|
446 if (src[n] == ' ') {return n + 1;} // We are at _X |
|
447 ++n; |
|
448 } |
|
449 n = 0; |
|
450 while (n < limit) { |
|
451 if ((src[n] & 0xc0) != 0x80) {return n;} // We are at char begin |
|
452 ++n; |
|
453 } |
|
454 return 0; |
|
455 } |
|
456 |
|
457 |
|
458 // This uses a cheap predictor to get a measure of compression, and |
|
459 // hence a measure of repetitiveness. It works on complete UTF-8 characters |
|
460 // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly |
|
461 // all the time when done with a byte-based count. Sigh. |
|
462 // |
|
463 // To allow running prediction across multiple chunks, caller passes in current |
|
464 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0. |
|
465 // |
|
466 // Returns the number of *bytes* correctly predicted, increments by 1..4 for |
|
467 // each correctly-predicted character. |
|
468 // |
|
469 // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text |
|
470 // |
|
471 |
|
472 // TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen |
|
473 |
|
474 int CountPredictedBytes(const char* isrc, int src_len, int* hash, int* tbl) { |
|
475 int p_count = 0; |
|
476 const uint8* src = reinterpret_cast<const uint8*>(isrc); |
|
477 const uint8* srclimit = src + src_len; |
|
478 int local_hash = *hash; |
|
479 |
|
480 while (src < srclimit) { |
|
481 int c = src[0]; |
|
482 int incr = 1; |
|
483 |
|
484 // Pick up one char and length |
|
485 if (c < 0xc0) { |
|
486 // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx |
|
487 // Do nothing more |
|
488 } else if ((c & 0xe0) == 0xc0) { |
|
489 // Two-byte |
|
490 c = (c << 8) | src[1]; |
|
491 incr = 2; |
|
492 } else if ((c & 0xf0) == 0xe0) { |
|
493 // Three-byte |
|
494 c = (c << 16) | (src[1] << 8) | src[2]; |
|
495 incr = 3; |
|
496 } else { |
|
497 // Four-byte |
|
498 c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; |
|
499 incr = 4; |
|
500 } |
|
501 src += incr; |
|
502 |
|
503 int p = tbl[local_hash]; // Prediction |
|
504 tbl[local_hash] = c; // Update prediction |
|
505 if (c == p) { |
|
506 p_count += incr; // Count bytes of good predictions |
|
507 } |
|
508 |
|
509 local_hash = ((local_hash << 4) ^ c) & 0xfff; |
|
510 } |
|
511 *hash = local_hash; |
|
512 return p_count; |
|
513 } |
|
514 |
|
515 |
|
516 |
|
517 // Counts number of spaces; a little faster than one-at-a-time |
|
518 // Doesn't count odd bytes at end |
|
519 int CountSpaces4(const char* src, int src_len) { |
|
520 int s_count = 0; |
|
521 for (int i = 0; i < (src_len & ~3); i += 4) { |
|
522 s_count += (src[i] == ' '); |
|
523 s_count += (src[i+1] == ' '); |
|
524 s_count += (src[i+2] == ' '); |
|
525 s_count += (src[i+3] == ' '); |
|
526 } |
|
527 return s_count; |
|
528 } |
|
529 |
|
530 |
|
531 // Remove words of text that have more than half their letters predicted |
|
532 // correctly by our cheap predictor, moving the remaining words in-place |
|
533 // to the front of the input buffer. |
|
534 // |
|
535 // To allow running prediction across multiple chunks, caller passes in current |
|
536 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0. |
|
537 // |
|
538 // Return the new, possibly-shorter length |
|
539 // |
|
540 // Result Buffer ALWAYS has leading space and trailing space space space NUL, |
|
541 // if input does |
|
542 // |
|
543 int CheapRepWordsInplace(char* isrc, int src_len, int* hash, int* tbl) { |
|
544 const uint8* src = reinterpret_cast<const uint8*>(isrc); |
|
545 const uint8* srclimit = src + src_len; |
|
546 char* dst = isrc; |
|
547 int local_hash = *hash; |
|
548 char* word_dst = dst; // Start of next word |
|
549 int good_predict_bytes = 0; |
|
550 int word_length_bytes = 0; |
|
551 |
|
552 while (src < srclimit) { |
|
553 int c = src[0]; |
|
554 int incr = 1; |
|
555 *dst++ = c; |
|
556 |
|
557 if (c == ' ') { |
|
558 if ((good_predict_bytes * 2) > word_length_bytes) { |
|
559 // Word is well-predicted: backup to start of this word |
|
560 dst = word_dst; |
|
561 if (FLAGS_cld_showme) { |
|
562 // Mark the deletion point with period |
|
563 // Don't repeat multiple periods |
|
564 // Cannot mark with more bytes or may overwrite unseen input |
|
565 if ((isrc < (dst - 2)) && (dst[-2] != '.')) { |
|
566 *dst++ = '.'; |
|
567 *dst++ = ' '; |
|
568 } |
|
569 } |
|
570 } |
|
571 word_dst = dst; // Start of next word |
|
572 good_predict_bytes = 0; |
|
573 word_length_bytes = 0; |
|
574 } |
|
575 |
|
576 // Pick up one char and length |
|
577 if (c < 0xc0) { |
|
578 // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx |
|
579 // Do nothing more |
|
580 } else if ((c & 0xe0) == 0xc0) { |
|
581 // Two-byte |
|
582 *dst++ = src[1]; |
|
583 c = (c << 8) | src[1]; |
|
584 incr = 2; |
|
585 } else if ((c & 0xf0) == 0xe0) { |
|
586 // Three-byte |
|
587 *dst++ = src[1]; |
|
588 *dst++ = src[2]; |
|
589 c = (c << 16) | (src[1] << 8) | src[2]; |
|
590 incr = 3; |
|
591 } else { |
|
592 // Four-byte |
|
593 *dst++ = src[1]; |
|
594 *dst++ = src[2]; |
|
595 *dst++ = src[3]; |
|
596 c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; |
|
597 incr = 4; |
|
598 } |
|
599 src += incr; |
|
600 word_length_bytes += incr; |
|
601 |
|
602 int p = tbl[local_hash]; // Prediction |
|
603 tbl[local_hash] = c; // Update prediction |
|
604 if (c == p) { |
|
605 good_predict_bytes += incr; // Count good predictions |
|
606 } |
|
607 |
|
608 local_hash = ((local_hash << 4) ^ c) & 0xfff; |
|
609 } |
|
610 |
|
611 *hash = local_hash; |
|
612 |
|
613 if ((dst - isrc) < (src_len - 3)) { |
|
614 // Pad and make last char clean UTF-8 by putting following spaces |
|
615 dst[0] = ' '; |
|
616 dst[1] = ' '; |
|
617 dst[2] = ' '; |
|
618 dst[3] = '\0'; |
|
619 } else if ((dst - isrc) < src_len) { |
|
620 // Make last char clean UTF-8 by putting following space off the end |
|
621 dst[0] = ' '; |
|
622 } |
|
623 |
|
624 return static_cast<int>(dst - isrc); |
|
625 } |
|
626 |
|
627 |
|
628 // This alternate form overwrites redundant words, thus avoiding corrupting the |
|
629 // backmap for generate a vector of original-text ranges. |
|
630 int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) { |
|
631 const uint8* src = reinterpret_cast<const uint8*>(isrc); |
|
632 const uint8* srclimit = src + src_len; |
|
633 char* dst = isrc; |
|
634 int local_hash = *hash; |
|
635 char* word_dst = dst; // Start of next word |
|
636 int good_predict_bytes = 0; |
|
637 int word_length_bytes = 0; |
|
638 |
|
639 while (src < srclimit) { |
|
640 int c = src[0]; |
|
641 int incr = 1; |
|
642 *dst++ = c; |
|
643 |
|
644 if (c == ' ') { |
|
645 if ((good_predict_bytes * 2) > word_length_bytes) { |
|
646 // Word [word_dst..dst-1) is well-predicted: overwrite |
|
647 for (char* p = word_dst; p < dst - 1; ++p) {*p = '.';} |
|
648 } |
|
649 word_dst = dst; // Start of next word |
|
650 good_predict_bytes = 0; |
|
651 word_length_bytes = 0; |
|
652 } |
|
653 |
|
654 // Pick up one char and length |
|
655 if (c < 0xc0) { |
|
656 // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx |
|
657 // Do nothing more |
|
658 } else if ((c & 0xe0) == 0xc0) { |
|
659 // Two-byte |
|
660 *dst++ = src[1]; |
|
661 c = (c << 8) | src[1]; |
|
662 incr = 2; |
|
663 } else if ((c & 0xf0) == 0xe0) { |
|
664 // Three-byte |
|
665 *dst++ = src[1]; |
|
666 *dst++ = src[2]; |
|
667 c = (c << 16) | (src[1] << 8) | src[2]; |
|
668 incr = 3; |
|
669 } else { |
|
670 // Four-byte |
|
671 *dst++ = src[1]; |
|
672 *dst++ = src[2]; |
|
673 *dst++ = src[3]; |
|
674 c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; |
|
675 incr = 4; |
|
676 } |
|
677 src += incr; |
|
678 word_length_bytes += incr; |
|
679 |
|
680 int p = tbl[local_hash]; // Prediction |
|
681 tbl[local_hash] = c; // Update prediction |
|
682 if (c == p) { |
|
683 good_predict_bytes += incr; // Count good predictions |
|
684 } |
|
685 |
|
686 local_hash = ((local_hash << 4) ^ c) & 0xfff; |
|
687 } |
|
688 |
|
689 *hash = local_hash; |
|
690 |
|
691 if ((dst - isrc) < (src_len - 3)) { |
|
692 // Pad and make last char clean UTF-8 by putting following spaces |
|
693 dst[0] = ' '; |
|
694 dst[1] = ' '; |
|
695 dst[2] = ' '; |
|
696 dst[3] = '\0'; |
|
697 } else if ((dst - isrc) < src_len) { |
|
698 // Make last char clean UTF-8 by putting following space off the end |
|
699 dst[0] = ' '; |
|
700 } |
|
701 |
|
702 return static_cast<int>(dst - isrc); |
|
703 } |
|
704 |
|
705 |
|
706 // Remove portions of text that have a high density of spaces, or that are |
|
707 // overly repetitive, squeezing the remaining text in-place to the front of the |
|
708 // input buffer. |
|
709 // |
|
710 // Squeezing looks at density of space/prediced chars in fixed-size chunks, |
|
711 // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes. |
|
712 // |
|
713 // Return the new, possibly-shorter length |
|
714 // |
|
715 // Result Buffer ALWAYS has leading space and trailing space space space NUL, |
|
716 // if input does |
|
717 // |
|
718 int CheapSqueezeInplace(char* isrc, |
|
719 int src_len, |
|
720 int ichunksize) { |
|
721 char* src = isrc; |
|
722 char* dst = src; |
|
723 char* srclimit = src + src_len; |
|
724 bool skipping = false; |
|
725 |
|
726 int hash = 0; |
|
727 // Allocate local prediction table. |
|
728 int* predict_tbl = new int[kPredictionTableSize]; |
|
729 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); |
|
730 |
|
731 int chunksize = ichunksize; |
|
732 if (chunksize == 0) {chunksize = kChunksizeDefault;} |
|
733 int space_thresh = (chunksize * kSpacesThreshPercent) / 100; |
|
734 int predict_thresh = (chunksize * kPredictThreshPercent) / 100; |
|
735 |
|
736 while (src < srclimit) { |
|
737 int remaining_bytes = srclimit - src; |
|
738 int len = minint(chunksize, remaining_bytes); |
|
739 // Make len land us on a UTF-8 character boundary. |
|
740 // Ah. Also fixes mispredict because we could get out of phase |
|
741 // Loop always terminates at trailing space in buffer |
|
742 while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes |
|
743 |
|
744 int space_n = CountSpaces4(src, len); |
|
745 int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl); |
|
746 if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) { |
|
747 // Skip the text |
|
748 if (!skipping) { |
|
749 // Keeping-to-skipping transition; do it at a space |
|
750 int n = BackscanToSpace(dst, static_cast<int>(dst - isrc)); |
|
751 dst -= n; |
|
752 if (dst == isrc) { |
|
753 // Force a leading space if the first chunk is deleted |
|
754 *dst++ = ' '; |
|
755 } |
|
756 if (FLAGS_cld_showme) { |
|
757 // Mark the deletion point with black square U+25A0 |
|
758 *dst++ = static_cast<unsigned char>(0xe2); |
|
759 *dst++ = static_cast<unsigned char>(0x96); |
|
760 *dst++ = static_cast<unsigned char>(0xa0); |
|
761 *dst++ = ' '; |
|
762 } |
|
763 skipping = true; |
|
764 } |
|
765 } else { |
|
766 // Keep the text |
|
767 if (skipping) { |
|
768 // Skipping-to-keeping transition; do it at a space |
|
769 int n = ForwardscanToSpace(src, len); |
|
770 src += n; |
|
771 remaining_bytes -= n; // Shrink remaining length |
|
772 len -= n; |
|
773 skipping = false; |
|
774 } |
|
775 // "len" can be negative in some cases |
|
776 if (len > 0) { |
|
777 memmove(dst, src, len); |
|
778 dst += len; |
|
779 } |
|
780 } |
|
781 src += len; |
|
782 } |
|
783 |
|
784 if ((dst - isrc) < (src_len - 3)) { |
|
785 // Pad and make last char clean UTF-8 by putting following spaces |
|
786 dst[0] = ' '; |
|
787 dst[1] = ' '; |
|
788 dst[2] = ' '; |
|
789 dst[3] = '\0'; |
|
790 } else if ((dst - isrc) < src_len) { |
|
791 // Make last char clean UTF-8 by putting following space off the end |
|
792 dst[0] = ' '; |
|
793 } |
|
794 |
|
795 // Deallocate local prediction table |
|
796 delete[] predict_tbl; |
|
797 return static_cast<int>(dst - isrc); |
|
798 } |
|
799 |
|
800 // This alternate form overwrites redundant words, thus avoiding corrupting the |
|
801 // backmap for generate a vector of original-text ranges. |
|
802 int CheapSqueezeInplaceOverwrite(char* isrc, |
|
803 int src_len, |
|
804 int ichunksize) { |
|
805 char* src = isrc; |
|
806 char* dst = src; |
|
807 char* srclimit = src + src_len; |
|
808 bool skipping = false; |
|
809 |
|
810 int hash = 0; |
|
811 // Allocate local prediction table. |
|
812 int* predict_tbl = new int[kPredictionTableSize]; |
|
813 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); |
|
814 |
|
815 int chunksize = ichunksize; |
|
816 if (chunksize == 0) {chunksize = kChunksizeDefault;} |
|
817 int space_thresh = (chunksize * kSpacesThreshPercent) / 100; |
|
818 int predict_thresh = (chunksize * kPredictThreshPercent) / 100; |
|
819 |
|
820 // Always keep first byte (space) |
|
821 ++src; |
|
822 ++dst; |
|
823 while (src < srclimit) { |
|
824 int remaining_bytes = srclimit - src; |
|
825 int len = minint(chunksize, remaining_bytes); |
|
826 // Make len land us on a UTF-8 character boundary. |
|
827 // Ah. Also fixes mispredict because we could get out of phase |
|
828 // Loop always terminates at trailing space in buffer |
|
829 while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes |
|
830 |
|
831 int space_n = CountSpaces4(src, len); |
|
832 int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl); |
|
833 if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) { |
|
834 // Overwrite the text [dst-n..dst) |
|
835 if (!skipping) { |
|
836 // Keeping-to-skipping transition; do it at a space |
|
837 int n = BackscanToSpace(dst, static_cast<int>(dst - isrc)); |
|
838 // Text [word_dst..dst) is well-predicted: overwrite |
|
839 for (char* p = dst - n; p < dst; ++p) {*p = '.';} |
|
840 skipping = true; |
|
841 } |
|
842 // Overwrite the text [dst..dst+len) |
|
843 for (char* p = dst; p < dst + len; ++p) {*p = '.';} |
|
844 dst[len - 1] = ' '; // Space at end so we can see what is happening |
|
845 } else { |
|
846 // Keep the text |
|
847 if (skipping) { |
|
848 // Skipping-to-keeping transition; do it at a space |
|
849 int n = ForwardscanToSpace(src, len); |
|
850 // Text [dst..dst+n) is well-predicted: overwrite |
|
851 for (char* p = dst; p < dst + n - 1; ++p) {*p = '.';} |
|
852 skipping = false; |
|
853 } |
|
854 } |
|
855 dst += len; |
|
856 src += len; |
|
857 } |
|
858 |
|
859 if ((dst - isrc) < (src_len - 3)) { |
|
860 // Pad and make last char clean UTF-8 by putting following spaces |
|
861 dst[0] = ' '; |
|
862 dst[1] = ' '; |
|
863 dst[2] = ' '; |
|
864 dst[3] = '\0'; |
|
865 } else if ((dst - isrc) < src_len) { |
|
866 // Make last char clean UTF-8 by putting following space off the end |
|
867 dst[0] = ' '; |
|
868 } |
|
869 |
|
870 // Deallocate local prediction table |
|
871 delete[] predict_tbl; |
|
872 return static_cast<int>(dst - isrc); |
|
873 } |
|
874 |
|
875 // Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input |
|
876 // About 90 MB/sec, with or without memcpy, chunksize 48 or 4096 |
|
877 // Just CountSpaces is about 340 MB/sec |
|
878 // Byte-only CountPredictedBytes is about 150 MB/sec |
|
879 // Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec |
|
880 // Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c |
|
881 // Unjammed byte-only both = 170 MB/sec |
|
882 // Jammed byte-only both = 120 MB/sec |
|
883 // Back to original w/slight updates, 110 MB/sec |
|
884 // |
|
885 bool CheapSqueezeTriggerTest(const char* src, int src_len, int testsize) { |
|
886 // Don't trigger at all on short text |
|
887 if (src_len < testsize) {return false;} |
|
888 int space_thresh = (testsize * kSpacesTriggerPercent) / 100; |
|
889 int predict_thresh = (testsize * kPredictTriggerPercent) / 100; |
|
890 int hash = 0; |
|
891 // Allocate local prediction table. |
|
892 int* predict_tbl = new int[kPredictionTableSize]; |
|
893 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); |
|
894 |
|
895 bool retval = false; |
|
896 if ((CountSpaces4(src, testsize) >= space_thresh) || |
|
897 (CountPredictedBytes(src, testsize, &hash, predict_tbl) >= |
|
898 predict_thresh)) { |
|
899 retval = true; |
|
900 } |
|
901 // Deallocate local prediction table |
|
902 delete[] predict_tbl; |
|
903 return retval; |
|
904 } |
|
905 |
|
906 |
|
907 |
|
908 |
|
909 // Delete any extended languages from doc_tote |
|
910 void RemoveExtendedLanguages(DocTote* doc_tote) { |
|
911 // Now a nop |
|
912 } |
|
913 |
|
914 static const int kMinReliableKeepPercent = 41; // Remove lang if reli < this |
|
915 |
|
916 // For Tier3 languages, require a minimum number of bytes to be first-place lang |
|
917 static const int kGoodFirstT3MinBytes = 24; // <this => no first |
|
918 |
|
919 // Move bytes for unreliable langs to another lang or UNKNOWN |
|
920 // doc_tote is sorted, so cannot Add |
|
921 // |
|
922 // If both CHINESE and CHINESET are present and unreliable, do not delete both; |
|
923 // merge both into CHINESE. |
|
924 // |
|
925 //dsites 2009.03.19 |
|
926 // we also want to remove Tier3 languages as the first lang if there is very |
|
927 // little text like ej1 ej2 ej3 ej4 |
|
928 // maybe fold this back in earlier |
|
929 // |
|
930 void RemoveUnreliableLanguages(DocTote* doc_tote, |
|
931 bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) { |
|
932 // Prepass to merge some low-reliablility languages |
|
933 // TODO: this shouldn't really reach in to the internal structure of doc_tote |
|
934 int total_bytes = 0; |
|
935 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { |
|
936 int plang = doc_tote->Key(sub); |
|
937 if (plang == DocTote::kUnusedKey) {continue;} // Empty slot |
|
938 |
|
939 Language lang = static_cast<Language>(plang); |
|
940 int bytes = doc_tote->Value(sub); |
|
941 int reli = doc_tote->Reliability(sub); |
|
942 if (bytes == 0) {continue;} // Zero bytes |
|
943 total_bytes += bytes; |
|
944 |
|
945 // Reliable percent = stored reliable score over stored bytecount |
|
946 int reliable_percent = reli / bytes; |
|
947 if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper |
|
948 |
|
949 // This language is too unreliable to keep, but we might merge it. |
|
950 Language altlang = UNKNOWN_LANGUAGE; |
|
951 if (lang <= HAWAIIAN) {altlang = kClosestAltLanguage[lang];} |
|
952 if (altlang == UNKNOWN_LANGUAGE) {continue;} // No alternative |
|
953 |
|
954 // Look for alternative in doc_tote |
|
955 int altsub = doc_tote->Find(altlang); |
|
956 if (altsub < 0) {continue;} // No alternative text |
|
957 |
|
958 int bytes2 = doc_tote->Value(altsub); |
|
959 int reli2 = doc_tote->Reliability(altsub); |
|
960 if (bytes2 == 0) {continue;} // Zero bytes |
|
961 |
|
962 // Reliable percent is stored reliable score over stored bytecount |
|
963 int reliable_percent2 = reli2 / bytes2; |
|
964 |
|
965 // Merge one language into the other. Break ties toward lower lang # |
|
966 int tosub = altsub; |
|
967 int fromsub = sub; |
|
968 bool into_lang = false; |
|
969 if ((reliable_percent2 < reliable_percent) || |
|
970 ((reliable_percent2 == reliable_percent) && (lang < altlang))) { |
|
971 tosub = sub; |
|
972 fromsub = altsub; |
|
973 into_lang = true; |
|
974 } |
|
975 |
|
976 // Make sure merged reliability doesn't drop and is enough to avoid delete |
|
977 int newpercent = maxint(reliable_percent, reliable_percent2); |
|
978 newpercent = maxint(newpercent, kMinReliableKeepPercent); |
|
979 int newbytes = bytes + bytes2; |
|
980 int newreli = newpercent * newbytes; |
|
981 |
|
982 doc_tote->SetKey(fromsub, DocTote::kUnusedKey); |
|
983 doc_tote->SetScore(fromsub, 0); |
|
984 doc_tote->SetReliability(fromsub, 0); |
|
985 doc_tote->SetScore(tosub, newbytes); |
|
986 doc_tote->SetReliability(tosub, newreli); |
|
987 |
|
988 // Show fate of unreliable languages if at least 10 bytes |
|
989 if (FLAGS_cld2_html && (newbytes >= 10) && |
|
990 !FLAGS_cld2_quiet) { |
|
991 if (into_lang) { |
|
992 fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ", |
|
993 LanguageCode(altlang), reliable_percent2, bytes2, |
|
994 LanguageCode(lang)); |
|
995 } else { |
|
996 fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ", |
|
997 LanguageCode(lang), reliable_percent, bytes, |
|
998 LanguageCode(altlang)); |
|
999 } |
|
1000 } |
|
1001 } |
|
1002 |
|
1003 |
|
1004 // Pass to delete any remaining unreliable languages |
|
1005 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { |
|
1006 int plang = doc_tote->Key(sub); |
|
1007 if (plang == DocTote::kUnusedKey) {continue;} // Empty slot |
|
1008 |
|
1009 Language lang = static_cast<Language>(plang); |
|
1010 int bytes = doc_tote->Value(sub); |
|
1011 int reli = doc_tote->Reliability(sub); |
|
1012 if (bytes == 0) {continue;} // Zero bytes |
|
1013 |
|
1014 // Reliable percent is stored as reliable score over stored bytecount |
|
1015 int reliable_percent = reli / bytes; |
|
1016 if (reliable_percent >= kMinReliableKeepPercent) { // Keeper? |
|
1017 continue; // yes |
|
1018 } |
|
1019 |
|
1020 // Delete unreliable entry |
|
1021 doc_tote->SetKey(sub, DocTote::kUnusedKey); |
|
1022 doc_tote->SetScore(sub, 0); |
|
1023 doc_tote->SetReliability(sub, 0); |
|
1024 |
|
1025 // Show fate of unreliable languages if at least 10 bytes |
|
1026 if (FLAGS_cld2_html && (bytes >= 10) && |
|
1027 !FLAGS_cld2_quiet) { |
|
1028 fprintf(stderr, "{Unreli %s.%dR,%dB} ", |
|
1029 LanguageCode(lang), reliable_percent, bytes); |
|
1030 } |
|
1031 } |
|
1032 |
|
1033 ////if (FLAGS_cld2_html) {fprintf(stderr, "<br>\n");} |
|
1034 } |
|
1035 |
|
1036 |
|
1037 // Move all the text bytes from lower byte-count to higher one |
|
1038 void MoveLang1ToLang2(Language lang1, Language lang2, |
|
1039 int lang1_sub, int lang2_sub, |
|
1040 DocTote* doc_tote, |
|
1041 ResultChunkVector* resultchunkvector) { |
|
1042 // In doc_tote, move all the bytes lang1 => lang2 |
|
1043 int sum = doc_tote->Value(lang2_sub) + doc_tote->Value(lang1_sub); |
|
1044 doc_tote->SetValue(lang2_sub, sum); |
|
1045 sum = doc_tote->Score(lang2_sub) + doc_tote->Score(lang1_sub); |
|
1046 doc_tote->SetScore(lang2_sub, sum); |
|
1047 sum = doc_tote->Reliability(lang2_sub) + doc_tote->Reliability(lang1_sub); |
|
1048 doc_tote->SetReliability(lang2_sub, sum); |
|
1049 |
|
1050 // Delete old entry |
|
1051 doc_tote->SetKey(lang1_sub, DocTote::kUnusedKey); |
|
1052 doc_tote->SetScore(lang1_sub, 0); |
|
1053 doc_tote->SetReliability(lang1_sub, 0); |
|
1054 |
|
1055 // In resultchunkvector, move all the bytes lang1 => lang2 |
|
1056 if (resultchunkvector == NULL) {return;} |
|
1057 |
|
1058 int k = 0; |
|
1059 uint16 prior_lang = UNKNOWN_LANGUAGE; |
|
1060 for (int i = 0; i < static_cast<int>(resultchunkvector->size()); ++i) { |
|
1061 ResultChunk* rc = &(*resultchunkvector)[i]; |
|
1062 if (rc->lang1 == lang1) { |
|
1063 // Update entry[i] lang1 => lang2 |
|
1064 rc->lang1 = lang2; |
|
1065 } |
|
1066 // One change may produce two merges -- entry before and entry after |
|
1067 if ((rc->lang1 == prior_lang) && (k > 0)) { |
|
1068 // Merge with previous, deleting entry[i] |
|
1069 ResultChunk* prior_rc = &(*resultchunkvector)[k - 1]; |
|
1070 prior_rc->bytes += rc->bytes; |
|
1071 // fprintf(stderr, "MoveLang1ToLang2 merged [%d] => [%d]<br>\n", i, k-1); |
|
1072 } else { |
|
1073 // Keep entry[i] |
|
1074 (*resultchunkvector)[k] = (*resultchunkvector)[i]; |
|
1075 // fprintf(stderr, "MoveLang1ToLang2 keep [%d] => [%d]<br>\n", i, k); |
|
1076 ++k; |
|
1077 } |
|
1078 prior_lang = rc->lang1; |
|
1079 } |
|
1080 resultchunkvector->resize(k); |
|
1081 } |
|
1082 |
|
1083 |
|
1084 |
|
1085 // Move less likely byte count to more likely for close pairs of languages |
|
1086 // If given, also update resultchunkvector |
|
1087 void RefineScoredClosePairs(DocTote* doc_tote, |
|
1088 ResultChunkVector* resultchunkvector, |
|
1089 bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) { |
|
1090 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { |
|
1091 int close_packedlang = doc_tote->Key(sub); |
|
1092 int subscr = LanguageCloseSet(static_cast<Language>(close_packedlang)); |
|
1093 if (subscr == 0) {continue;} |
|
1094 |
|
1095 // We have a close pair language -- if the other one is also scored and the |
|
1096 // longword score differs enough, put all our eggs into one basket |
|
1097 |
|
1098 // Nonzero longword score: Go look for the other of this pair |
|
1099 for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) { |
|
1100 if (LanguageCloseSet(static_cast<Language>(doc_tote->Key(sub2))) == subscr) { |
|
1101 // We have a matching pair |
|
1102 int close_packedlang2 = doc_tote->Key(sub2); |
|
1103 |
|
1104 // Move all the text bytes from lower byte-count to higher one |
|
1105 int from_sub, to_sub; |
|
1106 Language from_lang, to_lang; |
|
1107 if (doc_tote->Value(sub) < doc_tote->Value(sub2)) { |
|
1108 from_sub = sub; |
|
1109 to_sub = sub2; |
|
1110 from_lang = static_cast<Language>(close_packedlang); |
|
1111 to_lang = static_cast<Language>(close_packedlang2); |
|
1112 } else { |
|
1113 from_sub = sub2; |
|
1114 to_sub = sub; |
|
1115 from_lang = static_cast<Language>(close_packedlang2); |
|
1116 to_lang = static_cast<Language>(close_packedlang); |
|
1117 } |
|
1118 |
|
1119 if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) { |
|
1120 // Show fate of closepair language |
|
1121 int val = doc_tote->Value(from_sub); // byte count |
|
1122 int reli = doc_tote->Reliability(from_sub); |
|
1123 int reliable_percent = reli / (val ? val : 1); // avoid zdiv |
|
1124 fprintf(stderr, "{CloseLangPair: %s.%dR,%dB => %s}<br>\n", |
|
1125 LanguageCode(from_lang), |
|
1126 reliable_percent, |
|
1127 doc_tote->Value(from_sub), |
|
1128 LanguageCode(to_lang)); |
|
1129 } |
|
1130 MoveLang1ToLang2(from_lang, to_lang, from_sub, to_sub, |
|
1131 doc_tote, resultchunkvector); |
|
1132 break; // Exit inner for sub2 loop |
|
1133 } |
|
1134 } // End for sub2 |
|
1135 } // End for sub |
|
1136 } |
|
1137 |
|
1138 |
|
1139 void ApplyAllLanguageHints(Tote* chunk_tote, int tote_grams, |
|
1140 uint8* lang_hint_boost) { |
|
1141 } |
|
1142 |
|
1143 |
|
1144 void PrintHtmlEscapedText(FILE* f, const char* txt, int len) { |
|
1145 string temp(txt, len); |
|
1146 fprintf(f, "%s", GetHtmlEscapedText(temp).c_str()); |
|
1147 } |
|
1148 |
|
1149 void PrintLang(FILE* f, Tote* chunk_tote, |
|
1150 Language cur_lang, bool cur_unreliable, |
|
1151 Language prior_lang, bool prior_unreliable) { |
|
1152 if (cur_lang == prior_lang) { |
|
1153 fprintf(f, "[]"); |
|
1154 } else { |
|
1155 fprintf(f, "[%s%s]", LanguageCode(cur_lang), cur_unreliable ? "*" : ""); |
|
1156 } |
|
1157 } |
|
1158 |
|
1159 |
|
1160 void PrintTopLang(Language top_lang) { |
|
1161 if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) { |
|
1162 fprintf(stderr, "[] "); |
|
1163 } else { |
|
1164 fprintf(stderr, "[%s] ", LanguageName(top_lang)); |
|
1165 prior_lang = top_lang; |
|
1166 } |
|
1167 } |
|
1168 |
|
1169 void PrintTopLangSpeculative(Language top_lang) { |
|
1170 fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0); |
|
1171 if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) { |
|
1172 fprintf(stderr, "[] "); |
|
1173 } else { |
|
1174 fprintf(stderr, "[%s] ", LanguageName(top_lang)); |
|
1175 prior_lang = top_lang; |
|
1176 } |
|
1177 fprintf(stderr, "</span>\n"); |
|
1178 } |
|
1179 |
|
1180 void PrintLangs(FILE* f, const Language* language3, const int* percent3, |
|
1181 const int* text_bytes, const bool* is_reliable) { |
|
1182 fprintf(f, "<br> Initial_Languages "); |
|
1183 if (language3[0] != UNKNOWN_LANGUAGE) { |
|
1184 fprintf(f, "%s%s(%d%%) ", |
|
1185 LanguageName(language3[0]), |
|
1186 *is_reliable ? "" : "*", |
|
1187 percent3[0]); |
|
1188 } |
|
1189 if (language3[1] != UNKNOWN_LANGUAGE) { |
|
1190 fprintf(f, "%s(%d%%) ", LanguageName(language3[1]), percent3[1]); |
|
1191 } |
|
1192 if (language3[2] != UNKNOWN_LANGUAGE) { |
|
1193 fprintf(f, "%s(%d%%) ", LanguageName(language3[2]), percent3[2]); |
|
1194 } |
|
1195 fprintf(f, "%d bytes \n", *text_bytes); |
|
1196 |
|
1197 fprintf(f, "<br>\n"); |
|
1198 } |
|
1199 |
|
1200 |
|
1201 // Return internal probability score (sum) per 1024 bytes |
|
1202 double GetNormalizedScore(Language lang, ULScript ulscript, |
|
1203 int bytecount, int score) { |
|
1204 if (bytecount <= 0) {return 0.0;} |
|
1205 return (score << 10) / bytecount; |
|
1206 } |
|
1207 |
|
1208 // Extract return values before fixups |
|
1209 void ExtractLangEtc(DocTote* doc_tote, int total_text_bytes, |
|
1210 int* reliable_percent3, Language* language3, int* percent3, |
|
1211 double* normalized_score3, |
|
1212 int* text_bytes, bool* is_reliable) { |
|
1213 reliable_percent3[0] = 0; |
|
1214 reliable_percent3[1] = 0; |
|
1215 reliable_percent3[2] = 0; |
|
1216 language3[0] = UNKNOWN_LANGUAGE; |
|
1217 language3[1] = UNKNOWN_LANGUAGE; |
|
1218 language3[2] = UNKNOWN_LANGUAGE; |
|
1219 percent3[0] = 0; |
|
1220 percent3[1] = 0; |
|
1221 percent3[2] = 0; |
|
1222 normalized_score3[0] = 0.0; |
|
1223 normalized_score3[1] = 0.0; |
|
1224 normalized_score3[2] = 0.0; |
|
1225 |
|
1226 *text_bytes = total_text_bytes; |
|
1227 *is_reliable = false; |
|
1228 |
|
1229 int bytecount1 = 0; |
|
1230 int bytecount2 = 0; |
|
1231 int bytecount3 = 0; |
|
1232 |
|
1233 int lang1 = doc_tote->Key(0); |
|
1234 if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) { |
|
1235 // We have a top language |
|
1236 language3[0] = static_cast<Language>(lang1); |
|
1237 bytecount1 = doc_tote->Value(0); |
|
1238 int reli1 = doc_tote->Reliability(0); |
|
1239 reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1); // avoid zdiv |
|
1240 normalized_score3[0] = GetNormalizedScore(language3[0], |
|
1241 ULScript_Common, |
|
1242 bytecount1, |
|
1243 doc_tote->Score(0)); |
|
1244 } |
|
1245 |
|
1246 int lang2 = doc_tote->Key(1); |
|
1247 if ((lang2 != DocTote::kUnusedKey) && (lang2 != UNKNOWN_LANGUAGE)) { |
|
1248 language3[1] = static_cast<Language>(lang2); |
|
1249 bytecount2 = doc_tote->Value(1); |
|
1250 int reli2 = doc_tote->Reliability(1); |
|
1251 reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1); // avoid zdiv |
|
1252 normalized_score3[1] = GetNormalizedScore(language3[1], |
|
1253 ULScript_Common, |
|
1254 bytecount2, |
|
1255 doc_tote->Score(1)); |
|
1256 } |
|
1257 |
|
1258 int lang3 = doc_tote->Key(2); |
|
1259 if ((lang3 != DocTote::kUnusedKey) && (lang3 != UNKNOWN_LANGUAGE)) { |
|
1260 language3[2] = static_cast<Language>(lang3); |
|
1261 bytecount3 = doc_tote->Value(2); |
|
1262 int reli3 = doc_tote->Reliability(2); |
|
1263 reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1); // avoid zdiv |
|
1264 normalized_score3[2] = GetNormalizedScore(language3[2], |
|
1265 ULScript_Common, |
|
1266 bytecount3, |
|
1267 doc_tote->Score(2)); |
|
1268 } |
|
1269 |
|
1270 // Increase total bytes to sum (top 3) if low for some reason |
|
1271 int total_bytecount12 = bytecount1 + bytecount2; |
|
1272 int total_bytecount123 = total_bytecount12 + bytecount3; |
|
1273 if (total_text_bytes < total_bytecount123) { |
|
1274 total_text_bytes = total_bytecount123; |
|
1275 *text_bytes = total_text_bytes; |
|
1276 } |
|
1277 |
|
1278 // Sum minus previous % gives better roundoff behavior than bytecount/total |
|
1279 int total_text_bytes_div = maxint(1, total_text_bytes); // Avoid zdiv |
|
1280 percent3[0] = (bytecount1 * 100) / total_text_bytes_div; |
|
1281 percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div; |
|
1282 percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div; |
|
1283 percent3[2] -= percent3[1]; |
|
1284 percent3[1] -= percent3[0]; |
|
1285 |
|
1286 // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2% |
|
1287 // Fix this explicitly |
|
1288 if (percent3[1] < percent3[2]) { |
|
1289 ++percent3[1]; |
|
1290 --percent3[2]; |
|
1291 } |
|
1292 if (percent3[0] < percent3[1]) { |
|
1293 ++percent3[0]; |
|
1294 --percent3[1]; |
|
1295 } |
|
1296 |
|
1297 *text_bytes = total_text_bytes; |
|
1298 |
|
1299 if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) { |
|
1300 // We have a top language |
|
1301 // Its reliability is overall result reliability |
|
1302 int bytecount = doc_tote->Value(0); |
|
1303 int reli = doc_tote->Reliability(0); |
|
1304 int reliable_percent = reli / (bytecount ? bytecount : 1); // avoid zdiv |
|
1305 *is_reliable = (reliable_percent >= kMinReliableKeepPercent); |
|
1306 } else { |
|
1307 // No top language at all. This can happen with zero text or 100% Klingon |
|
1308 // if extended=false. Just return all UNKNOWN_LANGUAGE, unreliable. |
|
1309 *is_reliable = false; |
|
1310 } |
|
1311 |
|
1312 // If ignore percent is too large, set unreliable. |
|
1313 int ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]); |
|
1314 if ((ignore_percent > kIgnoreMaxPercent)) { |
|
1315 *is_reliable = false; |
|
1316 } |
|
1317 } |
|
1318 |
|
1319 bool IsFIGS(Language lang) { |
|
1320 if (lang == FRENCH) {return true;} |
|
1321 if (lang == ITALIAN) {return true;} |
|
1322 if (lang == GERMAN) {return true;} |
|
1323 if (lang == SPANISH) {return true;} |
|
1324 return false; |
|
1325 } |
|
1326 |
|
1327 bool IsEFIGS(Language lang) { |
|
1328 if (lang == ENGLISH) {return true;} |
|
1329 if (lang == FRENCH) {return true;} |
|
1330 if (lang == ITALIAN) {return true;} |
|
1331 if (lang == GERMAN) {return true;} |
|
1332 if (lang == SPANISH) {return true;} |
|
1333 return false; |
|
1334 } |
|
1335 |
|
1336 // For Tier3 languages, require more bytes of text to override |
|
1337 // the first-place language |
|
1338 static const int kGoodSecondT1T2MinBytes = 15; // <this => no second |
|
1339 static const int kGoodSecondT3MinBytes = 128; // <this => no second |
|
1340 |
|
1341 // Calculate a single summary language for the document, and its reliability. |
|
1342 // Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE |
|
1343 // This is the heart of matching human-rater perception. |
|
1344 // reliable_percent3[] is currently unused |
|
1345 // |
|
1346 // Do not return Tier3 second language unless there are at least 128 bytes |
|
1347 void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes, |
|
1348 const int* reliable_percent3, |
|
1349 const Language* language3, |
|
1350 const int* percent3, |
|
1351 Language* summary_lang, bool* is_reliable, |
|
1352 bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) { |
|
1353 // Vector of active languages; changes if we delete some |
|
1354 int slot_count = 3; |
|
1355 int active_slot[3] = {0, 1, 2}; |
|
1356 |
|
1357 int ignore_percent = 0; |
|
1358 int return_percent = percent3[0]; // Default to top lang |
|
1359 *summary_lang = language3[0]; |
|
1360 *is_reliable = true; |
|
1361 if (percent3[0] < kKeepMinPercent) {*is_reliable = false;} |
|
1362 |
|
1363 // If any of top 3 is IGNORE, remove it and increment ignore_percent |
|
1364 for (int i = 0; i < 3; ++i) { |
|
1365 if (language3[i] == TG_UNKNOWN_LANGUAGE) { |
|
1366 ignore_percent += percent3[i]; |
|
1367 // Move the rest up, levaing input vectors unchanged |
|
1368 for (int j=i+1; j < 3; ++j) { |
|
1369 active_slot[j - 1] = active_slot[j]; |
|
1370 } |
|
1371 -- slot_count; |
|
1372 // Logically remove Ignore from percentage-text calculation |
|
1373 // (extra 1 in 101 avoids zdiv, biases slightly small) |
|
1374 return_percent = (percent3[0] * 100) / (101 - ignore_percent); |
|
1375 *summary_lang = language3[active_slot[0]]; |
|
1376 if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;} |
|
1377 } |
|
1378 } |
|
1379 |
|
1380 |
|
1381 // If English and X, where X (not UNK) is big enough, |
|
1382 // assume the English is boilerplate and return X. |
|
1383 // Logically remove English from percentage-text calculation |
|
1384 int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100; |
|
1385 // Require more bytes of text for Tier3 languages |
|
1386 int minbytesneeded = kGoodSecondT1T2MinBytes; |
|
1387 int plang_second = PerScriptNumber(ULScript_Latin, language3[active_slot[1]]); |
|
1388 |
|
1389 if ((language3[active_slot[0]] == ENGLISH) && |
|
1390 (language3[active_slot[1]] != ENGLISH) && |
|
1391 (language3[active_slot[1]] != UNKNOWN_LANGUAGE) && |
|
1392 (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) && |
|
1393 (second_bytes >= minbytesneeded)) { |
|
1394 ignore_percent += percent3[active_slot[0]]; |
|
1395 return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent); |
|
1396 *summary_lang = language3[active_slot[1]]; |
|
1397 if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;} |
|
1398 |
|
1399 // Else If FIGS and X, where X (not UNK, EFIGS) is big enough, |
|
1400 // assume the FIGS is boilerplate and return X. |
|
1401 // Logically remove FIGS from percentage-text calculation |
|
1402 } else if (IsFIGS(language3[active_slot[0]]) && |
|
1403 !IsEFIGS(language3[active_slot[1]]) && |
|
1404 (language3[active_slot[1]] != UNKNOWN_LANGUAGE) && |
|
1405 (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) && |
|
1406 (second_bytes >= minbytesneeded)) { |
|
1407 ignore_percent += percent3[active_slot[0]]; |
|
1408 return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent); |
|
1409 *summary_lang = language3[active_slot[1]]; |
|
1410 if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;} |
|
1411 |
|
1412 // Else we are returning the first language, but want to improve its |
|
1413 // return_percent if the second language should be ignored |
|
1414 } else if ((language3[active_slot[1]] == ENGLISH) && |
|
1415 (language3[active_slot[0]] != ENGLISH)) { |
|
1416 ignore_percent += percent3[active_slot[1]]; |
|
1417 return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent); |
|
1418 } else if (IsFIGS(language3[active_slot[1]]) && |
|
1419 !IsEFIGS(language3[active_slot[0]])) { |
|
1420 ignore_percent += percent3[active_slot[1]]; |
|
1421 return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent); |
|
1422 } |
|
1423 |
|
1424 // If return percent is too small (too many languages), return UNKNOWN |
|
1425 if ((return_percent < kGoodFirstMinPercent)) { |
|
1426 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { |
|
1427 fprintf(stderr, "{Unreli %s %d%% percent too small} ", |
|
1428 LanguageCode(*summary_lang), return_percent); |
|
1429 } |
|
1430 *summary_lang = UNKNOWN_LANGUAGE; |
|
1431 *is_reliable = false; |
|
1432 } |
|
1433 |
|
1434 // If return percent is small, return language but set unreliable. |
|
1435 if ((return_percent < kGoodFirstReliableMinPercent)) { |
|
1436 *is_reliable = false; |
|
1437 } |
|
1438 |
|
1439 // If ignore percent is too large, set unreliable. |
|
1440 ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]); |
|
1441 if ((ignore_percent > kIgnoreMaxPercent)) { |
|
1442 *is_reliable = false; |
|
1443 } |
|
1444 |
|
1445 // If we removed all the active languages, return UNKNOWN |
|
1446 if (slot_count == 0) { |
|
1447 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { |
|
1448 fprintf(stderr, "{Unreli %s no languages left} ", |
|
1449 LanguageCode(*summary_lang)); |
|
1450 } |
|
1451 *summary_lang = UNKNOWN_LANGUAGE; |
|
1452 *is_reliable = false; |
|
1453 } |
|
1454 } |
|
1455 |
|
1456 void AddLangPriorBoost(Language lang, uint32 langprob, |
|
1457 ScoringContext* scoringcontext) { |
|
1458 // This is called 0..n times with language hints |
|
1459 // but we don't know the script -- so boost either or both Latn, Othr. |
|
1460 |
|
1461 if (IsLatnLanguage(lang)) { |
|
1462 LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn; |
|
1463 int n = langprior_boost->n; |
|
1464 langprior_boost->langprob[n] = langprob; |
|
1465 langprior_boost->n = langprior_boost->wrap(n + 1); |
|
1466 } |
|
1467 |
|
1468 if (IsOthrLanguage(lang)) { |
|
1469 LangBoosts* langprior_boost = &scoringcontext->langprior_boost.othr; |
|
1470 int n = langprior_boost->n; |
|
1471 langprior_boost->langprob[n] = langprob; |
|
1472 langprior_boost->n = langprior_boost->wrap(n + 1); |
|
1473 } |
|
1474 |
|
1475 } |
|
1476 |
|
1477 void AddOneWhack(Language whacker_lang, Language whackee_lang, |
|
1478 ScoringContext* scoringcontext) { |
|
1479 uint32 langprob = MakeLangProb(whackee_lang, 1); |
|
1480 // This logic avoids hr-Latn whacking sr-Cyrl, but still whacks sr-Latn |
|
1481 if (IsLatnLanguage(whacker_lang) && IsLatnLanguage(whackee_lang)) { |
|
1482 LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn; |
|
1483 int n = langprior_whack->n; |
|
1484 langprior_whack->langprob[n] = langprob; |
|
1485 langprior_whack->n = langprior_whack->wrap(n + 1); |
|
1486 } |
|
1487 if (IsOthrLanguage(whacker_lang) && IsOthrLanguage(whackee_lang)) { |
|
1488 LangBoosts* langprior_whack = &scoringcontext->langprior_whack.othr; |
|
1489 int n = langprior_whack->n; |
|
1490 langprior_whack->langprob[n] = langprob; |
|
1491 langprior_whack->n = langprior_whack->wrap(n + 1); |
|
1492 } |
|
1493 } |
|
1494 |
|
1495 void AddCloseLangWhack(Language lang, ScoringContext* scoringcontext) { |
|
1496 // We do not in general want zh-Hans and zh-Hant to be close pairs, |
|
1497 // but we do here. |
|
1498 if (lang == CLD2::CHINESE) { |
|
1499 AddOneWhack(lang, CLD2::CHINESE_T, scoringcontext); |
|
1500 return; |
|
1501 } |
|
1502 if (lang == CLD2::CHINESE_T) { |
|
1503 AddOneWhack(lang, CLD2::CHINESE, scoringcontext); |
|
1504 return; |
|
1505 } |
|
1506 |
|
1507 int base_lang_set = LanguageCloseSet(lang); |
|
1508 if (base_lang_set == 0) {return;} |
|
1509 // TODO: add an explicit list of each set to avoid this 512-times loop |
|
1510 for (int i = 0; i < kLanguageToPLangSize; ++i) { |
|
1511 Language lang2 = static_cast<Language>(i); |
|
1512 if ((base_lang_set == LanguageCloseSet(lang2)) && (lang != lang2)) { |
|
1513 AddOneWhack(lang, lang2, scoringcontext); |
|
1514 } |
|
1515 } |
|
1516 } |
|
1517 |
|
1518 |
|
1519 void ApplyHints(const char* buffer, |
|
1520 int buffer_length, |
|
1521 bool is_plain_text, |
|
1522 const CLDHints* cld_hints, |
|
1523 ScoringContext* scoringcontext) { |
|
1524 CLDLangPriors lang_priors; |
|
1525 InitCLDLangPriors(&lang_priors); |
|
1526 |
|
1527 // We now use lang= tags. |
|
1528 // Last look, circa 2008 found only 15% of web pages with lang= tags and |
|
1529 // many of those were wrong. Now (July 2011), we find 44% of web pages have |
|
1530 // lang= tags, and most of them are correct. So we now give them substantial |
|
1531 // weight in each chunk scored. |
|
1532 if (!is_plain_text) { |
|
1533 // Get any contained language tags in first n KB |
|
1534 int32 max_scan_bytes = FLAGS_cld_max_lang_tag_scan_kb << 10; |
|
1535 string lang_tags = GetLangTagsFromHtml(buffer, buffer_length, |
|
1536 max_scan_bytes); |
|
1537 SetCLDLangTagsHint(lang_tags, &lang_priors); |
|
1538 if (scoringcontext->flags_cld2_html) { |
|
1539 if (!lang_tags.empty()) { |
|
1540 fprintf(scoringcontext->debug_file, "<br>lang_tags '%s'<br>\n", |
|
1541 lang_tags.c_str()); |
|
1542 } |
|
1543 } |
|
1544 } |
|
1545 |
|
1546 if (cld_hints != NULL) { |
|
1547 if ((cld_hints->content_language_hint != NULL) && |
|
1548 (cld_hints->content_language_hint[0] != '\0')) { |
|
1549 SetCLDContentLangHint(cld_hints->content_language_hint, &lang_priors); |
|
1550 } |
|
1551 |
|
1552 // Input is from GetTLD(), already lowercased |
|
1553 if ((cld_hints->tld_hint != NULL) && (cld_hints->tld_hint[0] != '\0')) { |
|
1554 SetCLDTLDHint(cld_hints->tld_hint, &lang_priors); |
|
1555 } |
|
1556 |
|
1557 if (cld_hints->encoding_hint != UNKNOWN_ENCODING) { |
|
1558 Encoding enc = static_cast<Encoding>(cld_hints->encoding_hint); |
|
1559 SetCLDEncodingHint(enc, &lang_priors); |
|
1560 } |
|
1561 |
|
1562 if (cld_hints->language_hint != UNKNOWN_LANGUAGE) { |
|
1563 SetCLDLanguageHint(cld_hints->language_hint, &lang_priors); |
|
1564 } |
|
1565 } |
|
1566 |
|
1567 // Keep no more than four different languages with hints |
|
1568 TrimCLDLangPriors(4, &lang_priors); |
|
1569 |
|
1570 if (scoringcontext->flags_cld2_html) { |
|
1571 string print_temp = DumpCLDLangPriors(&lang_priors); |
|
1572 if (!print_temp.empty()) { |
|
1573 fprintf(scoringcontext->debug_file, "DumpCLDLangPriors %s<br>\n", |
|
1574 print_temp.c_str()); |
|
1575 } |
|
1576 } |
|
1577 |
|
1578 // Put boosts into ScoringContext |
|
1579 for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) { |
|
1580 Language lang = GetCLDPriorLang(lang_priors.prior[i]); |
|
1581 int qprob = GetCLDPriorWeight(lang_priors.prior[i]); |
|
1582 if (qprob > 0) { |
|
1583 uint32 langprob = MakeLangProb(lang, qprob); |
|
1584 AddLangPriorBoost(lang, langprob, scoringcontext); |
|
1585 } |
|
1586 } |
|
1587 |
|
1588 // Put whacks into scoring context |
|
1589 // We do not in general want zh-Hans and zh-Hant to be close pairs, |
|
1590 // but we do here. Use close_set_count[kCloseSetSize] to count zh, zh-Hant |
|
1591 std::vector<int> close_set_count(kCloseSetSize + 1, 0); |
|
1592 |
|
1593 for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) { |
|
1594 Language lang = GetCLDPriorLang(lang_priors.prior[i]); |
|
1595 ++close_set_count[LanguageCloseSet(lang)]; |
|
1596 if (lang == CLD2::CHINESE) {++close_set_count[kCloseSetSize];} |
|
1597 if (lang == CLD2::CHINESE_T) {++close_set_count[kCloseSetSize];} |
|
1598 } |
|
1599 |
|
1600 // If a boost language is in a close set, force suppressing the others in |
|
1601 // that set, if exactly one of the set is present |
|
1602 for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) { |
|
1603 Language lang = GetCLDPriorLang(lang_priors.prior[i]); |
|
1604 int qprob = GetCLDPriorWeight(lang_priors.prior[i]); |
|
1605 if (qprob > 0) { |
|
1606 int close_set = LanguageCloseSet(lang); |
|
1607 if ((close_set > 0) && (close_set_count[close_set] == 1)) { |
|
1608 AddCloseLangWhack(lang, scoringcontext); |
|
1609 } |
|
1610 if (((lang == CLD2::CHINESE) || (lang == CLD2::CHINESE_T)) && |
|
1611 (close_set_count[kCloseSetSize] == 1)) { |
|
1612 AddCloseLangWhack(lang, scoringcontext); |
|
1613 } |
|
1614 } |
|
1615 } |
|
1616 |
|
1617 |
|
1618 |
|
1619 |
|
1620 |
|
1621 |
|
1622 } |
|
1623 |
|
1624 |
|
1625 |
|
1626 // Results language3/percent3/text_bytes must be exactly three items |
|
1627 Language DetectLanguageSummaryV2( |
|
1628 const char* buffer, |
|
1629 int buffer_length, |
|
1630 bool is_plain_text, |
|
1631 const CLDHints* cld_hints, |
|
1632 bool allow_extended_lang, |
|
1633 int flags, |
|
1634 Language plus_one, |
|
1635 Language* language3, |
|
1636 int* percent3, |
|
1637 double* normalized_score3, |
|
1638 ResultChunkVector* resultchunkvector, |
|
1639 int* text_bytes, |
|
1640 bool* is_reliable) { |
|
1641 language3[0] = UNKNOWN_LANGUAGE; |
|
1642 language3[1] = UNKNOWN_LANGUAGE; |
|
1643 language3[2] = UNKNOWN_LANGUAGE; |
|
1644 percent3[0] = 0; |
|
1645 percent3[1] = 0; |
|
1646 percent3[2] = 0; |
|
1647 normalized_score3[0] = 0.0; |
|
1648 normalized_score3[1] = 0.0; |
|
1649 normalized_score3[2] = 0.0; |
|
1650 if (resultchunkvector != NULL) { |
|
1651 resultchunkvector->clear(); |
|
1652 } |
|
1653 *text_bytes = 0; |
|
1654 *is_reliable = false; |
|
1655 |
|
1656 if ((flags & kCLDFlagEcho) != 0) { |
|
1657 string temp(buffer, buffer_length); |
|
1658 if ((flags & kCLDFlagHtml) != 0) { |
|
1659 fprintf(stderr, "CLD2[%d] '%s'<br>\n", |
|
1660 buffer_length, GetHtmlEscapedText(temp).c_str()); |
|
1661 } else { |
|
1662 fprintf(stderr, "CLD2[%d] '%s'\n", |
|
1663 buffer_length, GetPlainEscapedText(temp).c_str()); |
|
1664 } |
|
1665 } |
|
1666 |
|
1667 #ifdef CLD2_DYNAMIC_MODE |
|
1668 // In dynamic mode, we immediately return UNKNOWN_LANGUAGE if the data file |
|
1669 // hasn't been loaded yet. This is the only sane thing we can do, as there |
|
1670 // are no scoring tables to consult. |
|
1671 bool dataLoaded = isDataLoaded(); |
|
1672 if ((flags & kCLDFlagVerbose) != 0) { |
|
1673 fprintf(stderr, "Data loaded: %s\n", (dataLoaded ? "true" : "false")); |
|
1674 } |
|
1675 if (!dataLoaded) { |
|
1676 return UNKNOWN_LANGUAGE; |
|
1677 } |
|
1678 #endif |
|
1679 |
|
1680 // Exit now if no text |
|
1681 if (buffer_length == 0) {return UNKNOWN_LANGUAGE;} |
|
1682 if (kScoringtables.quadgram_obj == NULL) {return UNKNOWN_LANGUAGE;} |
|
1683 |
|
1684 // Document totals |
|
1685 DocTote doc_tote; // Reliability = 0..100 |
|
1686 |
|
1687 // ScoringContext carries state across scriptspans |
|
1688 ScoringContext scoringcontext; |
|
1689 scoringcontext.debug_file = stderr; |
|
1690 scoringcontext.flags_cld2_score_as_quads = |
|
1691 ((flags & kCLDFlagScoreAsQuads) != 0); |
|
1692 scoringcontext.flags_cld2_html = ((flags & kCLDFlagHtml) != 0); |
|
1693 scoringcontext.flags_cld2_cr = ((flags & kCLDFlagCr) != 0); |
|
1694 scoringcontext.flags_cld2_verbose = ((flags & kCLDFlagVerbose) != 0); |
|
1695 scoringcontext.prior_chunk_lang = UNKNOWN_LANGUAGE; |
|
1696 scoringcontext.ulscript = ULScript_Common; |
|
1697 scoringcontext.scoringtables = &kScoringtables; |
|
1698 scoringcontext.scanner = NULL; |
|
1699 scoringcontext.init(); // Clear the internal memory arrays |
|
1700 |
|
1701 // Now thread safe. |
|
1702 bool FLAGS_cld2_html = ((flags & kCLDFlagHtml) != 0); |
|
1703 bool FLAGS_cld2_quiet = ((flags & kCLDFlagQuiet) != 0); |
|
1704 |
|
1705 ApplyHints(buffer, buffer_length, is_plain_text, cld_hints, &scoringcontext); |
|
1706 |
|
1707 // Four individual script totals, Latin, Han, other2, other3 |
|
1708 int next_other_tote = 2; |
|
1709 int tote_num = 0; |
|
1710 |
|
1711 // Four totes for up to four different scripts pending at once |
|
1712 Tote totes[4]; // [0] Latn [1] Hani [2] other [3] other |
|
1713 bool tote_seen[4] = {false, false, false, false}; |
|
1714 int tote_grams[4] = {0, 0, 0, 0}; // Number in partial chunk |
|
1715 ULScript tote_script[4] = |
|
1716 {ULScript_Latin, ULScript_Hani, ULScript_Common, ULScript_Common}; |
|
1717 |
|
1718 // Loop through text spans in a single script |
|
1719 ScriptScanner ss(buffer, buffer_length, is_plain_text); |
|
1720 LangSpan scriptspan; |
|
1721 |
|
1722 scoringcontext.scanner = &ss; |
|
1723 |
|
1724 scriptspan.text = NULL; |
|
1725 scriptspan.text_bytes = 0; |
|
1726 scriptspan.offset = 0; |
|
1727 scriptspan.ulscript = ULScript_Common; |
|
1728 scriptspan.lang = UNKNOWN_LANGUAGE; |
|
1729 |
|
1730 int total_text_bytes = 0; |
|
1731 int textlimit = FLAGS_cld_textlimit << 10; // in KB |
|
1732 if (textlimit == 0) {textlimit = 0x7fffffff;} |
|
1733 |
|
1734 int advance_by = 2; // Advance 2 bytes |
|
1735 int advance_limit = textlimit >> 3; // For first 1/8 of max document |
|
1736 |
|
1737 int initial_word_span = kDefaultWordSpan; |
|
1738 if (FLAGS_cld_forcewords) { |
|
1739 initial_word_span = kReallyBigWordSpan; |
|
1740 } |
|
1741 |
|
1742 // Pick up chunk sizes |
|
1743 // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each |
|
1744 // Sanity check -- force into a reasonable range |
|
1745 int chunksizequads = FLAGS_cld_smoothwidth; |
|
1746 chunksizequads = minint(maxint(chunksizequads, kMinChunkSizeQuads), |
|
1747 kMaxChunkSizeQuads); |
|
1748 int chunksizeunis = (chunksizequads * 5) >> 1; |
|
1749 |
|
1750 // Varying short-span limit doesn't work well -- skips too much beyond 20KB |
|
1751 // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth; |
|
1752 int spantooshortlimit = kShortSpanThresh; |
|
1753 |
|
1754 // For debugging only. Not thread-safe |
|
1755 prior_lang = UNKNOWN_LANGUAGE; |
|
1756 prior_unreliable = false; |
|
1757 |
|
1758 // Allocate full-document prediction table for finding repeating words |
|
1759 int hash = 0; |
|
1760 int* predict_tbl = new int[kPredictionTableSize]; |
|
1761 if (FlagRepeats(flags)) { |
|
1762 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); |
|
1763 } |
|
1764 |
|
1765 |
|
1766 |
|
1767 // Loop through scriptspans accumulating number of text bytes in each language |
|
1768 while (ss.GetOneScriptSpanLower(&scriptspan)) { |
|
1769 ULScript ulscript = scriptspan.ulscript; |
|
1770 |
|
1771 // Squeeze out big chunks of text span if asked to |
|
1772 if (FlagSqueeze(flags)) { |
|
1773 // Remove repetitive or mostly-spaces chunks |
|
1774 int newlen; |
|
1775 int chunksize = 0; // Use the default |
|
1776 if (resultchunkvector != NULL) { |
|
1777 newlen = CheapSqueezeInplaceOverwrite(scriptspan.text, |
|
1778 scriptspan.text_bytes, |
|
1779 chunksize); |
|
1780 } else { |
|
1781 newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes, |
|
1782 chunksize); |
|
1783 } |
|
1784 scriptspan.text_bytes = newlen; |
|
1785 } else { |
|
1786 // Check now and then to see if we should be squeezing |
|
1787 if (((kCheapSqueezeTestThresh >> 1) < scriptspan.text_bytes) && |
|
1788 !FlagFinish(flags)) { |
|
1789 // fprintf(stderr, "CheapSqueezeTriggerTest, " |
|
1790 // "first %d bytes of %d (>%d/2)<br>\n", |
|
1791 // kCheapSqueezeTestLen, |
|
1792 // scriptspan.text_bytes, |
|
1793 // kCheapSqueezeTestThresh); |
|
1794 |
|
1795 if (CheapSqueezeTriggerTest(scriptspan.text, |
|
1796 scriptspan.text_bytes, |
|
1797 kCheapSqueezeTestLen)) { |
|
1798 // Recursive call with big-chunk squeezing set |
|
1799 if (FLAGS_cld2_html || FLAGS_dbgscore) { |
|
1800 fprintf(stderr, |
|
1801 "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n", |
|
1802 total_text_bytes); |
|
1803 } |
|
1804 // Deallocate full-document prediction table |
|
1805 delete[] predict_tbl; |
|
1806 |
|
1807 return DetectLanguageSummaryV2( |
|
1808 buffer, |
|
1809 buffer_length, |
|
1810 is_plain_text, |
|
1811 cld_hints, |
|
1812 allow_extended_lang, |
|
1813 flags | kCLDFlagSqueeze, |
|
1814 plus_one, |
|
1815 language3, |
|
1816 percent3, |
|
1817 normalized_score3, |
|
1818 resultchunkvector, |
|
1819 text_bytes, |
|
1820 is_reliable); |
|
1821 } |
|
1822 } |
|
1823 } |
|
1824 |
|
1825 // Remove repetitive words if asked to |
|
1826 if (FlagRepeats(flags)) { |
|
1827 // Remove repetitive words |
|
1828 int newlen; |
|
1829 if (resultchunkvector != NULL) { |
|
1830 newlen = CheapRepWordsInplaceOverwrite(scriptspan.text, |
|
1831 scriptspan.text_bytes, |
|
1832 &hash, predict_tbl); |
|
1833 } else { |
|
1834 newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes, |
|
1835 &hash, predict_tbl); |
|
1836 } |
|
1837 scriptspan.text_bytes = newlen; |
|
1838 } |
|
1839 |
|
1840 // Scoring depends on scriptspan buffer ALWAYS having |
|
1841 // leading space and off-the-end space space space NUL, |
|
1842 // DCHECK(scriptspan.text[0] == ' '); |
|
1843 // DCHECK(scriptspan.text[scriptspan.text_bytes + 0] == ' '); |
|
1844 // DCHECK(scriptspan.text[scriptspan.text_bytes + 1] == ' '); |
|
1845 // DCHECK(scriptspan.text[scriptspan.text_bytes + 2] == ' '); |
|
1846 // DCHECK(scriptspan.text[scriptspan.text_bytes + 3] == '\0'); |
|
1847 |
|
1848 // The real scoring |
|
1849 // Accumulate directly into the document total, or accmulate in one of four |
|
1850 // chunk totals. The purpose of the multiple chunk totals is to piece |
|
1851 // together short choppy pieces of text in alternating scripts. One total is |
|
1852 // dedicated to Latin text, one to Han text, and the other two are dynamicly |
|
1853 // assigned. |
|
1854 |
|
1855 scoringcontext.ulscript = scriptspan.ulscript; |
|
1856 // FLAGS_cld2_html = scoringcontext.flags_cld2_html; |
|
1857 |
|
1858 ScoreOneScriptSpan(scriptspan, |
|
1859 &scoringcontext, |
|
1860 &doc_tote, |
|
1861 resultchunkvector); |
|
1862 |
|
1863 total_text_bytes += scriptspan.text_bytes; |
|
1864 } // End while (ss.GetOneScriptSpanLower()) |
|
1865 |
|
1866 // Deallocate full-document prediction table |
|
1867 delete[] predict_tbl; |
|
1868 |
|
1869 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { |
|
1870 // If no forced <cr>, put one in front of dump |
|
1871 if (!scoringcontext.flags_cld2_cr) {fprintf(stderr, "<br>\n");} |
|
1872 doc_tote.Dump(stderr); |
|
1873 } |
|
1874 |
|
1875 |
|
1876 // If extended langauges are disallowed, remove them here |
|
1877 if (!allow_extended_lang) { |
|
1878 RemoveExtendedLanguages(&doc_tote); |
|
1879 } |
|
1880 |
|
1881 // Force close pairs to one or the other |
|
1882 // If given, also update resultchunkvector |
|
1883 RefineScoredClosePairs(&doc_tote, resultchunkvector, |
|
1884 FLAGS_cld2_html, FLAGS_cld2_quiet); |
|
1885 |
|
1886 |
|
1887 // Calculate return results |
|
1888 // Find top three byte counts in tote heap |
|
1889 int reliable_percent3[3]; |
|
1890 |
|
1891 // Cannot use Add, etc. after sorting |
|
1892 doc_tote.Sort(3); |
|
1893 |
|
1894 ExtractLangEtc(&doc_tote, total_text_bytes, |
|
1895 reliable_percent3, language3, percent3, normalized_score3, |
|
1896 text_bytes, is_reliable); |
|
1897 |
|
1898 bool have_good_answer = false; |
|
1899 if (FlagFinish(flags)) { |
|
1900 // Force a result |
|
1901 have_good_answer = true; |
|
1902 } else if (total_text_bytes <= kShortTextThresh) { |
|
1903 // Don't recurse on short text -- we already did word scores |
|
1904 have_good_answer = true; |
|
1905 } else if (*is_reliable && |
|
1906 (percent3[0] >= kGoodLang1Percent)) { |
|
1907 have_good_answer = true; |
|
1908 } else if (*is_reliable && |
|
1909 ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) { |
|
1910 have_good_answer = true; |
|
1911 } |
|
1912 |
|
1913 |
|
1914 if (have_good_answer) { |
|
1915 // This is the real, non-recursive return |
|
1916 |
|
1917 // Move bytes for unreliable langs to another lang or UNKNOWN |
|
1918 RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet); |
|
1919 |
|
1920 // Redo the result extraction after the removal above |
|
1921 doc_tote.Sort(3); |
|
1922 ExtractLangEtc(&doc_tote, total_text_bytes, |
|
1923 reliable_percent3, language3, percent3, normalized_score3, |
|
1924 text_bytes, is_reliable); |
|
1925 |
|
1926 |
|
1927 |
|
1928 Language summary_lang; |
|
1929 CalcSummaryLang(&doc_tote, total_text_bytes, |
|
1930 reliable_percent3, language3, percent3, |
|
1931 &summary_lang, is_reliable, |
|
1932 FLAGS_cld2_html, FLAGS_cld2_quiet); |
|
1933 |
|
1934 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { |
|
1935 for (int i = 0; i < 3; ++i) { |
|
1936 if (language3[i] != UNKNOWN_LANGUAGE) { |
|
1937 fprintf(stderr, "%s.%dR(%d%%) ", |
|
1938 LanguageCode(language3[i]), |
|
1939 reliable_percent3[i], |
|
1940 percent3[i]); |
|
1941 } |
|
1942 } |
|
1943 |
|
1944 fprintf(stderr, "%d bytes ", total_text_bytes); |
|
1945 fprintf(stderr, "= %s%c ", |
|
1946 LanguageName(summary_lang), *is_reliable ? ' ' : '*'); |
|
1947 fprintf(stderr, "<br><br>\n"); |
|
1948 } |
|
1949 |
|
1950 // Slightly condensed if quiet |
|
1951 if (FLAGS_cld2_html && FLAGS_cld2_quiet) { |
|
1952 fprintf(stderr, " "); |
|
1953 for (int i = 0; i < 3; ++i) { |
|
1954 if (language3[i] != UNKNOWN_LANGUAGE) { |
|
1955 fprintf(stderr, " %s %d%% ", |
|
1956 LanguageCode(language3[i]), |
|
1957 percent3[i]); |
|
1958 } |
|
1959 } |
|
1960 fprintf(stderr, "= %s%c ", |
|
1961 LanguageName(summary_lang), *is_reliable ? ' ' : '*'); |
|
1962 fprintf(stderr, "<br>\n"); |
|
1963 } |
|
1964 |
|
1965 return summary_lang; |
|
1966 } |
|
1967 |
|
1968 // Not a good answer -- do recursive call to refine |
|
1969 if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) { |
|
1970 // This is what we hope to improve on in the recursive call, if any |
|
1971 PrintLangs(stderr, language3, percent3, text_bytes, is_reliable); |
|
1972 } |
|
1973 |
|
1974 // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40 |
|
1975 // For this purpose, we treate "Ignore" as top40 |
|
1976 Language new_plus_one = UNKNOWN_LANGUAGE; |
|
1977 |
|
1978 if (total_text_bytes < kShortTextThresh) { |
|
1979 // Short text: Recursive call with top40 and short set |
|
1980 if (FLAGS_cld2_html || FLAGS_dbgscore) { |
|
1981 fprintf(stderr, " ---text_bytes[%d] " |
|
1982 "Recursive(Top40/Rep/Short/Words)---<br><br>\n", |
|
1983 total_text_bytes); |
|
1984 } |
|
1985 return DetectLanguageSummaryV2( |
|
1986 buffer, |
|
1987 buffer_length, |
|
1988 is_plain_text, |
|
1989 cld_hints, |
|
1990 allow_extended_lang, |
|
1991 flags | kCLDFlagTop40 | kCLDFlagRepeats | |
|
1992 kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish, |
|
1993 new_plus_one, |
|
1994 language3, |
|
1995 percent3, |
|
1996 normalized_score3, |
|
1997 resultchunkvector, |
|
1998 text_bytes, |
|
1999 is_reliable); |
|
2000 } |
|
2001 |
|
2002 // Longer text: Recursive call with top40 set |
|
2003 if (FLAGS_cld2_html || FLAGS_dbgscore) { |
|
2004 fprintf(stderr, |
|
2005 " ---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n", |
|
2006 total_text_bytes); |
|
2007 } |
|
2008 return DetectLanguageSummaryV2( |
|
2009 buffer, |
|
2010 buffer_length, |
|
2011 is_plain_text, |
|
2012 cld_hints, |
|
2013 allow_extended_lang, |
|
2014 flags | kCLDFlagTop40 | kCLDFlagRepeats | |
|
2015 kCLDFlagFinish, |
|
2016 new_plus_one, |
|
2017 language3, |
|
2018 percent3, |
|
2019 normalized_score3, |
|
2020 resultchunkvector, |
|
2021 text_bytes, |
|
2022 is_reliable); |
|
2023 } |
|
2024 |
|
2025 |
|
2026 // For debugging and wrappers. Not thread safe. |
|
2027 static char temp_detectlanguageversion[32]; |
|
2028 |
|
2029 // Return version text string |
|
2030 // String is "code_version - data_build_date" |
|
2031 const char* DetectLanguageVersion() { |
|
2032 if (kScoringtables.quadgram_obj == NULL) {return "";} |
|
2033 sprintf(temp_detectlanguageversion, |
|
2034 "V2.0 - %u", kScoringtables.quadgram_obj->kCLDTableBuildDate); |
|
2035 return temp_detectlanguageversion; |
|
2036 } |
|
2037 |
|
2038 |
|
2039 } // End namespace CLD2 |