1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/internal/compact_lang_det_impl.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,2039 @@ 1.4 +// Copyright 2013 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +// 1.19 +// Author: dsites@google.com (Dick Sites) 1.20 +// Updated 2014.01 for dual table lookup 1.21 +// 1.22 + 1.23 +#include <stdio.h> 1.24 +#include <string.h> 1.25 +#include <string> 1.26 +#include <vector> 1.27 + 1.28 +#include "cldutil.h" 1.29 +#include "debug.h" 1.30 +#include "integral_types.h" 1.31 +#include "lang_script.h" 1.32 +#include "utf8statetable.h" 1.33 + 1.34 +#ifdef CLD2_DYNAMIC_MODE 1.35 +#include "cld2_dynamic_data.h" 1.36 +#include "cld2_dynamic_data_loader.h" 1.37 +#endif 1.38 +#include "cld2tablesummary.h" 1.39 +#include "compact_lang_det_impl.h" 1.40 +#include "compact_lang_det_hint_code.h" 1.41 +#include "getonescriptspan.h" 1.42 +#include "tote.h" 1.43 + 1.44 + 1.45 +namespace CLD2 { 1.46 + 1.47 +using namespace std; 1.48 + 1.49 +// Linker supplies the right tables, From files 1.50 +// cld_generated_cjk_uni_prop_80.cc cld2_generated_cjk_compatible.cc 1.51 +// cld_generated_cjk_delta_bi_32.cc generated_distinct_bi_0.cc 1.52 +// cld2_generated_quad*.cc cld2_generated_deltaocta*.cc 1.53 +// cld2_generated_distinctocta*.cc 1.54 +// cld_generated_score_quad_octa_1024_256.cc 1.55 + 1.56 +// 2014.01 Now implementing quadgram dual lookup tables, to allow main table 1.57 +// sizes that are 1/3/5 times a power of two, instead of just powers of two. 1.58 +// Gives more flexibility of total footprint for CLD2. 1.59 + 1.60 +extern const int kLanguageToPLangSize; 1.61 +extern const int kCloseSetSize; 1.62 + 1.63 +extern const UTF8PropObj cld_generated_CjkUni_obj; 1.64 +extern const CLD2TableSummary kCjkCompat_obj; 1.65 +extern const CLD2TableSummary kCjkDeltaBi_obj; 1.66 +extern const CLD2TableSummary kDistinctBiTable_obj; 1.67 +extern const CLD2TableSummary kQuad_obj; 1.68 +extern const CLD2TableSummary kQuad_obj2; // Dual lookup tables 1.69 +extern const CLD2TableSummary kDeltaOcta_obj; 1.70 +extern const CLD2TableSummary kDistinctOcta_obj; 1.71 +extern const short kAvgDeltaOctaScore[]; 1.72 + 1.73 +#ifdef CLD2_DYNAMIC_MODE 1.74 + // CLD2_DYNAMIC_MODE is defined: 1.75 + // Data will be read from an mmap opened at runtime. 1.76 + static ScoringTables kScoringtables = { 1.77 + NULL, //&cld_generated_CjkUni_obj, 1.78 + NULL, //&kCjkCompat_obj, 1.79 + NULL, //&kCjkDeltaBi_obj, 1.80 + NULL, //&kDistinctBiTable_obj, 1.81 + NULL, //&kQuad_obj, 1.82 + NULL, //&kQuad_obj2, 1.83 + NULL, //&kDeltaOcta_obj, 1.84 + NULL, //&kDistinctOcta_obj, 1.85 + NULL, //kAvgDeltaOctaScore, 1.86 + }; 1.87 + static bool dynamicDataLoaded = false; 1.88 + static ScoringTables* dynamicTables = NULL; 1.89 + static void* mmapAddress = NULL; 1.90 + static int mmapLength = 0; 1.91 + 1.92 + bool isDataLoaded() { return dynamicDataLoaded; } 1.93 + 1.94 + void loadData(const char* fileName) { 1.95 + if (isDataLoaded()) { 1.96 + unloadData(); 1.97 + } 1.98 + dynamicTables = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength); 1.99 + kScoringtables = *dynamicTables; 1.100 + dynamicDataLoaded = true; 1.101 + }; 1.102 + 1.103 + void unloadData() { 1.104 + if (!dynamicDataLoaded) return; 1.105 + dynamicDataLoaded = false; 1.106 + // unloading will null all the pointers out. 1.107 + CLD2DynamicDataLoader::unloadData(&dynamicTables, &mmapAddress, &mmapLength); 1.108 + } 1.109 +#else 1.110 + // This initializes kScoringtables.quadgram_obj etc. 1.111 + static const ScoringTables kScoringtables = { 1.112 + &cld_generated_CjkUni_obj, 1.113 + &kCjkCompat_obj, 1.114 + &kCjkDeltaBi_obj, 1.115 + &kDistinctBiTable_obj, 1.116 + 1.117 + &kQuad_obj, 1.118 + &kQuad_obj2, // Dual lookup tables 1.119 + &kDeltaOcta_obj, 1.120 + &kDistinctOcta_obj, 1.121 + 1.122 + kAvgDeltaOctaScore, 1.123 + }; 1.124 +#endif // #ifdef CLD2_DYNAMIC_MODE 1.125 + 1.126 + 1.127 +static const bool FLAGS_cld_no_minimum_bytes = false; 1.128 +static const bool FLAGS_cld_forcewords = true; 1.129 +static const bool FLAGS_cld_showme = false; 1.130 +static const bool FLAGS_cld_echotext = true; 1.131 +static const int32 FLAGS_cld_textlimit = 160; 1.132 +static const int32 FLAGS_cld_smoothwidth = 20; 1.133 +static const bool FLAGS_cld_2011_hints = true; 1.134 +static const int32 FLAGS_cld_max_lang_tag_scan_kb = 8; 1.135 + 1.136 +static const bool FLAGS_dbgscore = false; 1.137 + 1.138 + 1.139 +static const int kLangHintInitial = 12; // Boost language by N initially 1.140 +static const int kLangHintBoost = 12; // Boost language by N/16 per quadgram 1.141 + 1.142 +static const int kShortSpanThresh = 32; // Bytes 1.143 +static const int kMaxSecondChanceLen = 1024; // Look at first 1K of short spans 1.144 + 1.145 +static const int kCheapSqueezeTestThresh = 4096; // Only look for squeezing 1.146 + // after this many text bytes 1.147 +static const int kCheapSqueezeTestLen = 256; // Bytes to test to trigger sqz 1.148 +static const int kSpacesTriggerPercent = 25; // Trigger sqz if >=25% spaces 1.149 +static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted 1.150 + 1.151 +static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks 1.152 +static const int kSpacesThreshPercent = 25; // Squeeze if >=25% spaces 1.153 +static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted 1.154 + 1.155 +static const int kMaxSpaceScan = 32; // Bytes 1.156 + 1.157 +static const int kGoodLang1Percent = 70; 1.158 +static const int kGoodLang1and2Percent = 93; 1.159 +static const int kShortTextThresh = 256; // Bytes 1.160 + 1.161 +static const int kMinChunkSizeQuads = 4; // Chunk is at least four quads 1.162 +static const int kMaxChunkSizeQuads = 1024; // Chunk is at most 1K quads 1.163 + 1.164 +static const int kDefaultWordSpan = 256; // Scan at least this many initial 1.165 + // bytes with word scoring 1.166 +static const int kReallyBigWordSpan = 9999999; // Forces word scoring all text 1.167 + 1.168 +static const int kMinReliableSeq = 50; // Record in seq if >= 50% reliable 1.169 + 1.170 +static const int kPredictionTableSize = 4096; // Must be exactly 4096 for 1.171 + // cheap compressor 1.172 + 1.173 +static const int kNonEnBoilerplateMinPercent = 17; // <this => no second 1.174 +static const int kNonFIGSBoilerplateMinPercent = 20; // <this => no second 1.175 +static const int kGoodFirstMinPercent = 26; // <this => UNK 1.176 +static const int kGoodFirstReliableMinPercent = 51; // <this => unreli 1.177 +static const int kIgnoreMaxPercent = 20; // >this => unreli 1.178 +static const int kKeepMinPercent = 2; // <this => unreli 1.179 + 1.180 + 1.181 + 1.182 +// Statistically closest language, based on quadgram table 1.183 +// Those that are far from other languges map to UNKNOWN_LANGUAGE 1.184 +// Subscripted by Language 1.185 +// 1.186 +// From lang_correlation.txt and hand-edits 1.187 +// sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/ 1.188 +// (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE, 1.189 +// \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt 1.190 +// 1.191 +static const int kMinCorrPercent = 24; // Pick off how close you want 1.192 + // 24 catches PERSIAN <== ARABIC 1.193 + // but not SPANISH <== PORTUGESE 1.194 +static Language Unknown = UNKNOWN_LANGUAGE; 1.195 + 1.196 +// Suspect idea 1.197 +// Subscripted by Language 1.198 +static const Language kClosestAltLanguage[] = { 1.199 + (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // ENGLISH 1.200 + (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // DANISH 1.201 + (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE, // DUTCH 1.202 + (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // FINNISH 1.203 + (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // FRENCH 1.204 + (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE, // GERMAN 1.205 + (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE, // HEBREW 1.206 + (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE, // ITALIAN 1.207 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Japanese 1.208 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Korean 1.209 + (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE, // NORWEGIAN 1.210 + ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // POLISH 1.211 + (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // PORTUGUESE 1.212 + (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // RUSSIAN 1.213 + (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE, // SPANISH 1.214 + (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // SWEDISH 1.215 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Chinese 1.216 + (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // CZECH 1.217 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GREEK 1.218 + (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE, // ICELANDIC 1.219 + ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE, // LATVIAN 1.220 + ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE, // LITHUANIAN 1.221 + ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ROMANIAN 1.222 + ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // HUNGARIAN 1.223 + (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE, // ESTONIAN 1.224 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Ignore 1.225 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Unknown 1.226 + (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // BULGARIAN 1.227 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CROATIAN 1.228 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SERBIAN 1.229 + (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE, // IRISH 1.230 + (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GALICIAN 1.231 + ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // TAGALOG 1.232 + (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE, // TURKISH 1.233 + (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // UKRAINIAN 1.234 + (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // HINDI 1.235 + (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // MACEDONIAN 1.236 + (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE, // BENGALI 1.237 + (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // INDONESIAN 1.238 + ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // LATIN 1.239 + (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // MALAY 1.240 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MALAYALAM 1.241 + ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE, // WELSH 1.242 + ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // NEPALI 1.243 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TELUGU 1.244 + ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE, // ALBANIAN 1.245 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TAMIL 1.246 + (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE, // BELARUSIAN 1.247 + (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE, // JAVANESE 1.248 + (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE, // OCCITAN 1.249 + (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // URDU 1.250 + (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // BIHARI 1.251 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GUJARATI 1.252 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // THAI 1.253 + (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // ARABIC 1.254 + (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // CATALAN 1.255 + ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ESPERANTO 1.256 + ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // BASQUE 1.257 + ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // INTERLINGUA 1.258 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KANNADA 1.259 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PUNJABI 1.260 + (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE, // SCOTS_GAELIC 1.261 + ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SWAHILI 1.262 + (28 >= kMinCorrPercent) ? SERBIAN : UNKNOWN_LANGUAGE, // SLOVENIAN 1.263 + (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // MARATHI 1.264 + ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // MALTESE 1.265 + ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE, // VIETNAMESE 1.266 + (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // FRISIAN 1.267 + (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE, // SLOVAK 1.268 + // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ChineseT 1.269 + (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE, // ChineseT 1.270 + (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE, // FAROESE 1.271 + (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE, // SUNDANESE 1.272 + (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE, // UZBEK 1.273 + ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE, // AMHARIC 1.274 + (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // AZERBAIJANI 1.275 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GEORGIAN 1.276 + ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE, // TIGRINYA 1.277 + (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // PERSIAN 1.278 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // BOSNIAN 1.279 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SINHALESE 1.280 + (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // NORWEGIAN_N 1.281 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_P 1.282 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_B 1.283 + (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // XHOSA 1.284 + (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE, // ZULU 1.285 + ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GUARANI 1.286 + (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE, // SESOTHO 1.287 + ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // TURKMEN 1.288 + ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE, // KYRGYZ 1.289 + ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE, // BRETON 1.290 + ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE, // TWI 1.291 + (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE, // YIDDISH 1.292 + (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE, // SERBO_CROATIAN 1.293 + (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // SOMALI 1.294 + ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // UIGHUR 1.295 + (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // KURDISH 1.296 + ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // MONGOLIAN 1.297 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ARMENIAN 1.298 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // LAOTHIAN 1.299 + ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // SINDHI 1.300 + (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // RHAETO_ROMANCE 1.301 + (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // AFRIKAANS 1.302 + (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // LUXEMBOURGISH 1.303 + ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // BURMESE 1.304 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KHMER 1.305 + (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE, // TIBETAN 1.306 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // DHIVEHI 1.307 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CHEROKEE 1.308 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SYRIAC 1.309 + ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // LIMBU 1.310 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ORIYA 1.311 + (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE, // ASSAMESE 1.312 + (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // CORSICAN 1.313 + ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // INTERLINGUE 1.314 + ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // KAZAKH 1.315 + ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE, // LINGALA 1.316 + (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // MOLDAVIAN 1.317 + (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // PASHTO 1.318 + ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE, // QUECHUA 1.319 + ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SHONA 1.320 + (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // TAJIK 1.321 + (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE, // TATAR 1.322 + (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE, // TONGA 1.323 + ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE, // YORUBA 1.324 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_ENGLISH_BASED 1.325 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_FRENCH_BASED 1.326 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED 1.327 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_OTHER 1.328 + ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // MAORI 1.329 + ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // WOLOF 1.330 + ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE, // ABKHAZIAN 1.331 + ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // AFAR 1.332 + ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE, // AYMARA 1.333 + (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE, // BASHKIR 1.334 + ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // BISLAMA 1.335 + (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE, // DZONGKHA 1.336 + ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // FIJIAN 1.337 + ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE, // GREENLANDIC 1.338 + ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE, // HAUSA 1.339 + ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // HAITIAN_CREOLE 1.340 + ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE, // INUPIAK 1.341 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // INUKTITUT 1.342 + ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // KASHMIRI 1.343 + (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE, // KINYARWANDA 1.344 + ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE, // MALAGASY 1.345 + (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // NAURU 1.346 + (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // OROMO 1.347 + (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // RUNDI 1.348 + (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // SAMOAN 1.349 + ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE, // SANGO 1.350 + (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // SANSKRIT 1.351 + (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // SISWANT 1.352 + ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE, // TSONGA 1.353 + (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE, // TSWANA 1.354 + ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // VOLAPUK 1.355 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ZHUANG 1.356 + ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // KHASI 1.357 + (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // SCOTS 1.358 + (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // GANDA 1.359 + ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // MANX 1.360 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MONTENEGRIN 1.361 + 1.362 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // AKAN 1.363 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // IGBO 1.364 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MAURITIAN_CREOLE 1.365 + ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // HAWAIIAN 1.366 +}; 1.367 + 1.368 +// COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES, 1.369 +// kClosestAltLanguage_has_incorrect_size); 1.370 + 1.371 + 1.372 +inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;} 1.373 +inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;} 1.374 +inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;} 1.375 +inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;} 1.376 +inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;} 1.377 +inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;} 1.378 +inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;} 1.379 + 1.380 + 1.381 + // Defines Top40 packed languages 1.382 + 1.383 + // Google top 40 languages 1.384 + // 1.385 + // Tier 0/1 Language enum list (16) 1.386 + // ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH, // E - FIGS 1.387 + // DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN, 1.388 + // PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI, 1.389 + // ARABIC, 1.390 + // 1.391 + // Tier 2 Language enum list (22) 1.392 + // SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN, 1.393 + // HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN, 1.394 + // VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK, 1.395 + // TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN, 1.396 + // UKRAINIAN, HINDI, 1.397 + // 1.398 + // use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21) 1.399 + // 1.400 + // Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40 1.401 + 1.402 + 1.403 +void DemoteNotTop40(Tote* chunk_tote, uint16 psplus_one) { 1.404 + // REVISIT 1.405 +} 1.406 + 1.407 +void PrintText(FILE* f, Language cur_lang, const string& temp) { 1.408 + if (temp.size() == 0) {return;} 1.409 + fprintf(f, "PrintText[%s]%s<br>\n", LanguageName(cur_lang), temp.c_str()); 1.410 +} 1.411 + 1.412 + 1.413 +//------------------------------------------------------------------------------ 1.414 +// For --cld_html debugging output. Not thread safe 1.415 +//------------------------------------------------------------------------------ 1.416 +static Language prior_lang = UNKNOWN_LANGUAGE; 1.417 +static bool prior_unreliable = false; 1.418 + 1.419 +//------------------------------------------------------------------------------ 1.420 +// End For --cld_html debugging output 1.421 +//------------------------------------------------------------------------------ 1.422 + 1.423 + 1.424 +// Backscan to word boundary, returning how many bytes n to go back 1.425 +// so that src - n is non-space ans src - n - 1 is space. 1.426 +// If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary 1.427 +int BackscanToSpace(const char* src, int limit) { 1.428 + int n = 0; 1.429 + limit = minint(limit, kMaxSpaceScan); 1.430 + while (n < limit) { 1.431 + if (src[-n - 1] == ' ') {return n;} // We are at _X 1.432 + ++n; 1.433 + } 1.434 + n = 0; 1.435 + while (n < limit) { 1.436 + if ((src[-n] & 0xc0) != 0x80) {return n;} // We are at char begin 1.437 + ++n; 1.438 + } 1.439 + return 0; 1.440 +} 1.441 + 1.442 +// Forwardscan to word boundary, returning how many bytes n to go forward 1.443 +// so that src + n is non-space ans src + n - 1 is space. 1.444 +// If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary 1.445 +int ForwardscanToSpace(const char* src, int limit) { 1.446 + int n = 0; 1.447 + limit = minint(limit, kMaxSpaceScan); 1.448 + while (n < limit) { 1.449 + if (src[n] == ' ') {return n + 1;} // We are at _X 1.450 + ++n; 1.451 + } 1.452 + n = 0; 1.453 + while (n < limit) { 1.454 + if ((src[n] & 0xc0) != 0x80) {return n;} // We are at char begin 1.455 + ++n; 1.456 + } 1.457 + return 0; 1.458 +} 1.459 + 1.460 + 1.461 +// This uses a cheap predictor to get a measure of compression, and 1.462 +// hence a measure of repetitiveness. It works on complete UTF-8 characters 1.463 +// instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly 1.464 +// all the time when done with a byte-based count. Sigh. 1.465 +// 1.466 +// To allow running prediction across multiple chunks, caller passes in current 1.467 +// 12-bit hash value and int[4096] prediction table. Caller inits these to 0. 1.468 +// 1.469 +// Returns the number of *bytes* correctly predicted, increments by 1..4 for 1.470 +// each correctly-predicted character. 1.471 +// 1.472 +// NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text 1.473 +// 1.474 + 1.475 +// TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen 1.476 + 1.477 +int CountPredictedBytes(const char* isrc, int src_len, int* hash, int* tbl) { 1.478 + int p_count = 0; 1.479 + const uint8* src = reinterpret_cast<const uint8*>(isrc); 1.480 + const uint8* srclimit = src + src_len; 1.481 + int local_hash = *hash; 1.482 + 1.483 + while (src < srclimit) { 1.484 + int c = src[0]; 1.485 + int incr = 1; 1.486 + 1.487 + // Pick up one char and length 1.488 + if (c < 0xc0) { 1.489 + // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx 1.490 + // Do nothing more 1.491 + } else if ((c & 0xe0) == 0xc0) { 1.492 + // Two-byte 1.493 + c = (c << 8) | src[1]; 1.494 + incr = 2; 1.495 + } else if ((c & 0xf0) == 0xe0) { 1.496 + // Three-byte 1.497 + c = (c << 16) | (src[1] << 8) | src[2]; 1.498 + incr = 3; 1.499 + } else { 1.500 + // Four-byte 1.501 + c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; 1.502 + incr = 4; 1.503 + } 1.504 + src += incr; 1.505 + 1.506 + int p = tbl[local_hash]; // Prediction 1.507 + tbl[local_hash] = c; // Update prediction 1.508 + if (c == p) { 1.509 + p_count += incr; // Count bytes of good predictions 1.510 + } 1.511 + 1.512 + local_hash = ((local_hash << 4) ^ c) & 0xfff; 1.513 + } 1.514 + *hash = local_hash; 1.515 + return p_count; 1.516 +} 1.517 + 1.518 + 1.519 + 1.520 +// Counts number of spaces; a little faster than one-at-a-time 1.521 +// Doesn't count odd bytes at end 1.522 +int CountSpaces4(const char* src, int src_len) { 1.523 + int s_count = 0; 1.524 + for (int i = 0; i < (src_len & ~3); i += 4) { 1.525 + s_count += (src[i] == ' '); 1.526 + s_count += (src[i+1] == ' '); 1.527 + s_count += (src[i+2] == ' '); 1.528 + s_count += (src[i+3] == ' '); 1.529 + } 1.530 + return s_count; 1.531 +} 1.532 + 1.533 + 1.534 +// Remove words of text that have more than half their letters predicted 1.535 +// correctly by our cheap predictor, moving the remaining words in-place 1.536 +// to the front of the input buffer. 1.537 +// 1.538 +// To allow running prediction across multiple chunks, caller passes in current 1.539 +// 12-bit hash value and int[4096] prediction table. Caller inits these to 0. 1.540 +// 1.541 +// Return the new, possibly-shorter length 1.542 +// 1.543 +// Result Buffer ALWAYS has leading space and trailing space space space NUL, 1.544 +// if input does 1.545 +// 1.546 +int CheapRepWordsInplace(char* isrc, int src_len, int* hash, int* tbl) { 1.547 + const uint8* src = reinterpret_cast<const uint8*>(isrc); 1.548 + const uint8* srclimit = src + src_len; 1.549 + char* dst = isrc; 1.550 + int local_hash = *hash; 1.551 + char* word_dst = dst; // Start of next word 1.552 + int good_predict_bytes = 0; 1.553 + int word_length_bytes = 0; 1.554 + 1.555 + while (src < srclimit) { 1.556 + int c = src[0]; 1.557 + int incr = 1; 1.558 + *dst++ = c; 1.559 + 1.560 + if (c == ' ') { 1.561 + if ((good_predict_bytes * 2) > word_length_bytes) { 1.562 + // Word is well-predicted: backup to start of this word 1.563 + dst = word_dst; 1.564 + if (FLAGS_cld_showme) { 1.565 + // Mark the deletion point with period 1.566 + // Don't repeat multiple periods 1.567 + // Cannot mark with more bytes or may overwrite unseen input 1.568 + if ((isrc < (dst - 2)) && (dst[-2] != '.')) { 1.569 + *dst++ = '.'; 1.570 + *dst++ = ' '; 1.571 + } 1.572 + } 1.573 + } 1.574 + word_dst = dst; // Start of next word 1.575 + good_predict_bytes = 0; 1.576 + word_length_bytes = 0; 1.577 + } 1.578 + 1.579 + // Pick up one char and length 1.580 + if (c < 0xc0) { 1.581 + // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx 1.582 + // Do nothing more 1.583 + } else if ((c & 0xe0) == 0xc0) { 1.584 + // Two-byte 1.585 + *dst++ = src[1]; 1.586 + c = (c << 8) | src[1]; 1.587 + incr = 2; 1.588 + } else if ((c & 0xf0) == 0xe0) { 1.589 + // Three-byte 1.590 + *dst++ = src[1]; 1.591 + *dst++ = src[2]; 1.592 + c = (c << 16) | (src[1] << 8) | src[2]; 1.593 + incr = 3; 1.594 + } else { 1.595 + // Four-byte 1.596 + *dst++ = src[1]; 1.597 + *dst++ = src[2]; 1.598 + *dst++ = src[3]; 1.599 + c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; 1.600 + incr = 4; 1.601 + } 1.602 + src += incr; 1.603 + word_length_bytes += incr; 1.604 + 1.605 + int p = tbl[local_hash]; // Prediction 1.606 + tbl[local_hash] = c; // Update prediction 1.607 + if (c == p) { 1.608 + good_predict_bytes += incr; // Count good predictions 1.609 + } 1.610 + 1.611 + local_hash = ((local_hash << 4) ^ c) & 0xfff; 1.612 + } 1.613 + 1.614 + *hash = local_hash; 1.615 + 1.616 + if ((dst - isrc) < (src_len - 3)) { 1.617 + // Pad and make last char clean UTF-8 by putting following spaces 1.618 + dst[0] = ' '; 1.619 + dst[1] = ' '; 1.620 + dst[2] = ' '; 1.621 + dst[3] = '\0'; 1.622 + } else if ((dst - isrc) < src_len) { 1.623 + // Make last char clean UTF-8 by putting following space off the end 1.624 + dst[0] = ' '; 1.625 + } 1.626 + 1.627 + return static_cast<int>(dst - isrc); 1.628 +} 1.629 + 1.630 + 1.631 +// This alternate form overwrites redundant words, thus avoiding corrupting the 1.632 +// backmap for generate a vector of original-text ranges. 1.633 +int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) { 1.634 + const uint8* src = reinterpret_cast<const uint8*>(isrc); 1.635 + const uint8* srclimit = src + src_len; 1.636 + char* dst = isrc; 1.637 + int local_hash = *hash; 1.638 + char* word_dst = dst; // Start of next word 1.639 + int good_predict_bytes = 0; 1.640 + int word_length_bytes = 0; 1.641 + 1.642 + while (src < srclimit) { 1.643 + int c = src[0]; 1.644 + int incr = 1; 1.645 + *dst++ = c; 1.646 + 1.647 + if (c == ' ') { 1.648 + if ((good_predict_bytes * 2) > word_length_bytes) { 1.649 + // Word [word_dst..dst-1) is well-predicted: overwrite 1.650 + for (char* p = word_dst; p < dst - 1; ++p) {*p = '.';} 1.651 + } 1.652 + word_dst = dst; // Start of next word 1.653 + good_predict_bytes = 0; 1.654 + word_length_bytes = 0; 1.655 + } 1.656 + 1.657 + // Pick up one char and length 1.658 + if (c < 0xc0) { 1.659 + // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx 1.660 + // Do nothing more 1.661 + } else if ((c & 0xe0) == 0xc0) { 1.662 + // Two-byte 1.663 + *dst++ = src[1]; 1.664 + c = (c << 8) | src[1]; 1.665 + incr = 2; 1.666 + } else if ((c & 0xf0) == 0xe0) { 1.667 + // Three-byte 1.668 + *dst++ = src[1]; 1.669 + *dst++ = src[2]; 1.670 + c = (c << 16) | (src[1] << 8) | src[2]; 1.671 + incr = 3; 1.672 + } else { 1.673 + // Four-byte 1.674 + *dst++ = src[1]; 1.675 + *dst++ = src[2]; 1.676 + *dst++ = src[3]; 1.677 + c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; 1.678 + incr = 4; 1.679 + } 1.680 + src += incr; 1.681 + word_length_bytes += incr; 1.682 + 1.683 + int p = tbl[local_hash]; // Prediction 1.684 + tbl[local_hash] = c; // Update prediction 1.685 + if (c == p) { 1.686 + good_predict_bytes += incr; // Count good predictions 1.687 + } 1.688 + 1.689 + local_hash = ((local_hash << 4) ^ c) & 0xfff; 1.690 + } 1.691 + 1.692 + *hash = local_hash; 1.693 + 1.694 + if ((dst - isrc) < (src_len - 3)) { 1.695 + // Pad and make last char clean UTF-8 by putting following spaces 1.696 + dst[0] = ' '; 1.697 + dst[1] = ' '; 1.698 + dst[2] = ' '; 1.699 + dst[3] = '\0'; 1.700 + } else if ((dst - isrc) < src_len) { 1.701 + // Make last char clean UTF-8 by putting following space off the end 1.702 + dst[0] = ' '; 1.703 + } 1.704 + 1.705 + return static_cast<int>(dst - isrc); 1.706 +} 1.707 + 1.708 + 1.709 +// Remove portions of text that have a high density of spaces, or that are 1.710 +// overly repetitive, squeezing the remaining text in-place to the front of the 1.711 +// input buffer. 1.712 +// 1.713 +// Squeezing looks at density of space/prediced chars in fixed-size chunks, 1.714 +// specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes. 1.715 +// 1.716 +// Return the new, possibly-shorter length 1.717 +// 1.718 +// Result Buffer ALWAYS has leading space and trailing space space space NUL, 1.719 +// if input does 1.720 +// 1.721 +int CheapSqueezeInplace(char* isrc, 1.722 + int src_len, 1.723 + int ichunksize) { 1.724 + char* src = isrc; 1.725 + char* dst = src; 1.726 + char* srclimit = src + src_len; 1.727 + bool skipping = false; 1.728 + 1.729 + int hash = 0; 1.730 + // Allocate local prediction table. 1.731 + int* predict_tbl = new int[kPredictionTableSize]; 1.732 + memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); 1.733 + 1.734 + int chunksize = ichunksize; 1.735 + if (chunksize == 0) {chunksize = kChunksizeDefault;} 1.736 + int space_thresh = (chunksize * kSpacesThreshPercent) / 100; 1.737 + int predict_thresh = (chunksize * kPredictThreshPercent) / 100; 1.738 + 1.739 + while (src < srclimit) { 1.740 + int remaining_bytes = srclimit - src; 1.741 + int len = minint(chunksize, remaining_bytes); 1.742 + // Make len land us on a UTF-8 character boundary. 1.743 + // Ah. Also fixes mispredict because we could get out of phase 1.744 + // Loop always terminates at trailing space in buffer 1.745 + while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes 1.746 + 1.747 + int space_n = CountSpaces4(src, len); 1.748 + int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl); 1.749 + if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) { 1.750 + // Skip the text 1.751 + if (!skipping) { 1.752 + // Keeping-to-skipping transition; do it at a space 1.753 + int n = BackscanToSpace(dst, static_cast<int>(dst - isrc)); 1.754 + dst -= n; 1.755 + if (dst == isrc) { 1.756 + // Force a leading space if the first chunk is deleted 1.757 + *dst++ = ' '; 1.758 + } 1.759 + if (FLAGS_cld_showme) { 1.760 + // Mark the deletion point with black square U+25A0 1.761 + *dst++ = static_cast<unsigned char>(0xe2); 1.762 + *dst++ = static_cast<unsigned char>(0x96); 1.763 + *dst++ = static_cast<unsigned char>(0xa0); 1.764 + *dst++ = ' '; 1.765 + } 1.766 + skipping = true; 1.767 + } 1.768 + } else { 1.769 + // Keep the text 1.770 + if (skipping) { 1.771 + // Skipping-to-keeping transition; do it at a space 1.772 + int n = ForwardscanToSpace(src, len); 1.773 + src += n; 1.774 + remaining_bytes -= n; // Shrink remaining length 1.775 + len -= n; 1.776 + skipping = false; 1.777 + } 1.778 + // "len" can be negative in some cases 1.779 + if (len > 0) { 1.780 + memmove(dst, src, len); 1.781 + dst += len; 1.782 + } 1.783 + } 1.784 + src += len; 1.785 + } 1.786 + 1.787 + if ((dst - isrc) < (src_len - 3)) { 1.788 + // Pad and make last char clean UTF-8 by putting following spaces 1.789 + dst[0] = ' '; 1.790 + dst[1] = ' '; 1.791 + dst[2] = ' '; 1.792 + dst[3] = '\0'; 1.793 + } else if ((dst - isrc) < src_len) { 1.794 + // Make last char clean UTF-8 by putting following space off the end 1.795 + dst[0] = ' '; 1.796 + } 1.797 + 1.798 + // Deallocate local prediction table 1.799 + delete[] predict_tbl; 1.800 + return static_cast<int>(dst - isrc); 1.801 +} 1.802 + 1.803 +// This alternate form overwrites redundant words, thus avoiding corrupting the 1.804 +// backmap for generate a vector of original-text ranges. 1.805 +int CheapSqueezeInplaceOverwrite(char* isrc, 1.806 + int src_len, 1.807 + int ichunksize) { 1.808 + char* src = isrc; 1.809 + char* dst = src; 1.810 + char* srclimit = src + src_len; 1.811 + bool skipping = false; 1.812 + 1.813 + int hash = 0; 1.814 + // Allocate local prediction table. 1.815 + int* predict_tbl = new int[kPredictionTableSize]; 1.816 + memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); 1.817 + 1.818 + int chunksize = ichunksize; 1.819 + if (chunksize == 0) {chunksize = kChunksizeDefault;} 1.820 + int space_thresh = (chunksize * kSpacesThreshPercent) / 100; 1.821 + int predict_thresh = (chunksize * kPredictThreshPercent) / 100; 1.822 + 1.823 + // Always keep first byte (space) 1.824 + ++src; 1.825 + ++dst; 1.826 + while (src < srclimit) { 1.827 + int remaining_bytes = srclimit - src; 1.828 + int len = minint(chunksize, remaining_bytes); 1.829 + // Make len land us on a UTF-8 character boundary. 1.830 + // Ah. Also fixes mispredict because we could get out of phase 1.831 + // Loop always terminates at trailing space in buffer 1.832 + while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes 1.833 + 1.834 + int space_n = CountSpaces4(src, len); 1.835 + int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl); 1.836 + if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) { 1.837 + // Overwrite the text [dst-n..dst) 1.838 + if (!skipping) { 1.839 + // Keeping-to-skipping transition; do it at a space 1.840 + int n = BackscanToSpace(dst, static_cast<int>(dst - isrc)); 1.841 + // Text [word_dst..dst) is well-predicted: overwrite 1.842 + for (char* p = dst - n; p < dst; ++p) {*p = '.';} 1.843 + skipping = true; 1.844 + } 1.845 + // Overwrite the text [dst..dst+len) 1.846 + for (char* p = dst; p < dst + len; ++p) {*p = '.';} 1.847 + dst[len - 1] = ' '; // Space at end so we can see what is happening 1.848 + } else { 1.849 + // Keep the text 1.850 + if (skipping) { 1.851 + // Skipping-to-keeping transition; do it at a space 1.852 + int n = ForwardscanToSpace(src, len); 1.853 + // Text [dst..dst+n) is well-predicted: overwrite 1.854 + for (char* p = dst; p < dst + n - 1; ++p) {*p = '.';} 1.855 + skipping = false; 1.856 + } 1.857 + } 1.858 + dst += len; 1.859 + src += len; 1.860 + } 1.861 + 1.862 + if ((dst - isrc) < (src_len - 3)) { 1.863 + // Pad and make last char clean UTF-8 by putting following spaces 1.864 + dst[0] = ' '; 1.865 + dst[1] = ' '; 1.866 + dst[2] = ' '; 1.867 + dst[3] = '\0'; 1.868 + } else if ((dst - isrc) < src_len) { 1.869 + // Make last char clean UTF-8 by putting following space off the end 1.870 + dst[0] = ' '; 1.871 + } 1.872 + 1.873 + // Deallocate local prediction table 1.874 + delete[] predict_tbl; 1.875 + return static_cast<int>(dst - isrc); 1.876 +} 1.877 + 1.878 +// Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input 1.879 +// About 90 MB/sec, with or without memcpy, chunksize 48 or 4096 1.880 +// Just CountSpaces is about 340 MB/sec 1.881 +// Byte-only CountPredictedBytes is about 150 MB/sec 1.882 +// Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec 1.883 +// Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c 1.884 +// Unjammed byte-only both = 170 MB/sec 1.885 +// Jammed byte-only both = 120 MB/sec 1.886 +// Back to original w/slight updates, 110 MB/sec 1.887 +// 1.888 +bool CheapSqueezeTriggerTest(const char* src, int src_len, int testsize) { 1.889 + // Don't trigger at all on short text 1.890 + if (src_len < testsize) {return false;} 1.891 + int space_thresh = (testsize * kSpacesTriggerPercent) / 100; 1.892 + int predict_thresh = (testsize * kPredictTriggerPercent) / 100; 1.893 + int hash = 0; 1.894 + // Allocate local prediction table. 1.895 + int* predict_tbl = new int[kPredictionTableSize]; 1.896 + memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); 1.897 + 1.898 + bool retval = false; 1.899 + if ((CountSpaces4(src, testsize) >= space_thresh) || 1.900 + (CountPredictedBytes(src, testsize, &hash, predict_tbl) >= 1.901 + predict_thresh)) { 1.902 + retval = true; 1.903 + } 1.904 + // Deallocate local prediction table 1.905 + delete[] predict_tbl; 1.906 + return retval; 1.907 +} 1.908 + 1.909 + 1.910 + 1.911 + 1.912 +// Delete any extended languages from doc_tote 1.913 +void RemoveExtendedLanguages(DocTote* doc_tote) { 1.914 + // Now a nop 1.915 +} 1.916 + 1.917 +static const int kMinReliableKeepPercent = 41; // Remove lang if reli < this 1.918 + 1.919 +// For Tier3 languages, require a minimum number of bytes to be first-place lang 1.920 +static const int kGoodFirstT3MinBytes = 24; // <this => no first 1.921 + 1.922 +// Move bytes for unreliable langs to another lang or UNKNOWN 1.923 +// doc_tote is sorted, so cannot Add 1.924 +// 1.925 +// If both CHINESE and CHINESET are present and unreliable, do not delete both; 1.926 +// merge both into CHINESE. 1.927 +// 1.928 +//dsites 2009.03.19 1.929 +// we also want to remove Tier3 languages as the first lang if there is very 1.930 +// little text like ej1 ej2 ej3 ej4 1.931 +// maybe fold this back in earlier 1.932 +// 1.933 +void RemoveUnreliableLanguages(DocTote* doc_tote, 1.934 + bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) { 1.935 + // Prepass to merge some low-reliablility languages 1.936 + // TODO: this shouldn't really reach in to the internal structure of doc_tote 1.937 + int total_bytes = 0; 1.938 + for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { 1.939 + int plang = doc_tote->Key(sub); 1.940 + if (plang == DocTote::kUnusedKey) {continue;} // Empty slot 1.941 + 1.942 + Language lang = static_cast<Language>(plang); 1.943 + int bytes = doc_tote->Value(sub); 1.944 + int reli = doc_tote->Reliability(sub); 1.945 + if (bytes == 0) {continue;} // Zero bytes 1.946 + total_bytes += bytes; 1.947 + 1.948 + // Reliable percent = stored reliable score over stored bytecount 1.949 + int reliable_percent = reli / bytes; 1.950 + if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper 1.951 + 1.952 + // This language is too unreliable to keep, but we might merge it. 1.953 + Language altlang = UNKNOWN_LANGUAGE; 1.954 + if (lang <= HAWAIIAN) {altlang = kClosestAltLanguage[lang];} 1.955 + if (altlang == UNKNOWN_LANGUAGE) {continue;} // No alternative 1.956 + 1.957 + // Look for alternative in doc_tote 1.958 + int altsub = doc_tote->Find(altlang); 1.959 + if (altsub < 0) {continue;} // No alternative text 1.960 + 1.961 + int bytes2 = doc_tote->Value(altsub); 1.962 + int reli2 = doc_tote->Reliability(altsub); 1.963 + if (bytes2 == 0) {continue;} // Zero bytes 1.964 + 1.965 + // Reliable percent is stored reliable score over stored bytecount 1.966 + int reliable_percent2 = reli2 / bytes2; 1.967 + 1.968 + // Merge one language into the other. Break ties toward lower lang # 1.969 + int tosub = altsub; 1.970 + int fromsub = sub; 1.971 + bool into_lang = false; 1.972 + if ((reliable_percent2 < reliable_percent) || 1.973 + ((reliable_percent2 == reliable_percent) && (lang < altlang))) { 1.974 + tosub = sub; 1.975 + fromsub = altsub; 1.976 + into_lang = true; 1.977 + } 1.978 + 1.979 + // Make sure merged reliability doesn't drop and is enough to avoid delete 1.980 + int newpercent = maxint(reliable_percent, reliable_percent2); 1.981 + newpercent = maxint(newpercent, kMinReliableKeepPercent); 1.982 + int newbytes = bytes + bytes2; 1.983 + int newreli = newpercent * newbytes; 1.984 + 1.985 + doc_tote->SetKey(fromsub, DocTote::kUnusedKey); 1.986 + doc_tote->SetScore(fromsub, 0); 1.987 + doc_tote->SetReliability(fromsub, 0); 1.988 + doc_tote->SetScore(tosub, newbytes); 1.989 + doc_tote->SetReliability(tosub, newreli); 1.990 + 1.991 + // Show fate of unreliable languages if at least 10 bytes 1.992 + if (FLAGS_cld2_html && (newbytes >= 10) && 1.993 + !FLAGS_cld2_quiet) { 1.994 + if (into_lang) { 1.995 + fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ", 1.996 + LanguageCode(altlang), reliable_percent2, bytes2, 1.997 + LanguageCode(lang)); 1.998 + } else { 1.999 + fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ", 1.1000 + LanguageCode(lang), reliable_percent, bytes, 1.1001 + LanguageCode(altlang)); 1.1002 + } 1.1003 + } 1.1004 + } 1.1005 + 1.1006 + 1.1007 + // Pass to delete any remaining unreliable languages 1.1008 + for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { 1.1009 + int plang = doc_tote->Key(sub); 1.1010 + if (plang == DocTote::kUnusedKey) {continue;} // Empty slot 1.1011 + 1.1012 + Language lang = static_cast<Language>(plang); 1.1013 + int bytes = doc_tote->Value(sub); 1.1014 + int reli = doc_tote->Reliability(sub); 1.1015 + if (bytes == 0) {continue;} // Zero bytes 1.1016 + 1.1017 + // Reliable percent is stored as reliable score over stored bytecount 1.1018 + int reliable_percent = reli / bytes; 1.1019 + if (reliable_percent >= kMinReliableKeepPercent) { // Keeper? 1.1020 + continue; // yes 1.1021 + } 1.1022 + 1.1023 + // Delete unreliable entry 1.1024 + doc_tote->SetKey(sub, DocTote::kUnusedKey); 1.1025 + doc_tote->SetScore(sub, 0); 1.1026 + doc_tote->SetReliability(sub, 0); 1.1027 + 1.1028 + // Show fate of unreliable languages if at least 10 bytes 1.1029 + if (FLAGS_cld2_html && (bytes >= 10) && 1.1030 + !FLAGS_cld2_quiet) { 1.1031 + fprintf(stderr, "{Unreli %s.%dR,%dB} ", 1.1032 + LanguageCode(lang), reliable_percent, bytes); 1.1033 + } 1.1034 + } 1.1035 + 1.1036 + ////if (FLAGS_cld2_html) {fprintf(stderr, "<br>\n");} 1.1037 +} 1.1038 + 1.1039 + 1.1040 +// Move all the text bytes from lower byte-count to higher one 1.1041 +void MoveLang1ToLang2(Language lang1, Language lang2, 1.1042 + int lang1_sub, int lang2_sub, 1.1043 + DocTote* doc_tote, 1.1044 + ResultChunkVector* resultchunkvector) { 1.1045 + // In doc_tote, move all the bytes lang1 => lang2 1.1046 + int sum = doc_tote->Value(lang2_sub) + doc_tote->Value(lang1_sub); 1.1047 + doc_tote->SetValue(lang2_sub, sum); 1.1048 + sum = doc_tote->Score(lang2_sub) + doc_tote->Score(lang1_sub); 1.1049 + doc_tote->SetScore(lang2_sub, sum); 1.1050 + sum = doc_tote->Reliability(lang2_sub) + doc_tote->Reliability(lang1_sub); 1.1051 + doc_tote->SetReliability(lang2_sub, sum); 1.1052 + 1.1053 + // Delete old entry 1.1054 + doc_tote->SetKey(lang1_sub, DocTote::kUnusedKey); 1.1055 + doc_tote->SetScore(lang1_sub, 0); 1.1056 + doc_tote->SetReliability(lang1_sub, 0); 1.1057 + 1.1058 + // In resultchunkvector, move all the bytes lang1 => lang2 1.1059 + if (resultchunkvector == NULL) {return;} 1.1060 + 1.1061 + int k = 0; 1.1062 + uint16 prior_lang = UNKNOWN_LANGUAGE; 1.1063 + for (int i = 0; i < static_cast<int>(resultchunkvector->size()); ++i) { 1.1064 + ResultChunk* rc = &(*resultchunkvector)[i]; 1.1065 + if (rc->lang1 == lang1) { 1.1066 + // Update entry[i] lang1 => lang2 1.1067 + rc->lang1 = lang2; 1.1068 + } 1.1069 + // One change may produce two merges -- entry before and entry after 1.1070 + if ((rc->lang1 == prior_lang) && (k > 0)) { 1.1071 + // Merge with previous, deleting entry[i] 1.1072 + ResultChunk* prior_rc = &(*resultchunkvector)[k - 1]; 1.1073 + prior_rc->bytes += rc->bytes; 1.1074 + // fprintf(stderr, "MoveLang1ToLang2 merged [%d] => [%d]<br>\n", i, k-1); 1.1075 + } else { 1.1076 + // Keep entry[i] 1.1077 + (*resultchunkvector)[k] = (*resultchunkvector)[i]; 1.1078 + // fprintf(stderr, "MoveLang1ToLang2 keep [%d] => [%d]<br>\n", i, k); 1.1079 + ++k; 1.1080 + } 1.1081 + prior_lang = rc->lang1; 1.1082 + } 1.1083 + resultchunkvector->resize(k); 1.1084 +} 1.1085 + 1.1086 + 1.1087 + 1.1088 +// Move less likely byte count to more likely for close pairs of languages 1.1089 +// If given, also update resultchunkvector 1.1090 +void RefineScoredClosePairs(DocTote* doc_tote, 1.1091 + ResultChunkVector* resultchunkvector, 1.1092 + bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) { 1.1093 + for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { 1.1094 + int close_packedlang = doc_tote->Key(sub); 1.1095 + int subscr = LanguageCloseSet(static_cast<Language>(close_packedlang)); 1.1096 + if (subscr == 0) {continue;} 1.1097 + 1.1098 + // We have a close pair language -- if the other one is also scored and the 1.1099 + // longword score differs enough, put all our eggs into one basket 1.1100 + 1.1101 + // Nonzero longword score: Go look for the other of this pair 1.1102 + for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) { 1.1103 + if (LanguageCloseSet(static_cast<Language>(doc_tote->Key(sub2))) == subscr) { 1.1104 + // We have a matching pair 1.1105 + int close_packedlang2 = doc_tote->Key(sub2); 1.1106 + 1.1107 + // Move all the text bytes from lower byte-count to higher one 1.1108 + int from_sub, to_sub; 1.1109 + Language from_lang, to_lang; 1.1110 + if (doc_tote->Value(sub) < doc_tote->Value(sub2)) { 1.1111 + from_sub = sub; 1.1112 + to_sub = sub2; 1.1113 + from_lang = static_cast<Language>(close_packedlang); 1.1114 + to_lang = static_cast<Language>(close_packedlang2); 1.1115 + } else { 1.1116 + from_sub = sub2; 1.1117 + to_sub = sub; 1.1118 + from_lang = static_cast<Language>(close_packedlang2); 1.1119 + to_lang = static_cast<Language>(close_packedlang); 1.1120 + } 1.1121 + 1.1122 + if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) { 1.1123 + // Show fate of closepair language 1.1124 + int val = doc_tote->Value(from_sub); // byte count 1.1125 + int reli = doc_tote->Reliability(from_sub); 1.1126 + int reliable_percent = reli / (val ? val : 1); // avoid zdiv 1.1127 + fprintf(stderr, "{CloseLangPair: %s.%dR,%dB => %s}<br>\n", 1.1128 + LanguageCode(from_lang), 1.1129 + reliable_percent, 1.1130 + doc_tote->Value(from_sub), 1.1131 + LanguageCode(to_lang)); 1.1132 + } 1.1133 + MoveLang1ToLang2(from_lang, to_lang, from_sub, to_sub, 1.1134 + doc_tote, resultchunkvector); 1.1135 + break; // Exit inner for sub2 loop 1.1136 + } 1.1137 + } // End for sub2 1.1138 + } // End for sub 1.1139 +} 1.1140 + 1.1141 + 1.1142 +void ApplyAllLanguageHints(Tote* chunk_tote, int tote_grams, 1.1143 + uint8* lang_hint_boost) { 1.1144 +} 1.1145 + 1.1146 + 1.1147 +void PrintHtmlEscapedText(FILE* f, const char* txt, int len) { 1.1148 + string temp(txt, len); 1.1149 + fprintf(f, "%s", GetHtmlEscapedText(temp).c_str()); 1.1150 +} 1.1151 + 1.1152 +void PrintLang(FILE* f, Tote* chunk_tote, 1.1153 + Language cur_lang, bool cur_unreliable, 1.1154 + Language prior_lang, bool prior_unreliable) { 1.1155 + if (cur_lang == prior_lang) { 1.1156 + fprintf(f, "[]"); 1.1157 + } else { 1.1158 + fprintf(f, "[%s%s]", LanguageCode(cur_lang), cur_unreliable ? "*" : ""); 1.1159 + } 1.1160 +} 1.1161 + 1.1162 + 1.1163 +void PrintTopLang(Language top_lang) { 1.1164 + if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) { 1.1165 + fprintf(stderr, "[] "); 1.1166 + } else { 1.1167 + fprintf(stderr, "[%s] ", LanguageName(top_lang)); 1.1168 + prior_lang = top_lang; 1.1169 + } 1.1170 +} 1.1171 + 1.1172 +void PrintTopLangSpeculative(Language top_lang) { 1.1173 + fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0); 1.1174 + if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) { 1.1175 + fprintf(stderr, "[] "); 1.1176 + } else { 1.1177 + fprintf(stderr, "[%s] ", LanguageName(top_lang)); 1.1178 + prior_lang = top_lang; 1.1179 + } 1.1180 + fprintf(stderr, "</span>\n"); 1.1181 +} 1.1182 + 1.1183 +void PrintLangs(FILE* f, const Language* language3, const int* percent3, 1.1184 + const int* text_bytes, const bool* is_reliable) { 1.1185 + fprintf(f, "<br> Initial_Languages "); 1.1186 + if (language3[0] != UNKNOWN_LANGUAGE) { 1.1187 + fprintf(f, "%s%s(%d%%) ", 1.1188 + LanguageName(language3[0]), 1.1189 + *is_reliable ? "" : "*", 1.1190 + percent3[0]); 1.1191 + } 1.1192 + if (language3[1] != UNKNOWN_LANGUAGE) { 1.1193 + fprintf(f, "%s(%d%%) ", LanguageName(language3[1]), percent3[1]); 1.1194 + } 1.1195 + if (language3[2] != UNKNOWN_LANGUAGE) { 1.1196 + fprintf(f, "%s(%d%%) ", LanguageName(language3[2]), percent3[2]); 1.1197 + } 1.1198 + fprintf(f, "%d bytes \n", *text_bytes); 1.1199 + 1.1200 + fprintf(f, "<br>\n"); 1.1201 +} 1.1202 + 1.1203 + 1.1204 +// Return internal probability score (sum) per 1024 bytes 1.1205 +double GetNormalizedScore(Language lang, ULScript ulscript, 1.1206 + int bytecount, int score) { 1.1207 + if (bytecount <= 0) {return 0.0;} 1.1208 + return (score << 10) / bytecount; 1.1209 +} 1.1210 + 1.1211 +// Extract return values before fixups 1.1212 +void ExtractLangEtc(DocTote* doc_tote, int total_text_bytes, 1.1213 + int* reliable_percent3, Language* language3, int* percent3, 1.1214 + double* normalized_score3, 1.1215 + int* text_bytes, bool* is_reliable) { 1.1216 + reliable_percent3[0] = 0; 1.1217 + reliable_percent3[1] = 0; 1.1218 + reliable_percent3[2] = 0; 1.1219 + language3[0] = UNKNOWN_LANGUAGE; 1.1220 + language3[1] = UNKNOWN_LANGUAGE; 1.1221 + language3[2] = UNKNOWN_LANGUAGE; 1.1222 + percent3[0] = 0; 1.1223 + percent3[1] = 0; 1.1224 + percent3[2] = 0; 1.1225 + normalized_score3[0] = 0.0; 1.1226 + normalized_score3[1] = 0.0; 1.1227 + normalized_score3[2] = 0.0; 1.1228 + 1.1229 + *text_bytes = total_text_bytes; 1.1230 + *is_reliable = false; 1.1231 + 1.1232 + int bytecount1 = 0; 1.1233 + int bytecount2 = 0; 1.1234 + int bytecount3 = 0; 1.1235 + 1.1236 + int lang1 = doc_tote->Key(0); 1.1237 + if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) { 1.1238 + // We have a top language 1.1239 + language3[0] = static_cast<Language>(lang1); 1.1240 + bytecount1 = doc_tote->Value(0); 1.1241 + int reli1 = doc_tote->Reliability(0); 1.1242 + reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1); // avoid zdiv 1.1243 + normalized_score3[0] = GetNormalizedScore(language3[0], 1.1244 + ULScript_Common, 1.1245 + bytecount1, 1.1246 + doc_tote->Score(0)); 1.1247 + } 1.1248 + 1.1249 + int lang2 = doc_tote->Key(1); 1.1250 + if ((lang2 != DocTote::kUnusedKey) && (lang2 != UNKNOWN_LANGUAGE)) { 1.1251 + language3[1] = static_cast<Language>(lang2); 1.1252 + bytecount2 = doc_tote->Value(1); 1.1253 + int reli2 = doc_tote->Reliability(1); 1.1254 + reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1); // avoid zdiv 1.1255 + normalized_score3[1] = GetNormalizedScore(language3[1], 1.1256 + ULScript_Common, 1.1257 + bytecount2, 1.1258 + doc_tote->Score(1)); 1.1259 + } 1.1260 + 1.1261 + int lang3 = doc_tote->Key(2); 1.1262 + if ((lang3 != DocTote::kUnusedKey) && (lang3 != UNKNOWN_LANGUAGE)) { 1.1263 + language3[2] = static_cast<Language>(lang3); 1.1264 + bytecount3 = doc_tote->Value(2); 1.1265 + int reli3 = doc_tote->Reliability(2); 1.1266 + reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1); // avoid zdiv 1.1267 + normalized_score3[2] = GetNormalizedScore(language3[2], 1.1268 + ULScript_Common, 1.1269 + bytecount3, 1.1270 + doc_tote->Score(2)); 1.1271 + } 1.1272 + 1.1273 + // Increase total bytes to sum (top 3) if low for some reason 1.1274 + int total_bytecount12 = bytecount1 + bytecount2; 1.1275 + int total_bytecount123 = total_bytecount12 + bytecount3; 1.1276 + if (total_text_bytes < total_bytecount123) { 1.1277 + total_text_bytes = total_bytecount123; 1.1278 + *text_bytes = total_text_bytes; 1.1279 + } 1.1280 + 1.1281 + // Sum minus previous % gives better roundoff behavior than bytecount/total 1.1282 + int total_text_bytes_div = maxint(1, total_text_bytes); // Avoid zdiv 1.1283 + percent3[0] = (bytecount1 * 100) / total_text_bytes_div; 1.1284 + percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div; 1.1285 + percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div; 1.1286 + percent3[2] -= percent3[1]; 1.1287 + percent3[1] -= percent3[0]; 1.1288 + 1.1289 + // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2% 1.1290 + // Fix this explicitly 1.1291 + if (percent3[1] < percent3[2]) { 1.1292 + ++percent3[1]; 1.1293 + --percent3[2]; 1.1294 + } 1.1295 + if (percent3[0] < percent3[1]) { 1.1296 + ++percent3[0]; 1.1297 + --percent3[1]; 1.1298 + } 1.1299 + 1.1300 + *text_bytes = total_text_bytes; 1.1301 + 1.1302 + if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) { 1.1303 + // We have a top language 1.1304 + // Its reliability is overall result reliability 1.1305 + int bytecount = doc_tote->Value(0); 1.1306 + int reli = doc_tote->Reliability(0); 1.1307 + int reliable_percent = reli / (bytecount ? bytecount : 1); // avoid zdiv 1.1308 + *is_reliable = (reliable_percent >= kMinReliableKeepPercent); 1.1309 + } else { 1.1310 + // No top language at all. This can happen with zero text or 100% Klingon 1.1311 + // if extended=false. Just return all UNKNOWN_LANGUAGE, unreliable. 1.1312 + *is_reliable = false; 1.1313 + } 1.1314 + 1.1315 + // If ignore percent is too large, set unreliable. 1.1316 + int ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]); 1.1317 + if ((ignore_percent > kIgnoreMaxPercent)) { 1.1318 + *is_reliable = false; 1.1319 + } 1.1320 +} 1.1321 + 1.1322 +bool IsFIGS(Language lang) { 1.1323 + if (lang == FRENCH) {return true;} 1.1324 + if (lang == ITALIAN) {return true;} 1.1325 + if (lang == GERMAN) {return true;} 1.1326 + if (lang == SPANISH) {return true;} 1.1327 + return false; 1.1328 +} 1.1329 + 1.1330 +bool IsEFIGS(Language lang) { 1.1331 + if (lang == ENGLISH) {return true;} 1.1332 + if (lang == FRENCH) {return true;} 1.1333 + if (lang == ITALIAN) {return true;} 1.1334 + if (lang == GERMAN) {return true;} 1.1335 + if (lang == SPANISH) {return true;} 1.1336 + return false; 1.1337 +} 1.1338 + 1.1339 +// For Tier3 languages, require more bytes of text to override 1.1340 +// the first-place language 1.1341 +static const int kGoodSecondT1T2MinBytes = 15; // <this => no second 1.1342 +static const int kGoodSecondT3MinBytes = 128; // <this => no second 1.1343 + 1.1344 +// Calculate a single summary language for the document, and its reliability. 1.1345 +// Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE 1.1346 +// This is the heart of matching human-rater perception. 1.1347 +// reliable_percent3[] is currently unused 1.1348 +// 1.1349 +// Do not return Tier3 second language unless there are at least 128 bytes 1.1350 +void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes, 1.1351 + const int* reliable_percent3, 1.1352 + const Language* language3, 1.1353 + const int* percent3, 1.1354 + Language* summary_lang, bool* is_reliable, 1.1355 + bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) { 1.1356 + // Vector of active languages; changes if we delete some 1.1357 + int slot_count = 3; 1.1358 + int active_slot[3] = {0, 1, 2}; 1.1359 + 1.1360 + int ignore_percent = 0; 1.1361 + int return_percent = percent3[0]; // Default to top lang 1.1362 + *summary_lang = language3[0]; 1.1363 + *is_reliable = true; 1.1364 + if (percent3[0] < kKeepMinPercent) {*is_reliable = false;} 1.1365 + 1.1366 + // If any of top 3 is IGNORE, remove it and increment ignore_percent 1.1367 + for (int i = 0; i < 3; ++i) { 1.1368 + if (language3[i] == TG_UNKNOWN_LANGUAGE) { 1.1369 + ignore_percent += percent3[i]; 1.1370 + // Move the rest up, levaing input vectors unchanged 1.1371 + for (int j=i+1; j < 3; ++j) { 1.1372 + active_slot[j - 1] = active_slot[j]; 1.1373 + } 1.1374 + -- slot_count; 1.1375 + // Logically remove Ignore from percentage-text calculation 1.1376 + // (extra 1 in 101 avoids zdiv, biases slightly small) 1.1377 + return_percent = (percent3[0] * 100) / (101 - ignore_percent); 1.1378 + *summary_lang = language3[active_slot[0]]; 1.1379 + if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;} 1.1380 + } 1.1381 + } 1.1382 + 1.1383 + 1.1384 + // If English and X, where X (not UNK) is big enough, 1.1385 + // assume the English is boilerplate and return X. 1.1386 + // Logically remove English from percentage-text calculation 1.1387 + int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100; 1.1388 + // Require more bytes of text for Tier3 languages 1.1389 + int minbytesneeded = kGoodSecondT1T2MinBytes; 1.1390 + int plang_second = PerScriptNumber(ULScript_Latin, language3[active_slot[1]]); 1.1391 + 1.1392 + if ((language3[active_slot[0]] == ENGLISH) && 1.1393 + (language3[active_slot[1]] != ENGLISH) && 1.1394 + (language3[active_slot[1]] != UNKNOWN_LANGUAGE) && 1.1395 + (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) && 1.1396 + (second_bytes >= minbytesneeded)) { 1.1397 + ignore_percent += percent3[active_slot[0]]; 1.1398 + return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent); 1.1399 + *summary_lang = language3[active_slot[1]]; 1.1400 + if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;} 1.1401 + 1.1402 + // Else If FIGS and X, where X (not UNK, EFIGS) is big enough, 1.1403 + // assume the FIGS is boilerplate and return X. 1.1404 + // Logically remove FIGS from percentage-text calculation 1.1405 + } else if (IsFIGS(language3[active_slot[0]]) && 1.1406 + !IsEFIGS(language3[active_slot[1]]) && 1.1407 + (language3[active_slot[1]] != UNKNOWN_LANGUAGE) && 1.1408 + (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) && 1.1409 + (second_bytes >= minbytesneeded)) { 1.1410 + ignore_percent += percent3[active_slot[0]]; 1.1411 + return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent); 1.1412 + *summary_lang = language3[active_slot[1]]; 1.1413 + if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;} 1.1414 + 1.1415 + // Else we are returning the first language, but want to improve its 1.1416 + // return_percent if the second language should be ignored 1.1417 + } else if ((language3[active_slot[1]] == ENGLISH) && 1.1418 + (language3[active_slot[0]] != ENGLISH)) { 1.1419 + ignore_percent += percent3[active_slot[1]]; 1.1420 + return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent); 1.1421 + } else if (IsFIGS(language3[active_slot[1]]) && 1.1422 + !IsEFIGS(language3[active_slot[0]])) { 1.1423 + ignore_percent += percent3[active_slot[1]]; 1.1424 + return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent); 1.1425 + } 1.1426 + 1.1427 + // If return percent is too small (too many languages), return UNKNOWN 1.1428 + if ((return_percent < kGoodFirstMinPercent)) { 1.1429 + if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { 1.1430 + fprintf(stderr, "{Unreli %s %d%% percent too small} ", 1.1431 + LanguageCode(*summary_lang), return_percent); 1.1432 + } 1.1433 + *summary_lang = UNKNOWN_LANGUAGE; 1.1434 + *is_reliable = false; 1.1435 + } 1.1436 + 1.1437 + // If return percent is small, return language but set unreliable. 1.1438 + if ((return_percent < kGoodFirstReliableMinPercent)) { 1.1439 + *is_reliable = false; 1.1440 + } 1.1441 + 1.1442 + // If ignore percent is too large, set unreliable. 1.1443 + ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]); 1.1444 + if ((ignore_percent > kIgnoreMaxPercent)) { 1.1445 + *is_reliable = false; 1.1446 + } 1.1447 + 1.1448 + // If we removed all the active languages, return UNKNOWN 1.1449 + if (slot_count == 0) { 1.1450 + if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { 1.1451 + fprintf(stderr, "{Unreli %s no languages left} ", 1.1452 + LanguageCode(*summary_lang)); 1.1453 + } 1.1454 + *summary_lang = UNKNOWN_LANGUAGE; 1.1455 + *is_reliable = false; 1.1456 + } 1.1457 +} 1.1458 + 1.1459 +void AddLangPriorBoost(Language lang, uint32 langprob, 1.1460 + ScoringContext* scoringcontext) { 1.1461 + // This is called 0..n times with language hints 1.1462 + // but we don't know the script -- so boost either or both Latn, Othr. 1.1463 + 1.1464 + if (IsLatnLanguage(lang)) { 1.1465 + LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn; 1.1466 + int n = langprior_boost->n; 1.1467 + langprior_boost->langprob[n] = langprob; 1.1468 + langprior_boost->n = langprior_boost->wrap(n + 1); 1.1469 + } 1.1470 + 1.1471 + if (IsOthrLanguage(lang)) { 1.1472 + LangBoosts* langprior_boost = &scoringcontext->langprior_boost.othr; 1.1473 + int n = langprior_boost->n; 1.1474 + langprior_boost->langprob[n] = langprob; 1.1475 + langprior_boost->n = langprior_boost->wrap(n + 1); 1.1476 + } 1.1477 + 1.1478 +} 1.1479 + 1.1480 +void AddOneWhack(Language whacker_lang, Language whackee_lang, 1.1481 + ScoringContext* scoringcontext) { 1.1482 + uint32 langprob = MakeLangProb(whackee_lang, 1); 1.1483 + // This logic avoids hr-Latn whacking sr-Cyrl, but still whacks sr-Latn 1.1484 + if (IsLatnLanguage(whacker_lang) && IsLatnLanguage(whackee_lang)) { 1.1485 + LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn; 1.1486 + int n = langprior_whack->n; 1.1487 + langprior_whack->langprob[n] = langprob; 1.1488 + langprior_whack->n = langprior_whack->wrap(n + 1); 1.1489 + } 1.1490 + if (IsOthrLanguage(whacker_lang) && IsOthrLanguage(whackee_lang)) { 1.1491 + LangBoosts* langprior_whack = &scoringcontext->langprior_whack.othr; 1.1492 + int n = langprior_whack->n; 1.1493 + langprior_whack->langprob[n] = langprob; 1.1494 + langprior_whack->n = langprior_whack->wrap(n + 1); 1.1495 + } 1.1496 +} 1.1497 + 1.1498 +void AddCloseLangWhack(Language lang, ScoringContext* scoringcontext) { 1.1499 + // We do not in general want zh-Hans and zh-Hant to be close pairs, 1.1500 + // but we do here. 1.1501 + if (lang == CLD2::CHINESE) { 1.1502 + AddOneWhack(lang, CLD2::CHINESE_T, scoringcontext); 1.1503 + return; 1.1504 + } 1.1505 + if (lang == CLD2::CHINESE_T) { 1.1506 + AddOneWhack(lang, CLD2::CHINESE, scoringcontext); 1.1507 + return; 1.1508 + } 1.1509 + 1.1510 + int base_lang_set = LanguageCloseSet(lang); 1.1511 + if (base_lang_set == 0) {return;} 1.1512 + // TODO: add an explicit list of each set to avoid this 512-times loop 1.1513 + for (int i = 0; i < kLanguageToPLangSize; ++i) { 1.1514 + Language lang2 = static_cast<Language>(i); 1.1515 + if ((base_lang_set == LanguageCloseSet(lang2)) && (lang != lang2)) { 1.1516 + AddOneWhack(lang, lang2, scoringcontext); 1.1517 + } 1.1518 + } 1.1519 +} 1.1520 + 1.1521 + 1.1522 +void ApplyHints(const char* buffer, 1.1523 + int buffer_length, 1.1524 + bool is_plain_text, 1.1525 + const CLDHints* cld_hints, 1.1526 + ScoringContext* scoringcontext) { 1.1527 + CLDLangPriors lang_priors; 1.1528 + InitCLDLangPriors(&lang_priors); 1.1529 + 1.1530 + // We now use lang= tags. 1.1531 + // Last look, circa 2008 found only 15% of web pages with lang= tags and 1.1532 + // many of those were wrong. Now (July 2011), we find 44% of web pages have 1.1533 + // lang= tags, and most of them are correct. So we now give them substantial 1.1534 + // weight in each chunk scored. 1.1535 + if (!is_plain_text) { 1.1536 + // Get any contained language tags in first n KB 1.1537 + int32 max_scan_bytes = FLAGS_cld_max_lang_tag_scan_kb << 10; 1.1538 + string lang_tags = GetLangTagsFromHtml(buffer, buffer_length, 1.1539 + max_scan_bytes); 1.1540 + SetCLDLangTagsHint(lang_tags, &lang_priors); 1.1541 + if (scoringcontext->flags_cld2_html) { 1.1542 + if (!lang_tags.empty()) { 1.1543 + fprintf(scoringcontext->debug_file, "<br>lang_tags '%s'<br>\n", 1.1544 + lang_tags.c_str()); 1.1545 + } 1.1546 + } 1.1547 + } 1.1548 + 1.1549 + if (cld_hints != NULL) { 1.1550 + if ((cld_hints->content_language_hint != NULL) && 1.1551 + (cld_hints->content_language_hint[0] != '\0')) { 1.1552 + SetCLDContentLangHint(cld_hints->content_language_hint, &lang_priors); 1.1553 + } 1.1554 + 1.1555 + // Input is from GetTLD(), already lowercased 1.1556 + if ((cld_hints->tld_hint != NULL) && (cld_hints->tld_hint[0] != '\0')) { 1.1557 + SetCLDTLDHint(cld_hints->tld_hint, &lang_priors); 1.1558 + } 1.1559 + 1.1560 + if (cld_hints->encoding_hint != UNKNOWN_ENCODING) { 1.1561 + Encoding enc = static_cast<Encoding>(cld_hints->encoding_hint); 1.1562 + SetCLDEncodingHint(enc, &lang_priors); 1.1563 + } 1.1564 + 1.1565 + if (cld_hints->language_hint != UNKNOWN_LANGUAGE) { 1.1566 + SetCLDLanguageHint(cld_hints->language_hint, &lang_priors); 1.1567 + } 1.1568 + } 1.1569 + 1.1570 + // Keep no more than four different languages with hints 1.1571 + TrimCLDLangPriors(4, &lang_priors); 1.1572 + 1.1573 + if (scoringcontext->flags_cld2_html) { 1.1574 + string print_temp = DumpCLDLangPriors(&lang_priors); 1.1575 + if (!print_temp.empty()) { 1.1576 + fprintf(scoringcontext->debug_file, "DumpCLDLangPriors %s<br>\n", 1.1577 + print_temp.c_str()); 1.1578 + } 1.1579 + } 1.1580 + 1.1581 + // Put boosts into ScoringContext 1.1582 + for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) { 1.1583 + Language lang = GetCLDPriorLang(lang_priors.prior[i]); 1.1584 + int qprob = GetCLDPriorWeight(lang_priors.prior[i]); 1.1585 + if (qprob > 0) { 1.1586 + uint32 langprob = MakeLangProb(lang, qprob); 1.1587 + AddLangPriorBoost(lang, langprob, scoringcontext); 1.1588 + } 1.1589 + } 1.1590 + 1.1591 + // Put whacks into scoring context 1.1592 + // We do not in general want zh-Hans and zh-Hant to be close pairs, 1.1593 + // but we do here. Use close_set_count[kCloseSetSize] to count zh, zh-Hant 1.1594 + std::vector<int> close_set_count(kCloseSetSize + 1, 0); 1.1595 + 1.1596 + for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) { 1.1597 + Language lang = GetCLDPriorLang(lang_priors.prior[i]); 1.1598 + ++close_set_count[LanguageCloseSet(lang)]; 1.1599 + if (lang == CLD2::CHINESE) {++close_set_count[kCloseSetSize];} 1.1600 + if (lang == CLD2::CHINESE_T) {++close_set_count[kCloseSetSize];} 1.1601 + } 1.1602 + 1.1603 + // If a boost language is in a close set, force suppressing the others in 1.1604 + // that set, if exactly one of the set is present 1.1605 + for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) { 1.1606 + Language lang = GetCLDPriorLang(lang_priors.prior[i]); 1.1607 + int qprob = GetCLDPriorWeight(lang_priors.prior[i]); 1.1608 + if (qprob > 0) { 1.1609 + int close_set = LanguageCloseSet(lang); 1.1610 + if ((close_set > 0) && (close_set_count[close_set] == 1)) { 1.1611 + AddCloseLangWhack(lang, scoringcontext); 1.1612 + } 1.1613 + if (((lang == CLD2::CHINESE) || (lang == CLD2::CHINESE_T)) && 1.1614 + (close_set_count[kCloseSetSize] == 1)) { 1.1615 + AddCloseLangWhack(lang, scoringcontext); 1.1616 + } 1.1617 + } 1.1618 + } 1.1619 + 1.1620 + 1.1621 + 1.1622 + 1.1623 + 1.1624 + 1.1625 +} 1.1626 + 1.1627 + 1.1628 + 1.1629 +// Results language3/percent3/text_bytes must be exactly three items 1.1630 +Language DetectLanguageSummaryV2( 1.1631 + const char* buffer, 1.1632 + int buffer_length, 1.1633 + bool is_plain_text, 1.1634 + const CLDHints* cld_hints, 1.1635 + bool allow_extended_lang, 1.1636 + int flags, 1.1637 + Language plus_one, 1.1638 + Language* language3, 1.1639 + int* percent3, 1.1640 + double* normalized_score3, 1.1641 + ResultChunkVector* resultchunkvector, 1.1642 + int* text_bytes, 1.1643 + bool* is_reliable) { 1.1644 + language3[0] = UNKNOWN_LANGUAGE; 1.1645 + language3[1] = UNKNOWN_LANGUAGE; 1.1646 + language3[2] = UNKNOWN_LANGUAGE; 1.1647 + percent3[0] = 0; 1.1648 + percent3[1] = 0; 1.1649 + percent3[2] = 0; 1.1650 + normalized_score3[0] = 0.0; 1.1651 + normalized_score3[1] = 0.0; 1.1652 + normalized_score3[2] = 0.0; 1.1653 + if (resultchunkvector != NULL) { 1.1654 + resultchunkvector->clear(); 1.1655 + } 1.1656 + *text_bytes = 0; 1.1657 + *is_reliable = false; 1.1658 + 1.1659 + if ((flags & kCLDFlagEcho) != 0) { 1.1660 + string temp(buffer, buffer_length); 1.1661 + if ((flags & kCLDFlagHtml) != 0) { 1.1662 + fprintf(stderr, "CLD2[%d] '%s'<br>\n", 1.1663 + buffer_length, GetHtmlEscapedText(temp).c_str()); 1.1664 + } else { 1.1665 + fprintf(stderr, "CLD2[%d] '%s'\n", 1.1666 + buffer_length, GetPlainEscapedText(temp).c_str()); 1.1667 + } 1.1668 + } 1.1669 + 1.1670 +#ifdef CLD2_DYNAMIC_MODE 1.1671 + // In dynamic mode, we immediately return UNKNOWN_LANGUAGE if the data file 1.1672 + // hasn't been loaded yet. This is the only sane thing we can do, as there 1.1673 + // are no scoring tables to consult. 1.1674 + bool dataLoaded = isDataLoaded(); 1.1675 + if ((flags & kCLDFlagVerbose) != 0) { 1.1676 + fprintf(stderr, "Data loaded: %s\n", (dataLoaded ? "true" : "false")); 1.1677 + } 1.1678 + if (!dataLoaded) { 1.1679 + return UNKNOWN_LANGUAGE; 1.1680 + } 1.1681 +#endif 1.1682 + 1.1683 + // Exit now if no text 1.1684 + if (buffer_length == 0) {return UNKNOWN_LANGUAGE;} 1.1685 + if (kScoringtables.quadgram_obj == NULL) {return UNKNOWN_LANGUAGE;} 1.1686 + 1.1687 + // Document totals 1.1688 + DocTote doc_tote; // Reliability = 0..100 1.1689 + 1.1690 + // ScoringContext carries state across scriptspans 1.1691 + ScoringContext scoringcontext; 1.1692 + scoringcontext.debug_file = stderr; 1.1693 + scoringcontext.flags_cld2_score_as_quads = 1.1694 + ((flags & kCLDFlagScoreAsQuads) != 0); 1.1695 + scoringcontext.flags_cld2_html = ((flags & kCLDFlagHtml) != 0); 1.1696 + scoringcontext.flags_cld2_cr = ((flags & kCLDFlagCr) != 0); 1.1697 + scoringcontext.flags_cld2_verbose = ((flags & kCLDFlagVerbose) != 0); 1.1698 + scoringcontext.prior_chunk_lang = UNKNOWN_LANGUAGE; 1.1699 + scoringcontext.ulscript = ULScript_Common; 1.1700 + scoringcontext.scoringtables = &kScoringtables; 1.1701 + scoringcontext.scanner = NULL; 1.1702 + scoringcontext.init(); // Clear the internal memory arrays 1.1703 + 1.1704 + // Now thread safe. 1.1705 + bool FLAGS_cld2_html = ((flags & kCLDFlagHtml) != 0); 1.1706 + bool FLAGS_cld2_quiet = ((flags & kCLDFlagQuiet) != 0); 1.1707 + 1.1708 + ApplyHints(buffer, buffer_length, is_plain_text, cld_hints, &scoringcontext); 1.1709 + 1.1710 + // Four individual script totals, Latin, Han, other2, other3 1.1711 + int next_other_tote = 2; 1.1712 + int tote_num = 0; 1.1713 + 1.1714 + // Four totes for up to four different scripts pending at once 1.1715 + Tote totes[4]; // [0] Latn [1] Hani [2] other [3] other 1.1716 + bool tote_seen[4] = {false, false, false, false}; 1.1717 + int tote_grams[4] = {0, 0, 0, 0}; // Number in partial chunk 1.1718 + ULScript tote_script[4] = 1.1719 + {ULScript_Latin, ULScript_Hani, ULScript_Common, ULScript_Common}; 1.1720 + 1.1721 + // Loop through text spans in a single script 1.1722 + ScriptScanner ss(buffer, buffer_length, is_plain_text); 1.1723 + LangSpan scriptspan; 1.1724 + 1.1725 + scoringcontext.scanner = &ss; 1.1726 + 1.1727 + scriptspan.text = NULL; 1.1728 + scriptspan.text_bytes = 0; 1.1729 + scriptspan.offset = 0; 1.1730 + scriptspan.ulscript = ULScript_Common; 1.1731 + scriptspan.lang = UNKNOWN_LANGUAGE; 1.1732 + 1.1733 + int total_text_bytes = 0; 1.1734 + int textlimit = FLAGS_cld_textlimit << 10; // in KB 1.1735 + if (textlimit == 0) {textlimit = 0x7fffffff;} 1.1736 + 1.1737 + int advance_by = 2; // Advance 2 bytes 1.1738 + int advance_limit = textlimit >> 3; // For first 1/8 of max document 1.1739 + 1.1740 + int initial_word_span = kDefaultWordSpan; 1.1741 + if (FLAGS_cld_forcewords) { 1.1742 + initial_word_span = kReallyBigWordSpan; 1.1743 + } 1.1744 + 1.1745 + // Pick up chunk sizes 1.1746 + // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each 1.1747 + // Sanity check -- force into a reasonable range 1.1748 + int chunksizequads = FLAGS_cld_smoothwidth; 1.1749 + chunksizequads = minint(maxint(chunksizequads, kMinChunkSizeQuads), 1.1750 + kMaxChunkSizeQuads); 1.1751 + int chunksizeunis = (chunksizequads * 5) >> 1; 1.1752 + 1.1753 + // Varying short-span limit doesn't work well -- skips too much beyond 20KB 1.1754 + // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth; 1.1755 + int spantooshortlimit = kShortSpanThresh; 1.1756 + 1.1757 + // For debugging only. Not thread-safe 1.1758 + prior_lang = UNKNOWN_LANGUAGE; 1.1759 + prior_unreliable = false; 1.1760 + 1.1761 + // Allocate full-document prediction table for finding repeating words 1.1762 + int hash = 0; 1.1763 + int* predict_tbl = new int[kPredictionTableSize]; 1.1764 + if (FlagRepeats(flags)) { 1.1765 + memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); 1.1766 + } 1.1767 + 1.1768 + 1.1769 + 1.1770 + // Loop through scriptspans accumulating number of text bytes in each language 1.1771 + while (ss.GetOneScriptSpanLower(&scriptspan)) { 1.1772 + ULScript ulscript = scriptspan.ulscript; 1.1773 + 1.1774 + // Squeeze out big chunks of text span if asked to 1.1775 + if (FlagSqueeze(flags)) { 1.1776 + // Remove repetitive or mostly-spaces chunks 1.1777 + int newlen; 1.1778 + int chunksize = 0; // Use the default 1.1779 + if (resultchunkvector != NULL) { 1.1780 + newlen = CheapSqueezeInplaceOverwrite(scriptspan.text, 1.1781 + scriptspan.text_bytes, 1.1782 + chunksize); 1.1783 + } else { 1.1784 + newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes, 1.1785 + chunksize); 1.1786 + } 1.1787 + scriptspan.text_bytes = newlen; 1.1788 + } else { 1.1789 + // Check now and then to see if we should be squeezing 1.1790 + if (((kCheapSqueezeTestThresh >> 1) < scriptspan.text_bytes) && 1.1791 + !FlagFinish(flags)) { 1.1792 + // fprintf(stderr, "CheapSqueezeTriggerTest, " 1.1793 + // "first %d bytes of %d (>%d/2)<br>\n", 1.1794 + // kCheapSqueezeTestLen, 1.1795 + // scriptspan.text_bytes, 1.1796 + // kCheapSqueezeTestThresh); 1.1797 + 1.1798 + if (CheapSqueezeTriggerTest(scriptspan.text, 1.1799 + scriptspan.text_bytes, 1.1800 + kCheapSqueezeTestLen)) { 1.1801 + // Recursive call with big-chunk squeezing set 1.1802 + if (FLAGS_cld2_html || FLAGS_dbgscore) { 1.1803 + fprintf(stderr, 1.1804 + "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n", 1.1805 + total_text_bytes); 1.1806 + } 1.1807 + // Deallocate full-document prediction table 1.1808 + delete[] predict_tbl; 1.1809 + 1.1810 + return DetectLanguageSummaryV2( 1.1811 + buffer, 1.1812 + buffer_length, 1.1813 + is_plain_text, 1.1814 + cld_hints, 1.1815 + allow_extended_lang, 1.1816 + flags | kCLDFlagSqueeze, 1.1817 + plus_one, 1.1818 + language3, 1.1819 + percent3, 1.1820 + normalized_score3, 1.1821 + resultchunkvector, 1.1822 + text_bytes, 1.1823 + is_reliable); 1.1824 + } 1.1825 + } 1.1826 + } 1.1827 + 1.1828 + // Remove repetitive words if asked to 1.1829 + if (FlagRepeats(flags)) { 1.1830 + // Remove repetitive words 1.1831 + int newlen; 1.1832 + if (resultchunkvector != NULL) { 1.1833 + newlen = CheapRepWordsInplaceOverwrite(scriptspan.text, 1.1834 + scriptspan.text_bytes, 1.1835 + &hash, predict_tbl); 1.1836 + } else { 1.1837 + newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes, 1.1838 + &hash, predict_tbl); 1.1839 + } 1.1840 + scriptspan.text_bytes = newlen; 1.1841 + } 1.1842 + 1.1843 + // Scoring depends on scriptspan buffer ALWAYS having 1.1844 + // leading space and off-the-end space space space NUL, 1.1845 + // DCHECK(scriptspan.text[0] == ' '); 1.1846 + // DCHECK(scriptspan.text[scriptspan.text_bytes + 0] == ' '); 1.1847 + // DCHECK(scriptspan.text[scriptspan.text_bytes + 1] == ' '); 1.1848 + // DCHECK(scriptspan.text[scriptspan.text_bytes + 2] == ' '); 1.1849 + // DCHECK(scriptspan.text[scriptspan.text_bytes + 3] == '\0'); 1.1850 + 1.1851 + // The real scoring 1.1852 + // Accumulate directly into the document total, or accmulate in one of four 1.1853 + // chunk totals. The purpose of the multiple chunk totals is to piece 1.1854 + // together short choppy pieces of text in alternating scripts. One total is 1.1855 + // dedicated to Latin text, one to Han text, and the other two are dynamicly 1.1856 + // assigned. 1.1857 + 1.1858 + scoringcontext.ulscript = scriptspan.ulscript; 1.1859 + // FLAGS_cld2_html = scoringcontext.flags_cld2_html; 1.1860 + 1.1861 + ScoreOneScriptSpan(scriptspan, 1.1862 + &scoringcontext, 1.1863 + &doc_tote, 1.1864 + resultchunkvector); 1.1865 + 1.1866 + total_text_bytes += scriptspan.text_bytes; 1.1867 + } // End while (ss.GetOneScriptSpanLower()) 1.1868 + 1.1869 + // Deallocate full-document prediction table 1.1870 + delete[] predict_tbl; 1.1871 + 1.1872 + if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { 1.1873 + // If no forced <cr>, put one in front of dump 1.1874 + if (!scoringcontext.flags_cld2_cr) {fprintf(stderr, "<br>\n");} 1.1875 + doc_tote.Dump(stderr); 1.1876 + } 1.1877 + 1.1878 + 1.1879 + // If extended langauges are disallowed, remove them here 1.1880 + if (!allow_extended_lang) { 1.1881 + RemoveExtendedLanguages(&doc_tote); 1.1882 + } 1.1883 + 1.1884 + // Force close pairs to one or the other 1.1885 + // If given, also update resultchunkvector 1.1886 + RefineScoredClosePairs(&doc_tote, resultchunkvector, 1.1887 + FLAGS_cld2_html, FLAGS_cld2_quiet); 1.1888 + 1.1889 + 1.1890 + // Calculate return results 1.1891 + // Find top three byte counts in tote heap 1.1892 + int reliable_percent3[3]; 1.1893 + 1.1894 + // Cannot use Add, etc. after sorting 1.1895 + doc_tote.Sort(3); 1.1896 + 1.1897 + ExtractLangEtc(&doc_tote, total_text_bytes, 1.1898 + reliable_percent3, language3, percent3, normalized_score3, 1.1899 + text_bytes, is_reliable); 1.1900 + 1.1901 + bool have_good_answer = false; 1.1902 + if (FlagFinish(flags)) { 1.1903 + // Force a result 1.1904 + have_good_answer = true; 1.1905 + } else if (total_text_bytes <= kShortTextThresh) { 1.1906 + // Don't recurse on short text -- we already did word scores 1.1907 + have_good_answer = true; 1.1908 + } else if (*is_reliable && 1.1909 + (percent3[0] >= kGoodLang1Percent)) { 1.1910 + have_good_answer = true; 1.1911 + } else if (*is_reliable && 1.1912 + ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) { 1.1913 + have_good_answer = true; 1.1914 + } 1.1915 + 1.1916 + 1.1917 + if (have_good_answer) { 1.1918 + // This is the real, non-recursive return 1.1919 + 1.1920 + // Move bytes for unreliable langs to another lang or UNKNOWN 1.1921 + RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet); 1.1922 + 1.1923 + // Redo the result extraction after the removal above 1.1924 + doc_tote.Sort(3); 1.1925 + ExtractLangEtc(&doc_tote, total_text_bytes, 1.1926 + reliable_percent3, language3, percent3, normalized_score3, 1.1927 + text_bytes, is_reliable); 1.1928 + 1.1929 + 1.1930 + 1.1931 + Language summary_lang; 1.1932 + CalcSummaryLang(&doc_tote, total_text_bytes, 1.1933 + reliable_percent3, language3, percent3, 1.1934 + &summary_lang, is_reliable, 1.1935 + FLAGS_cld2_html, FLAGS_cld2_quiet); 1.1936 + 1.1937 + if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { 1.1938 + for (int i = 0; i < 3; ++i) { 1.1939 + if (language3[i] != UNKNOWN_LANGUAGE) { 1.1940 + fprintf(stderr, "%s.%dR(%d%%) ", 1.1941 + LanguageCode(language3[i]), 1.1942 + reliable_percent3[i], 1.1943 + percent3[i]); 1.1944 + } 1.1945 + } 1.1946 + 1.1947 + fprintf(stderr, "%d bytes ", total_text_bytes); 1.1948 + fprintf(stderr, "= %s%c ", 1.1949 + LanguageName(summary_lang), *is_reliable ? ' ' : '*'); 1.1950 + fprintf(stderr, "<br><br>\n"); 1.1951 + } 1.1952 + 1.1953 + // Slightly condensed if quiet 1.1954 + if (FLAGS_cld2_html && FLAGS_cld2_quiet) { 1.1955 + fprintf(stderr, " "); 1.1956 + for (int i = 0; i < 3; ++i) { 1.1957 + if (language3[i] != UNKNOWN_LANGUAGE) { 1.1958 + fprintf(stderr, " %s %d%% ", 1.1959 + LanguageCode(language3[i]), 1.1960 + percent3[i]); 1.1961 + } 1.1962 + } 1.1963 + fprintf(stderr, "= %s%c ", 1.1964 + LanguageName(summary_lang), *is_reliable ? ' ' : '*'); 1.1965 + fprintf(stderr, "<br>\n"); 1.1966 + } 1.1967 + 1.1968 + return summary_lang; 1.1969 + } 1.1970 + 1.1971 + // Not a good answer -- do recursive call to refine 1.1972 + if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) { 1.1973 + // This is what we hope to improve on in the recursive call, if any 1.1974 + PrintLangs(stderr, language3, percent3, text_bytes, is_reliable); 1.1975 + } 1.1976 + 1.1977 + // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40 1.1978 + // For this purpose, we treate "Ignore" as top40 1.1979 + Language new_plus_one = UNKNOWN_LANGUAGE; 1.1980 + 1.1981 + if (total_text_bytes < kShortTextThresh) { 1.1982 + // Short text: Recursive call with top40 and short set 1.1983 + if (FLAGS_cld2_html || FLAGS_dbgscore) { 1.1984 + fprintf(stderr, " ---text_bytes[%d] " 1.1985 + "Recursive(Top40/Rep/Short/Words)---<br><br>\n", 1.1986 + total_text_bytes); 1.1987 + } 1.1988 + return DetectLanguageSummaryV2( 1.1989 + buffer, 1.1990 + buffer_length, 1.1991 + is_plain_text, 1.1992 + cld_hints, 1.1993 + allow_extended_lang, 1.1994 + flags | kCLDFlagTop40 | kCLDFlagRepeats | 1.1995 + kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish, 1.1996 + new_plus_one, 1.1997 + language3, 1.1998 + percent3, 1.1999 + normalized_score3, 1.2000 + resultchunkvector, 1.2001 + text_bytes, 1.2002 + is_reliable); 1.2003 + } 1.2004 + 1.2005 + // Longer text: Recursive call with top40 set 1.2006 + if (FLAGS_cld2_html || FLAGS_dbgscore) { 1.2007 + fprintf(stderr, 1.2008 + " ---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n", 1.2009 + total_text_bytes); 1.2010 + } 1.2011 + return DetectLanguageSummaryV2( 1.2012 + buffer, 1.2013 + buffer_length, 1.2014 + is_plain_text, 1.2015 + cld_hints, 1.2016 + allow_extended_lang, 1.2017 + flags | kCLDFlagTop40 | kCLDFlagRepeats | 1.2018 + kCLDFlagFinish, 1.2019 + new_plus_one, 1.2020 + language3, 1.2021 + percent3, 1.2022 + normalized_score3, 1.2023 + resultchunkvector, 1.2024 + text_bytes, 1.2025 + is_reliable); 1.2026 +} 1.2027 + 1.2028 + 1.2029 +// For debugging and wrappers. Not thread safe. 1.2030 +static char temp_detectlanguageversion[32]; 1.2031 + 1.2032 +// Return version text string 1.2033 +// String is "code_version - data_build_date" 1.2034 +const char* DetectLanguageVersion() { 1.2035 + if (kScoringtables.quadgram_obj == NULL) {return "";} 1.2036 + sprintf(temp_detectlanguageversion, 1.2037 + "V2.0 - %u", kScoringtables.quadgram_obj->kCLDTableBuildDate); 1.2038 + return temp_detectlanguageversion; 1.2039 +} 1.2040 + 1.2041 + 1.2042 +} // End namespace CLD2