Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | // Copyright 2013 Google Inc. All Rights Reserved. |
michael@0 | 2 | // |
michael@0 | 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
michael@0 | 4 | // you may not use this file except in compliance with the License. |
michael@0 | 5 | // You may obtain a copy of the License at |
michael@0 | 6 | // |
michael@0 | 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
michael@0 | 8 | // |
michael@0 | 9 | // Unless required by applicable law or agreed to in writing, software |
michael@0 | 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
michael@0 | 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
michael@0 | 12 | // See the License for the specific language governing permissions and |
michael@0 | 13 | // limitations under the License. |
michael@0 | 14 | |
michael@0 | 15 | // |
michael@0 | 16 | // Author: dsites@google.com (Dick Sites) |
michael@0 | 17 | // Updated 2014.01 for dual table lookup |
michael@0 | 18 | // |
michael@0 | 19 | |
michael@0 | 20 | #include <stdio.h> |
michael@0 | 21 | #include <string.h> |
michael@0 | 22 | #include <string> |
michael@0 | 23 | #include <vector> |
michael@0 | 24 | |
michael@0 | 25 | #include "cldutil.h" |
michael@0 | 26 | #include "debug.h" |
michael@0 | 27 | #include "integral_types.h" |
michael@0 | 28 | #include "lang_script.h" |
michael@0 | 29 | #include "utf8statetable.h" |
michael@0 | 30 | |
michael@0 | 31 | #ifdef CLD2_DYNAMIC_MODE |
michael@0 | 32 | #include "cld2_dynamic_data.h" |
michael@0 | 33 | #include "cld2_dynamic_data_loader.h" |
michael@0 | 34 | #endif |
michael@0 | 35 | #include "cld2tablesummary.h" |
michael@0 | 36 | #include "compact_lang_det_impl.h" |
michael@0 | 37 | #include "compact_lang_det_hint_code.h" |
michael@0 | 38 | #include "getonescriptspan.h" |
michael@0 | 39 | #include "tote.h" |
michael@0 | 40 | |
michael@0 | 41 | |
michael@0 | 42 | namespace CLD2 { |
michael@0 | 43 | |
michael@0 | 44 | using namespace std; |
michael@0 | 45 | |
michael@0 | 46 | // Linker supplies the right tables, From files |
michael@0 | 47 | // cld_generated_cjk_uni_prop_80.cc cld2_generated_cjk_compatible.cc |
michael@0 | 48 | // cld_generated_cjk_delta_bi_32.cc generated_distinct_bi_0.cc |
michael@0 | 49 | // cld2_generated_quad*.cc cld2_generated_deltaocta*.cc |
michael@0 | 50 | // cld2_generated_distinctocta*.cc |
michael@0 | 51 | // cld_generated_score_quad_octa_1024_256.cc |
michael@0 | 52 | |
michael@0 | 53 | // 2014.01 Now implementing quadgram dual lookup tables, to allow main table |
michael@0 | 54 | // sizes that are 1/3/5 times a power of two, instead of just powers of two. |
michael@0 | 55 | // Gives more flexibility of total footprint for CLD2. |
michael@0 | 56 | |
michael@0 | 57 | extern const int kLanguageToPLangSize; |
michael@0 | 58 | extern const int kCloseSetSize; |
michael@0 | 59 | |
michael@0 | 60 | extern const UTF8PropObj cld_generated_CjkUni_obj; |
michael@0 | 61 | extern const CLD2TableSummary kCjkCompat_obj; |
michael@0 | 62 | extern const CLD2TableSummary kCjkDeltaBi_obj; |
michael@0 | 63 | extern const CLD2TableSummary kDistinctBiTable_obj; |
michael@0 | 64 | extern const CLD2TableSummary kQuad_obj; |
michael@0 | 65 | extern const CLD2TableSummary kQuad_obj2; // Dual lookup tables |
michael@0 | 66 | extern const CLD2TableSummary kDeltaOcta_obj; |
michael@0 | 67 | extern const CLD2TableSummary kDistinctOcta_obj; |
michael@0 | 68 | extern const short kAvgDeltaOctaScore[]; |
michael@0 | 69 | |
michael@0 | 70 | #ifdef CLD2_DYNAMIC_MODE |
michael@0 | 71 | // CLD2_DYNAMIC_MODE is defined: |
michael@0 | 72 | // Data will be read from an mmap opened at runtime. |
michael@0 | 73 | static ScoringTables kScoringtables = { |
michael@0 | 74 | NULL, //&cld_generated_CjkUni_obj, |
michael@0 | 75 | NULL, //&kCjkCompat_obj, |
michael@0 | 76 | NULL, //&kCjkDeltaBi_obj, |
michael@0 | 77 | NULL, //&kDistinctBiTable_obj, |
michael@0 | 78 | NULL, //&kQuad_obj, |
michael@0 | 79 | NULL, //&kQuad_obj2, |
michael@0 | 80 | NULL, //&kDeltaOcta_obj, |
michael@0 | 81 | NULL, //&kDistinctOcta_obj, |
michael@0 | 82 | NULL, //kAvgDeltaOctaScore, |
michael@0 | 83 | }; |
michael@0 | 84 | static bool dynamicDataLoaded = false; |
michael@0 | 85 | static ScoringTables* dynamicTables = NULL; |
michael@0 | 86 | static void* mmapAddress = NULL; |
michael@0 | 87 | static int mmapLength = 0; |
michael@0 | 88 | |
michael@0 | 89 | bool isDataLoaded() { return dynamicDataLoaded; } |
michael@0 | 90 | |
michael@0 | 91 | void loadData(const char* fileName) { |
michael@0 | 92 | if (isDataLoaded()) { |
michael@0 | 93 | unloadData(); |
michael@0 | 94 | } |
michael@0 | 95 | dynamicTables = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength); |
michael@0 | 96 | kScoringtables = *dynamicTables; |
michael@0 | 97 | dynamicDataLoaded = true; |
michael@0 | 98 | }; |
michael@0 | 99 | |
michael@0 | 100 | void unloadData() { |
michael@0 | 101 | if (!dynamicDataLoaded) return; |
michael@0 | 102 | dynamicDataLoaded = false; |
michael@0 | 103 | // unloading will null all the pointers out. |
michael@0 | 104 | CLD2DynamicDataLoader::unloadData(&dynamicTables, &mmapAddress, &mmapLength); |
michael@0 | 105 | } |
michael@0 | 106 | #else |
michael@0 | 107 | // This initializes kScoringtables.quadgram_obj etc. |
michael@0 | 108 | static const ScoringTables kScoringtables = { |
michael@0 | 109 | &cld_generated_CjkUni_obj, |
michael@0 | 110 | &kCjkCompat_obj, |
michael@0 | 111 | &kCjkDeltaBi_obj, |
michael@0 | 112 | &kDistinctBiTable_obj, |
michael@0 | 113 | |
michael@0 | 114 | &kQuad_obj, |
michael@0 | 115 | &kQuad_obj2, // Dual lookup tables |
michael@0 | 116 | &kDeltaOcta_obj, |
michael@0 | 117 | &kDistinctOcta_obj, |
michael@0 | 118 | |
michael@0 | 119 | kAvgDeltaOctaScore, |
michael@0 | 120 | }; |
michael@0 | 121 | #endif // #ifdef CLD2_DYNAMIC_MODE |
michael@0 | 122 | |
michael@0 | 123 | |
michael@0 | 124 | static const bool FLAGS_cld_no_minimum_bytes = false; |
michael@0 | 125 | static const bool FLAGS_cld_forcewords = true; |
michael@0 | 126 | static const bool FLAGS_cld_showme = false; |
michael@0 | 127 | static const bool FLAGS_cld_echotext = true; |
michael@0 | 128 | static const int32 FLAGS_cld_textlimit = 160; |
michael@0 | 129 | static const int32 FLAGS_cld_smoothwidth = 20; |
michael@0 | 130 | static const bool FLAGS_cld_2011_hints = true; |
michael@0 | 131 | static const int32 FLAGS_cld_max_lang_tag_scan_kb = 8; |
michael@0 | 132 | |
michael@0 | 133 | static const bool FLAGS_dbgscore = false; |
michael@0 | 134 | |
michael@0 | 135 | |
michael@0 | 136 | static const int kLangHintInitial = 12; // Boost language by N initially |
michael@0 | 137 | static const int kLangHintBoost = 12; // Boost language by N/16 per quadgram |
michael@0 | 138 | |
michael@0 | 139 | static const int kShortSpanThresh = 32; // Bytes |
michael@0 | 140 | static const int kMaxSecondChanceLen = 1024; // Look at first 1K of short spans |
michael@0 | 141 | |
michael@0 | 142 | static const int kCheapSqueezeTestThresh = 4096; // Only look for squeezing |
michael@0 | 143 | // after this many text bytes |
michael@0 | 144 | static const int kCheapSqueezeTestLen = 256; // Bytes to test to trigger sqz |
michael@0 | 145 | static const int kSpacesTriggerPercent = 25; // Trigger sqz if >=25% spaces |
michael@0 | 146 | static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted |
michael@0 | 147 | |
michael@0 | 148 | static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks |
michael@0 | 149 | static const int kSpacesThreshPercent = 25; // Squeeze if >=25% spaces |
michael@0 | 150 | static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted |
michael@0 | 151 | |
michael@0 | 152 | static const int kMaxSpaceScan = 32; // Bytes |
michael@0 | 153 | |
michael@0 | 154 | static const int kGoodLang1Percent = 70; |
michael@0 | 155 | static const int kGoodLang1and2Percent = 93; |
michael@0 | 156 | static const int kShortTextThresh = 256; // Bytes |
michael@0 | 157 | |
michael@0 | 158 | static const int kMinChunkSizeQuads = 4; // Chunk is at least four quads |
michael@0 | 159 | static const int kMaxChunkSizeQuads = 1024; // Chunk is at most 1K quads |
michael@0 | 160 | |
michael@0 | 161 | static const int kDefaultWordSpan = 256; // Scan at least this many initial |
michael@0 | 162 | // bytes with word scoring |
michael@0 | 163 | static const int kReallyBigWordSpan = 9999999; // Forces word scoring all text |
michael@0 | 164 | |
michael@0 | 165 | static const int kMinReliableSeq = 50; // Record in seq if >= 50% reliable |
michael@0 | 166 | |
michael@0 | 167 | static const int kPredictionTableSize = 4096; // Must be exactly 4096 for |
michael@0 | 168 | // cheap compressor |
michael@0 | 169 | |
michael@0 | 170 | static const int kNonEnBoilerplateMinPercent = 17; // <this => no second |
michael@0 | 171 | static const int kNonFIGSBoilerplateMinPercent = 20; // <this => no second |
michael@0 | 172 | static const int kGoodFirstMinPercent = 26; // <this => UNK |
michael@0 | 173 | static const int kGoodFirstReliableMinPercent = 51; // <this => unreli |
michael@0 | 174 | static const int kIgnoreMaxPercent = 20; // >this => unreli |
michael@0 | 175 | static const int kKeepMinPercent = 2; // <this => unreli |
michael@0 | 176 | |
michael@0 | 177 | |
michael@0 | 178 | |
michael@0 | 179 | // Statistically closest language, based on quadgram table |
michael@0 | 180 | // Those that are far from other languges map to UNKNOWN_LANGUAGE |
michael@0 | 181 | // Subscripted by Language |
michael@0 | 182 | // |
michael@0 | 183 | // From lang_correlation.txt and hand-edits |
michael@0 | 184 | // sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/ |
michael@0 | 185 | // (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE, |
michael@0 | 186 | // \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt |
michael@0 | 187 | // |
michael@0 | 188 | static const int kMinCorrPercent = 24; // Pick off how close you want |
michael@0 | 189 | // 24 catches PERSIAN <== ARABIC |
michael@0 | 190 | // but not SPANISH <== PORTUGESE |
michael@0 | 191 | static Language Unknown = UNKNOWN_LANGUAGE; |
michael@0 | 192 | |
michael@0 | 193 | // Suspect idea |
michael@0 | 194 | // Subscripted by Language |
michael@0 | 195 | static const Language kClosestAltLanguage[] = { |
michael@0 | 196 | (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // ENGLISH |
michael@0 | 197 | (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // DANISH |
michael@0 | 198 | (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE, // DUTCH |
michael@0 | 199 | (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // FINNISH |
michael@0 | 200 | (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // FRENCH |
michael@0 | 201 | (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE, // GERMAN |
michael@0 | 202 | (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE, // HEBREW |
michael@0 | 203 | (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE, // ITALIAN |
michael@0 | 204 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Japanese |
michael@0 | 205 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Korean |
michael@0 | 206 | (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE, // NORWEGIAN |
michael@0 | 207 | ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // POLISH |
michael@0 | 208 | (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // PORTUGUESE |
michael@0 | 209 | (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // RUSSIAN |
michael@0 | 210 | (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE, // SPANISH |
michael@0 | 211 | (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // SWEDISH |
michael@0 | 212 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Chinese |
michael@0 | 213 | (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // CZECH |
michael@0 | 214 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GREEK |
michael@0 | 215 | (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE, // ICELANDIC |
michael@0 | 216 | ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE, // LATVIAN |
michael@0 | 217 | ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE, // LITHUANIAN |
michael@0 | 218 | ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ROMANIAN |
michael@0 | 219 | ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // HUNGARIAN |
michael@0 | 220 | (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE, // ESTONIAN |
michael@0 | 221 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Ignore |
michael@0 | 222 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Unknown |
michael@0 | 223 | (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // BULGARIAN |
michael@0 | 224 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CROATIAN |
michael@0 | 225 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SERBIAN |
michael@0 | 226 | (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE, // IRISH |
michael@0 | 227 | (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GALICIAN |
michael@0 | 228 | ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // TAGALOG |
michael@0 | 229 | (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE, // TURKISH |
michael@0 | 230 | (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // UKRAINIAN |
michael@0 | 231 | (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // HINDI |
michael@0 | 232 | (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // MACEDONIAN |
michael@0 | 233 | (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE, // BENGALI |
michael@0 | 234 | (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // INDONESIAN |
michael@0 | 235 | ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // LATIN |
michael@0 | 236 | (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // MALAY |
michael@0 | 237 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MALAYALAM |
michael@0 | 238 | ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE, // WELSH |
michael@0 | 239 | ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // NEPALI |
michael@0 | 240 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TELUGU |
michael@0 | 241 | ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE, // ALBANIAN |
michael@0 | 242 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TAMIL |
michael@0 | 243 | (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE, // BELARUSIAN |
michael@0 | 244 | (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE, // JAVANESE |
michael@0 | 245 | (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE, // OCCITAN |
michael@0 | 246 | (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // URDU |
michael@0 | 247 | (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // BIHARI |
michael@0 | 248 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GUJARATI |
michael@0 | 249 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // THAI |
michael@0 | 250 | (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // ARABIC |
michael@0 | 251 | (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // CATALAN |
michael@0 | 252 | ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ESPERANTO |
michael@0 | 253 | ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // BASQUE |
michael@0 | 254 | ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // INTERLINGUA |
michael@0 | 255 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KANNADA |
michael@0 | 256 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PUNJABI |
michael@0 | 257 | (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE, // SCOTS_GAELIC |
michael@0 | 258 | ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SWAHILI |
michael@0 | 259 | (28 >= kMinCorrPercent) ? SERBIAN : UNKNOWN_LANGUAGE, // SLOVENIAN |
michael@0 | 260 | (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // MARATHI |
michael@0 | 261 | ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // MALTESE |
michael@0 | 262 | ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE, // VIETNAMESE |
michael@0 | 263 | (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // FRISIAN |
michael@0 | 264 | (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE, // SLOVAK |
michael@0 | 265 | // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ChineseT |
michael@0 | 266 | (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE, // ChineseT |
michael@0 | 267 | (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE, // FAROESE |
michael@0 | 268 | (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE, // SUNDANESE |
michael@0 | 269 | (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE, // UZBEK |
michael@0 | 270 | ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE, // AMHARIC |
michael@0 | 271 | (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // AZERBAIJANI |
michael@0 | 272 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GEORGIAN |
michael@0 | 273 | ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE, // TIGRINYA |
michael@0 | 274 | (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // PERSIAN |
michael@0 | 275 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // BOSNIAN |
michael@0 | 276 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SINHALESE |
michael@0 | 277 | (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // NORWEGIAN_N |
michael@0 | 278 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_P |
michael@0 | 279 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_B |
michael@0 | 280 | (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // XHOSA |
michael@0 | 281 | (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE, // ZULU |
michael@0 | 282 | ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GUARANI |
michael@0 | 283 | (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE, // SESOTHO |
michael@0 | 284 | ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // TURKMEN |
michael@0 | 285 | ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE, // KYRGYZ |
michael@0 | 286 | ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE, // BRETON |
michael@0 | 287 | ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE, // TWI |
michael@0 | 288 | (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE, // YIDDISH |
michael@0 | 289 | (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE, // SERBO_CROATIAN |
michael@0 | 290 | (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // SOMALI |
michael@0 | 291 | ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // UIGHUR |
michael@0 | 292 | (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // KURDISH |
michael@0 | 293 | ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // MONGOLIAN |
michael@0 | 294 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ARMENIAN |
michael@0 | 295 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // LAOTHIAN |
michael@0 | 296 | ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // SINDHI |
michael@0 | 297 | (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // RHAETO_ROMANCE |
michael@0 | 298 | (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // AFRIKAANS |
michael@0 | 299 | (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // LUXEMBOURGISH |
michael@0 | 300 | ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // BURMESE |
michael@0 | 301 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KHMER |
michael@0 | 302 | (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE, // TIBETAN |
michael@0 | 303 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // DHIVEHI |
michael@0 | 304 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CHEROKEE |
michael@0 | 305 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SYRIAC |
michael@0 | 306 | ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // LIMBU |
michael@0 | 307 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ORIYA |
michael@0 | 308 | (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE, // ASSAMESE |
michael@0 | 309 | (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // CORSICAN |
michael@0 | 310 | ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // INTERLINGUE |
michael@0 | 311 | ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // KAZAKH |
michael@0 | 312 | ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE, // LINGALA |
michael@0 | 313 | (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // MOLDAVIAN |
michael@0 | 314 | (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // PASHTO |
michael@0 | 315 | ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE, // QUECHUA |
michael@0 | 316 | ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SHONA |
michael@0 | 317 | (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // TAJIK |
michael@0 | 318 | (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE, // TATAR |
michael@0 | 319 | (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE, // TONGA |
michael@0 | 320 | ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE, // YORUBA |
michael@0 | 321 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_ENGLISH_BASED |
michael@0 | 322 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_FRENCH_BASED |
michael@0 | 323 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED |
michael@0 | 324 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_OTHER |
michael@0 | 325 | ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // MAORI |
michael@0 | 326 | ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // WOLOF |
michael@0 | 327 | ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE, // ABKHAZIAN |
michael@0 | 328 | ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // AFAR |
michael@0 | 329 | ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE, // AYMARA |
michael@0 | 330 | (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE, // BASHKIR |
michael@0 | 331 | ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // BISLAMA |
michael@0 | 332 | (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE, // DZONGKHA |
michael@0 | 333 | ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // FIJIAN |
michael@0 | 334 | ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE, // GREENLANDIC |
michael@0 | 335 | ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE, // HAUSA |
michael@0 | 336 | ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // HAITIAN_CREOLE |
michael@0 | 337 | ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE, // INUPIAK |
michael@0 | 338 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // INUKTITUT |
michael@0 | 339 | ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // KASHMIRI |
michael@0 | 340 | (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE, // KINYARWANDA |
michael@0 | 341 | ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE, // MALAGASY |
michael@0 | 342 | (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // NAURU |
michael@0 | 343 | (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // OROMO |
michael@0 | 344 | (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // RUNDI |
michael@0 | 345 | (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // SAMOAN |
michael@0 | 346 | ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE, // SANGO |
michael@0 | 347 | (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // SANSKRIT |
michael@0 | 348 | (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // SISWANT |
michael@0 | 349 | ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE, // TSONGA |
michael@0 | 350 | (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE, // TSWANA |
michael@0 | 351 | ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // VOLAPUK |
michael@0 | 352 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ZHUANG |
michael@0 | 353 | ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // KHASI |
michael@0 | 354 | (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // SCOTS |
michael@0 | 355 | (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // GANDA |
michael@0 | 356 | ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // MANX |
michael@0 | 357 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MONTENEGRIN |
michael@0 | 358 | |
michael@0 | 359 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // AKAN |
michael@0 | 360 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // IGBO |
michael@0 | 361 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MAURITIAN_CREOLE |
michael@0 | 362 | ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // HAWAIIAN |
michael@0 | 363 | }; |
michael@0 | 364 | |
michael@0 | 365 | // COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES, |
michael@0 | 366 | // kClosestAltLanguage_has_incorrect_size); |
michael@0 | 367 | |
michael@0 | 368 | |
michael@0 | 369 | inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;} |
michael@0 | 370 | inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;} |
michael@0 | 371 | inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;} |
michael@0 | 372 | inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;} |
michael@0 | 373 | inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;} |
michael@0 | 374 | inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;} |
michael@0 | 375 | inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;} |
michael@0 | 376 | |
michael@0 | 377 | |
michael@0 | 378 | // Defines Top40 packed languages |
michael@0 | 379 | |
michael@0 | 380 | // Google top 40 languages |
michael@0 | 381 | // |
michael@0 | 382 | // Tier 0/1 Language enum list (16) |
michael@0 | 383 | // ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH, // E - FIGS |
michael@0 | 384 | // DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN, |
michael@0 | 385 | // PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI, |
michael@0 | 386 | // ARABIC, |
michael@0 | 387 | // |
michael@0 | 388 | // Tier 2 Language enum list (22) |
michael@0 | 389 | // SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN, |
michael@0 | 390 | // HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN, |
michael@0 | 391 | // VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK, |
michael@0 | 392 | // TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN, |
michael@0 | 393 | // UKRAINIAN, HINDI, |
michael@0 | 394 | // |
michael@0 | 395 | // use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21) |
michael@0 | 396 | // |
michael@0 | 397 | // Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40 |
michael@0 | 398 | |
michael@0 | 399 | |
michael@0 | 400 | void DemoteNotTop40(Tote* chunk_tote, uint16 psplus_one) { |
michael@0 | 401 | // REVISIT |
michael@0 | 402 | } |
michael@0 | 403 | |
michael@0 | 404 | void PrintText(FILE* f, Language cur_lang, const string& temp) { |
michael@0 | 405 | if (temp.size() == 0) {return;} |
michael@0 | 406 | fprintf(f, "PrintText[%s]%s<br>\n", LanguageName(cur_lang), temp.c_str()); |
michael@0 | 407 | } |
michael@0 | 408 | |
michael@0 | 409 | |
michael@0 | 410 | //------------------------------------------------------------------------------ |
michael@0 | 411 | // For --cld_html debugging output. Not thread safe |
michael@0 | 412 | //------------------------------------------------------------------------------ |
michael@0 | 413 | static Language prior_lang = UNKNOWN_LANGUAGE; |
michael@0 | 414 | static bool prior_unreliable = false; |
michael@0 | 415 | |
michael@0 | 416 | //------------------------------------------------------------------------------ |
michael@0 | 417 | // End For --cld_html debugging output |
michael@0 | 418 | //------------------------------------------------------------------------------ |
michael@0 | 419 | |
michael@0 | 420 | |
michael@0 | 421 | // Backscan to word boundary, returning how many bytes n to go back |
michael@0 | 422 | // so that src - n is non-space ans src - n - 1 is space. |
michael@0 | 423 | // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary |
michael@0 | 424 | int BackscanToSpace(const char* src, int limit) { |
michael@0 | 425 | int n = 0; |
michael@0 | 426 | limit = minint(limit, kMaxSpaceScan); |
michael@0 | 427 | while (n < limit) { |
michael@0 | 428 | if (src[-n - 1] == ' ') {return n;} // We are at _X |
michael@0 | 429 | ++n; |
michael@0 | 430 | } |
michael@0 | 431 | n = 0; |
michael@0 | 432 | while (n < limit) { |
michael@0 | 433 | if ((src[-n] & 0xc0) != 0x80) {return n;} // We are at char begin |
michael@0 | 434 | ++n; |
michael@0 | 435 | } |
michael@0 | 436 | return 0; |
michael@0 | 437 | } |
michael@0 | 438 | |
michael@0 | 439 | // Forwardscan to word boundary, returning how many bytes n to go forward |
michael@0 | 440 | // so that src + n is non-space ans src + n - 1 is space. |
michael@0 | 441 | // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary |
michael@0 | 442 | int ForwardscanToSpace(const char* src, int limit) { |
michael@0 | 443 | int n = 0; |
michael@0 | 444 | limit = minint(limit, kMaxSpaceScan); |
michael@0 | 445 | while (n < limit) { |
michael@0 | 446 | if (src[n] == ' ') {return n + 1;} // We are at _X |
michael@0 | 447 | ++n; |
michael@0 | 448 | } |
michael@0 | 449 | n = 0; |
michael@0 | 450 | while (n < limit) { |
michael@0 | 451 | if ((src[n] & 0xc0) != 0x80) {return n;} // We are at char begin |
michael@0 | 452 | ++n; |
michael@0 | 453 | } |
michael@0 | 454 | return 0; |
michael@0 | 455 | } |
michael@0 | 456 | |
michael@0 | 457 | |
michael@0 | 458 | // This uses a cheap predictor to get a measure of compression, and |
michael@0 | 459 | // hence a measure of repetitiveness. It works on complete UTF-8 characters |
michael@0 | 460 | // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly |
michael@0 | 461 | // all the time when done with a byte-based count. Sigh. |
michael@0 | 462 | // |
michael@0 | 463 | // To allow running prediction across multiple chunks, caller passes in current |
michael@0 | 464 | // 12-bit hash value and int[4096] prediction table. Caller inits these to 0. |
michael@0 | 465 | // |
michael@0 | 466 | // Returns the number of *bytes* correctly predicted, increments by 1..4 for |
michael@0 | 467 | // each correctly-predicted character. |
michael@0 | 468 | // |
michael@0 | 469 | // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text |
michael@0 | 470 | // |
michael@0 | 471 | |
michael@0 | 472 | // TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen |
michael@0 | 473 | |
michael@0 | 474 | int CountPredictedBytes(const char* isrc, int src_len, int* hash, int* tbl) { |
michael@0 | 475 | int p_count = 0; |
michael@0 | 476 | const uint8* src = reinterpret_cast<const uint8*>(isrc); |
michael@0 | 477 | const uint8* srclimit = src + src_len; |
michael@0 | 478 | int local_hash = *hash; |
michael@0 | 479 | |
michael@0 | 480 | while (src < srclimit) { |
michael@0 | 481 | int c = src[0]; |
michael@0 | 482 | int incr = 1; |
michael@0 | 483 | |
michael@0 | 484 | // Pick up one char and length |
michael@0 | 485 | if (c < 0xc0) { |
michael@0 | 486 | // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx |
michael@0 | 487 | // Do nothing more |
michael@0 | 488 | } else if ((c & 0xe0) == 0xc0) { |
michael@0 | 489 | // Two-byte |
michael@0 | 490 | c = (c << 8) | src[1]; |
michael@0 | 491 | incr = 2; |
michael@0 | 492 | } else if ((c & 0xf0) == 0xe0) { |
michael@0 | 493 | // Three-byte |
michael@0 | 494 | c = (c << 16) | (src[1] << 8) | src[2]; |
michael@0 | 495 | incr = 3; |
michael@0 | 496 | } else { |
michael@0 | 497 | // Four-byte |
michael@0 | 498 | c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; |
michael@0 | 499 | incr = 4; |
michael@0 | 500 | } |
michael@0 | 501 | src += incr; |
michael@0 | 502 | |
michael@0 | 503 | int p = tbl[local_hash]; // Prediction |
michael@0 | 504 | tbl[local_hash] = c; // Update prediction |
michael@0 | 505 | if (c == p) { |
michael@0 | 506 | p_count += incr; // Count bytes of good predictions |
michael@0 | 507 | } |
michael@0 | 508 | |
michael@0 | 509 | local_hash = ((local_hash << 4) ^ c) & 0xfff; |
michael@0 | 510 | } |
michael@0 | 511 | *hash = local_hash; |
michael@0 | 512 | return p_count; |
michael@0 | 513 | } |
michael@0 | 514 | |
michael@0 | 515 | |
michael@0 | 516 | |
michael@0 | 517 | // Counts number of spaces; a little faster than one-at-a-time |
michael@0 | 518 | // Doesn't count odd bytes at end |
michael@0 | 519 | int CountSpaces4(const char* src, int src_len) { |
michael@0 | 520 | int s_count = 0; |
michael@0 | 521 | for (int i = 0; i < (src_len & ~3); i += 4) { |
michael@0 | 522 | s_count += (src[i] == ' '); |
michael@0 | 523 | s_count += (src[i+1] == ' '); |
michael@0 | 524 | s_count += (src[i+2] == ' '); |
michael@0 | 525 | s_count += (src[i+3] == ' '); |
michael@0 | 526 | } |
michael@0 | 527 | return s_count; |
michael@0 | 528 | } |
michael@0 | 529 | |
michael@0 | 530 | |
michael@0 | 531 | // Remove words of text that have more than half their letters predicted |
michael@0 | 532 | // correctly by our cheap predictor, moving the remaining words in-place |
michael@0 | 533 | // to the front of the input buffer. |
michael@0 | 534 | // |
michael@0 | 535 | // To allow running prediction across multiple chunks, caller passes in current |
michael@0 | 536 | // 12-bit hash value and int[4096] prediction table. Caller inits these to 0. |
michael@0 | 537 | // |
michael@0 | 538 | // Return the new, possibly-shorter length |
michael@0 | 539 | // |
michael@0 | 540 | // Result Buffer ALWAYS has leading space and trailing space space space NUL, |
michael@0 | 541 | // if input does |
michael@0 | 542 | // |
michael@0 | 543 | int CheapRepWordsInplace(char* isrc, int src_len, int* hash, int* tbl) { |
michael@0 | 544 | const uint8* src = reinterpret_cast<const uint8*>(isrc); |
michael@0 | 545 | const uint8* srclimit = src + src_len; |
michael@0 | 546 | char* dst = isrc; |
michael@0 | 547 | int local_hash = *hash; |
michael@0 | 548 | char* word_dst = dst; // Start of next word |
michael@0 | 549 | int good_predict_bytes = 0; |
michael@0 | 550 | int word_length_bytes = 0; |
michael@0 | 551 | |
michael@0 | 552 | while (src < srclimit) { |
michael@0 | 553 | int c = src[0]; |
michael@0 | 554 | int incr = 1; |
michael@0 | 555 | *dst++ = c; |
michael@0 | 556 | |
michael@0 | 557 | if (c == ' ') { |
michael@0 | 558 | if ((good_predict_bytes * 2) > word_length_bytes) { |
michael@0 | 559 | // Word is well-predicted: backup to start of this word |
michael@0 | 560 | dst = word_dst; |
michael@0 | 561 | if (FLAGS_cld_showme) { |
michael@0 | 562 | // Mark the deletion point with period |
michael@0 | 563 | // Don't repeat multiple periods |
michael@0 | 564 | // Cannot mark with more bytes or may overwrite unseen input |
michael@0 | 565 | if ((isrc < (dst - 2)) && (dst[-2] != '.')) { |
michael@0 | 566 | *dst++ = '.'; |
michael@0 | 567 | *dst++ = ' '; |
michael@0 | 568 | } |
michael@0 | 569 | } |
michael@0 | 570 | } |
michael@0 | 571 | word_dst = dst; // Start of next word |
michael@0 | 572 | good_predict_bytes = 0; |
michael@0 | 573 | word_length_bytes = 0; |
michael@0 | 574 | } |
michael@0 | 575 | |
michael@0 | 576 | // Pick up one char and length |
michael@0 | 577 | if (c < 0xc0) { |
michael@0 | 578 | // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx |
michael@0 | 579 | // Do nothing more |
michael@0 | 580 | } else if ((c & 0xe0) == 0xc0) { |
michael@0 | 581 | // Two-byte |
michael@0 | 582 | *dst++ = src[1]; |
michael@0 | 583 | c = (c << 8) | src[1]; |
michael@0 | 584 | incr = 2; |
michael@0 | 585 | } else if ((c & 0xf0) == 0xe0) { |
michael@0 | 586 | // Three-byte |
michael@0 | 587 | *dst++ = src[1]; |
michael@0 | 588 | *dst++ = src[2]; |
michael@0 | 589 | c = (c << 16) | (src[1] << 8) | src[2]; |
michael@0 | 590 | incr = 3; |
michael@0 | 591 | } else { |
michael@0 | 592 | // Four-byte |
michael@0 | 593 | *dst++ = src[1]; |
michael@0 | 594 | *dst++ = src[2]; |
michael@0 | 595 | *dst++ = src[3]; |
michael@0 | 596 | c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; |
michael@0 | 597 | incr = 4; |
michael@0 | 598 | } |
michael@0 | 599 | src += incr; |
michael@0 | 600 | word_length_bytes += incr; |
michael@0 | 601 | |
michael@0 | 602 | int p = tbl[local_hash]; // Prediction |
michael@0 | 603 | tbl[local_hash] = c; // Update prediction |
michael@0 | 604 | if (c == p) { |
michael@0 | 605 | good_predict_bytes += incr; // Count good predictions |
michael@0 | 606 | } |
michael@0 | 607 | |
michael@0 | 608 | local_hash = ((local_hash << 4) ^ c) & 0xfff; |
michael@0 | 609 | } |
michael@0 | 610 | |
michael@0 | 611 | *hash = local_hash; |
michael@0 | 612 | |
michael@0 | 613 | if ((dst - isrc) < (src_len - 3)) { |
michael@0 | 614 | // Pad and make last char clean UTF-8 by putting following spaces |
michael@0 | 615 | dst[0] = ' '; |
michael@0 | 616 | dst[1] = ' '; |
michael@0 | 617 | dst[2] = ' '; |
michael@0 | 618 | dst[3] = '\0'; |
michael@0 | 619 | } else if ((dst - isrc) < src_len) { |
michael@0 | 620 | // Make last char clean UTF-8 by putting following space off the end |
michael@0 | 621 | dst[0] = ' '; |
michael@0 | 622 | } |
michael@0 | 623 | |
michael@0 | 624 | return static_cast<int>(dst - isrc); |
michael@0 | 625 | } |
michael@0 | 626 | |
michael@0 | 627 | |
michael@0 | 628 | // This alternate form overwrites redundant words, thus avoiding corrupting the |
michael@0 | 629 | // backmap for generate a vector of original-text ranges. |
michael@0 | 630 | int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) { |
michael@0 | 631 | const uint8* src = reinterpret_cast<const uint8*>(isrc); |
michael@0 | 632 | const uint8* srclimit = src + src_len; |
michael@0 | 633 | char* dst = isrc; |
michael@0 | 634 | int local_hash = *hash; |
michael@0 | 635 | char* word_dst = dst; // Start of next word |
michael@0 | 636 | int good_predict_bytes = 0; |
michael@0 | 637 | int word_length_bytes = 0; |
michael@0 | 638 | |
michael@0 | 639 | while (src < srclimit) { |
michael@0 | 640 | int c = src[0]; |
michael@0 | 641 | int incr = 1; |
michael@0 | 642 | *dst++ = c; |
michael@0 | 643 | |
michael@0 | 644 | if (c == ' ') { |
michael@0 | 645 | if ((good_predict_bytes * 2) > word_length_bytes) { |
michael@0 | 646 | // Word [word_dst..dst-1) is well-predicted: overwrite |
michael@0 | 647 | for (char* p = word_dst; p < dst - 1; ++p) {*p = '.';} |
michael@0 | 648 | } |
michael@0 | 649 | word_dst = dst; // Start of next word |
michael@0 | 650 | good_predict_bytes = 0; |
michael@0 | 651 | word_length_bytes = 0; |
michael@0 | 652 | } |
michael@0 | 653 | |
michael@0 | 654 | // Pick up one char and length |
michael@0 | 655 | if (c < 0xc0) { |
michael@0 | 656 | // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx |
michael@0 | 657 | // Do nothing more |
michael@0 | 658 | } else if ((c & 0xe0) == 0xc0) { |
michael@0 | 659 | // Two-byte |
michael@0 | 660 | *dst++ = src[1]; |
michael@0 | 661 | c = (c << 8) | src[1]; |
michael@0 | 662 | incr = 2; |
michael@0 | 663 | } else if ((c & 0xf0) == 0xe0) { |
michael@0 | 664 | // Three-byte |
michael@0 | 665 | *dst++ = src[1]; |
michael@0 | 666 | *dst++ = src[2]; |
michael@0 | 667 | c = (c << 16) | (src[1] << 8) | src[2]; |
michael@0 | 668 | incr = 3; |
michael@0 | 669 | } else { |
michael@0 | 670 | // Four-byte |
michael@0 | 671 | *dst++ = src[1]; |
michael@0 | 672 | *dst++ = src[2]; |
michael@0 | 673 | *dst++ = src[3]; |
michael@0 | 674 | c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; |
michael@0 | 675 | incr = 4; |
michael@0 | 676 | } |
michael@0 | 677 | src += incr; |
michael@0 | 678 | word_length_bytes += incr; |
michael@0 | 679 | |
michael@0 | 680 | int p = tbl[local_hash]; // Prediction |
michael@0 | 681 | tbl[local_hash] = c; // Update prediction |
michael@0 | 682 | if (c == p) { |
michael@0 | 683 | good_predict_bytes += incr; // Count good predictions |
michael@0 | 684 | } |
michael@0 | 685 | |
michael@0 | 686 | local_hash = ((local_hash << 4) ^ c) & 0xfff; |
michael@0 | 687 | } |
michael@0 | 688 | |
michael@0 | 689 | *hash = local_hash; |
michael@0 | 690 | |
michael@0 | 691 | if ((dst - isrc) < (src_len - 3)) { |
michael@0 | 692 | // Pad and make last char clean UTF-8 by putting following spaces |
michael@0 | 693 | dst[0] = ' '; |
michael@0 | 694 | dst[1] = ' '; |
michael@0 | 695 | dst[2] = ' '; |
michael@0 | 696 | dst[3] = '\0'; |
michael@0 | 697 | } else if ((dst - isrc) < src_len) { |
michael@0 | 698 | // Make last char clean UTF-8 by putting following space off the end |
michael@0 | 699 | dst[0] = ' '; |
michael@0 | 700 | } |
michael@0 | 701 | |
michael@0 | 702 | return static_cast<int>(dst - isrc); |
michael@0 | 703 | } |
michael@0 | 704 | |
michael@0 | 705 | |
michael@0 | 706 | // Remove portions of text that have a high density of spaces, or that are |
michael@0 | 707 | // overly repetitive, squeezing the remaining text in-place to the front of the |
michael@0 | 708 | // input buffer. |
michael@0 | 709 | // |
michael@0 | 710 | // Squeezing looks at density of space/prediced chars in fixed-size chunks, |
michael@0 | 711 | // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes. |
michael@0 | 712 | // |
michael@0 | 713 | // Return the new, possibly-shorter length |
michael@0 | 714 | // |
michael@0 | 715 | // Result Buffer ALWAYS has leading space and trailing space space space NUL, |
michael@0 | 716 | // if input does |
michael@0 | 717 | // |
michael@0 | 718 | int CheapSqueezeInplace(char* isrc, |
michael@0 | 719 | int src_len, |
michael@0 | 720 | int ichunksize) { |
michael@0 | 721 | char* src = isrc; |
michael@0 | 722 | char* dst = src; |
michael@0 | 723 | char* srclimit = src + src_len; |
michael@0 | 724 | bool skipping = false; |
michael@0 | 725 | |
michael@0 | 726 | int hash = 0; |
michael@0 | 727 | // Allocate local prediction table. |
michael@0 | 728 | int* predict_tbl = new int[kPredictionTableSize]; |
michael@0 | 729 | memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); |
michael@0 | 730 | |
michael@0 | 731 | int chunksize = ichunksize; |
michael@0 | 732 | if (chunksize == 0) {chunksize = kChunksizeDefault;} |
michael@0 | 733 | int space_thresh = (chunksize * kSpacesThreshPercent) / 100; |
michael@0 | 734 | int predict_thresh = (chunksize * kPredictThreshPercent) / 100; |
michael@0 | 735 | |
michael@0 | 736 | while (src < srclimit) { |
michael@0 | 737 | int remaining_bytes = srclimit - src; |
michael@0 | 738 | int len = minint(chunksize, remaining_bytes); |
michael@0 | 739 | // Make len land us on a UTF-8 character boundary. |
michael@0 | 740 | // Ah. Also fixes mispredict because we could get out of phase |
michael@0 | 741 | // Loop always terminates at trailing space in buffer |
michael@0 | 742 | while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes |
michael@0 | 743 | |
michael@0 | 744 | int space_n = CountSpaces4(src, len); |
michael@0 | 745 | int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl); |
michael@0 | 746 | if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) { |
michael@0 | 747 | // Skip the text |
michael@0 | 748 | if (!skipping) { |
michael@0 | 749 | // Keeping-to-skipping transition; do it at a space |
michael@0 | 750 | int n = BackscanToSpace(dst, static_cast<int>(dst - isrc)); |
michael@0 | 751 | dst -= n; |
michael@0 | 752 | if (dst == isrc) { |
michael@0 | 753 | // Force a leading space if the first chunk is deleted |
michael@0 | 754 | *dst++ = ' '; |
michael@0 | 755 | } |
michael@0 | 756 | if (FLAGS_cld_showme) { |
michael@0 | 757 | // Mark the deletion point with black square U+25A0 |
michael@0 | 758 | *dst++ = static_cast<unsigned char>(0xe2); |
michael@0 | 759 | *dst++ = static_cast<unsigned char>(0x96); |
michael@0 | 760 | *dst++ = static_cast<unsigned char>(0xa0); |
michael@0 | 761 | *dst++ = ' '; |
michael@0 | 762 | } |
michael@0 | 763 | skipping = true; |
michael@0 | 764 | } |
michael@0 | 765 | } else { |
michael@0 | 766 | // Keep the text |
michael@0 | 767 | if (skipping) { |
michael@0 | 768 | // Skipping-to-keeping transition; do it at a space |
michael@0 | 769 | int n = ForwardscanToSpace(src, len); |
michael@0 | 770 | src += n; |
michael@0 | 771 | remaining_bytes -= n; // Shrink remaining length |
michael@0 | 772 | len -= n; |
michael@0 | 773 | skipping = false; |
michael@0 | 774 | } |
michael@0 | 775 | // "len" can be negative in some cases |
michael@0 | 776 | if (len > 0) { |
michael@0 | 777 | memmove(dst, src, len); |
michael@0 | 778 | dst += len; |
michael@0 | 779 | } |
michael@0 | 780 | } |
michael@0 | 781 | src += len; |
michael@0 | 782 | } |
michael@0 | 783 | |
michael@0 | 784 | if ((dst - isrc) < (src_len - 3)) { |
michael@0 | 785 | // Pad and make last char clean UTF-8 by putting following spaces |
michael@0 | 786 | dst[0] = ' '; |
michael@0 | 787 | dst[1] = ' '; |
michael@0 | 788 | dst[2] = ' '; |
michael@0 | 789 | dst[3] = '\0'; |
michael@0 | 790 | } else if ((dst - isrc) < src_len) { |
michael@0 | 791 | // Make last char clean UTF-8 by putting following space off the end |
michael@0 | 792 | dst[0] = ' '; |
michael@0 | 793 | } |
michael@0 | 794 | |
michael@0 | 795 | // Deallocate local prediction table |
michael@0 | 796 | delete[] predict_tbl; |
michael@0 | 797 | return static_cast<int>(dst - isrc); |
michael@0 | 798 | } |
michael@0 | 799 | |
michael@0 | 800 | // This alternate form overwrites redundant words, thus avoiding corrupting the |
michael@0 | 801 | // backmap for generate a vector of original-text ranges. |
michael@0 | 802 | int CheapSqueezeInplaceOverwrite(char* isrc, |
michael@0 | 803 | int src_len, |
michael@0 | 804 | int ichunksize) { |
michael@0 | 805 | char* src = isrc; |
michael@0 | 806 | char* dst = src; |
michael@0 | 807 | char* srclimit = src + src_len; |
michael@0 | 808 | bool skipping = false; |
michael@0 | 809 | |
michael@0 | 810 | int hash = 0; |
michael@0 | 811 | // Allocate local prediction table. |
michael@0 | 812 | int* predict_tbl = new int[kPredictionTableSize]; |
michael@0 | 813 | memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); |
michael@0 | 814 | |
michael@0 | 815 | int chunksize = ichunksize; |
michael@0 | 816 | if (chunksize == 0) {chunksize = kChunksizeDefault;} |
michael@0 | 817 | int space_thresh = (chunksize * kSpacesThreshPercent) / 100; |
michael@0 | 818 | int predict_thresh = (chunksize * kPredictThreshPercent) / 100; |
michael@0 | 819 | |
michael@0 | 820 | // Always keep first byte (space) |
michael@0 | 821 | ++src; |
michael@0 | 822 | ++dst; |
michael@0 | 823 | while (src < srclimit) { |
michael@0 | 824 | int remaining_bytes = srclimit - src; |
michael@0 | 825 | int len = minint(chunksize, remaining_bytes); |
michael@0 | 826 | // Make len land us on a UTF-8 character boundary. |
michael@0 | 827 | // Ah. Also fixes mispredict because we could get out of phase |
michael@0 | 828 | // Loop always terminates at trailing space in buffer |
michael@0 | 829 | while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes |
michael@0 | 830 | |
michael@0 | 831 | int space_n = CountSpaces4(src, len); |
michael@0 | 832 | int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl); |
michael@0 | 833 | if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) { |
michael@0 | 834 | // Overwrite the text [dst-n..dst) |
michael@0 | 835 | if (!skipping) { |
michael@0 | 836 | // Keeping-to-skipping transition; do it at a space |
michael@0 | 837 | int n = BackscanToSpace(dst, static_cast<int>(dst - isrc)); |
michael@0 | 838 | // Text [word_dst..dst) is well-predicted: overwrite |
michael@0 | 839 | for (char* p = dst - n; p < dst; ++p) {*p = '.';} |
michael@0 | 840 | skipping = true; |
michael@0 | 841 | } |
michael@0 | 842 | // Overwrite the text [dst..dst+len) |
michael@0 | 843 | for (char* p = dst; p < dst + len; ++p) {*p = '.';} |
michael@0 | 844 | dst[len - 1] = ' '; // Space at end so we can see what is happening |
michael@0 | 845 | } else { |
michael@0 | 846 | // Keep the text |
michael@0 | 847 | if (skipping) { |
michael@0 | 848 | // Skipping-to-keeping transition; do it at a space |
michael@0 | 849 | int n = ForwardscanToSpace(src, len); |
michael@0 | 850 | // Text [dst..dst+n) is well-predicted: overwrite |
michael@0 | 851 | for (char* p = dst; p < dst + n - 1; ++p) {*p = '.';} |
michael@0 | 852 | skipping = false; |
michael@0 | 853 | } |
michael@0 | 854 | } |
michael@0 | 855 | dst += len; |
michael@0 | 856 | src += len; |
michael@0 | 857 | } |
michael@0 | 858 | |
michael@0 | 859 | if ((dst - isrc) < (src_len - 3)) { |
michael@0 | 860 | // Pad and make last char clean UTF-8 by putting following spaces |
michael@0 | 861 | dst[0] = ' '; |
michael@0 | 862 | dst[1] = ' '; |
michael@0 | 863 | dst[2] = ' '; |
michael@0 | 864 | dst[3] = '\0'; |
michael@0 | 865 | } else if ((dst - isrc) < src_len) { |
michael@0 | 866 | // Make last char clean UTF-8 by putting following space off the end |
michael@0 | 867 | dst[0] = ' '; |
michael@0 | 868 | } |
michael@0 | 869 | |
michael@0 | 870 | // Deallocate local prediction table |
michael@0 | 871 | delete[] predict_tbl; |
michael@0 | 872 | return static_cast<int>(dst - isrc); |
michael@0 | 873 | } |
michael@0 | 874 | |
michael@0 | 875 | // Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input |
michael@0 | 876 | // About 90 MB/sec, with or without memcpy, chunksize 48 or 4096 |
michael@0 | 877 | // Just CountSpaces is about 340 MB/sec |
michael@0 | 878 | // Byte-only CountPredictedBytes is about 150 MB/sec |
michael@0 | 879 | // Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec |
michael@0 | 880 | // Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c |
michael@0 | 881 | // Unjammed byte-only both = 170 MB/sec |
michael@0 | 882 | // Jammed byte-only both = 120 MB/sec |
michael@0 | 883 | // Back to original w/slight updates, 110 MB/sec |
michael@0 | 884 | // |
michael@0 | 885 | bool CheapSqueezeTriggerTest(const char* src, int src_len, int testsize) { |
michael@0 | 886 | // Don't trigger at all on short text |
michael@0 | 887 | if (src_len < testsize) {return false;} |
michael@0 | 888 | int space_thresh = (testsize * kSpacesTriggerPercent) / 100; |
michael@0 | 889 | int predict_thresh = (testsize * kPredictTriggerPercent) / 100; |
michael@0 | 890 | int hash = 0; |
michael@0 | 891 | // Allocate local prediction table. |
michael@0 | 892 | int* predict_tbl = new int[kPredictionTableSize]; |
michael@0 | 893 | memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); |
michael@0 | 894 | |
michael@0 | 895 | bool retval = false; |
michael@0 | 896 | if ((CountSpaces4(src, testsize) >= space_thresh) || |
michael@0 | 897 | (CountPredictedBytes(src, testsize, &hash, predict_tbl) >= |
michael@0 | 898 | predict_thresh)) { |
michael@0 | 899 | retval = true; |
michael@0 | 900 | } |
michael@0 | 901 | // Deallocate local prediction table |
michael@0 | 902 | delete[] predict_tbl; |
michael@0 | 903 | return retval; |
michael@0 | 904 | } |
michael@0 | 905 | |
michael@0 | 906 | |
michael@0 | 907 | |
michael@0 | 908 | |
michael@0 | 909 | // Delete any extended languages from doc_tote |
michael@0 | 910 | void RemoveExtendedLanguages(DocTote* doc_tote) { |
michael@0 | 911 | // Now a nop |
michael@0 | 912 | } |
michael@0 | 913 | |
michael@0 | 914 | static const int kMinReliableKeepPercent = 41; // Remove lang if reli < this |
michael@0 | 915 | |
michael@0 | 916 | // For Tier3 languages, require a minimum number of bytes to be first-place lang |
michael@0 | 917 | static const int kGoodFirstT3MinBytes = 24; // <this => no first |
michael@0 | 918 | |
michael@0 | 919 | // Move bytes for unreliable langs to another lang or UNKNOWN |
michael@0 | 920 | // doc_tote is sorted, so cannot Add |
michael@0 | 921 | // |
michael@0 | 922 | // If both CHINESE and CHINESET are present and unreliable, do not delete both; |
michael@0 | 923 | // merge both into CHINESE. |
michael@0 | 924 | // |
michael@0 | 925 | //dsites 2009.03.19 |
michael@0 | 926 | // we also want to remove Tier3 languages as the first lang if there is very |
michael@0 | 927 | // little text like ej1 ej2 ej3 ej4 |
michael@0 | 928 | // maybe fold this back in earlier |
michael@0 | 929 | // |
michael@0 | 930 | void RemoveUnreliableLanguages(DocTote* doc_tote, |
michael@0 | 931 | bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) { |
michael@0 | 932 | // Prepass to merge some low-reliablility languages |
michael@0 | 933 | // TODO: this shouldn't really reach in to the internal structure of doc_tote |
michael@0 | 934 | int total_bytes = 0; |
michael@0 | 935 | for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { |
michael@0 | 936 | int plang = doc_tote->Key(sub); |
michael@0 | 937 | if (plang == DocTote::kUnusedKey) {continue;} // Empty slot |
michael@0 | 938 | |
michael@0 | 939 | Language lang = static_cast<Language>(plang); |
michael@0 | 940 | int bytes = doc_tote->Value(sub); |
michael@0 | 941 | int reli = doc_tote->Reliability(sub); |
michael@0 | 942 | if (bytes == 0) {continue;} // Zero bytes |
michael@0 | 943 | total_bytes += bytes; |
michael@0 | 944 | |
michael@0 | 945 | // Reliable percent = stored reliable score over stored bytecount |
michael@0 | 946 | int reliable_percent = reli / bytes; |
michael@0 | 947 | if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper |
michael@0 | 948 | |
michael@0 | 949 | // This language is too unreliable to keep, but we might merge it. |
michael@0 | 950 | Language altlang = UNKNOWN_LANGUAGE; |
michael@0 | 951 | if (lang <= HAWAIIAN) {altlang = kClosestAltLanguage[lang];} |
michael@0 | 952 | if (altlang == UNKNOWN_LANGUAGE) {continue;} // No alternative |
michael@0 | 953 | |
michael@0 | 954 | // Look for alternative in doc_tote |
michael@0 | 955 | int altsub = doc_tote->Find(altlang); |
michael@0 | 956 | if (altsub < 0) {continue;} // No alternative text |
michael@0 | 957 | |
michael@0 | 958 | int bytes2 = doc_tote->Value(altsub); |
michael@0 | 959 | int reli2 = doc_tote->Reliability(altsub); |
michael@0 | 960 | if (bytes2 == 0) {continue;} // Zero bytes |
michael@0 | 961 | |
michael@0 | 962 | // Reliable percent is stored reliable score over stored bytecount |
michael@0 | 963 | int reliable_percent2 = reli2 / bytes2; |
michael@0 | 964 | |
michael@0 | 965 | // Merge one language into the other. Break ties toward lower lang # |
michael@0 | 966 | int tosub = altsub; |
michael@0 | 967 | int fromsub = sub; |
michael@0 | 968 | bool into_lang = false; |
michael@0 | 969 | if ((reliable_percent2 < reliable_percent) || |
michael@0 | 970 | ((reliable_percent2 == reliable_percent) && (lang < altlang))) { |
michael@0 | 971 | tosub = sub; |
michael@0 | 972 | fromsub = altsub; |
michael@0 | 973 | into_lang = true; |
michael@0 | 974 | } |
michael@0 | 975 | |
michael@0 | 976 | // Make sure merged reliability doesn't drop and is enough to avoid delete |
michael@0 | 977 | int newpercent = maxint(reliable_percent, reliable_percent2); |
michael@0 | 978 | newpercent = maxint(newpercent, kMinReliableKeepPercent); |
michael@0 | 979 | int newbytes = bytes + bytes2; |
michael@0 | 980 | int newreli = newpercent * newbytes; |
michael@0 | 981 | |
michael@0 | 982 | doc_tote->SetKey(fromsub, DocTote::kUnusedKey); |
michael@0 | 983 | doc_tote->SetScore(fromsub, 0); |
michael@0 | 984 | doc_tote->SetReliability(fromsub, 0); |
michael@0 | 985 | doc_tote->SetScore(tosub, newbytes); |
michael@0 | 986 | doc_tote->SetReliability(tosub, newreli); |
michael@0 | 987 | |
michael@0 | 988 | // Show fate of unreliable languages if at least 10 bytes |
michael@0 | 989 | if (FLAGS_cld2_html && (newbytes >= 10) && |
michael@0 | 990 | !FLAGS_cld2_quiet) { |
michael@0 | 991 | if (into_lang) { |
michael@0 | 992 | fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ", |
michael@0 | 993 | LanguageCode(altlang), reliable_percent2, bytes2, |
michael@0 | 994 | LanguageCode(lang)); |
michael@0 | 995 | } else { |
michael@0 | 996 | fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ", |
michael@0 | 997 | LanguageCode(lang), reliable_percent, bytes, |
michael@0 | 998 | LanguageCode(altlang)); |
michael@0 | 999 | } |
michael@0 | 1000 | } |
michael@0 | 1001 | } |
michael@0 | 1002 | |
michael@0 | 1003 | |
michael@0 | 1004 | // Pass to delete any remaining unreliable languages |
michael@0 | 1005 | for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { |
michael@0 | 1006 | int plang = doc_tote->Key(sub); |
michael@0 | 1007 | if (plang == DocTote::kUnusedKey) {continue;} // Empty slot |
michael@0 | 1008 | |
michael@0 | 1009 | Language lang = static_cast<Language>(plang); |
michael@0 | 1010 | int bytes = doc_tote->Value(sub); |
michael@0 | 1011 | int reli = doc_tote->Reliability(sub); |
michael@0 | 1012 | if (bytes == 0) {continue;} // Zero bytes |
michael@0 | 1013 | |
michael@0 | 1014 | // Reliable percent is stored as reliable score over stored bytecount |
michael@0 | 1015 | int reliable_percent = reli / bytes; |
michael@0 | 1016 | if (reliable_percent >= kMinReliableKeepPercent) { // Keeper? |
michael@0 | 1017 | continue; // yes |
michael@0 | 1018 | } |
michael@0 | 1019 | |
michael@0 | 1020 | // Delete unreliable entry |
michael@0 | 1021 | doc_tote->SetKey(sub, DocTote::kUnusedKey); |
michael@0 | 1022 | doc_tote->SetScore(sub, 0); |
michael@0 | 1023 | doc_tote->SetReliability(sub, 0); |
michael@0 | 1024 | |
michael@0 | 1025 | // Show fate of unreliable languages if at least 10 bytes |
michael@0 | 1026 | if (FLAGS_cld2_html && (bytes >= 10) && |
michael@0 | 1027 | !FLAGS_cld2_quiet) { |
michael@0 | 1028 | fprintf(stderr, "{Unreli %s.%dR,%dB} ", |
michael@0 | 1029 | LanguageCode(lang), reliable_percent, bytes); |
michael@0 | 1030 | } |
michael@0 | 1031 | } |
michael@0 | 1032 | |
michael@0 | 1033 | ////if (FLAGS_cld2_html) {fprintf(stderr, "<br>\n");} |
michael@0 | 1034 | } |
michael@0 | 1035 | |
michael@0 | 1036 | |
michael@0 | 1037 | // Move all the text bytes from lower byte-count to higher one |
michael@0 | 1038 | void MoveLang1ToLang2(Language lang1, Language lang2, |
michael@0 | 1039 | int lang1_sub, int lang2_sub, |
michael@0 | 1040 | DocTote* doc_tote, |
michael@0 | 1041 | ResultChunkVector* resultchunkvector) { |
michael@0 | 1042 | // In doc_tote, move all the bytes lang1 => lang2 |
michael@0 | 1043 | int sum = doc_tote->Value(lang2_sub) + doc_tote->Value(lang1_sub); |
michael@0 | 1044 | doc_tote->SetValue(lang2_sub, sum); |
michael@0 | 1045 | sum = doc_tote->Score(lang2_sub) + doc_tote->Score(lang1_sub); |
michael@0 | 1046 | doc_tote->SetScore(lang2_sub, sum); |
michael@0 | 1047 | sum = doc_tote->Reliability(lang2_sub) + doc_tote->Reliability(lang1_sub); |
michael@0 | 1048 | doc_tote->SetReliability(lang2_sub, sum); |
michael@0 | 1049 | |
michael@0 | 1050 | // Delete old entry |
michael@0 | 1051 | doc_tote->SetKey(lang1_sub, DocTote::kUnusedKey); |
michael@0 | 1052 | doc_tote->SetScore(lang1_sub, 0); |
michael@0 | 1053 | doc_tote->SetReliability(lang1_sub, 0); |
michael@0 | 1054 | |
michael@0 | 1055 | // In resultchunkvector, move all the bytes lang1 => lang2 |
michael@0 | 1056 | if (resultchunkvector == NULL) {return;} |
michael@0 | 1057 | |
michael@0 | 1058 | int k = 0; |
michael@0 | 1059 | uint16 prior_lang = UNKNOWN_LANGUAGE; |
michael@0 | 1060 | for (int i = 0; i < static_cast<int>(resultchunkvector->size()); ++i) { |
michael@0 | 1061 | ResultChunk* rc = &(*resultchunkvector)[i]; |
michael@0 | 1062 | if (rc->lang1 == lang1) { |
michael@0 | 1063 | // Update entry[i] lang1 => lang2 |
michael@0 | 1064 | rc->lang1 = lang2; |
michael@0 | 1065 | } |
michael@0 | 1066 | // One change may produce two merges -- entry before and entry after |
michael@0 | 1067 | if ((rc->lang1 == prior_lang) && (k > 0)) { |
michael@0 | 1068 | // Merge with previous, deleting entry[i] |
michael@0 | 1069 | ResultChunk* prior_rc = &(*resultchunkvector)[k - 1]; |
michael@0 | 1070 | prior_rc->bytes += rc->bytes; |
michael@0 | 1071 | // fprintf(stderr, "MoveLang1ToLang2 merged [%d] => [%d]<br>\n", i, k-1); |
michael@0 | 1072 | } else { |
michael@0 | 1073 | // Keep entry[i] |
michael@0 | 1074 | (*resultchunkvector)[k] = (*resultchunkvector)[i]; |
michael@0 | 1075 | // fprintf(stderr, "MoveLang1ToLang2 keep [%d] => [%d]<br>\n", i, k); |
michael@0 | 1076 | ++k; |
michael@0 | 1077 | } |
michael@0 | 1078 | prior_lang = rc->lang1; |
michael@0 | 1079 | } |
michael@0 | 1080 | resultchunkvector->resize(k); |
michael@0 | 1081 | } |
michael@0 | 1082 | |
michael@0 | 1083 | |
michael@0 | 1084 | |
michael@0 | 1085 | // Move less likely byte count to more likely for close pairs of languages |
michael@0 | 1086 | // If given, also update resultchunkvector |
michael@0 | 1087 | void RefineScoredClosePairs(DocTote* doc_tote, |
michael@0 | 1088 | ResultChunkVector* resultchunkvector, |
michael@0 | 1089 | bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) { |
michael@0 | 1090 | for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { |
michael@0 | 1091 | int close_packedlang = doc_tote->Key(sub); |
michael@0 | 1092 | int subscr = LanguageCloseSet(static_cast<Language>(close_packedlang)); |
michael@0 | 1093 | if (subscr == 0) {continue;} |
michael@0 | 1094 | |
michael@0 | 1095 | // We have a close pair language -- if the other one is also scored and the |
michael@0 | 1096 | // longword score differs enough, put all our eggs into one basket |
michael@0 | 1097 | |
michael@0 | 1098 | // Nonzero longword score: Go look for the other of this pair |
michael@0 | 1099 | for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) { |
michael@0 | 1100 | if (LanguageCloseSet(static_cast<Language>(doc_tote->Key(sub2))) == subscr) { |
michael@0 | 1101 | // We have a matching pair |
michael@0 | 1102 | int close_packedlang2 = doc_tote->Key(sub2); |
michael@0 | 1103 | |
michael@0 | 1104 | // Move all the text bytes from lower byte-count to higher one |
michael@0 | 1105 | int from_sub, to_sub; |
michael@0 | 1106 | Language from_lang, to_lang; |
michael@0 | 1107 | if (doc_tote->Value(sub) < doc_tote->Value(sub2)) { |
michael@0 | 1108 | from_sub = sub; |
michael@0 | 1109 | to_sub = sub2; |
michael@0 | 1110 | from_lang = static_cast<Language>(close_packedlang); |
michael@0 | 1111 | to_lang = static_cast<Language>(close_packedlang2); |
michael@0 | 1112 | } else { |
michael@0 | 1113 | from_sub = sub2; |
michael@0 | 1114 | to_sub = sub; |
michael@0 | 1115 | from_lang = static_cast<Language>(close_packedlang2); |
michael@0 | 1116 | to_lang = static_cast<Language>(close_packedlang); |
michael@0 | 1117 | } |
michael@0 | 1118 | |
michael@0 | 1119 | if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) { |
michael@0 | 1120 | // Show fate of closepair language |
michael@0 | 1121 | int val = doc_tote->Value(from_sub); // byte count |
michael@0 | 1122 | int reli = doc_tote->Reliability(from_sub); |
michael@0 | 1123 | int reliable_percent = reli / (val ? val : 1); // avoid zdiv |
michael@0 | 1124 | fprintf(stderr, "{CloseLangPair: %s.%dR,%dB => %s}<br>\n", |
michael@0 | 1125 | LanguageCode(from_lang), |
michael@0 | 1126 | reliable_percent, |
michael@0 | 1127 | doc_tote->Value(from_sub), |
michael@0 | 1128 | LanguageCode(to_lang)); |
michael@0 | 1129 | } |
michael@0 | 1130 | MoveLang1ToLang2(from_lang, to_lang, from_sub, to_sub, |
michael@0 | 1131 | doc_tote, resultchunkvector); |
michael@0 | 1132 | break; // Exit inner for sub2 loop |
michael@0 | 1133 | } |
michael@0 | 1134 | } // End for sub2 |
michael@0 | 1135 | } // End for sub |
michael@0 | 1136 | } |
michael@0 | 1137 | |
michael@0 | 1138 | |
michael@0 | 1139 | void ApplyAllLanguageHints(Tote* chunk_tote, int tote_grams, |
michael@0 | 1140 | uint8* lang_hint_boost) { |
michael@0 | 1141 | } |
michael@0 | 1142 | |
michael@0 | 1143 | |
michael@0 | 1144 | void PrintHtmlEscapedText(FILE* f, const char* txt, int len) { |
michael@0 | 1145 | string temp(txt, len); |
michael@0 | 1146 | fprintf(f, "%s", GetHtmlEscapedText(temp).c_str()); |
michael@0 | 1147 | } |
michael@0 | 1148 | |
michael@0 | 1149 | void PrintLang(FILE* f, Tote* chunk_tote, |
michael@0 | 1150 | Language cur_lang, bool cur_unreliable, |
michael@0 | 1151 | Language prior_lang, bool prior_unreliable) { |
michael@0 | 1152 | if (cur_lang == prior_lang) { |
michael@0 | 1153 | fprintf(f, "[]"); |
michael@0 | 1154 | } else { |
michael@0 | 1155 | fprintf(f, "[%s%s]", LanguageCode(cur_lang), cur_unreliable ? "*" : ""); |
michael@0 | 1156 | } |
michael@0 | 1157 | } |
michael@0 | 1158 | |
michael@0 | 1159 | |
michael@0 | 1160 | void PrintTopLang(Language top_lang) { |
michael@0 | 1161 | if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) { |
michael@0 | 1162 | fprintf(stderr, "[] "); |
michael@0 | 1163 | } else { |
michael@0 | 1164 | fprintf(stderr, "[%s] ", LanguageName(top_lang)); |
michael@0 | 1165 | prior_lang = top_lang; |
michael@0 | 1166 | } |
michael@0 | 1167 | } |
michael@0 | 1168 | |
michael@0 | 1169 | void PrintTopLangSpeculative(Language top_lang) { |
michael@0 | 1170 | fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0); |
michael@0 | 1171 | if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) { |
michael@0 | 1172 | fprintf(stderr, "[] "); |
michael@0 | 1173 | } else { |
michael@0 | 1174 | fprintf(stderr, "[%s] ", LanguageName(top_lang)); |
michael@0 | 1175 | prior_lang = top_lang; |
michael@0 | 1176 | } |
michael@0 | 1177 | fprintf(stderr, "</span>\n"); |
michael@0 | 1178 | } |
michael@0 | 1179 | |
michael@0 | 1180 | void PrintLangs(FILE* f, const Language* language3, const int* percent3, |
michael@0 | 1181 | const int* text_bytes, const bool* is_reliable) { |
michael@0 | 1182 | fprintf(f, "<br> Initial_Languages "); |
michael@0 | 1183 | if (language3[0] != UNKNOWN_LANGUAGE) { |
michael@0 | 1184 | fprintf(f, "%s%s(%d%%) ", |
michael@0 | 1185 | LanguageName(language3[0]), |
michael@0 | 1186 | *is_reliable ? "" : "*", |
michael@0 | 1187 | percent3[0]); |
michael@0 | 1188 | } |
michael@0 | 1189 | if (language3[1] != UNKNOWN_LANGUAGE) { |
michael@0 | 1190 | fprintf(f, "%s(%d%%) ", LanguageName(language3[1]), percent3[1]); |
michael@0 | 1191 | } |
michael@0 | 1192 | if (language3[2] != UNKNOWN_LANGUAGE) { |
michael@0 | 1193 | fprintf(f, "%s(%d%%) ", LanguageName(language3[2]), percent3[2]); |
michael@0 | 1194 | } |
michael@0 | 1195 | fprintf(f, "%d bytes \n", *text_bytes); |
michael@0 | 1196 | |
michael@0 | 1197 | fprintf(f, "<br>\n"); |
michael@0 | 1198 | } |
michael@0 | 1199 | |
michael@0 | 1200 | |
michael@0 | 1201 | // Return internal probability score (sum) per 1024 bytes |
michael@0 | 1202 | double GetNormalizedScore(Language lang, ULScript ulscript, |
michael@0 | 1203 | int bytecount, int score) { |
michael@0 | 1204 | if (bytecount <= 0) {return 0.0;} |
michael@0 | 1205 | return (score << 10) / bytecount; |
michael@0 | 1206 | } |
michael@0 | 1207 | |
michael@0 | 1208 | // Extract return values before fixups |
michael@0 | 1209 | void ExtractLangEtc(DocTote* doc_tote, int total_text_bytes, |
michael@0 | 1210 | int* reliable_percent3, Language* language3, int* percent3, |
michael@0 | 1211 | double* normalized_score3, |
michael@0 | 1212 | int* text_bytes, bool* is_reliable) { |
michael@0 | 1213 | reliable_percent3[0] = 0; |
michael@0 | 1214 | reliable_percent3[1] = 0; |
michael@0 | 1215 | reliable_percent3[2] = 0; |
michael@0 | 1216 | language3[0] = UNKNOWN_LANGUAGE; |
michael@0 | 1217 | language3[1] = UNKNOWN_LANGUAGE; |
michael@0 | 1218 | language3[2] = UNKNOWN_LANGUAGE; |
michael@0 | 1219 | percent3[0] = 0; |
michael@0 | 1220 | percent3[1] = 0; |
michael@0 | 1221 | percent3[2] = 0; |
michael@0 | 1222 | normalized_score3[0] = 0.0; |
michael@0 | 1223 | normalized_score3[1] = 0.0; |
michael@0 | 1224 | normalized_score3[2] = 0.0; |
michael@0 | 1225 | |
michael@0 | 1226 | *text_bytes = total_text_bytes; |
michael@0 | 1227 | *is_reliable = false; |
michael@0 | 1228 | |
michael@0 | 1229 | int bytecount1 = 0; |
michael@0 | 1230 | int bytecount2 = 0; |
michael@0 | 1231 | int bytecount3 = 0; |
michael@0 | 1232 | |
michael@0 | 1233 | int lang1 = doc_tote->Key(0); |
michael@0 | 1234 | if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) { |
michael@0 | 1235 | // We have a top language |
michael@0 | 1236 | language3[0] = static_cast<Language>(lang1); |
michael@0 | 1237 | bytecount1 = doc_tote->Value(0); |
michael@0 | 1238 | int reli1 = doc_tote->Reliability(0); |
michael@0 | 1239 | reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1); // avoid zdiv |
michael@0 | 1240 | normalized_score3[0] = GetNormalizedScore(language3[0], |
michael@0 | 1241 | ULScript_Common, |
michael@0 | 1242 | bytecount1, |
michael@0 | 1243 | doc_tote->Score(0)); |
michael@0 | 1244 | } |
michael@0 | 1245 | |
michael@0 | 1246 | int lang2 = doc_tote->Key(1); |
michael@0 | 1247 | if ((lang2 != DocTote::kUnusedKey) && (lang2 != UNKNOWN_LANGUAGE)) { |
michael@0 | 1248 | language3[1] = static_cast<Language>(lang2); |
michael@0 | 1249 | bytecount2 = doc_tote->Value(1); |
michael@0 | 1250 | int reli2 = doc_tote->Reliability(1); |
michael@0 | 1251 | reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1); // avoid zdiv |
michael@0 | 1252 | normalized_score3[1] = GetNormalizedScore(language3[1], |
michael@0 | 1253 | ULScript_Common, |
michael@0 | 1254 | bytecount2, |
michael@0 | 1255 | doc_tote->Score(1)); |
michael@0 | 1256 | } |
michael@0 | 1257 | |
michael@0 | 1258 | int lang3 = doc_tote->Key(2); |
michael@0 | 1259 | if ((lang3 != DocTote::kUnusedKey) && (lang3 != UNKNOWN_LANGUAGE)) { |
michael@0 | 1260 | language3[2] = static_cast<Language>(lang3); |
michael@0 | 1261 | bytecount3 = doc_tote->Value(2); |
michael@0 | 1262 | int reli3 = doc_tote->Reliability(2); |
michael@0 | 1263 | reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1); // avoid zdiv |
michael@0 | 1264 | normalized_score3[2] = GetNormalizedScore(language3[2], |
michael@0 | 1265 | ULScript_Common, |
michael@0 | 1266 | bytecount3, |
michael@0 | 1267 | doc_tote->Score(2)); |
michael@0 | 1268 | } |
michael@0 | 1269 | |
michael@0 | 1270 | // Increase total bytes to sum (top 3) if low for some reason |
michael@0 | 1271 | int total_bytecount12 = bytecount1 + bytecount2; |
michael@0 | 1272 | int total_bytecount123 = total_bytecount12 + bytecount3; |
michael@0 | 1273 | if (total_text_bytes < total_bytecount123) { |
michael@0 | 1274 | total_text_bytes = total_bytecount123; |
michael@0 | 1275 | *text_bytes = total_text_bytes; |
michael@0 | 1276 | } |
michael@0 | 1277 | |
michael@0 | 1278 | // Sum minus previous % gives better roundoff behavior than bytecount/total |
michael@0 | 1279 | int total_text_bytes_div = maxint(1, total_text_bytes); // Avoid zdiv |
michael@0 | 1280 | percent3[0] = (bytecount1 * 100) / total_text_bytes_div; |
michael@0 | 1281 | percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div; |
michael@0 | 1282 | percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div; |
michael@0 | 1283 | percent3[2] -= percent3[1]; |
michael@0 | 1284 | percent3[1] -= percent3[0]; |
michael@0 | 1285 | |
michael@0 | 1286 | // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2% |
michael@0 | 1287 | // Fix this explicitly |
michael@0 | 1288 | if (percent3[1] < percent3[2]) { |
michael@0 | 1289 | ++percent3[1]; |
michael@0 | 1290 | --percent3[2]; |
michael@0 | 1291 | } |
michael@0 | 1292 | if (percent3[0] < percent3[1]) { |
michael@0 | 1293 | ++percent3[0]; |
michael@0 | 1294 | --percent3[1]; |
michael@0 | 1295 | } |
michael@0 | 1296 | |
michael@0 | 1297 | *text_bytes = total_text_bytes; |
michael@0 | 1298 | |
michael@0 | 1299 | if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) { |
michael@0 | 1300 | // We have a top language |
michael@0 | 1301 | // Its reliability is overall result reliability |
michael@0 | 1302 | int bytecount = doc_tote->Value(0); |
michael@0 | 1303 | int reli = doc_tote->Reliability(0); |
michael@0 | 1304 | int reliable_percent = reli / (bytecount ? bytecount : 1); // avoid zdiv |
michael@0 | 1305 | *is_reliable = (reliable_percent >= kMinReliableKeepPercent); |
michael@0 | 1306 | } else { |
michael@0 | 1307 | // No top language at all. This can happen with zero text or 100% Klingon |
michael@0 | 1308 | // if extended=false. Just return all UNKNOWN_LANGUAGE, unreliable. |
michael@0 | 1309 | *is_reliable = false; |
michael@0 | 1310 | } |
michael@0 | 1311 | |
michael@0 | 1312 | // If ignore percent is too large, set unreliable. |
michael@0 | 1313 | int ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]); |
michael@0 | 1314 | if ((ignore_percent > kIgnoreMaxPercent)) { |
michael@0 | 1315 | *is_reliable = false; |
michael@0 | 1316 | } |
michael@0 | 1317 | } |
michael@0 | 1318 | |
michael@0 | 1319 | bool IsFIGS(Language lang) { |
michael@0 | 1320 | if (lang == FRENCH) {return true;} |
michael@0 | 1321 | if (lang == ITALIAN) {return true;} |
michael@0 | 1322 | if (lang == GERMAN) {return true;} |
michael@0 | 1323 | if (lang == SPANISH) {return true;} |
michael@0 | 1324 | return false; |
michael@0 | 1325 | } |
michael@0 | 1326 | |
michael@0 | 1327 | bool IsEFIGS(Language lang) { |
michael@0 | 1328 | if (lang == ENGLISH) {return true;} |
michael@0 | 1329 | if (lang == FRENCH) {return true;} |
michael@0 | 1330 | if (lang == ITALIAN) {return true;} |
michael@0 | 1331 | if (lang == GERMAN) {return true;} |
michael@0 | 1332 | if (lang == SPANISH) {return true;} |
michael@0 | 1333 | return false; |
michael@0 | 1334 | } |
michael@0 | 1335 | |
michael@0 | 1336 | // For Tier3 languages, require more bytes of text to override |
michael@0 | 1337 | // the first-place language |
michael@0 | 1338 | static const int kGoodSecondT1T2MinBytes = 15; // <this => no second |
michael@0 | 1339 | static const int kGoodSecondT3MinBytes = 128; // <this => no second |
michael@0 | 1340 | |
michael@0 | 1341 | // Calculate a single summary language for the document, and its reliability. |
michael@0 | 1342 | // Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE |
michael@0 | 1343 | // This is the heart of matching human-rater perception. |
michael@0 | 1344 | // reliable_percent3[] is currently unused |
michael@0 | 1345 | // |
michael@0 | 1346 | // Do not return Tier3 second language unless there are at least 128 bytes |
michael@0 | 1347 | void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes, |
michael@0 | 1348 | const int* reliable_percent3, |
michael@0 | 1349 | const Language* language3, |
michael@0 | 1350 | const int* percent3, |
michael@0 | 1351 | Language* summary_lang, bool* is_reliable, |
michael@0 | 1352 | bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) { |
michael@0 | 1353 | // Vector of active languages; changes if we delete some |
michael@0 | 1354 | int slot_count = 3; |
michael@0 | 1355 | int active_slot[3] = {0, 1, 2}; |
michael@0 | 1356 | |
michael@0 | 1357 | int ignore_percent = 0; |
michael@0 | 1358 | int return_percent = percent3[0]; // Default to top lang |
michael@0 | 1359 | *summary_lang = language3[0]; |
michael@0 | 1360 | *is_reliable = true; |
michael@0 | 1361 | if (percent3[0] < kKeepMinPercent) {*is_reliable = false;} |
michael@0 | 1362 | |
michael@0 | 1363 | // If any of top 3 is IGNORE, remove it and increment ignore_percent |
michael@0 | 1364 | for (int i = 0; i < 3; ++i) { |
michael@0 | 1365 | if (language3[i] == TG_UNKNOWN_LANGUAGE) { |
michael@0 | 1366 | ignore_percent += percent3[i]; |
michael@0 | 1367 | // Move the rest up, levaing input vectors unchanged |
michael@0 | 1368 | for (int j=i+1; j < 3; ++j) { |
michael@0 | 1369 | active_slot[j - 1] = active_slot[j]; |
michael@0 | 1370 | } |
michael@0 | 1371 | -- slot_count; |
michael@0 | 1372 | // Logically remove Ignore from percentage-text calculation |
michael@0 | 1373 | // (extra 1 in 101 avoids zdiv, biases slightly small) |
michael@0 | 1374 | return_percent = (percent3[0] * 100) / (101 - ignore_percent); |
michael@0 | 1375 | *summary_lang = language3[active_slot[0]]; |
michael@0 | 1376 | if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;} |
michael@0 | 1377 | } |
michael@0 | 1378 | } |
michael@0 | 1379 | |
michael@0 | 1380 | |
michael@0 | 1381 | // If English and X, where X (not UNK) is big enough, |
michael@0 | 1382 | // assume the English is boilerplate and return X. |
michael@0 | 1383 | // Logically remove English from percentage-text calculation |
michael@0 | 1384 | int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100; |
michael@0 | 1385 | // Require more bytes of text for Tier3 languages |
michael@0 | 1386 | int minbytesneeded = kGoodSecondT1T2MinBytes; |
michael@0 | 1387 | int plang_second = PerScriptNumber(ULScript_Latin, language3[active_slot[1]]); |
michael@0 | 1388 | |
michael@0 | 1389 | if ((language3[active_slot[0]] == ENGLISH) && |
michael@0 | 1390 | (language3[active_slot[1]] != ENGLISH) && |
michael@0 | 1391 | (language3[active_slot[1]] != UNKNOWN_LANGUAGE) && |
michael@0 | 1392 | (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) && |
michael@0 | 1393 | (second_bytes >= minbytesneeded)) { |
michael@0 | 1394 | ignore_percent += percent3[active_slot[0]]; |
michael@0 | 1395 | return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent); |
michael@0 | 1396 | *summary_lang = language3[active_slot[1]]; |
michael@0 | 1397 | if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;} |
michael@0 | 1398 | |
michael@0 | 1399 | // Else If FIGS and X, where X (not UNK, EFIGS) is big enough, |
michael@0 | 1400 | // assume the FIGS is boilerplate and return X. |
michael@0 | 1401 | // Logically remove FIGS from percentage-text calculation |
michael@0 | 1402 | } else if (IsFIGS(language3[active_slot[0]]) && |
michael@0 | 1403 | !IsEFIGS(language3[active_slot[1]]) && |
michael@0 | 1404 | (language3[active_slot[1]] != UNKNOWN_LANGUAGE) && |
michael@0 | 1405 | (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) && |
michael@0 | 1406 | (second_bytes >= minbytesneeded)) { |
michael@0 | 1407 | ignore_percent += percent3[active_slot[0]]; |
michael@0 | 1408 | return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent); |
michael@0 | 1409 | *summary_lang = language3[active_slot[1]]; |
michael@0 | 1410 | if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;} |
michael@0 | 1411 | |
michael@0 | 1412 | // Else we are returning the first language, but want to improve its |
michael@0 | 1413 | // return_percent if the second language should be ignored |
michael@0 | 1414 | } else if ((language3[active_slot[1]] == ENGLISH) && |
michael@0 | 1415 | (language3[active_slot[0]] != ENGLISH)) { |
michael@0 | 1416 | ignore_percent += percent3[active_slot[1]]; |
michael@0 | 1417 | return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent); |
michael@0 | 1418 | } else if (IsFIGS(language3[active_slot[1]]) && |
michael@0 | 1419 | !IsEFIGS(language3[active_slot[0]])) { |
michael@0 | 1420 | ignore_percent += percent3[active_slot[1]]; |
michael@0 | 1421 | return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent); |
michael@0 | 1422 | } |
michael@0 | 1423 | |
michael@0 | 1424 | // If return percent is too small (too many languages), return UNKNOWN |
michael@0 | 1425 | if ((return_percent < kGoodFirstMinPercent)) { |
michael@0 | 1426 | if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { |
michael@0 | 1427 | fprintf(stderr, "{Unreli %s %d%% percent too small} ", |
michael@0 | 1428 | LanguageCode(*summary_lang), return_percent); |
michael@0 | 1429 | } |
michael@0 | 1430 | *summary_lang = UNKNOWN_LANGUAGE; |
michael@0 | 1431 | *is_reliable = false; |
michael@0 | 1432 | } |
michael@0 | 1433 | |
michael@0 | 1434 | // If return percent is small, return language but set unreliable. |
michael@0 | 1435 | if ((return_percent < kGoodFirstReliableMinPercent)) { |
michael@0 | 1436 | *is_reliable = false; |
michael@0 | 1437 | } |
michael@0 | 1438 | |
michael@0 | 1439 | // If ignore percent is too large, set unreliable. |
michael@0 | 1440 | ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]); |
michael@0 | 1441 | if ((ignore_percent > kIgnoreMaxPercent)) { |
michael@0 | 1442 | *is_reliable = false; |
michael@0 | 1443 | } |
michael@0 | 1444 | |
michael@0 | 1445 | // If we removed all the active languages, return UNKNOWN |
michael@0 | 1446 | if (slot_count == 0) { |
michael@0 | 1447 | if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { |
michael@0 | 1448 | fprintf(stderr, "{Unreli %s no languages left} ", |
michael@0 | 1449 | LanguageCode(*summary_lang)); |
michael@0 | 1450 | } |
michael@0 | 1451 | *summary_lang = UNKNOWN_LANGUAGE; |
michael@0 | 1452 | *is_reliable = false; |
michael@0 | 1453 | } |
michael@0 | 1454 | } |
michael@0 | 1455 | |
michael@0 | 1456 | void AddLangPriorBoost(Language lang, uint32 langprob, |
michael@0 | 1457 | ScoringContext* scoringcontext) { |
michael@0 | 1458 | // This is called 0..n times with language hints |
michael@0 | 1459 | // but we don't know the script -- so boost either or both Latn, Othr. |
michael@0 | 1460 | |
michael@0 | 1461 | if (IsLatnLanguage(lang)) { |
michael@0 | 1462 | LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn; |
michael@0 | 1463 | int n = langprior_boost->n; |
michael@0 | 1464 | langprior_boost->langprob[n] = langprob; |
michael@0 | 1465 | langprior_boost->n = langprior_boost->wrap(n + 1); |
michael@0 | 1466 | } |
michael@0 | 1467 | |
michael@0 | 1468 | if (IsOthrLanguage(lang)) { |
michael@0 | 1469 | LangBoosts* langprior_boost = &scoringcontext->langprior_boost.othr; |
michael@0 | 1470 | int n = langprior_boost->n; |
michael@0 | 1471 | langprior_boost->langprob[n] = langprob; |
michael@0 | 1472 | langprior_boost->n = langprior_boost->wrap(n + 1); |
michael@0 | 1473 | } |
michael@0 | 1474 | |
michael@0 | 1475 | } |
michael@0 | 1476 | |
michael@0 | 1477 | void AddOneWhack(Language whacker_lang, Language whackee_lang, |
michael@0 | 1478 | ScoringContext* scoringcontext) { |
michael@0 | 1479 | uint32 langprob = MakeLangProb(whackee_lang, 1); |
michael@0 | 1480 | // This logic avoids hr-Latn whacking sr-Cyrl, but still whacks sr-Latn |
michael@0 | 1481 | if (IsLatnLanguage(whacker_lang) && IsLatnLanguage(whackee_lang)) { |
michael@0 | 1482 | LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn; |
michael@0 | 1483 | int n = langprior_whack->n; |
michael@0 | 1484 | langprior_whack->langprob[n] = langprob; |
michael@0 | 1485 | langprior_whack->n = langprior_whack->wrap(n + 1); |
michael@0 | 1486 | } |
michael@0 | 1487 | if (IsOthrLanguage(whacker_lang) && IsOthrLanguage(whackee_lang)) { |
michael@0 | 1488 | LangBoosts* langprior_whack = &scoringcontext->langprior_whack.othr; |
michael@0 | 1489 | int n = langprior_whack->n; |
michael@0 | 1490 | langprior_whack->langprob[n] = langprob; |
michael@0 | 1491 | langprior_whack->n = langprior_whack->wrap(n + 1); |
michael@0 | 1492 | } |
michael@0 | 1493 | } |
michael@0 | 1494 | |
michael@0 | 1495 | void AddCloseLangWhack(Language lang, ScoringContext* scoringcontext) { |
michael@0 | 1496 | // We do not in general want zh-Hans and zh-Hant to be close pairs, |
michael@0 | 1497 | // but we do here. |
michael@0 | 1498 | if (lang == CLD2::CHINESE) { |
michael@0 | 1499 | AddOneWhack(lang, CLD2::CHINESE_T, scoringcontext); |
michael@0 | 1500 | return; |
michael@0 | 1501 | } |
michael@0 | 1502 | if (lang == CLD2::CHINESE_T) { |
michael@0 | 1503 | AddOneWhack(lang, CLD2::CHINESE, scoringcontext); |
michael@0 | 1504 | return; |
michael@0 | 1505 | } |
michael@0 | 1506 | |
michael@0 | 1507 | int base_lang_set = LanguageCloseSet(lang); |
michael@0 | 1508 | if (base_lang_set == 0) {return;} |
michael@0 | 1509 | // TODO: add an explicit list of each set to avoid this 512-times loop |
michael@0 | 1510 | for (int i = 0; i < kLanguageToPLangSize; ++i) { |
michael@0 | 1511 | Language lang2 = static_cast<Language>(i); |
michael@0 | 1512 | if ((base_lang_set == LanguageCloseSet(lang2)) && (lang != lang2)) { |
michael@0 | 1513 | AddOneWhack(lang, lang2, scoringcontext); |
michael@0 | 1514 | } |
michael@0 | 1515 | } |
michael@0 | 1516 | } |
michael@0 | 1517 | |
michael@0 | 1518 | |
michael@0 | 1519 | void ApplyHints(const char* buffer, |
michael@0 | 1520 | int buffer_length, |
michael@0 | 1521 | bool is_plain_text, |
michael@0 | 1522 | const CLDHints* cld_hints, |
michael@0 | 1523 | ScoringContext* scoringcontext) { |
michael@0 | 1524 | CLDLangPriors lang_priors; |
michael@0 | 1525 | InitCLDLangPriors(&lang_priors); |
michael@0 | 1526 | |
michael@0 | 1527 | // We now use lang= tags. |
michael@0 | 1528 | // Last look, circa 2008 found only 15% of web pages with lang= tags and |
michael@0 | 1529 | // many of those were wrong. Now (July 2011), we find 44% of web pages have |
michael@0 | 1530 | // lang= tags, and most of them are correct. So we now give them substantial |
michael@0 | 1531 | // weight in each chunk scored. |
michael@0 | 1532 | if (!is_plain_text) { |
michael@0 | 1533 | // Get any contained language tags in first n KB |
michael@0 | 1534 | int32 max_scan_bytes = FLAGS_cld_max_lang_tag_scan_kb << 10; |
michael@0 | 1535 | string lang_tags = GetLangTagsFromHtml(buffer, buffer_length, |
michael@0 | 1536 | max_scan_bytes); |
michael@0 | 1537 | SetCLDLangTagsHint(lang_tags, &lang_priors); |
michael@0 | 1538 | if (scoringcontext->flags_cld2_html) { |
michael@0 | 1539 | if (!lang_tags.empty()) { |
michael@0 | 1540 | fprintf(scoringcontext->debug_file, "<br>lang_tags '%s'<br>\n", |
michael@0 | 1541 | lang_tags.c_str()); |
michael@0 | 1542 | } |
michael@0 | 1543 | } |
michael@0 | 1544 | } |
michael@0 | 1545 | |
michael@0 | 1546 | if (cld_hints != NULL) { |
michael@0 | 1547 | if ((cld_hints->content_language_hint != NULL) && |
michael@0 | 1548 | (cld_hints->content_language_hint[0] != '\0')) { |
michael@0 | 1549 | SetCLDContentLangHint(cld_hints->content_language_hint, &lang_priors); |
michael@0 | 1550 | } |
michael@0 | 1551 | |
michael@0 | 1552 | // Input is from GetTLD(), already lowercased |
michael@0 | 1553 | if ((cld_hints->tld_hint != NULL) && (cld_hints->tld_hint[0] != '\0')) { |
michael@0 | 1554 | SetCLDTLDHint(cld_hints->tld_hint, &lang_priors); |
michael@0 | 1555 | } |
michael@0 | 1556 | |
michael@0 | 1557 | if (cld_hints->encoding_hint != UNKNOWN_ENCODING) { |
michael@0 | 1558 | Encoding enc = static_cast<Encoding>(cld_hints->encoding_hint); |
michael@0 | 1559 | SetCLDEncodingHint(enc, &lang_priors); |
michael@0 | 1560 | } |
michael@0 | 1561 | |
michael@0 | 1562 | if (cld_hints->language_hint != UNKNOWN_LANGUAGE) { |
michael@0 | 1563 | SetCLDLanguageHint(cld_hints->language_hint, &lang_priors); |
michael@0 | 1564 | } |
michael@0 | 1565 | } |
michael@0 | 1566 | |
michael@0 | 1567 | // Keep no more than four different languages with hints |
michael@0 | 1568 | TrimCLDLangPriors(4, &lang_priors); |
michael@0 | 1569 | |
michael@0 | 1570 | if (scoringcontext->flags_cld2_html) { |
michael@0 | 1571 | string print_temp = DumpCLDLangPriors(&lang_priors); |
michael@0 | 1572 | if (!print_temp.empty()) { |
michael@0 | 1573 | fprintf(scoringcontext->debug_file, "DumpCLDLangPriors %s<br>\n", |
michael@0 | 1574 | print_temp.c_str()); |
michael@0 | 1575 | } |
michael@0 | 1576 | } |
michael@0 | 1577 | |
michael@0 | 1578 | // Put boosts into ScoringContext |
michael@0 | 1579 | for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) { |
michael@0 | 1580 | Language lang = GetCLDPriorLang(lang_priors.prior[i]); |
michael@0 | 1581 | int qprob = GetCLDPriorWeight(lang_priors.prior[i]); |
michael@0 | 1582 | if (qprob > 0) { |
michael@0 | 1583 | uint32 langprob = MakeLangProb(lang, qprob); |
michael@0 | 1584 | AddLangPriorBoost(lang, langprob, scoringcontext); |
michael@0 | 1585 | } |
michael@0 | 1586 | } |
michael@0 | 1587 | |
michael@0 | 1588 | // Put whacks into scoring context |
michael@0 | 1589 | // We do not in general want zh-Hans and zh-Hant to be close pairs, |
michael@0 | 1590 | // but we do here. Use close_set_count[kCloseSetSize] to count zh, zh-Hant |
michael@0 | 1591 | std::vector<int> close_set_count(kCloseSetSize + 1, 0); |
michael@0 | 1592 | |
michael@0 | 1593 | for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) { |
michael@0 | 1594 | Language lang = GetCLDPriorLang(lang_priors.prior[i]); |
michael@0 | 1595 | ++close_set_count[LanguageCloseSet(lang)]; |
michael@0 | 1596 | if (lang == CLD2::CHINESE) {++close_set_count[kCloseSetSize];} |
michael@0 | 1597 | if (lang == CLD2::CHINESE_T) {++close_set_count[kCloseSetSize];} |
michael@0 | 1598 | } |
michael@0 | 1599 | |
michael@0 | 1600 | // If a boost language is in a close set, force suppressing the others in |
michael@0 | 1601 | // that set, if exactly one of the set is present |
michael@0 | 1602 | for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) { |
michael@0 | 1603 | Language lang = GetCLDPriorLang(lang_priors.prior[i]); |
michael@0 | 1604 | int qprob = GetCLDPriorWeight(lang_priors.prior[i]); |
michael@0 | 1605 | if (qprob > 0) { |
michael@0 | 1606 | int close_set = LanguageCloseSet(lang); |
michael@0 | 1607 | if ((close_set > 0) && (close_set_count[close_set] == 1)) { |
michael@0 | 1608 | AddCloseLangWhack(lang, scoringcontext); |
michael@0 | 1609 | } |
michael@0 | 1610 | if (((lang == CLD2::CHINESE) || (lang == CLD2::CHINESE_T)) && |
michael@0 | 1611 | (close_set_count[kCloseSetSize] == 1)) { |
michael@0 | 1612 | AddCloseLangWhack(lang, scoringcontext); |
michael@0 | 1613 | } |
michael@0 | 1614 | } |
michael@0 | 1615 | } |
michael@0 | 1616 | |
michael@0 | 1617 | |
michael@0 | 1618 | |
michael@0 | 1619 | |
michael@0 | 1620 | |
michael@0 | 1621 | |
michael@0 | 1622 | } |
michael@0 | 1623 | |
michael@0 | 1624 | |
michael@0 | 1625 | |
michael@0 | 1626 | // Results language3/percent3/text_bytes must be exactly three items |
michael@0 | 1627 | Language DetectLanguageSummaryV2( |
michael@0 | 1628 | const char* buffer, |
michael@0 | 1629 | int buffer_length, |
michael@0 | 1630 | bool is_plain_text, |
michael@0 | 1631 | const CLDHints* cld_hints, |
michael@0 | 1632 | bool allow_extended_lang, |
michael@0 | 1633 | int flags, |
michael@0 | 1634 | Language plus_one, |
michael@0 | 1635 | Language* language3, |
michael@0 | 1636 | int* percent3, |
michael@0 | 1637 | double* normalized_score3, |
michael@0 | 1638 | ResultChunkVector* resultchunkvector, |
michael@0 | 1639 | int* text_bytes, |
michael@0 | 1640 | bool* is_reliable) { |
michael@0 | 1641 | language3[0] = UNKNOWN_LANGUAGE; |
michael@0 | 1642 | language3[1] = UNKNOWN_LANGUAGE; |
michael@0 | 1643 | language3[2] = UNKNOWN_LANGUAGE; |
michael@0 | 1644 | percent3[0] = 0; |
michael@0 | 1645 | percent3[1] = 0; |
michael@0 | 1646 | percent3[2] = 0; |
michael@0 | 1647 | normalized_score3[0] = 0.0; |
michael@0 | 1648 | normalized_score3[1] = 0.0; |
michael@0 | 1649 | normalized_score3[2] = 0.0; |
michael@0 | 1650 | if (resultchunkvector != NULL) { |
michael@0 | 1651 | resultchunkvector->clear(); |
michael@0 | 1652 | } |
michael@0 | 1653 | *text_bytes = 0; |
michael@0 | 1654 | *is_reliable = false; |
michael@0 | 1655 | |
michael@0 | 1656 | if ((flags & kCLDFlagEcho) != 0) { |
michael@0 | 1657 | string temp(buffer, buffer_length); |
michael@0 | 1658 | if ((flags & kCLDFlagHtml) != 0) { |
michael@0 | 1659 | fprintf(stderr, "CLD2[%d] '%s'<br>\n", |
michael@0 | 1660 | buffer_length, GetHtmlEscapedText(temp).c_str()); |
michael@0 | 1661 | } else { |
michael@0 | 1662 | fprintf(stderr, "CLD2[%d] '%s'\n", |
michael@0 | 1663 | buffer_length, GetPlainEscapedText(temp).c_str()); |
michael@0 | 1664 | } |
michael@0 | 1665 | } |
michael@0 | 1666 | |
michael@0 | 1667 | #ifdef CLD2_DYNAMIC_MODE |
michael@0 | 1668 | // In dynamic mode, we immediately return UNKNOWN_LANGUAGE if the data file |
michael@0 | 1669 | // hasn't been loaded yet. This is the only sane thing we can do, as there |
michael@0 | 1670 | // are no scoring tables to consult. |
michael@0 | 1671 | bool dataLoaded = isDataLoaded(); |
michael@0 | 1672 | if ((flags & kCLDFlagVerbose) != 0) { |
michael@0 | 1673 | fprintf(stderr, "Data loaded: %s\n", (dataLoaded ? "true" : "false")); |
michael@0 | 1674 | } |
michael@0 | 1675 | if (!dataLoaded) { |
michael@0 | 1676 | return UNKNOWN_LANGUAGE; |
michael@0 | 1677 | } |
michael@0 | 1678 | #endif |
michael@0 | 1679 | |
michael@0 | 1680 | // Exit now if no text |
michael@0 | 1681 | if (buffer_length == 0) {return UNKNOWN_LANGUAGE;} |
michael@0 | 1682 | if (kScoringtables.quadgram_obj == NULL) {return UNKNOWN_LANGUAGE;} |
michael@0 | 1683 | |
michael@0 | 1684 | // Document totals |
michael@0 | 1685 | DocTote doc_tote; // Reliability = 0..100 |
michael@0 | 1686 | |
michael@0 | 1687 | // ScoringContext carries state across scriptspans |
michael@0 | 1688 | ScoringContext scoringcontext; |
michael@0 | 1689 | scoringcontext.debug_file = stderr; |
michael@0 | 1690 | scoringcontext.flags_cld2_score_as_quads = |
michael@0 | 1691 | ((flags & kCLDFlagScoreAsQuads) != 0); |
michael@0 | 1692 | scoringcontext.flags_cld2_html = ((flags & kCLDFlagHtml) != 0); |
michael@0 | 1693 | scoringcontext.flags_cld2_cr = ((flags & kCLDFlagCr) != 0); |
michael@0 | 1694 | scoringcontext.flags_cld2_verbose = ((flags & kCLDFlagVerbose) != 0); |
michael@0 | 1695 | scoringcontext.prior_chunk_lang = UNKNOWN_LANGUAGE; |
michael@0 | 1696 | scoringcontext.ulscript = ULScript_Common; |
michael@0 | 1697 | scoringcontext.scoringtables = &kScoringtables; |
michael@0 | 1698 | scoringcontext.scanner = NULL; |
michael@0 | 1699 | scoringcontext.init(); // Clear the internal memory arrays |
michael@0 | 1700 | |
michael@0 | 1701 | // Now thread safe. |
michael@0 | 1702 | bool FLAGS_cld2_html = ((flags & kCLDFlagHtml) != 0); |
michael@0 | 1703 | bool FLAGS_cld2_quiet = ((flags & kCLDFlagQuiet) != 0); |
michael@0 | 1704 | |
michael@0 | 1705 | ApplyHints(buffer, buffer_length, is_plain_text, cld_hints, &scoringcontext); |
michael@0 | 1706 | |
michael@0 | 1707 | // Four individual script totals, Latin, Han, other2, other3 |
michael@0 | 1708 | int next_other_tote = 2; |
michael@0 | 1709 | int tote_num = 0; |
michael@0 | 1710 | |
michael@0 | 1711 | // Four totes for up to four different scripts pending at once |
michael@0 | 1712 | Tote totes[4]; // [0] Latn [1] Hani [2] other [3] other |
michael@0 | 1713 | bool tote_seen[4] = {false, false, false, false}; |
michael@0 | 1714 | int tote_grams[4] = {0, 0, 0, 0}; // Number in partial chunk |
michael@0 | 1715 | ULScript tote_script[4] = |
michael@0 | 1716 | {ULScript_Latin, ULScript_Hani, ULScript_Common, ULScript_Common}; |
michael@0 | 1717 | |
michael@0 | 1718 | // Loop through text spans in a single script |
michael@0 | 1719 | ScriptScanner ss(buffer, buffer_length, is_plain_text); |
michael@0 | 1720 | LangSpan scriptspan; |
michael@0 | 1721 | |
michael@0 | 1722 | scoringcontext.scanner = &ss; |
michael@0 | 1723 | |
michael@0 | 1724 | scriptspan.text = NULL; |
michael@0 | 1725 | scriptspan.text_bytes = 0; |
michael@0 | 1726 | scriptspan.offset = 0; |
michael@0 | 1727 | scriptspan.ulscript = ULScript_Common; |
michael@0 | 1728 | scriptspan.lang = UNKNOWN_LANGUAGE; |
michael@0 | 1729 | |
michael@0 | 1730 | int total_text_bytes = 0; |
michael@0 | 1731 | int textlimit = FLAGS_cld_textlimit << 10; // in KB |
michael@0 | 1732 | if (textlimit == 0) {textlimit = 0x7fffffff;} |
michael@0 | 1733 | |
michael@0 | 1734 | int advance_by = 2; // Advance 2 bytes |
michael@0 | 1735 | int advance_limit = textlimit >> 3; // For first 1/8 of max document |
michael@0 | 1736 | |
michael@0 | 1737 | int initial_word_span = kDefaultWordSpan; |
michael@0 | 1738 | if (FLAGS_cld_forcewords) { |
michael@0 | 1739 | initial_word_span = kReallyBigWordSpan; |
michael@0 | 1740 | } |
michael@0 | 1741 | |
michael@0 | 1742 | // Pick up chunk sizes |
michael@0 | 1743 | // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each |
michael@0 | 1744 | // Sanity check -- force into a reasonable range |
michael@0 | 1745 | int chunksizequads = FLAGS_cld_smoothwidth; |
michael@0 | 1746 | chunksizequads = minint(maxint(chunksizequads, kMinChunkSizeQuads), |
michael@0 | 1747 | kMaxChunkSizeQuads); |
michael@0 | 1748 | int chunksizeunis = (chunksizequads * 5) >> 1; |
michael@0 | 1749 | |
michael@0 | 1750 | // Varying short-span limit doesn't work well -- skips too much beyond 20KB |
michael@0 | 1751 | // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth; |
michael@0 | 1752 | int spantooshortlimit = kShortSpanThresh; |
michael@0 | 1753 | |
michael@0 | 1754 | // For debugging only. Not thread-safe |
michael@0 | 1755 | prior_lang = UNKNOWN_LANGUAGE; |
michael@0 | 1756 | prior_unreliable = false; |
michael@0 | 1757 | |
michael@0 | 1758 | // Allocate full-document prediction table for finding repeating words |
michael@0 | 1759 | int hash = 0; |
michael@0 | 1760 | int* predict_tbl = new int[kPredictionTableSize]; |
michael@0 | 1761 | if (FlagRepeats(flags)) { |
michael@0 | 1762 | memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); |
michael@0 | 1763 | } |
michael@0 | 1764 | |
michael@0 | 1765 | |
michael@0 | 1766 | |
michael@0 | 1767 | // Loop through scriptspans accumulating number of text bytes in each language |
michael@0 | 1768 | while (ss.GetOneScriptSpanLower(&scriptspan)) { |
michael@0 | 1769 | ULScript ulscript = scriptspan.ulscript; |
michael@0 | 1770 | |
michael@0 | 1771 | // Squeeze out big chunks of text span if asked to |
michael@0 | 1772 | if (FlagSqueeze(flags)) { |
michael@0 | 1773 | // Remove repetitive or mostly-spaces chunks |
michael@0 | 1774 | int newlen; |
michael@0 | 1775 | int chunksize = 0; // Use the default |
michael@0 | 1776 | if (resultchunkvector != NULL) { |
michael@0 | 1777 | newlen = CheapSqueezeInplaceOverwrite(scriptspan.text, |
michael@0 | 1778 | scriptspan.text_bytes, |
michael@0 | 1779 | chunksize); |
michael@0 | 1780 | } else { |
michael@0 | 1781 | newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes, |
michael@0 | 1782 | chunksize); |
michael@0 | 1783 | } |
michael@0 | 1784 | scriptspan.text_bytes = newlen; |
michael@0 | 1785 | } else { |
michael@0 | 1786 | // Check now and then to see if we should be squeezing |
michael@0 | 1787 | if (((kCheapSqueezeTestThresh >> 1) < scriptspan.text_bytes) && |
michael@0 | 1788 | !FlagFinish(flags)) { |
michael@0 | 1789 | // fprintf(stderr, "CheapSqueezeTriggerTest, " |
michael@0 | 1790 | // "first %d bytes of %d (>%d/2)<br>\n", |
michael@0 | 1791 | // kCheapSqueezeTestLen, |
michael@0 | 1792 | // scriptspan.text_bytes, |
michael@0 | 1793 | // kCheapSqueezeTestThresh); |
michael@0 | 1794 | |
michael@0 | 1795 | if (CheapSqueezeTriggerTest(scriptspan.text, |
michael@0 | 1796 | scriptspan.text_bytes, |
michael@0 | 1797 | kCheapSqueezeTestLen)) { |
michael@0 | 1798 | // Recursive call with big-chunk squeezing set |
michael@0 | 1799 | if (FLAGS_cld2_html || FLAGS_dbgscore) { |
michael@0 | 1800 | fprintf(stderr, |
michael@0 | 1801 | "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n", |
michael@0 | 1802 | total_text_bytes); |
michael@0 | 1803 | } |
michael@0 | 1804 | // Deallocate full-document prediction table |
michael@0 | 1805 | delete[] predict_tbl; |
michael@0 | 1806 | |
michael@0 | 1807 | return DetectLanguageSummaryV2( |
michael@0 | 1808 | buffer, |
michael@0 | 1809 | buffer_length, |
michael@0 | 1810 | is_plain_text, |
michael@0 | 1811 | cld_hints, |
michael@0 | 1812 | allow_extended_lang, |
michael@0 | 1813 | flags | kCLDFlagSqueeze, |
michael@0 | 1814 | plus_one, |
michael@0 | 1815 | language3, |
michael@0 | 1816 | percent3, |
michael@0 | 1817 | normalized_score3, |
michael@0 | 1818 | resultchunkvector, |
michael@0 | 1819 | text_bytes, |
michael@0 | 1820 | is_reliable); |
michael@0 | 1821 | } |
michael@0 | 1822 | } |
michael@0 | 1823 | } |
michael@0 | 1824 | |
michael@0 | 1825 | // Remove repetitive words if asked to |
michael@0 | 1826 | if (FlagRepeats(flags)) { |
michael@0 | 1827 | // Remove repetitive words |
michael@0 | 1828 | int newlen; |
michael@0 | 1829 | if (resultchunkvector != NULL) { |
michael@0 | 1830 | newlen = CheapRepWordsInplaceOverwrite(scriptspan.text, |
michael@0 | 1831 | scriptspan.text_bytes, |
michael@0 | 1832 | &hash, predict_tbl); |
michael@0 | 1833 | } else { |
michael@0 | 1834 | newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes, |
michael@0 | 1835 | &hash, predict_tbl); |
michael@0 | 1836 | } |
michael@0 | 1837 | scriptspan.text_bytes = newlen; |
michael@0 | 1838 | } |
michael@0 | 1839 | |
michael@0 | 1840 | // Scoring depends on scriptspan buffer ALWAYS having |
michael@0 | 1841 | // leading space and off-the-end space space space NUL, |
michael@0 | 1842 | // DCHECK(scriptspan.text[0] == ' '); |
michael@0 | 1843 | // DCHECK(scriptspan.text[scriptspan.text_bytes + 0] == ' '); |
michael@0 | 1844 | // DCHECK(scriptspan.text[scriptspan.text_bytes + 1] == ' '); |
michael@0 | 1845 | // DCHECK(scriptspan.text[scriptspan.text_bytes + 2] == ' '); |
michael@0 | 1846 | // DCHECK(scriptspan.text[scriptspan.text_bytes + 3] == '\0'); |
michael@0 | 1847 | |
michael@0 | 1848 | // The real scoring |
michael@0 | 1849 | // Accumulate directly into the document total, or accmulate in one of four |
michael@0 | 1850 | // chunk totals. The purpose of the multiple chunk totals is to piece |
michael@0 | 1851 | // together short choppy pieces of text in alternating scripts. One total is |
michael@0 | 1852 | // dedicated to Latin text, one to Han text, and the other two are dynamicly |
michael@0 | 1853 | // assigned. |
michael@0 | 1854 | |
michael@0 | 1855 | scoringcontext.ulscript = scriptspan.ulscript; |
michael@0 | 1856 | // FLAGS_cld2_html = scoringcontext.flags_cld2_html; |
michael@0 | 1857 | |
michael@0 | 1858 | ScoreOneScriptSpan(scriptspan, |
michael@0 | 1859 | &scoringcontext, |
michael@0 | 1860 | &doc_tote, |
michael@0 | 1861 | resultchunkvector); |
michael@0 | 1862 | |
michael@0 | 1863 | total_text_bytes += scriptspan.text_bytes; |
michael@0 | 1864 | } // End while (ss.GetOneScriptSpanLower()) |
michael@0 | 1865 | |
michael@0 | 1866 | // Deallocate full-document prediction table |
michael@0 | 1867 | delete[] predict_tbl; |
michael@0 | 1868 | |
michael@0 | 1869 | if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { |
michael@0 | 1870 | // If no forced <cr>, put one in front of dump |
michael@0 | 1871 | if (!scoringcontext.flags_cld2_cr) {fprintf(stderr, "<br>\n");} |
michael@0 | 1872 | doc_tote.Dump(stderr); |
michael@0 | 1873 | } |
michael@0 | 1874 | |
michael@0 | 1875 | |
michael@0 | 1876 | // If extended langauges are disallowed, remove them here |
michael@0 | 1877 | if (!allow_extended_lang) { |
michael@0 | 1878 | RemoveExtendedLanguages(&doc_tote); |
michael@0 | 1879 | } |
michael@0 | 1880 | |
michael@0 | 1881 | // Force close pairs to one or the other |
michael@0 | 1882 | // If given, also update resultchunkvector |
michael@0 | 1883 | RefineScoredClosePairs(&doc_tote, resultchunkvector, |
michael@0 | 1884 | FLAGS_cld2_html, FLAGS_cld2_quiet); |
michael@0 | 1885 | |
michael@0 | 1886 | |
michael@0 | 1887 | // Calculate return results |
michael@0 | 1888 | // Find top three byte counts in tote heap |
michael@0 | 1889 | int reliable_percent3[3]; |
michael@0 | 1890 | |
michael@0 | 1891 | // Cannot use Add, etc. after sorting |
michael@0 | 1892 | doc_tote.Sort(3); |
michael@0 | 1893 | |
michael@0 | 1894 | ExtractLangEtc(&doc_tote, total_text_bytes, |
michael@0 | 1895 | reliable_percent3, language3, percent3, normalized_score3, |
michael@0 | 1896 | text_bytes, is_reliable); |
michael@0 | 1897 | |
michael@0 | 1898 | bool have_good_answer = false; |
michael@0 | 1899 | if (FlagFinish(flags)) { |
michael@0 | 1900 | // Force a result |
michael@0 | 1901 | have_good_answer = true; |
michael@0 | 1902 | } else if (total_text_bytes <= kShortTextThresh) { |
michael@0 | 1903 | // Don't recurse on short text -- we already did word scores |
michael@0 | 1904 | have_good_answer = true; |
michael@0 | 1905 | } else if (*is_reliable && |
michael@0 | 1906 | (percent3[0] >= kGoodLang1Percent)) { |
michael@0 | 1907 | have_good_answer = true; |
michael@0 | 1908 | } else if (*is_reliable && |
michael@0 | 1909 | ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) { |
michael@0 | 1910 | have_good_answer = true; |
michael@0 | 1911 | } |
michael@0 | 1912 | |
michael@0 | 1913 | |
michael@0 | 1914 | if (have_good_answer) { |
michael@0 | 1915 | // This is the real, non-recursive return |
michael@0 | 1916 | |
michael@0 | 1917 | // Move bytes for unreliable langs to another lang or UNKNOWN |
michael@0 | 1918 | RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet); |
michael@0 | 1919 | |
michael@0 | 1920 | // Redo the result extraction after the removal above |
michael@0 | 1921 | doc_tote.Sort(3); |
michael@0 | 1922 | ExtractLangEtc(&doc_tote, total_text_bytes, |
michael@0 | 1923 | reliable_percent3, language3, percent3, normalized_score3, |
michael@0 | 1924 | text_bytes, is_reliable); |
michael@0 | 1925 | |
michael@0 | 1926 | |
michael@0 | 1927 | |
michael@0 | 1928 | Language summary_lang; |
michael@0 | 1929 | CalcSummaryLang(&doc_tote, total_text_bytes, |
michael@0 | 1930 | reliable_percent3, language3, percent3, |
michael@0 | 1931 | &summary_lang, is_reliable, |
michael@0 | 1932 | FLAGS_cld2_html, FLAGS_cld2_quiet); |
michael@0 | 1933 | |
michael@0 | 1934 | if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { |
michael@0 | 1935 | for (int i = 0; i < 3; ++i) { |
michael@0 | 1936 | if (language3[i] != UNKNOWN_LANGUAGE) { |
michael@0 | 1937 | fprintf(stderr, "%s.%dR(%d%%) ", |
michael@0 | 1938 | LanguageCode(language3[i]), |
michael@0 | 1939 | reliable_percent3[i], |
michael@0 | 1940 | percent3[i]); |
michael@0 | 1941 | } |
michael@0 | 1942 | } |
michael@0 | 1943 | |
michael@0 | 1944 | fprintf(stderr, "%d bytes ", total_text_bytes); |
michael@0 | 1945 | fprintf(stderr, "= %s%c ", |
michael@0 | 1946 | LanguageName(summary_lang), *is_reliable ? ' ' : '*'); |
michael@0 | 1947 | fprintf(stderr, "<br><br>\n"); |
michael@0 | 1948 | } |
michael@0 | 1949 | |
michael@0 | 1950 | // Slightly condensed if quiet |
michael@0 | 1951 | if (FLAGS_cld2_html && FLAGS_cld2_quiet) { |
michael@0 | 1952 | fprintf(stderr, " "); |
michael@0 | 1953 | for (int i = 0; i < 3; ++i) { |
michael@0 | 1954 | if (language3[i] != UNKNOWN_LANGUAGE) { |
michael@0 | 1955 | fprintf(stderr, " %s %d%% ", |
michael@0 | 1956 | LanguageCode(language3[i]), |
michael@0 | 1957 | percent3[i]); |
michael@0 | 1958 | } |
michael@0 | 1959 | } |
michael@0 | 1960 | fprintf(stderr, "= %s%c ", |
michael@0 | 1961 | LanguageName(summary_lang), *is_reliable ? ' ' : '*'); |
michael@0 | 1962 | fprintf(stderr, "<br>\n"); |
michael@0 | 1963 | } |
michael@0 | 1964 | |
michael@0 | 1965 | return summary_lang; |
michael@0 | 1966 | } |
michael@0 | 1967 | |
michael@0 | 1968 | // Not a good answer -- do recursive call to refine |
michael@0 | 1969 | if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) { |
michael@0 | 1970 | // This is what we hope to improve on in the recursive call, if any |
michael@0 | 1971 | PrintLangs(stderr, language3, percent3, text_bytes, is_reliable); |
michael@0 | 1972 | } |
michael@0 | 1973 | |
michael@0 | 1974 | // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40 |
michael@0 | 1975 | // For this purpose, we treate "Ignore" as top40 |
michael@0 | 1976 | Language new_plus_one = UNKNOWN_LANGUAGE; |
michael@0 | 1977 | |
michael@0 | 1978 | if (total_text_bytes < kShortTextThresh) { |
michael@0 | 1979 | // Short text: Recursive call with top40 and short set |
michael@0 | 1980 | if (FLAGS_cld2_html || FLAGS_dbgscore) { |
michael@0 | 1981 | fprintf(stderr, " ---text_bytes[%d] " |
michael@0 | 1982 | "Recursive(Top40/Rep/Short/Words)---<br><br>\n", |
michael@0 | 1983 | total_text_bytes); |
michael@0 | 1984 | } |
michael@0 | 1985 | return DetectLanguageSummaryV2( |
michael@0 | 1986 | buffer, |
michael@0 | 1987 | buffer_length, |
michael@0 | 1988 | is_plain_text, |
michael@0 | 1989 | cld_hints, |
michael@0 | 1990 | allow_extended_lang, |
michael@0 | 1991 | flags | kCLDFlagTop40 | kCLDFlagRepeats | |
michael@0 | 1992 | kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish, |
michael@0 | 1993 | new_plus_one, |
michael@0 | 1994 | language3, |
michael@0 | 1995 | percent3, |
michael@0 | 1996 | normalized_score3, |
michael@0 | 1997 | resultchunkvector, |
michael@0 | 1998 | text_bytes, |
michael@0 | 1999 | is_reliable); |
michael@0 | 2000 | } |
michael@0 | 2001 | |
michael@0 | 2002 | // Longer text: Recursive call with top40 set |
michael@0 | 2003 | if (FLAGS_cld2_html || FLAGS_dbgscore) { |
michael@0 | 2004 | fprintf(stderr, |
michael@0 | 2005 | " ---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n", |
michael@0 | 2006 | total_text_bytes); |
michael@0 | 2007 | } |
michael@0 | 2008 | return DetectLanguageSummaryV2( |
michael@0 | 2009 | buffer, |
michael@0 | 2010 | buffer_length, |
michael@0 | 2011 | is_plain_text, |
michael@0 | 2012 | cld_hints, |
michael@0 | 2013 | allow_extended_lang, |
michael@0 | 2014 | flags | kCLDFlagTop40 | kCLDFlagRepeats | |
michael@0 | 2015 | kCLDFlagFinish, |
michael@0 | 2016 | new_plus_one, |
michael@0 | 2017 | language3, |
michael@0 | 2018 | percent3, |
michael@0 | 2019 | normalized_score3, |
michael@0 | 2020 | resultchunkvector, |
michael@0 | 2021 | text_bytes, |
michael@0 | 2022 | is_reliable); |
michael@0 | 2023 | } |
michael@0 | 2024 | |
michael@0 | 2025 | |
michael@0 | 2026 | // For debugging and wrappers. Not thread safe. |
michael@0 | 2027 | static char temp_detectlanguageversion[32]; |
michael@0 | 2028 | |
michael@0 | 2029 | // Return version text string |
michael@0 | 2030 | // String is "code_version - data_build_date" |
michael@0 | 2031 | const char* DetectLanguageVersion() { |
michael@0 | 2032 | if (kScoringtables.quadgram_obj == NULL) {return "";} |
michael@0 | 2033 | sprintf(temp_detectlanguageversion, |
michael@0 | 2034 | "V2.0 - %u", kScoringtables.quadgram_obj->kCLDTableBuildDate); |
michael@0 | 2035 | return temp_detectlanguageversion; |
michael@0 | 2036 | } |
michael@0 | 2037 | |
michael@0 | 2038 | |
michael@0 | 2039 | } // End namespace CLD2 |