browser/components/translation/cld2/internal/compact_lang_det_impl.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 // Copyright 2013 Google Inc. All Rights Reserved.
michael@0 2 //
michael@0 3 // Licensed under the Apache License, Version 2.0 (the "License");
michael@0 4 // you may not use this file except in compliance with the License.
michael@0 5 // You may obtain a copy of the License at
michael@0 6 //
michael@0 7 // http://www.apache.org/licenses/LICENSE-2.0
michael@0 8 //
michael@0 9 // Unless required by applicable law or agreed to in writing, software
michael@0 10 // distributed under the License is distributed on an "AS IS" BASIS,
michael@0 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0 12 // See the License for the specific language governing permissions and
michael@0 13 // limitations under the License.
michael@0 14
michael@0 15 //
michael@0 16 // Author: dsites@google.com (Dick Sites)
michael@0 17 // Updated 2014.01 for dual table lookup
michael@0 18 //
michael@0 19
michael@0 20 #include <stdio.h>
michael@0 21 #include <string.h>
michael@0 22 #include <string>
michael@0 23 #include <vector>
michael@0 24
michael@0 25 #include "cldutil.h"
michael@0 26 #include "debug.h"
michael@0 27 #include "integral_types.h"
michael@0 28 #include "lang_script.h"
michael@0 29 #include "utf8statetable.h"
michael@0 30
michael@0 31 #ifdef CLD2_DYNAMIC_MODE
michael@0 32 #include "cld2_dynamic_data.h"
michael@0 33 #include "cld2_dynamic_data_loader.h"
michael@0 34 #endif
michael@0 35 #include "cld2tablesummary.h"
michael@0 36 #include "compact_lang_det_impl.h"
michael@0 37 #include "compact_lang_det_hint_code.h"
michael@0 38 #include "getonescriptspan.h"
michael@0 39 #include "tote.h"
michael@0 40
michael@0 41
michael@0 42 namespace CLD2 {
michael@0 43
michael@0 44 using namespace std;
michael@0 45
michael@0 46 // Linker supplies the right tables, From files
michael@0 47 // cld_generated_cjk_uni_prop_80.cc cld2_generated_cjk_compatible.cc
michael@0 48 // cld_generated_cjk_delta_bi_32.cc generated_distinct_bi_0.cc
michael@0 49 // cld2_generated_quad*.cc cld2_generated_deltaocta*.cc
michael@0 50 // cld2_generated_distinctocta*.cc
michael@0 51 // cld_generated_score_quad_octa_1024_256.cc
michael@0 52
michael@0 53 // 2014.01 Now implementing quadgram dual lookup tables, to allow main table
michael@0 54 // sizes that are 1/3/5 times a power of two, instead of just powers of two.
michael@0 55 // Gives more flexibility of total footprint for CLD2.
michael@0 56
michael@0 57 extern const int kLanguageToPLangSize;
michael@0 58 extern const int kCloseSetSize;
michael@0 59
michael@0 60 extern const UTF8PropObj cld_generated_CjkUni_obj;
michael@0 61 extern const CLD2TableSummary kCjkCompat_obj;
michael@0 62 extern const CLD2TableSummary kCjkDeltaBi_obj;
michael@0 63 extern const CLD2TableSummary kDistinctBiTable_obj;
michael@0 64 extern const CLD2TableSummary kQuad_obj;
michael@0 65 extern const CLD2TableSummary kQuad_obj2; // Dual lookup tables
michael@0 66 extern const CLD2TableSummary kDeltaOcta_obj;
michael@0 67 extern const CLD2TableSummary kDistinctOcta_obj;
michael@0 68 extern const short kAvgDeltaOctaScore[];
michael@0 69
michael@0 70 #ifdef CLD2_DYNAMIC_MODE
michael@0 71 // CLD2_DYNAMIC_MODE is defined:
michael@0 72 // Data will be read from an mmap opened at runtime.
michael@0 73 static ScoringTables kScoringtables = {
michael@0 74 NULL, //&cld_generated_CjkUni_obj,
michael@0 75 NULL, //&kCjkCompat_obj,
michael@0 76 NULL, //&kCjkDeltaBi_obj,
michael@0 77 NULL, //&kDistinctBiTable_obj,
michael@0 78 NULL, //&kQuad_obj,
michael@0 79 NULL, //&kQuad_obj2,
michael@0 80 NULL, //&kDeltaOcta_obj,
michael@0 81 NULL, //&kDistinctOcta_obj,
michael@0 82 NULL, //kAvgDeltaOctaScore,
michael@0 83 };
michael@0 84 static bool dynamicDataLoaded = false;
michael@0 85 static ScoringTables* dynamicTables = NULL;
michael@0 86 static void* mmapAddress = NULL;
michael@0 87 static int mmapLength = 0;
michael@0 88
michael@0 89 bool isDataLoaded() { return dynamicDataLoaded; }
michael@0 90
michael@0 91 void loadData(const char* fileName) {
michael@0 92 if (isDataLoaded()) {
michael@0 93 unloadData();
michael@0 94 }
michael@0 95 dynamicTables = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength);
michael@0 96 kScoringtables = *dynamicTables;
michael@0 97 dynamicDataLoaded = true;
michael@0 98 };
michael@0 99
michael@0 100 void unloadData() {
michael@0 101 if (!dynamicDataLoaded) return;
michael@0 102 dynamicDataLoaded = false;
michael@0 103 // unloading will null all the pointers out.
michael@0 104 CLD2DynamicDataLoader::unloadData(&dynamicTables, &mmapAddress, &mmapLength);
michael@0 105 }
michael@0 106 #else
michael@0 107 // This initializes kScoringtables.quadgram_obj etc.
michael@0 108 static const ScoringTables kScoringtables = {
michael@0 109 &cld_generated_CjkUni_obj,
michael@0 110 &kCjkCompat_obj,
michael@0 111 &kCjkDeltaBi_obj,
michael@0 112 &kDistinctBiTable_obj,
michael@0 113
michael@0 114 &kQuad_obj,
michael@0 115 &kQuad_obj2, // Dual lookup tables
michael@0 116 &kDeltaOcta_obj,
michael@0 117 &kDistinctOcta_obj,
michael@0 118
michael@0 119 kAvgDeltaOctaScore,
michael@0 120 };
michael@0 121 #endif // #ifdef CLD2_DYNAMIC_MODE
michael@0 122
michael@0 123
michael@0 124 static const bool FLAGS_cld_no_minimum_bytes = false;
michael@0 125 static const bool FLAGS_cld_forcewords = true;
michael@0 126 static const bool FLAGS_cld_showme = false;
michael@0 127 static const bool FLAGS_cld_echotext = true;
michael@0 128 static const int32 FLAGS_cld_textlimit = 160;
michael@0 129 static const int32 FLAGS_cld_smoothwidth = 20;
michael@0 130 static const bool FLAGS_cld_2011_hints = true;
michael@0 131 static const int32 FLAGS_cld_max_lang_tag_scan_kb = 8;
michael@0 132
michael@0 133 static const bool FLAGS_dbgscore = false;
michael@0 134
michael@0 135
michael@0 136 static const int kLangHintInitial = 12; // Boost language by N initially
michael@0 137 static const int kLangHintBoost = 12; // Boost language by N/16 per quadgram
michael@0 138
michael@0 139 static const int kShortSpanThresh = 32; // Bytes
michael@0 140 static const int kMaxSecondChanceLen = 1024; // Look at first 1K of short spans
michael@0 141
michael@0 142 static const int kCheapSqueezeTestThresh = 4096; // Only look for squeezing
michael@0 143 // after this many text bytes
michael@0 144 static const int kCheapSqueezeTestLen = 256; // Bytes to test to trigger sqz
michael@0 145 static const int kSpacesTriggerPercent = 25; // Trigger sqz if >=25% spaces
michael@0 146 static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted
michael@0 147
michael@0 148 static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks
michael@0 149 static const int kSpacesThreshPercent = 25; // Squeeze if >=25% spaces
michael@0 150 static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted
michael@0 151
michael@0 152 static const int kMaxSpaceScan = 32; // Bytes
michael@0 153
michael@0 154 static const int kGoodLang1Percent = 70;
michael@0 155 static const int kGoodLang1and2Percent = 93;
michael@0 156 static const int kShortTextThresh = 256; // Bytes
michael@0 157
michael@0 158 static const int kMinChunkSizeQuads = 4; // Chunk is at least four quads
michael@0 159 static const int kMaxChunkSizeQuads = 1024; // Chunk is at most 1K quads
michael@0 160
michael@0 161 static const int kDefaultWordSpan = 256; // Scan at least this many initial
michael@0 162 // bytes with word scoring
michael@0 163 static const int kReallyBigWordSpan = 9999999; // Forces word scoring all text
michael@0 164
michael@0 165 static const int kMinReliableSeq = 50; // Record in seq if >= 50% reliable
michael@0 166
michael@0 167 static const int kPredictionTableSize = 4096; // Must be exactly 4096 for
michael@0 168 // cheap compressor
michael@0 169
michael@0 170 static const int kNonEnBoilerplateMinPercent = 17; // <this => no second
michael@0 171 static const int kNonFIGSBoilerplateMinPercent = 20; // <this => no second
michael@0 172 static const int kGoodFirstMinPercent = 26; // <this => UNK
michael@0 173 static const int kGoodFirstReliableMinPercent = 51; // <this => unreli
michael@0 174 static const int kIgnoreMaxPercent = 20; // >this => unreli
michael@0 175 static const int kKeepMinPercent = 2; // <this => unreli
michael@0 176
michael@0 177
michael@0 178
michael@0 179 // Statistically closest language, based on quadgram table
michael@0 180 // Those that are far from other languges map to UNKNOWN_LANGUAGE
michael@0 181 // Subscripted by Language
michael@0 182 //
michael@0 183 // From lang_correlation.txt and hand-edits
michael@0 184 // sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/
michael@0 185 // (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE,
michael@0 186 // \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt
michael@0 187 //
michael@0 188 static const int kMinCorrPercent = 24; // Pick off how close you want
michael@0 189 // 24 catches PERSIAN <== ARABIC
michael@0 190 // but not SPANISH <== PORTUGESE
michael@0 191 static Language Unknown = UNKNOWN_LANGUAGE;
michael@0 192
michael@0 193 // Suspect idea
michael@0 194 // Subscripted by Language
michael@0 195 static const Language kClosestAltLanguage[] = {
michael@0 196 (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // ENGLISH
michael@0 197 (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // DANISH
michael@0 198 (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE, // DUTCH
michael@0 199 (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // FINNISH
michael@0 200 (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // FRENCH
michael@0 201 (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE, // GERMAN
michael@0 202 (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE, // HEBREW
michael@0 203 (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE, // ITALIAN
michael@0 204 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Japanese
michael@0 205 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Korean
michael@0 206 (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE, // NORWEGIAN
michael@0 207 ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // POLISH
michael@0 208 (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // PORTUGUESE
michael@0 209 (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // RUSSIAN
michael@0 210 (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE, // SPANISH
michael@0 211 (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // SWEDISH
michael@0 212 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Chinese
michael@0 213 (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // CZECH
michael@0 214 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GREEK
michael@0 215 (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE, // ICELANDIC
michael@0 216 ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE, // LATVIAN
michael@0 217 ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE, // LITHUANIAN
michael@0 218 ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ROMANIAN
michael@0 219 ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // HUNGARIAN
michael@0 220 (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE, // ESTONIAN
michael@0 221 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Ignore
michael@0 222 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Unknown
michael@0 223 (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // BULGARIAN
michael@0 224 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CROATIAN
michael@0 225 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SERBIAN
michael@0 226 (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE, // IRISH
michael@0 227 (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GALICIAN
michael@0 228 ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // TAGALOG
michael@0 229 (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE, // TURKISH
michael@0 230 (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // UKRAINIAN
michael@0 231 (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // HINDI
michael@0 232 (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // MACEDONIAN
michael@0 233 (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE, // BENGALI
michael@0 234 (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // INDONESIAN
michael@0 235 ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // LATIN
michael@0 236 (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // MALAY
michael@0 237 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MALAYALAM
michael@0 238 ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE, // WELSH
michael@0 239 ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // NEPALI
michael@0 240 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TELUGU
michael@0 241 ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE, // ALBANIAN
michael@0 242 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TAMIL
michael@0 243 (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE, // BELARUSIAN
michael@0 244 (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE, // JAVANESE
michael@0 245 (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE, // OCCITAN
michael@0 246 (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // URDU
michael@0 247 (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // BIHARI
michael@0 248 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GUJARATI
michael@0 249 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // THAI
michael@0 250 (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // ARABIC
michael@0 251 (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // CATALAN
michael@0 252 ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ESPERANTO
michael@0 253 ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // BASQUE
michael@0 254 ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // INTERLINGUA
michael@0 255 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KANNADA
michael@0 256 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PUNJABI
michael@0 257 (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE, // SCOTS_GAELIC
michael@0 258 ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SWAHILI
michael@0 259 (28 >= kMinCorrPercent) ? SERBIAN : UNKNOWN_LANGUAGE, // SLOVENIAN
michael@0 260 (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // MARATHI
michael@0 261 ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // MALTESE
michael@0 262 ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE, // VIETNAMESE
michael@0 263 (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // FRISIAN
michael@0 264 (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE, // SLOVAK
michael@0 265 // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ChineseT
michael@0 266 (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE, // ChineseT
michael@0 267 (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE, // FAROESE
michael@0 268 (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE, // SUNDANESE
michael@0 269 (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE, // UZBEK
michael@0 270 ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE, // AMHARIC
michael@0 271 (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // AZERBAIJANI
michael@0 272 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GEORGIAN
michael@0 273 ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE, // TIGRINYA
michael@0 274 (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // PERSIAN
michael@0 275 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // BOSNIAN
michael@0 276 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SINHALESE
michael@0 277 (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // NORWEGIAN_N
michael@0 278 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_P
michael@0 279 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_B
michael@0 280 (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // XHOSA
michael@0 281 (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE, // ZULU
michael@0 282 ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GUARANI
michael@0 283 (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE, // SESOTHO
michael@0 284 ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // TURKMEN
michael@0 285 ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE, // KYRGYZ
michael@0 286 ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE, // BRETON
michael@0 287 ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE, // TWI
michael@0 288 (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE, // YIDDISH
michael@0 289 (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE, // SERBO_CROATIAN
michael@0 290 (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // SOMALI
michael@0 291 ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // UIGHUR
michael@0 292 (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // KURDISH
michael@0 293 ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // MONGOLIAN
michael@0 294 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ARMENIAN
michael@0 295 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // LAOTHIAN
michael@0 296 ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // SINDHI
michael@0 297 (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // RHAETO_ROMANCE
michael@0 298 (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // AFRIKAANS
michael@0 299 (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // LUXEMBOURGISH
michael@0 300 ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // BURMESE
michael@0 301 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KHMER
michael@0 302 (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE, // TIBETAN
michael@0 303 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // DHIVEHI
michael@0 304 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CHEROKEE
michael@0 305 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SYRIAC
michael@0 306 ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // LIMBU
michael@0 307 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ORIYA
michael@0 308 (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE, // ASSAMESE
michael@0 309 (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // CORSICAN
michael@0 310 ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // INTERLINGUE
michael@0 311 ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // KAZAKH
michael@0 312 ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE, // LINGALA
michael@0 313 (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // MOLDAVIAN
michael@0 314 (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // PASHTO
michael@0 315 ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE, // QUECHUA
michael@0 316 ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SHONA
michael@0 317 (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // TAJIK
michael@0 318 (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE, // TATAR
michael@0 319 (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE, // TONGA
michael@0 320 ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE, // YORUBA
michael@0 321 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_ENGLISH_BASED
michael@0 322 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_FRENCH_BASED
michael@0 323 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
michael@0 324 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_OTHER
michael@0 325 ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // MAORI
michael@0 326 ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // WOLOF
michael@0 327 ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE, // ABKHAZIAN
michael@0 328 ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // AFAR
michael@0 329 ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE, // AYMARA
michael@0 330 (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE, // BASHKIR
michael@0 331 ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // BISLAMA
michael@0 332 (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE, // DZONGKHA
michael@0 333 ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // FIJIAN
michael@0 334 ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE, // GREENLANDIC
michael@0 335 ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE, // HAUSA
michael@0 336 ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // HAITIAN_CREOLE
michael@0 337 ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE, // INUPIAK
michael@0 338 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // INUKTITUT
michael@0 339 ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // KASHMIRI
michael@0 340 (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE, // KINYARWANDA
michael@0 341 ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE, // MALAGASY
michael@0 342 (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // NAURU
michael@0 343 (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // OROMO
michael@0 344 (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // RUNDI
michael@0 345 (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // SAMOAN
michael@0 346 ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE, // SANGO
michael@0 347 (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // SANSKRIT
michael@0 348 (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // SISWANT
michael@0 349 ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE, // TSONGA
michael@0 350 (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE, // TSWANA
michael@0 351 ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // VOLAPUK
michael@0 352 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ZHUANG
michael@0 353 ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // KHASI
michael@0 354 (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // SCOTS
michael@0 355 (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // GANDA
michael@0 356 ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // MANX
michael@0 357 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MONTENEGRIN
michael@0 358
michael@0 359 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // AKAN
michael@0 360 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // IGBO
michael@0 361 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MAURITIAN_CREOLE
michael@0 362 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // HAWAIIAN
michael@0 363 };
michael@0 364
michael@0 365 // COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES,
michael@0 366 // kClosestAltLanguage_has_incorrect_size);
michael@0 367
michael@0 368
michael@0 369 inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;}
michael@0 370 inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;}
michael@0 371 inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;}
michael@0 372 inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}
michael@0 373 inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
michael@0 374 inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
michael@0 375 inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
michael@0 376
michael@0 377
michael@0 378 // Defines Top40 packed languages
michael@0 379
michael@0 380 // Google top 40 languages
michael@0 381 //
michael@0 382 // Tier 0/1 Language enum list (16)
michael@0 383 // ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH, // E - FIGS
michael@0 384 // DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN,
michael@0 385 // PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI,
michael@0 386 // ARABIC,
michael@0 387 //
michael@0 388 // Tier 2 Language enum list (22)
michael@0 389 // SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN,
michael@0 390 // HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN,
michael@0 391 // VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK,
michael@0 392 // TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN,
michael@0 393 // UKRAINIAN, HINDI,
michael@0 394 //
michael@0 395 // use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21)
michael@0 396 //
michael@0 397 // Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40
michael@0 398
michael@0 399
michael@0 400 void DemoteNotTop40(Tote* chunk_tote, uint16 psplus_one) {
michael@0 401 // REVISIT
michael@0 402 }
michael@0 403
michael@0 404 void PrintText(FILE* f, Language cur_lang, const string& temp) {
michael@0 405 if (temp.size() == 0) {return;}
michael@0 406 fprintf(f, "PrintText[%s]%s<br>\n", LanguageName(cur_lang), temp.c_str());
michael@0 407 }
michael@0 408
michael@0 409
michael@0 410 //------------------------------------------------------------------------------
michael@0 411 // For --cld_html debugging output. Not thread safe
michael@0 412 //------------------------------------------------------------------------------
michael@0 413 static Language prior_lang = UNKNOWN_LANGUAGE;
michael@0 414 static bool prior_unreliable = false;
michael@0 415
michael@0 416 //------------------------------------------------------------------------------
michael@0 417 // End For --cld_html debugging output
michael@0 418 //------------------------------------------------------------------------------
michael@0 419
michael@0 420
michael@0 421 // Backscan to word boundary, returning how many bytes n to go back
michael@0 422 // so that src - n is non-space ans src - n - 1 is space.
michael@0 423 // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
michael@0 424 int BackscanToSpace(const char* src, int limit) {
michael@0 425 int n = 0;
michael@0 426 limit = minint(limit, kMaxSpaceScan);
michael@0 427 while (n < limit) {
michael@0 428 if (src[-n - 1] == ' ') {return n;} // We are at _X
michael@0 429 ++n;
michael@0 430 }
michael@0 431 n = 0;
michael@0 432 while (n < limit) {
michael@0 433 if ((src[-n] & 0xc0) != 0x80) {return n;} // We are at char begin
michael@0 434 ++n;
michael@0 435 }
michael@0 436 return 0;
michael@0 437 }
michael@0 438
michael@0 439 // Forwardscan to word boundary, returning how many bytes n to go forward
michael@0 440 // so that src + n is non-space ans src + n - 1 is space.
michael@0 441 // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
michael@0 442 int ForwardscanToSpace(const char* src, int limit) {
michael@0 443 int n = 0;
michael@0 444 limit = minint(limit, kMaxSpaceScan);
michael@0 445 while (n < limit) {
michael@0 446 if (src[n] == ' ') {return n + 1;} // We are at _X
michael@0 447 ++n;
michael@0 448 }
michael@0 449 n = 0;
michael@0 450 while (n < limit) {
michael@0 451 if ((src[n] & 0xc0) != 0x80) {return n;} // We are at char begin
michael@0 452 ++n;
michael@0 453 }
michael@0 454 return 0;
michael@0 455 }
michael@0 456
michael@0 457
michael@0 458 // This uses a cheap predictor to get a measure of compression, and
michael@0 459 // hence a measure of repetitiveness. It works on complete UTF-8 characters
michael@0 460 // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
michael@0 461 // all the time when done with a byte-based count. Sigh.
michael@0 462 //
michael@0 463 // To allow running prediction across multiple chunks, caller passes in current
michael@0 464 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
michael@0 465 //
michael@0 466 // Returns the number of *bytes* correctly predicted, increments by 1..4 for
michael@0 467 // each correctly-predicted character.
michael@0 468 //
michael@0 469 // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
michael@0 470 //
michael@0 471
michael@0 472 // TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen
michael@0 473
michael@0 474 int CountPredictedBytes(const char* isrc, int src_len, int* hash, int* tbl) {
michael@0 475 int p_count = 0;
michael@0 476 const uint8* src = reinterpret_cast<const uint8*>(isrc);
michael@0 477 const uint8* srclimit = src + src_len;
michael@0 478 int local_hash = *hash;
michael@0 479
michael@0 480 while (src < srclimit) {
michael@0 481 int c = src[0];
michael@0 482 int incr = 1;
michael@0 483
michael@0 484 // Pick up one char and length
michael@0 485 if (c < 0xc0) {
michael@0 486 // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
michael@0 487 // Do nothing more
michael@0 488 } else if ((c & 0xe0) == 0xc0) {
michael@0 489 // Two-byte
michael@0 490 c = (c << 8) | src[1];
michael@0 491 incr = 2;
michael@0 492 } else if ((c & 0xf0) == 0xe0) {
michael@0 493 // Three-byte
michael@0 494 c = (c << 16) | (src[1] << 8) | src[2];
michael@0 495 incr = 3;
michael@0 496 } else {
michael@0 497 // Four-byte
michael@0 498 c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
michael@0 499 incr = 4;
michael@0 500 }
michael@0 501 src += incr;
michael@0 502
michael@0 503 int p = tbl[local_hash]; // Prediction
michael@0 504 tbl[local_hash] = c; // Update prediction
michael@0 505 if (c == p) {
michael@0 506 p_count += incr; // Count bytes of good predictions
michael@0 507 }
michael@0 508
michael@0 509 local_hash = ((local_hash << 4) ^ c) & 0xfff;
michael@0 510 }
michael@0 511 *hash = local_hash;
michael@0 512 return p_count;
michael@0 513 }
michael@0 514
michael@0 515
michael@0 516
michael@0 517 // Counts number of spaces; a little faster than one-at-a-time
michael@0 518 // Doesn't count odd bytes at end
michael@0 519 int CountSpaces4(const char* src, int src_len) {
michael@0 520 int s_count = 0;
michael@0 521 for (int i = 0; i < (src_len & ~3); i += 4) {
michael@0 522 s_count += (src[i] == ' ');
michael@0 523 s_count += (src[i+1] == ' ');
michael@0 524 s_count += (src[i+2] == ' ');
michael@0 525 s_count += (src[i+3] == ' ');
michael@0 526 }
michael@0 527 return s_count;
michael@0 528 }
michael@0 529
michael@0 530
michael@0 531 // Remove words of text that have more than half their letters predicted
michael@0 532 // correctly by our cheap predictor, moving the remaining words in-place
michael@0 533 // to the front of the input buffer.
michael@0 534 //
michael@0 535 // To allow running prediction across multiple chunks, caller passes in current
michael@0 536 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
michael@0 537 //
michael@0 538 // Return the new, possibly-shorter length
michael@0 539 //
michael@0 540 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
michael@0 541 // if input does
michael@0 542 //
michael@0 543 int CheapRepWordsInplace(char* isrc, int src_len, int* hash, int* tbl) {
michael@0 544 const uint8* src = reinterpret_cast<const uint8*>(isrc);
michael@0 545 const uint8* srclimit = src + src_len;
michael@0 546 char* dst = isrc;
michael@0 547 int local_hash = *hash;
michael@0 548 char* word_dst = dst; // Start of next word
michael@0 549 int good_predict_bytes = 0;
michael@0 550 int word_length_bytes = 0;
michael@0 551
michael@0 552 while (src < srclimit) {
michael@0 553 int c = src[0];
michael@0 554 int incr = 1;
michael@0 555 *dst++ = c;
michael@0 556
michael@0 557 if (c == ' ') {
michael@0 558 if ((good_predict_bytes * 2) > word_length_bytes) {
michael@0 559 // Word is well-predicted: backup to start of this word
michael@0 560 dst = word_dst;
michael@0 561 if (FLAGS_cld_showme) {
michael@0 562 // Mark the deletion point with period
michael@0 563 // Don't repeat multiple periods
michael@0 564 // Cannot mark with more bytes or may overwrite unseen input
michael@0 565 if ((isrc < (dst - 2)) && (dst[-2] != '.')) {
michael@0 566 *dst++ = '.';
michael@0 567 *dst++ = ' ';
michael@0 568 }
michael@0 569 }
michael@0 570 }
michael@0 571 word_dst = dst; // Start of next word
michael@0 572 good_predict_bytes = 0;
michael@0 573 word_length_bytes = 0;
michael@0 574 }
michael@0 575
michael@0 576 // Pick up one char and length
michael@0 577 if (c < 0xc0) {
michael@0 578 // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
michael@0 579 // Do nothing more
michael@0 580 } else if ((c & 0xe0) == 0xc0) {
michael@0 581 // Two-byte
michael@0 582 *dst++ = src[1];
michael@0 583 c = (c << 8) | src[1];
michael@0 584 incr = 2;
michael@0 585 } else if ((c & 0xf0) == 0xe0) {
michael@0 586 // Three-byte
michael@0 587 *dst++ = src[1];
michael@0 588 *dst++ = src[2];
michael@0 589 c = (c << 16) | (src[1] << 8) | src[2];
michael@0 590 incr = 3;
michael@0 591 } else {
michael@0 592 // Four-byte
michael@0 593 *dst++ = src[1];
michael@0 594 *dst++ = src[2];
michael@0 595 *dst++ = src[3];
michael@0 596 c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
michael@0 597 incr = 4;
michael@0 598 }
michael@0 599 src += incr;
michael@0 600 word_length_bytes += incr;
michael@0 601
michael@0 602 int p = tbl[local_hash]; // Prediction
michael@0 603 tbl[local_hash] = c; // Update prediction
michael@0 604 if (c == p) {
michael@0 605 good_predict_bytes += incr; // Count good predictions
michael@0 606 }
michael@0 607
michael@0 608 local_hash = ((local_hash << 4) ^ c) & 0xfff;
michael@0 609 }
michael@0 610
michael@0 611 *hash = local_hash;
michael@0 612
michael@0 613 if ((dst - isrc) < (src_len - 3)) {
michael@0 614 // Pad and make last char clean UTF-8 by putting following spaces
michael@0 615 dst[0] = ' ';
michael@0 616 dst[1] = ' ';
michael@0 617 dst[2] = ' ';
michael@0 618 dst[3] = '\0';
michael@0 619 } else if ((dst - isrc) < src_len) {
michael@0 620 // Make last char clean UTF-8 by putting following space off the end
michael@0 621 dst[0] = ' ';
michael@0 622 }
michael@0 623
michael@0 624 return static_cast<int>(dst - isrc);
michael@0 625 }
michael@0 626
michael@0 627
michael@0 628 // This alternate form overwrites redundant words, thus avoiding corrupting the
michael@0 629 // backmap for generate a vector of original-text ranges.
michael@0 630 int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) {
michael@0 631 const uint8* src = reinterpret_cast<const uint8*>(isrc);
michael@0 632 const uint8* srclimit = src + src_len;
michael@0 633 char* dst = isrc;
michael@0 634 int local_hash = *hash;
michael@0 635 char* word_dst = dst; // Start of next word
michael@0 636 int good_predict_bytes = 0;
michael@0 637 int word_length_bytes = 0;
michael@0 638
michael@0 639 while (src < srclimit) {
michael@0 640 int c = src[0];
michael@0 641 int incr = 1;
michael@0 642 *dst++ = c;
michael@0 643
michael@0 644 if (c == ' ') {
michael@0 645 if ((good_predict_bytes * 2) > word_length_bytes) {
michael@0 646 // Word [word_dst..dst-1) is well-predicted: overwrite
michael@0 647 for (char* p = word_dst; p < dst - 1; ++p) {*p = '.';}
michael@0 648 }
michael@0 649 word_dst = dst; // Start of next word
michael@0 650 good_predict_bytes = 0;
michael@0 651 word_length_bytes = 0;
michael@0 652 }
michael@0 653
michael@0 654 // Pick up one char and length
michael@0 655 if (c < 0xc0) {
michael@0 656 // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
michael@0 657 // Do nothing more
michael@0 658 } else if ((c & 0xe0) == 0xc0) {
michael@0 659 // Two-byte
michael@0 660 *dst++ = src[1];
michael@0 661 c = (c << 8) | src[1];
michael@0 662 incr = 2;
michael@0 663 } else if ((c & 0xf0) == 0xe0) {
michael@0 664 // Three-byte
michael@0 665 *dst++ = src[1];
michael@0 666 *dst++ = src[2];
michael@0 667 c = (c << 16) | (src[1] << 8) | src[2];
michael@0 668 incr = 3;
michael@0 669 } else {
michael@0 670 // Four-byte
michael@0 671 *dst++ = src[1];
michael@0 672 *dst++ = src[2];
michael@0 673 *dst++ = src[3];
michael@0 674 c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
michael@0 675 incr = 4;
michael@0 676 }
michael@0 677 src += incr;
michael@0 678 word_length_bytes += incr;
michael@0 679
michael@0 680 int p = tbl[local_hash]; // Prediction
michael@0 681 tbl[local_hash] = c; // Update prediction
michael@0 682 if (c == p) {
michael@0 683 good_predict_bytes += incr; // Count good predictions
michael@0 684 }
michael@0 685
michael@0 686 local_hash = ((local_hash << 4) ^ c) & 0xfff;
michael@0 687 }
michael@0 688
michael@0 689 *hash = local_hash;
michael@0 690
michael@0 691 if ((dst - isrc) < (src_len - 3)) {
michael@0 692 // Pad and make last char clean UTF-8 by putting following spaces
michael@0 693 dst[0] = ' ';
michael@0 694 dst[1] = ' ';
michael@0 695 dst[2] = ' ';
michael@0 696 dst[3] = '\0';
michael@0 697 } else if ((dst - isrc) < src_len) {
michael@0 698 // Make last char clean UTF-8 by putting following space off the end
michael@0 699 dst[0] = ' ';
michael@0 700 }
michael@0 701
michael@0 702 return static_cast<int>(dst - isrc);
michael@0 703 }
michael@0 704
michael@0 705
michael@0 706 // Remove portions of text that have a high density of spaces, or that are
michael@0 707 // overly repetitive, squeezing the remaining text in-place to the front of the
michael@0 708 // input buffer.
michael@0 709 //
michael@0 710 // Squeezing looks at density of space/prediced chars in fixed-size chunks,
michael@0 711 // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
michael@0 712 //
michael@0 713 // Return the new, possibly-shorter length
michael@0 714 //
michael@0 715 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
michael@0 716 // if input does
michael@0 717 //
michael@0 718 int CheapSqueezeInplace(char* isrc,
michael@0 719 int src_len,
michael@0 720 int ichunksize) {
michael@0 721 char* src = isrc;
michael@0 722 char* dst = src;
michael@0 723 char* srclimit = src + src_len;
michael@0 724 bool skipping = false;
michael@0 725
michael@0 726 int hash = 0;
michael@0 727 // Allocate local prediction table.
michael@0 728 int* predict_tbl = new int[kPredictionTableSize];
michael@0 729 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
michael@0 730
michael@0 731 int chunksize = ichunksize;
michael@0 732 if (chunksize == 0) {chunksize = kChunksizeDefault;}
michael@0 733 int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
michael@0 734 int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
michael@0 735
michael@0 736 while (src < srclimit) {
michael@0 737 int remaining_bytes = srclimit - src;
michael@0 738 int len = minint(chunksize, remaining_bytes);
michael@0 739 // Make len land us on a UTF-8 character boundary.
michael@0 740 // Ah. Also fixes mispredict because we could get out of phase
michael@0 741 // Loop always terminates at trailing space in buffer
michael@0 742 while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes
michael@0 743
michael@0 744 int space_n = CountSpaces4(src, len);
michael@0 745 int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
michael@0 746 if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
michael@0 747 // Skip the text
michael@0 748 if (!skipping) {
michael@0 749 // Keeping-to-skipping transition; do it at a space
michael@0 750 int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
michael@0 751 dst -= n;
michael@0 752 if (dst == isrc) {
michael@0 753 // Force a leading space if the first chunk is deleted
michael@0 754 *dst++ = ' ';
michael@0 755 }
michael@0 756 if (FLAGS_cld_showme) {
michael@0 757 // Mark the deletion point with black square U+25A0
michael@0 758 *dst++ = static_cast<unsigned char>(0xe2);
michael@0 759 *dst++ = static_cast<unsigned char>(0x96);
michael@0 760 *dst++ = static_cast<unsigned char>(0xa0);
michael@0 761 *dst++ = ' ';
michael@0 762 }
michael@0 763 skipping = true;
michael@0 764 }
michael@0 765 } else {
michael@0 766 // Keep the text
michael@0 767 if (skipping) {
michael@0 768 // Skipping-to-keeping transition; do it at a space
michael@0 769 int n = ForwardscanToSpace(src, len);
michael@0 770 src += n;
michael@0 771 remaining_bytes -= n; // Shrink remaining length
michael@0 772 len -= n;
michael@0 773 skipping = false;
michael@0 774 }
michael@0 775 // "len" can be negative in some cases
michael@0 776 if (len > 0) {
michael@0 777 memmove(dst, src, len);
michael@0 778 dst += len;
michael@0 779 }
michael@0 780 }
michael@0 781 src += len;
michael@0 782 }
michael@0 783
michael@0 784 if ((dst - isrc) < (src_len - 3)) {
michael@0 785 // Pad and make last char clean UTF-8 by putting following spaces
michael@0 786 dst[0] = ' ';
michael@0 787 dst[1] = ' ';
michael@0 788 dst[2] = ' ';
michael@0 789 dst[3] = '\0';
michael@0 790 } else if ((dst - isrc) < src_len) {
michael@0 791 // Make last char clean UTF-8 by putting following space off the end
michael@0 792 dst[0] = ' ';
michael@0 793 }
michael@0 794
michael@0 795 // Deallocate local prediction table
michael@0 796 delete[] predict_tbl;
michael@0 797 return static_cast<int>(dst - isrc);
michael@0 798 }
michael@0 799
michael@0 800 // This alternate form overwrites redundant words, thus avoiding corrupting the
michael@0 801 // backmap for generate a vector of original-text ranges.
michael@0 802 int CheapSqueezeInplaceOverwrite(char* isrc,
michael@0 803 int src_len,
michael@0 804 int ichunksize) {
michael@0 805 char* src = isrc;
michael@0 806 char* dst = src;
michael@0 807 char* srclimit = src + src_len;
michael@0 808 bool skipping = false;
michael@0 809
michael@0 810 int hash = 0;
michael@0 811 // Allocate local prediction table.
michael@0 812 int* predict_tbl = new int[kPredictionTableSize];
michael@0 813 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
michael@0 814
michael@0 815 int chunksize = ichunksize;
michael@0 816 if (chunksize == 0) {chunksize = kChunksizeDefault;}
michael@0 817 int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
michael@0 818 int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
michael@0 819
michael@0 820 // Always keep first byte (space)
michael@0 821 ++src;
michael@0 822 ++dst;
michael@0 823 while (src < srclimit) {
michael@0 824 int remaining_bytes = srclimit - src;
michael@0 825 int len = minint(chunksize, remaining_bytes);
michael@0 826 // Make len land us on a UTF-8 character boundary.
michael@0 827 // Ah. Also fixes mispredict because we could get out of phase
michael@0 828 // Loop always terminates at trailing space in buffer
michael@0 829 while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes
michael@0 830
michael@0 831 int space_n = CountSpaces4(src, len);
michael@0 832 int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
michael@0 833 if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
michael@0 834 // Overwrite the text [dst-n..dst)
michael@0 835 if (!skipping) {
michael@0 836 // Keeping-to-skipping transition; do it at a space
michael@0 837 int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
michael@0 838 // Text [word_dst..dst) is well-predicted: overwrite
michael@0 839 for (char* p = dst - n; p < dst; ++p) {*p = '.';}
michael@0 840 skipping = true;
michael@0 841 }
michael@0 842 // Overwrite the text [dst..dst+len)
michael@0 843 for (char* p = dst; p < dst + len; ++p) {*p = '.';}
michael@0 844 dst[len - 1] = ' '; // Space at end so we can see what is happening
michael@0 845 } else {
michael@0 846 // Keep the text
michael@0 847 if (skipping) {
michael@0 848 // Skipping-to-keeping transition; do it at a space
michael@0 849 int n = ForwardscanToSpace(src, len);
michael@0 850 // Text [dst..dst+n) is well-predicted: overwrite
michael@0 851 for (char* p = dst; p < dst + n - 1; ++p) {*p = '.';}
michael@0 852 skipping = false;
michael@0 853 }
michael@0 854 }
michael@0 855 dst += len;
michael@0 856 src += len;
michael@0 857 }
michael@0 858
michael@0 859 if ((dst - isrc) < (src_len - 3)) {
michael@0 860 // Pad and make last char clean UTF-8 by putting following spaces
michael@0 861 dst[0] = ' ';
michael@0 862 dst[1] = ' ';
michael@0 863 dst[2] = ' ';
michael@0 864 dst[3] = '\0';
michael@0 865 } else if ((dst - isrc) < src_len) {
michael@0 866 // Make last char clean UTF-8 by putting following space off the end
michael@0 867 dst[0] = ' ';
michael@0 868 }
michael@0 869
michael@0 870 // Deallocate local prediction table
michael@0 871 delete[] predict_tbl;
michael@0 872 return static_cast<int>(dst - isrc);
michael@0 873 }
michael@0 874
michael@0 875 // Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input
michael@0 876 // About 90 MB/sec, with or without memcpy, chunksize 48 or 4096
michael@0 877 // Just CountSpaces is about 340 MB/sec
michael@0 878 // Byte-only CountPredictedBytes is about 150 MB/sec
michael@0 879 // Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec
michael@0 880 // Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c
michael@0 881 // Unjammed byte-only both = 170 MB/sec
michael@0 882 // Jammed byte-only both = 120 MB/sec
michael@0 883 // Back to original w/slight updates, 110 MB/sec
michael@0 884 //
michael@0 885 bool CheapSqueezeTriggerTest(const char* src, int src_len, int testsize) {
michael@0 886 // Don't trigger at all on short text
michael@0 887 if (src_len < testsize) {return false;}
michael@0 888 int space_thresh = (testsize * kSpacesTriggerPercent) / 100;
michael@0 889 int predict_thresh = (testsize * kPredictTriggerPercent) / 100;
michael@0 890 int hash = 0;
michael@0 891 // Allocate local prediction table.
michael@0 892 int* predict_tbl = new int[kPredictionTableSize];
michael@0 893 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
michael@0 894
michael@0 895 bool retval = false;
michael@0 896 if ((CountSpaces4(src, testsize) >= space_thresh) ||
michael@0 897 (CountPredictedBytes(src, testsize, &hash, predict_tbl) >=
michael@0 898 predict_thresh)) {
michael@0 899 retval = true;
michael@0 900 }
michael@0 901 // Deallocate local prediction table
michael@0 902 delete[] predict_tbl;
michael@0 903 return retval;
michael@0 904 }
michael@0 905
michael@0 906
michael@0 907
michael@0 908
michael@0 909 // Delete any extended languages from doc_tote
michael@0 910 void RemoveExtendedLanguages(DocTote* doc_tote) {
michael@0 911 // Now a nop
michael@0 912 }
michael@0 913
michael@0 914 static const int kMinReliableKeepPercent = 41; // Remove lang if reli < this
michael@0 915
michael@0 916 // For Tier3 languages, require a minimum number of bytes to be first-place lang
michael@0 917 static const int kGoodFirstT3MinBytes = 24; // <this => no first
michael@0 918
michael@0 919 // Move bytes for unreliable langs to another lang or UNKNOWN
michael@0 920 // doc_tote is sorted, so cannot Add
michael@0 921 //
michael@0 922 // If both CHINESE and CHINESET are present and unreliable, do not delete both;
michael@0 923 // merge both into CHINESE.
michael@0 924 //
michael@0 925 //dsites 2009.03.19
michael@0 926 // we also want to remove Tier3 languages as the first lang if there is very
michael@0 927 // little text like ej1 ej2 ej3 ej4
michael@0 928 // maybe fold this back in earlier
michael@0 929 //
michael@0 930 void RemoveUnreliableLanguages(DocTote* doc_tote,
michael@0 931 bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
michael@0 932 // Prepass to merge some low-reliablility languages
michael@0 933 // TODO: this shouldn't really reach in to the internal structure of doc_tote
michael@0 934 int total_bytes = 0;
michael@0 935 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
michael@0 936 int plang = doc_tote->Key(sub);
michael@0 937 if (plang == DocTote::kUnusedKey) {continue;} // Empty slot
michael@0 938
michael@0 939 Language lang = static_cast<Language>(plang);
michael@0 940 int bytes = doc_tote->Value(sub);
michael@0 941 int reli = doc_tote->Reliability(sub);
michael@0 942 if (bytes == 0) {continue;} // Zero bytes
michael@0 943 total_bytes += bytes;
michael@0 944
michael@0 945 // Reliable percent = stored reliable score over stored bytecount
michael@0 946 int reliable_percent = reli / bytes;
michael@0 947 if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper
michael@0 948
michael@0 949 // This language is too unreliable to keep, but we might merge it.
michael@0 950 Language altlang = UNKNOWN_LANGUAGE;
michael@0 951 if (lang <= HAWAIIAN) {altlang = kClosestAltLanguage[lang];}
michael@0 952 if (altlang == UNKNOWN_LANGUAGE) {continue;} // No alternative
michael@0 953
michael@0 954 // Look for alternative in doc_tote
michael@0 955 int altsub = doc_tote->Find(altlang);
michael@0 956 if (altsub < 0) {continue;} // No alternative text
michael@0 957
michael@0 958 int bytes2 = doc_tote->Value(altsub);
michael@0 959 int reli2 = doc_tote->Reliability(altsub);
michael@0 960 if (bytes2 == 0) {continue;} // Zero bytes
michael@0 961
michael@0 962 // Reliable percent is stored reliable score over stored bytecount
michael@0 963 int reliable_percent2 = reli2 / bytes2;
michael@0 964
michael@0 965 // Merge one language into the other. Break ties toward lower lang #
michael@0 966 int tosub = altsub;
michael@0 967 int fromsub = sub;
michael@0 968 bool into_lang = false;
michael@0 969 if ((reliable_percent2 < reliable_percent) ||
michael@0 970 ((reliable_percent2 == reliable_percent) && (lang < altlang))) {
michael@0 971 tosub = sub;
michael@0 972 fromsub = altsub;
michael@0 973 into_lang = true;
michael@0 974 }
michael@0 975
michael@0 976 // Make sure merged reliability doesn't drop and is enough to avoid delete
michael@0 977 int newpercent = maxint(reliable_percent, reliable_percent2);
michael@0 978 newpercent = maxint(newpercent, kMinReliableKeepPercent);
michael@0 979 int newbytes = bytes + bytes2;
michael@0 980 int newreli = newpercent * newbytes;
michael@0 981
michael@0 982 doc_tote->SetKey(fromsub, DocTote::kUnusedKey);
michael@0 983 doc_tote->SetScore(fromsub, 0);
michael@0 984 doc_tote->SetReliability(fromsub, 0);
michael@0 985 doc_tote->SetScore(tosub, newbytes);
michael@0 986 doc_tote->SetReliability(tosub, newreli);
michael@0 987
michael@0 988 // Show fate of unreliable languages if at least 10 bytes
michael@0 989 if (FLAGS_cld2_html && (newbytes >= 10) &&
michael@0 990 !FLAGS_cld2_quiet) {
michael@0 991 if (into_lang) {
michael@0 992 fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ",
michael@0 993 LanguageCode(altlang), reliable_percent2, bytes2,
michael@0 994 LanguageCode(lang));
michael@0 995 } else {
michael@0 996 fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ",
michael@0 997 LanguageCode(lang), reliable_percent, bytes,
michael@0 998 LanguageCode(altlang));
michael@0 999 }
michael@0 1000 }
michael@0 1001 }
michael@0 1002
michael@0 1003
michael@0 1004 // Pass to delete any remaining unreliable languages
michael@0 1005 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
michael@0 1006 int plang = doc_tote->Key(sub);
michael@0 1007 if (plang == DocTote::kUnusedKey) {continue;} // Empty slot
michael@0 1008
michael@0 1009 Language lang = static_cast<Language>(plang);
michael@0 1010 int bytes = doc_tote->Value(sub);
michael@0 1011 int reli = doc_tote->Reliability(sub);
michael@0 1012 if (bytes == 0) {continue;} // Zero bytes
michael@0 1013
michael@0 1014 // Reliable percent is stored as reliable score over stored bytecount
michael@0 1015 int reliable_percent = reli / bytes;
michael@0 1016 if (reliable_percent >= kMinReliableKeepPercent) { // Keeper?
michael@0 1017 continue; // yes
michael@0 1018 }
michael@0 1019
michael@0 1020 // Delete unreliable entry
michael@0 1021 doc_tote->SetKey(sub, DocTote::kUnusedKey);
michael@0 1022 doc_tote->SetScore(sub, 0);
michael@0 1023 doc_tote->SetReliability(sub, 0);
michael@0 1024
michael@0 1025 // Show fate of unreliable languages if at least 10 bytes
michael@0 1026 if (FLAGS_cld2_html && (bytes >= 10) &&
michael@0 1027 !FLAGS_cld2_quiet) {
michael@0 1028 fprintf(stderr, "{Unreli %s.%dR,%dB} ",
michael@0 1029 LanguageCode(lang), reliable_percent, bytes);
michael@0 1030 }
michael@0 1031 }
michael@0 1032
michael@0 1033 ////if (FLAGS_cld2_html) {fprintf(stderr, "<br>\n");}
michael@0 1034 }
michael@0 1035
michael@0 1036
michael@0 1037 // Move all the text bytes from lower byte-count to higher one
michael@0 1038 void MoveLang1ToLang2(Language lang1, Language lang2,
michael@0 1039 int lang1_sub, int lang2_sub,
michael@0 1040 DocTote* doc_tote,
michael@0 1041 ResultChunkVector* resultchunkvector) {
michael@0 1042 // In doc_tote, move all the bytes lang1 => lang2
michael@0 1043 int sum = doc_tote->Value(lang2_sub) + doc_tote->Value(lang1_sub);
michael@0 1044 doc_tote->SetValue(lang2_sub, sum);
michael@0 1045 sum = doc_tote->Score(lang2_sub) + doc_tote->Score(lang1_sub);
michael@0 1046 doc_tote->SetScore(lang2_sub, sum);
michael@0 1047 sum = doc_tote->Reliability(lang2_sub) + doc_tote->Reliability(lang1_sub);
michael@0 1048 doc_tote->SetReliability(lang2_sub, sum);
michael@0 1049
michael@0 1050 // Delete old entry
michael@0 1051 doc_tote->SetKey(lang1_sub, DocTote::kUnusedKey);
michael@0 1052 doc_tote->SetScore(lang1_sub, 0);
michael@0 1053 doc_tote->SetReliability(lang1_sub, 0);
michael@0 1054
michael@0 1055 // In resultchunkvector, move all the bytes lang1 => lang2
michael@0 1056 if (resultchunkvector == NULL) {return;}
michael@0 1057
michael@0 1058 int k = 0;
michael@0 1059 uint16 prior_lang = UNKNOWN_LANGUAGE;
michael@0 1060 for (int i = 0; i < static_cast<int>(resultchunkvector->size()); ++i) {
michael@0 1061 ResultChunk* rc = &(*resultchunkvector)[i];
michael@0 1062 if (rc->lang1 == lang1) {
michael@0 1063 // Update entry[i] lang1 => lang2
michael@0 1064 rc->lang1 = lang2;
michael@0 1065 }
michael@0 1066 // One change may produce two merges -- entry before and entry after
michael@0 1067 if ((rc->lang1 == prior_lang) && (k > 0)) {
michael@0 1068 // Merge with previous, deleting entry[i]
michael@0 1069 ResultChunk* prior_rc = &(*resultchunkvector)[k - 1];
michael@0 1070 prior_rc->bytes += rc->bytes;
michael@0 1071 // fprintf(stderr, "MoveLang1ToLang2 merged [%d] => [%d]<br>\n", i, k-1);
michael@0 1072 } else {
michael@0 1073 // Keep entry[i]
michael@0 1074 (*resultchunkvector)[k] = (*resultchunkvector)[i];
michael@0 1075 // fprintf(stderr, "MoveLang1ToLang2 keep [%d] => [%d]<br>\n", i, k);
michael@0 1076 ++k;
michael@0 1077 }
michael@0 1078 prior_lang = rc->lang1;
michael@0 1079 }
michael@0 1080 resultchunkvector->resize(k);
michael@0 1081 }
michael@0 1082
michael@0 1083
michael@0 1084
michael@0 1085 // Move less likely byte count to more likely for close pairs of languages
michael@0 1086 // If given, also update resultchunkvector
michael@0 1087 void RefineScoredClosePairs(DocTote* doc_tote,
michael@0 1088 ResultChunkVector* resultchunkvector,
michael@0 1089 bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
michael@0 1090 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
michael@0 1091 int close_packedlang = doc_tote->Key(sub);
michael@0 1092 int subscr = LanguageCloseSet(static_cast<Language>(close_packedlang));
michael@0 1093 if (subscr == 0) {continue;}
michael@0 1094
michael@0 1095 // We have a close pair language -- if the other one is also scored and the
michael@0 1096 // longword score differs enough, put all our eggs into one basket
michael@0 1097
michael@0 1098 // Nonzero longword score: Go look for the other of this pair
michael@0 1099 for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) {
michael@0 1100 if (LanguageCloseSet(static_cast<Language>(doc_tote->Key(sub2))) == subscr) {
michael@0 1101 // We have a matching pair
michael@0 1102 int close_packedlang2 = doc_tote->Key(sub2);
michael@0 1103
michael@0 1104 // Move all the text bytes from lower byte-count to higher one
michael@0 1105 int from_sub, to_sub;
michael@0 1106 Language from_lang, to_lang;
michael@0 1107 if (doc_tote->Value(sub) < doc_tote->Value(sub2)) {
michael@0 1108 from_sub = sub;
michael@0 1109 to_sub = sub2;
michael@0 1110 from_lang = static_cast<Language>(close_packedlang);
michael@0 1111 to_lang = static_cast<Language>(close_packedlang2);
michael@0 1112 } else {
michael@0 1113 from_sub = sub2;
michael@0 1114 to_sub = sub;
michael@0 1115 from_lang = static_cast<Language>(close_packedlang2);
michael@0 1116 to_lang = static_cast<Language>(close_packedlang);
michael@0 1117 }
michael@0 1118
michael@0 1119 if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) {
michael@0 1120 // Show fate of closepair language
michael@0 1121 int val = doc_tote->Value(from_sub); // byte count
michael@0 1122 int reli = doc_tote->Reliability(from_sub);
michael@0 1123 int reliable_percent = reli / (val ? val : 1); // avoid zdiv
michael@0 1124 fprintf(stderr, "{CloseLangPair: %s.%dR,%dB => %s}<br>\n",
michael@0 1125 LanguageCode(from_lang),
michael@0 1126 reliable_percent,
michael@0 1127 doc_tote->Value(from_sub),
michael@0 1128 LanguageCode(to_lang));
michael@0 1129 }
michael@0 1130 MoveLang1ToLang2(from_lang, to_lang, from_sub, to_sub,
michael@0 1131 doc_tote, resultchunkvector);
michael@0 1132 break; // Exit inner for sub2 loop
michael@0 1133 }
michael@0 1134 } // End for sub2
michael@0 1135 } // End for sub
michael@0 1136 }
michael@0 1137
michael@0 1138
michael@0 1139 void ApplyAllLanguageHints(Tote* chunk_tote, int tote_grams,
michael@0 1140 uint8* lang_hint_boost) {
michael@0 1141 }
michael@0 1142
michael@0 1143
michael@0 1144 void PrintHtmlEscapedText(FILE* f, const char* txt, int len) {
michael@0 1145 string temp(txt, len);
michael@0 1146 fprintf(f, "%s", GetHtmlEscapedText(temp).c_str());
michael@0 1147 }
michael@0 1148
michael@0 1149 void PrintLang(FILE* f, Tote* chunk_tote,
michael@0 1150 Language cur_lang, bool cur_unreliable,
michael@0 1151 Language prior_lang, bool prior_unreliable) {
michael@0 1152 if (cur_lang == prior_lang) {
michael@0 1153 fprintf(f, "[]");
michael@0 1154 } else {
michael@0 1155 fprintf(f, "[%s%s]", LanguageCode(cur_lang), cur_unreliable ? "*" : "");
michael@0 1156 }
michael@0 1157 }
michael@0 1158
michael@0 1159
michael@0 1160 void PrintTopLang(Language top_lang) {
michael@0 1161 if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
michael@0 1162 fprintf(stderr, "[] ");
michael@0 1163 } else {
michael@0 1164 fprintf(stderr, "[%s] ", LanguageName(top_lang));
michael@0 1165 prior_lang = top_lang;
michael@0 1166 }
michael@0 1167 }
michael@0 1168
michael@0 1169 void PrintTopLangSpeculative(Language top_lang) {
michael@0 1170 fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0);
michael@0 1171 if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
michael@0 1172 fprintf(stderr, "[] ");
michael@0 1173 } else {
michael@0 1174 fprintf(stderr, "[%s] ", LanguageName(top_lang));
michael@0 1175 prior_lang = top_lang;
michael@0 1176 }
michael@0 1177 fprintf(stderr, "</span>\n");
michael@0 1178 }
michael@0 1179
michael@0 1180 void PrintLangs(FILE* f, const Language* language3, const int* percent3,
michael@0 1181 const int* text_bytes, const bool* is_reliable) {
michael@0 1182 fprintf(f, "<br>&nbsp;&nbsp;Initial_Languages ");
michael@0 1183 if (language3[0] != UNKNOWN_LANGUAGE) {
michael@0 1184 fprintf(f, "%s%s(%d%%) ",
michael@0 1185 LanguageName(language3[0]),
michael@0 1186 *is_reliable ? "" : "*",
michael@0 1187 percent3[0]);
michael@0 1188 }
michael@0 1189 if (language3[1] != UNKNOWN_LANGUAGE) {
michael@0 1190 fprintf(f, "%s(%d%%) ", LanguageName(language3[1]), percent3[1]);
michael@0 1191 }
michael@0 1192 if (language3[2] != UNKNOWN_LANGUAGE) {
michael@0 1193 fprintf(f, "%s(%d%%) ", LanguageName(language3[2]), percent3[2]);
michael@0 1194 }
michael@0 1195 fprintf(f, "%d bytes \n", *text_bytes);
michael@0 1196
michael@0 1197 fprintf(f, "<br>\n");
michael@0 1198 }
michael@0 1199
michael@0 1200
michael@0 1201 // Return internal probability score (sum) per 1024 bytes
michael@0 1202 double GetNormalizedScore(Language lang, ULScript ulscript,
michael@0 1203 int bytecount, int score) {
michael@0 1204 if (bytecount <= 0) {return 0.0;}
michael@0 1205 return (score << 10) / bytecount;
michael@0 1206 }
michael@0 1207
michael@0 1208 // Extract return values before fixups
michael@0 1209 void ExtractLangEtc(DocTote* doc_tote, int total_text_bytes,
michael@0 1210 int* reliable_percent3, Language* language3, int* percent3,
michael@0 1211 double* normalized_score3,
michael@0 1212 int* text_bytes, bool* is_reliable) {
michael@0 1213 reliable_percent3[0] = 0;
michael@0 1214 reliable_percent3[1] = 0;
michael@0 1215 reliable_percent3[2] = 0;
michael@0 1216 language3[0] = UNKNOWN_LANGUAGE;
michael@0 1217 language3[1] = UNKNOWN_LANGUAGE;
michael@0 1218 language3[2] = UNKNOWN_LANGUAGE;
michael@0 1219 percent3[0] = 0;
michael@0 1220 percent3[1] = 0;
michael@0 1221 percent3[2] = 0;
michael@0 1222 normalized_score3[0] = 0.0;
michael@0 1223 normalized_score3[1] = 0.0;
michael@0 1224 normalized_score3[2] = 0.0;
michael@0 1225
michael@0 1226 *text_bytes = total_text_bytes;
michael@0 1227 *is_reliable = false;
michael@0 1228
michael@0 1229 int bytecount1 = 0;
michael@0 1230 int bytecount2 = 0;
michael@0 1231 int bytecount3 = 0;
michael@0 1232
michael@0 1233 int lang1 = doc_tote->Key(0);
michael@0 1234 if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) {
michael@0 1235 // We have a top language
michael@0 1236 language3[0] = static_cast<Language>(lang1);
michael@0 1237 bytecount1 = doc_tote->Value(0);
michael@0 1238 int reli1 = doc_tote->Reliability(0);
michael@0 1239 reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1); // avoid zdiv
michael@0 1240 normalized_score3[0] = GetNormalizedScore(language3[0],
michael@0 1241 ULScript_Common,
michael@0 1242 bytecount1,
michael@0 1243 doc_tote->Score(0));
michael@0 1244 }
michael@0 1245
michael@0 1246 int lang2 = doc_tote->Key(1);
michael@0 1247 if ((lang2 != DocTote::kUnusedKey) && (lang2 != UNKNOWN_LANGUAGE)) {
michael@0 1248 language3[1] = static_cast<Language>(lang2);
michael@0 1249 bytecount2 = doc_tote->Value(1);
michael@0 1250 int reli2 = doc_tote->Reliability(1);
michael@0 1251 reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1); // avoid zdiv
michael@0 1252 normalized_score3[1] = GetNormalizedScore(language3[1],
michael@0 1253 ULScript_Common,
michael@0 1254 bytecount2,
michael@0 1255 doc_tote->Score(1));
michael@0 1256 }
michael@0 1257
michael@0 1258 int lang3 = doc_tote->Key(2);
michael@0 1259 if ((lang3 != DocTote::kUnusedKey) && (lang3 != UNKNOWN_LANGUAGE)) {
michael@0 1260 language3[2] = static_cast<Language>(lang3);
michael@0 1261 bytecount3 = doc_tote->Value(2);
michael@0 1262 int reli3 = doc_tote->Reliability(2);
michael@0 1263 reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1); // avoid zdiv
michael@0 1264 normalized_score3[2] = GetNormalizedScore(language3[2],
michael@0 1265 ULScript_Common,
michael@0 1266 bytecount3,
michael@0 1267 doc_tote->Score(2));
michael@0 1268 }
michael@0 1269
michael@0 1270 // Increase total bytes to sum (top 3) if low for some reason
michael@0 1271 int total_bytecount12 = bytecount1 + bytecount2;
michael@0 1272 int total_bytecount123 = total_bytecount12 + bytecount3;
michael@0 1273 if (total_text_bytes < total_bytecount123) {
michael@0 1274 total_text_bytes = total_bytecount123;
michael@0 1275 *text_bytes = total_text_bytes;
michael@0 1276 }
michael@0 1277
michael@0 1278 // Sum minus previous % gives better roundoff behavior than bytecount/total
michael@0 1279 int total_text_bytes_div = maxint(1, total_text_bytes); // Avoid zdiv
michael@0 1280 percent3[0] = (bytecount1 * 100) / total_text_bytes_div;
michael@0 1281 percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div;
michael@0 1282 percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div;
michael@0 1283 percent3[2] -= percent3[1];
michael@0 1284 percent3[1] -= percent3[0];
michael@0 1285
michael@0 1286 // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2%
michael@0 1287 // Fix this explicitly
michael@0 1288 if (percent3[1] < percent3[2]) {
michael@0 1289 ++percent3[1];
michael@0 1290 --percent3[2];
michael@0 1291 }
michael@0 1292 if (percent3[0] < percent3[1]) {
michael@0 1293 ++percent3[0];
michael@0 1294 --percent3[1];
michael@0 1295 }
michael@0 1296
michael@0 1297 *text_bytes = total_text_bytes;
michael@0 1298
michael@0 1299 if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) {
michael@0 1300 // We have a top language
michael@0 1301 // Its reliability is overall result reliability
michael@0 1302 int bytecount = doc_tote->Value(0);
michael@0 1303 int reli = doc_tote->Reliability(0);
michael@0 1304 int reliable_percent = reli / (bytecount ? bytecount : 1); // avoid zdiv
michael@0 1305 *is_reliable = (reliable_percent >= kMinReliableKeepPercent);
michael@0 1306 } else {
michael@0 1307 // No top language at all. This can happen with zero text or 100% Klingon
michael@0 1308 // if extended=false. Just return all UNKNOWN_LANGUAGE, unreliable.
michael@0 1309 *is_reliable = false;
michael@0 1310 }
michael@0 1311
michael@0 1312 // If ignore percent is too large, set unreliable.
michael@0 1313 int ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]);
michael@0 1314 if ((ignore_percent > kIgnoreMaxPercent)) {
michael@0 1315 *is_reliable = false;
michael@0 1316 }
michael@0 1317 }
michael@0 1318
michael@0 1319 bool IsFIGS(Language lang) {
michael@0 1320 if (lang == FRENCH) {return true;}
michael@0 1321 if (lang == ITALIAN) {return true;}
michael@0 1322 if (lang == GERMAN) {return true;}
michael@0 1323 if (lang == SPANISH) {return true;}
michael@0 1324 return false;
michael@0 1325 }
michael@0 1326
michael@0 1327 bool IsEFIGS(Language lang) {
michael@0 1328 if (lang == ENGLISH) {return true;}
michael@0 1329 if (lang == FRENCH) {return true;}
michael@0 1330 if (lang == ITALIAN) {return true;}
michael@0 1331 if (lang == GERMAN) {return true;}
michael@0 1332 if (lang == SPANISH) {return true;}
michael@0 1333 return false;
michael@0 1334 }
michael@0 1335
michael@0 1336 // For Tier3 languages, require more bytes of text to override
michael@0 1337 // the first-place language
michael@0 1338 static const int kGoodSecondT1T2MinBytes = 15; // <this => no second
michael@0 1339 static const int kGoodSecondT3MinBytes = 128; // <this => no second
michael@0 1340
michael@0 1341 // Calculate a single summary language for the document, and its reliability.
michael@0 1342 // Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE
michael@0 1343 // This is the heart of matching human-rater perception.
michael@0 1344 // reliable_percent3[] is currently unused
michael@0 1345 //
michael@0 1346 // Do not return Tier3 second language unless there are at least 128 bytes
michael@0 1347 void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes,
michael@0 1348 const int* reliable_percent3,
michael@0 1349 const Language* language3,
michael@0 1350 const int* percent3,
michael@0 1351 Language* summary_lang, bool* is_reliable,
michael@0 1352 bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
michael@0 1353 // Vector of active languages; changes if we delete some
michael@0 1354 int slot_count = 3;
michael@0 1355 int active_slot[3] = {0, 1, 2};
michael@0 1356
michael@0 1357 int ignore_percent = 0;
michael@0 1358 int return_percent = percent3[0]; // Default to top lang
michael@0 1359 *summary_lang = language3[0];
michael@0 1360 *is_reliable = true;
michael@0 1361 if (percent3[0] < kKeepMinPercent) {*is_reliable = false;}
michael@0 1362
michael@0 1363 // If any of top 3 is IGNORE, remove it and increment ignore_percent
michael@0 1364 for (int i = 0; i < 3; ++i) {
michael@0 1365 if (language3[i] == TG_UNKNOWN_LANGUAGE) {
michael@0 1366 ignore_percent += percent3[i];
michael@0 1367 // Move the rest up, levaing input vectors unchanged
michael@0 1368 for (int j=i+1; j < 3; ++j) {
michael@0 1369 active_slot[j - 1] = active_slot[j];
michael@0 1370 }
michael@0 1371 -- slot_count;
michael@0 1372 // Logically remove Ignore from percentage-text calculation
michael@0 1373 // (extra 1 in 101 avoids zdiv, biases slightly small)
michael@0 1374 return_percent = (percent3[0] * 100) / (101 - ignore_percent);
michael@0 1375 *summary_lang = language3[active_slot[0]];
michael@0 1376 if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;}
michael@0 1377 }
michael@0 1378 }
michael@0 1379
michael@0 1380
michael@0 1381 // If English and X, where X (not UNK) is big enough,
michael@0 1382 // assume the English is boilerplate and return X.
michael@0 1383 // Logically remove English from percentage-text calculation
michael@0 1384 int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100;
michael@0 1385 // Require more bytes of text for Tier3 languages
michael@0 1386 int minbytesneeded = kGoodSecondT1T2MinBytes;
michael@0 1387 int plang_second = PerScriptNumber(ULScript_Latin, language3[active_slot[1]]);
michael@0 1388
michael@0 1389 if ((language3[active_slot[0]] == ENGLISH) &&
michael@0 1390 (language3[active_slot[1]] != ENGLISH) &&
michael@0 1391 (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
michael@0 1392 (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) &&
michael@0 1393 (second_bytes >= minbytesneeded)) {
michael@0 1394 ignore_percent += percent3[active_slot[0]];
michael@0 1395 return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
michael@0 1396 *summary_lang = language3[active_slot[1]];
michael@0 1397 if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
michael@0 1398
michael@0 1399 // Else If FIGS and X, where X (not UNK, EFIGS) is big enough,
michael@0 1400 // assume the FIGS is boilerplate and return X.
michael@0 1401 // Logically remove FIGS from percentage-text calculation
michael@0 1402 } else if (IsFIGS(language3[active_slot[0]]) &&
michael@0 1403 !IsEFIGS(language3[active_slot[1]]) &&
michael@0 1404 (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
michael@0 1405 (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) &&
michael@0 1406 (second_bytes >= minbytesneeded)) {
michael@0 1407 ignore_percent += percent3[active_slot[0]];
michael@0 1408 return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
michael@0 1409 *summary_lang = language3[active_slot[1]];
michael@0 1410 if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
michael@0 1411
michael@0 1412 // Else we are returning the first language, but want to improve its
michael@0 1413 // return_percent if the second language should be ignored
michael@0 1414 } else if ((language3[active_slot[1]] == ENGLISH) &&
michael@0 1415 (language3[active_slot[0]] != ENGLISH)) {
michael@0 1416 ignore_percent += percent3[active_slot[1]];
michael@0 1417 return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
michael@0 1418 } else if (IsFIGS(language3[active_slot[1]]) &&
michael@0 1419 !IsEFIGS(language3[active_slot[0]])) {
michael@0 1420 ignore_percent += percent3[active_slot[1]];
michael@0 1421 return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
michael@0 1422 }
michael@0 1423
michael@0 1424 // If return percent is too small (too many languages), return UNKNOWN
michael@0 1425 if ((return_percent < kGoodFirstMinPercent)) {
michael@0 1426 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
michael@0 1427 fprintf(stderr, "{Unreli %s %d%% percent too small} ",
michael@0 1428 LanguageCode(*summary_lang), return_percent);
michael@0 1429 }
michael@0 1430 *summary_lang = UNKNOWN_LANGUAGE;
michael@0 1431 *is_reliable = false;
michael@0 1432 }
michael@0 1433
michael@0 1434 // If return percent is small, return language but set unreliable.
michael@0 1435 if ((return_percent < kGoodFirstReliableMinPercent)) {
michael@0 1436 *is_reliable = false;
michael@0 1437 }
michael@0 1438
michael@0 1439 // If ignore percent is too large, set unreliable.
michael@0 1440 ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]);
michael@0 1441 if ((ignore_percent > kIgnoreMaxPercent)) {
michael@0 1442 *is_reliable = false;
michael@0 1443 }
michael@0 1444
michael@0 1445 // If we removed all the active languages, return UNKNOWN
michael@0 1446 if (slot_count == 0) {
michael@0 1447 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
michael@0 1448 fprintf(stderr, "{Unreli %s no languages left} ",
michael@0 1449 LanguageCode(*summary_lang));
michael@0 1450 }
michael@0 1451 *summary_lang = UNKNOWN_LANGUAGE;
michael@0 1452 *is_reliable = false;
michael@0 1453 }
michael@0 1454 }
michael@0 1455
michael@0 1456 void AddLangPriorBoost(Language lang, uint32 langprob,
michael@0 1457 ScoringContext* scoringcontext) {
michael@0 1458 // This is called 0..n times with language hints
michael@0 1459 // but we don't know the script -- so boost either or both Latn, Othr.
michael@0 1460
michael@0 1461 if (IsLatnLanguage(lang)) {
michael@0 1462 LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
michael@0 1463 int n = langprior_boost->n;
michael@0 1464 langprior_boost->langprob[n] = langprob;
michael@0 1465 langprior_boost->n = langprior_boost->wrap(n + 1);
michael@0 1466 }
michael@0 1467
michael@0 1468 if (IsOthrLanguage(lang)) {
michael@0 1469 LangBoosts* langprior_boost = &scoringcontext->langprior_boost.othr;
michael@0 1470 int n = langprior_boost->n;
michael@0 1471 langprior_boost->langprob[n] = langprob;
michael@0 1472 langprior_boost->n = langprior_boost->wrap(n + 1);
michael@0 1473 }
michael@0 1474
michael@0 1475 }
michael@0 1476
michael@0 1477 void AddOneWhack(Language whacker_lang, Language whackee_lang,
michael@0 1478 ScoringContext* scoringcontext) {
michael@0 1479 uint32 langprob = MakeLangProb(whackee_lang, 1);
michael@0 1480 // This logic avoids hr-Latn whacking sr-Cyrl, but still whacks sr-Latn
michael@0 1481 if (IsLatnLanguage(whacker_lang) && IsLatnLanguage(whackee_lang)) {
michael@0 1482 LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
michael@0 1483 int n = langprior_whack->n;
michael@0 1484 langprior_whack->langprob[n] = langprob;
michael@0 1485 langprior_whack->n = langprior_whack->wrap(n + 1);
michael@0 1486 }
michael@0 1487 if (IsOthrLanguage(whacker_lang) && IsOthrLanguage(whackee_lang)) {
michael@0 1488 LangBoosts* langprior_whack = &scoringcontext->langprior_whack.othr;
michael@0 1489 int n = langprior_whack->n;
michael@0 1490 langprior_whack->langprob[n] = langprob;
michael@0 1491 langprior_whack->n = langprior_whack->wrap(n + 1);
michael@0 1492 }
michael@0 1493 }
michael@0 1494
michael@0 1495 void AddCloseLangWhack(Language lang, ScoringContext* scoringcontext) {
michael@0 1496 // We do not in general want zh-Hans and zh-Hant to be close pairs,
michael@0 1497 // but we do here.
michael@0 1498 if (lang == CLD2::CHINESE) {
michael@0 1499 AddOneWhack(lang, CLD2::CHINESE_T, scoringcontext);
michael@0 1500 return;
michael@0 1501 }
michael@0 1502 if (lang == CLD2::CHINESE_T) {
michael@0 1503 AddOneWhack(lang, CLD2::CHINESE, scoringcontext);
michael@0 1504 return;
michael@0 1505 }
michael@0 1506
michael@0 1507 int base_lang_set = LanguageCloseSet(lang);
michael@0 1508 if (base_lang_set == 0) {return;}
michael@0 1509 // TODO: add an explicit list of each set to avoid this 512-times loop
michael@0 1510 for (int i = 0; i < kLanguageToPLangSize; ++i) {
michael@0 1511 Language lang2 = static_cast<Language>(i);
michael@0 1512 if ((base_lang_set == LanguageCloseSet(lang2)) && (lang != lang2)) {
michael@0 1513 AddOneWhack(lang, lang2, scoringcontext);
michael@0 1514 }
michael@0 1515 }
michael@0 1516 }
michael@0 1517
michael@0 1518
michael@0 1519 void ApplyHints(const char* buffer,
michael@0 1520 int buffer_length,
michael@0 1521 bool is_plain_text,
michael@0 1522 const CLDHints* cld_hints,
michael@0 1523 ScoringContext* scoringcontext) {
michael@0 1524 CLDLangPriors lang_priors;
michael@0 1525 InitCLDLangPriors(&lang_priors);
michael@0 1526
michael@0 1527 // We now use lang= tags.
michael@0 1528 // Last look, circa 2008 found only 15% of web pages with lang= tags and
michael@0 1529 // many of those were wrong. Now (July 2011), we find 44% of web pages have
michael@0 1530 // lang= tags, and most of them are correct. So we now give them substantial
michael@0 1531 // weight in each chunk scored.
michael@0 1532 if (!is_plain_text) {
michael@0 1533 // Get any contained language tags in first n KB
michael@0 1534 int32 max_scan_bytes = FLAGS_cld_max_lang_tag_scan_kb << 10;
michael@0 1535 string lang_tags = GetLangTagsFromHtml(buffer, buffer_length,
michael@0 1536 max_scan_bytes);
michael@0 1537 SetCLDLangTagsHint(lang_tags, &lang_priors);
michael@0 1538 if (scoringcontext->flags_cld2_html) {
michael@0 1539 if (!lang_tags.empty()) {
michael@0 1540 fprintf(scoringcontext->debug_file, "<br>lang_tags '%s'<br>\n",
michael@0 1541 lang_tags.c_str());
michael@0 1542 }
michael@0 1543 }
michael@0 1544 }
michael@0 1545
michael@0 1546 if (cld_hints != NULL) {
michael@0 1547 if ((cld_hints->content_language_hint != NULL) &&
michael@0 1548 (cld_hints->content_language_hint[0] != '\0')) {
michael@0 1549 SetCLDContentLangHint(cld_hints->content_language_hint, &lang_priors);
michael@0 1550 }
michael@0 1551
michael@0 1552 // Input is from GetTLD(), already lowercased
michael@0 1553 if ((cld_hints->tld_hint != NULL) && (cld_hints->tld_hint[0] != '\0')) {
michael@0 1554 SetCLDTLDHint(cld_hints->tld_hint, &lang_priors);
michael@0 1555 }
michael@0 1556
michael@0 1557 if (cld_hints->encoding_hint != UNKNOWN_ENCODING) {
michael@0 1558 Encoding enc = static_cast<Encoding>(cld_hints->encoding_hint);
michael@0 1559 SetCLDEncodingHint(enc, &lang_priors);
michael@0 1560 }
michael@0 1561
michael@0 1562 if (cld_hints->language_hint != UNKNOWN_LANGUAGE) {
michael@0 1563 SetCLDLanguageHint(cld_hints->language_hint, &lang_priors);
michael@0 1564 }
michael@0 1565 }
michael@0 1566
michael@0 1567 // Keep no more than four different languages with hints
michael@0 1568 TrimCLDLangPriors(4, &lang_priors);
michael@0 1569
michael@0 1570 if (scoringcontext->flags_cld2_html) {
michael@0 1571 string print_temp = DumpCLDLangPriors(&lang_priors);
michael@0 1572 if (!print_temp.empty()) {
michael@0 1573 fprintf(scoringcontext->debug_file, "DumpCLDLangPriors %s<br>\n",
michael@0 1574 print_temp.c_str());
michael@0 1575 }
michael@0 1576 }
michael@0 1577
michael@0 1578 // Put boosts into ScoringContext
michael@0 1579 for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
michael@0 1580 Language lang = GetCLDPriorLang(lang_priors.prior[i]);
michael@0 1581 int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
michael@0 1582 if (qprob > 0) {
michael@0 1583 uint32 langprob = MakeLangProb(lang, qprob);
michael@0 1584 AddLangPriorBoost(lang, langprob, scoringcontext);
michael@0 1585 }
michael@0 1586 }
michael@0 1587
michael@0 1588 // Put whacks into scoring context
michael@0 1589 // We do not in general want zh-Hans and zh-Hant to be close pairs,
michael@0 1590 // but we do here. Use close_set_count[kCloseSetSize] to count zh, zh-Hant
michael@0 1591 std::vector<int> close_set_count(kCloseSetSize + 1, 0);
michael@0 1592
michael@0 1593 for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
michael@0 1594 Language lang = GetCLDPriorLang(lang_priors.prior[i]);
michael@0 1595 ++close_set_count[LanguageCloseSet(lang)];
michael@0 1596 if (lang == CLD2::CHINESE) {++close_set_count[kCloseSetSize];}
michael@0 1597 if (lang == CLD2::CHINESE_T) {++close_set_count[kCloseSetSize];}
michael@0 1598 }
michael@0 1599
michael@0 1600 // If a boost language is in a close set, force suppressing the others in
michael@0 1601 // that set, if exactly one of the set is present
michael@0 1602 for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
michael@0 1603 Language lang = GetCLDPriorLang(lang_priors.prior[i]);
michael@0 1604 int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
michael@0 1605 if (qprob > 0) {
michael@0 1606 int close_set = LanguageCloseSet(lang);
michael@0 1607 if ((close_set > 0) && (close_set_count[close_set] == 1)) {
michael@0 1608 AddCloseLangWhack(lang, scoringcontext);
michael@0 1609 }
michael@0 1610 if (((lang == CLD2::CHINESE) || (lang == CLD2::CHINESE_T)) &&
michael@0 1611 (close_set_count[kCloseSetSize] == 1)) {
michael@0 1612 AddCloseLangWhack(lang, scoringcontext);
michael@0 1613 }
michael@0 1614 }
michael@0 1615 }
michael@0 1616
michael@0 1617
michael@0 1618
michael@0 1619
michael@0 1620
michael@0 1621
michael@0 1622 }
michael@0 1623
michael@0 1624
michael@0 1625
michael@0 1626 // Results language3/percent3/text_bytes must be exactly three items
michael@0 1627 Language DetectLanguageSummaryV2(
michael@0 1628 const char* buffer,
michael@0 1629 int buffer_length,
michael@0 1630 bool is_plain_text,
michael@0 1631 const CLDHints* cld_hints,
michael@0 1632 bool allow_extended_lang,
michael@0 1633 int flags,
michael@0 1634 Language plus_one,
michael@0 1635 Language* language3,
michael@0 1636 int* percent3,
michael@0 1637 double* normalized_score3,
michael@0 1638 ResultChunkVector* resultchunkvector,
michael@0 1639 int* text_bytes,
michael@0 1640 bool* is_reliable) {
michael@0 1641 language3[0] = UNKNOWN_LANGUAGE;
michael@0 1642 language3[1] = UNKNOWN_LANGUAGE;
michael@0 1643 language3[2] = UNKNOWN_LANGUAGE;
michael@0 1644 percent3[0] = 0;
michael@0 1645 percent3[1] = 0;
michael@0 1646 percent3[2] = 0;
michael@0 1647 normalized_score3[0] = 0.0;
michael@0 1648 normalized_score3[1] = 0.0;
michael@0 1649 normalized_score3[2] = 0.0;
michael@0 1650 if (resultchunkvector != NULL) {
michael@0 1651 resultchunkvector->clear();
michael@0 1652 }
michael@0 1653 *text_bytes = 0;
michael@0 1654 *is_reliable = false;
michael@0 1655
michael@0 1656 if ((flags & kCLDFlagEcho) != 0) {
michael@0 1657 string temp(buffer, buffer_length);
michael@0 1658 if ((flags & kCLDFlagHtml) != 0) {
michael@0 1659 fprintf(stderr, "CLD2[%d] '%s'<br>\n",
michael@0 1660 buffer_length, GetHtmlEscapedText(temp).c_str());
michael@0 1661 } else {
michael@0 1662 fprintf(stderr, "CLD2[%d] '%s'\n",
michael@0 1663 buffer_length, GetPlainEscapedText(temp).c_str());
michael@0 1664 }
michael@0 1665 }
michael@0 1666
michael@0 1667 #ifdef CLD2_DYNAMIC_MODE
michael@0 1668 // In dynamic mode, we immediately return UNKNOWN_LANGUAGE if the data file
michael@0 1669 // hasn't been loaded yet. This is the only sane thing we can do, as there
michael@0 1670 // are no scoring tables to consult.
michael@0 1671 bool dataLoaded = isDataLoaded();
michael@0 1672 if ((flags & kCLDFlagVerbose) != 0) {
michael@0 1673 fprintf(stderr, "Data loaded: %s\n", (dataLoaded ? "true" : "false"));
michael@0 1674 }
michael@0 1675 if (!dataLoaded) {
michael@0 1676 return UNKNOWN_LANGUAGE;
michael@0 1677 }
michael@0 1678 #endif
michael@0 1679
michael@0 1680 // Exit now if no text
michael@0 1681 if (buffer_length == 0) {return UNKNOWN_LANGUAGE;}
michael@0 1682 if (kScoringtables.quadgram_obj == NULL) {return UNKNOWN_LANGUAGE;}
michael@0 1683
michael@0 1684 // Document totals
michael@0 1685 DocTote doc_tote; // Reliability = 0..100
michael@0 1686
michael@0 1687 // ScoringContext carries state across scriptspans
michael@0 1688 ScoringContext scoringcontext;
michael@0 1689 scoringcontext.debug_file = stderr;
michael@0 1690 scoringcontext.flags_cld2_score_as_quads =
michael@0 1691 ((flags & kCLDFlagScoreAsQuads) != 0);
michael@0 1692 scoringcontext.flags_cld2_html = ((flags & kCLDFlagHtml) != 0);
michael@0 1693 scoringcontext.flags_cld2_cr = ((flags & kCLDFlagCr) != 0);
michael@0 1694 scoringcontext.flags_cld2_verbose = ((flags & kCLDFlagVerbose) != 0);
michael@0 1695 scoringcontext.prior_chunk_lang = UNKNOWN_LANGUAGE;
michael@0 1696 scoringcontext.ulscript = ULScript_Common;
michael@0 1697 scoringcontext.scoringtables = &kScoringtables;
michael@0 1698 scoringcontext.scanner = NULL;
michael@0 1699 scoringcontext.init(); // Clear the internal memory arrays
michael@0 1700
michael@0 1701 // Now thread safe.
michael@0 1702 bool FLAGS_cld2_html = ((flags & kCLDFlagHtml) != 0);
michael@0 1703 bool FLAGS_cld2_quiet = ((flags & kCLDFlagQuiet) != 0);
michael@0 1704
michael@0 1705 ApplyHints(buffer, buffer_length, is_plain_text, cld_hints, &scoringcontext);
michael@0 1706
michael@0 1707 // Four individual script totals, Latin, Han, other2, other3
michael@0 1708 int next_other_tote = 2;
michael@0 1709 int tote_num = 0;
michael@0 1710
michael@0 1711 // Four totes for up to four different scripts pending at once
michael@0 1712 Tote totes[4]; // [0] Latn [1] Hani [2] other [3] other
michael@0 1713 bool tote_seen[4] = {false, false, false, false};
michael@0 1714 int tote_grams[4] = {0, 0, 0, 0}; // Number in partial chunk
michael@0 1715 ULScript tote_script[4] =
michael@0 1716 {ULScript_Latin, ULScript_Hani, ULScript_Common, ULScript_Common};
michael@0 1717
michael@0 1718 // Loop through text spans in a single script
michael@0 1719 ScriptScanner ss(buffer, buffer_length, is_plain_text);
michael@0 1720 LangSpan scriptspan;
michael@0 1721
michael@0 1722 scoringcontext.scanner = &ss;
michael@0 1723
michael@0 1724 scriptspan.text = NULL;
michael@0 1725 scriptspan.text_bytes = 0;
michael@0 1726 scriptspan.offset = 0;
michael@0 1727 scriptspan.ulscript = ULScript_Common;
michael@0 1728 scriptspan.lang = UNKNOWN_LANGUAGE;
michael@0 1729
michael@0 1730 int total_text_bytes = 0;
michael@0 1731 int textlimit = FLAGS_cld_textlimit << 10; // in KB
michael@0 1732 if (textlimit == 0) {textlimit = 0x7fffffff;}
michael@0 1733
michael@0 1734 int advance_by = 2; // Advance 2 bytes
michael@0 1735 int advance_limit = textlimit >> 3; // For first 1/8 of max document
michael@0 1736
michael@0 1737 int initial_word_span = kDefaultWordSpan;
michael@0 1738 if (FLAGS_cld_forcewords) {
michael@0 1739 initial_word_span = kReallyBigWordSpan;
michael@0 1740 }
michael@0 1741
michael@0 1742 // Pick up chunk sizes
michael@0 1743 // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each
michael@0 1744 // Sanity check -- force into a reasonable range
michael@0 1745 int chunksizequads = FLAGS_cld_smoothwidth;
michael@0 1746 chunksizequads = minint(maxint(chunksizequads, kMinChunkSizeQuads),
michael@0 1747 kMaxChunkSizeQuads);
michael@0 1748 int chunksizeunis = (chunksizequads * 5) >> 1;
michael@0 1749
michael@0 1750 // Varying short-span limit doesn't work well -- skips too much beyond 20KB
michael@0 1751 // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth;
michael@0 1752 int spantooshortlimit = kShortSpanThresh;
michael@0 1753
michael@0 1754 // For debugging only. Not thread-safe
michael@0 1755 prior_lang = UNKNOWN_LANGUAGE;
michael@0 1756 prior_unreliable = false;
michael@0 1757
michael@0 1758 // Allocate full-document prediction table for finding repeating words
michael@0 1759 int hash = 0;
michael@0 1760 int* predict_tbl = new int[kPredictionTableSize];
michael@0 1761 if (FlagRepeats(flags)) {
michael@0 1762 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
michael@0 1763 }
michael@0 1764
michael@0 1765
michael@0 1766
michael@0 1767 // Loop through scriptspans accumulating number of text bytes in each language
michael@0 1768 while (ss.GetOneScriptSpanLower(&scriptspan)) {
michael@0 1769 ULScript ulscript = scriptspan.ulscript;
michael@0 1770
michael@0 1771 // Squeeze out big chunks of text span if asked to
michael@0 1772 if (FlagSqueeze(flags)) {
michael@0 1773 // Remove repetitive or mostly-spaces chunks
michael@0 1774 int newlen;
michael@0 1775 int chunksize = 0; // Use the default
michael@0 1776 if (resultchunkvector != NULL) {
michael@0 1777 newlen = CheapSqueezeInplaceOverwrite(scriptspan.text,
michael@0 1778 scriptspan.text_bytes,
michael@0 1779 chunksize);
michael@0 1780 } else {
michael@0 1781 newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes,
michael@0 1782 chunksize);
michael@0 1783 }
michael@0 1784 scriptspan.text_bytes = newlen;
michael@0 1785 } else {
michael@0 1786 // Check now and then to see if we should be squeezing
michael@0 1787 if (((kCheapSqueezeTestThresh >> 1) < scriptspan.text_bytes) &&
michael@0 1788 !FlagFinish(flags)) {
michael@0 1789 // fprintf(stderr, "CheapSqueezeTriggerTest, "
michael@0 1790 // "first %d bytes of %d (>%d/2)<br>\n",
michael@0 1791 // kCheapSqueezeTestLen,
michael@0 1792 // scriptspan.text_bytes,
michael@0 1793 // kCheapSqueezeTestThresh);
michael@0 1794
michael@0 1795 if (CheapSqueezeTriggerTest(scriptspan.text,
michael@0 1796 scriptspan.text_bytes,
michael@0 1797 kCheapSqueezeTestLen)) {
michael@0 1798 // Recursive call with big-chunk squeezing set
michael@0 1799 if (FLAGS_cld2_html || FLAGS_dbgscore) {
michael@0 1800 fprintf(stderr,
michael@0 1801 "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n",
michael@0 1802 total_text_bytes);
michael@0 1803 }
michael@0 1804 // Deallocate full-document prediction table
michael@0 1805 delete[] predict_tbl;
michael@0 1806
michael@0 1807 return DetectLanguageSummaryV2(
michael@0 1808 buffer,
michael@0 1809 buffer_length,
michael@0 1810 is_plain_text,
michael@0 1811 cld_hints,
michael@0 1812 allow_extended_lang,
michael@0 1813 flags | kCLDFlagSqueeze,
michael@0 1814 plus_one,
michael@0 1815 language3,
michael@0 1816 percent3,
michael@0 1817 normalized_score3,
michael@0 1818 resultchunkvector,
michael@0 1819 text_bytes,
michael@0 1820 is_reliable);
michael@0 1821 }
michael@0 1822 }
michael@0 1823 }
michael@0 1824
michael@0 1825 // Remove repetitive words if asked to
michael@0 1826 if (FlagRepeats(flags)) {
michael@0 1827 // Remove repetitive words
michael@0 1828 int newlen;
michael@0 1829 if (resultchunkvector != NULL) {
michael@0 1830 newlen = CheapRepWordsInplaceOverwrite(scriptspan.text,
michael@0 1831 scriptspan.text_bytes,
michael@0 1832 &hash, predict_tbl);
michael@0 1833 } else {
michael@0 1834 newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes,
michael@0 1835 &hash, predict_tbl);
michael@0 1836 }
michael@0 1837 scriptspan.text_bytes = newlen;
michael@0 1838 }
michael@0 1839
michael@0 1840 // Scoring depends on scriptspan buffer ALWAYS having
michael@0 1841 // leading space and off-the-end space space space NUL,
michael@0 1842 // DCHECK(scriptspan.text[0] == ' ');
michael@0 1843 // DCHECK(scriptspan.text[scriptspan.text_bytes + 0] == ' ');
michael@0 1844 // DCHECK(scriptspan.text[scriptspan.text_bytes + 1] == ' ');
michael@0 1845 // DCHECK(scriptspan.text[scriptspan.text_bytes + 2] == ' ');
michael@0 1846 // DCHECK(scriptspan.text[scriptspan.text_bytes + 3] == '\0');
michael@0 1847
michael@0 1848 // The real scoring
michael@0 1849 // Accumulate directly into the document total, or accmulate in one of four
michael@0 1850 // chunk totals. The purpose of the multiple chunk totals is to piece
michael@0 1851 // together short choppy pieces of text in alternating scripts. One total is
michael@0 1852 // dedicated to Latin text, one to Han text, and the other two are dynamicly
michael@0 1853 // assigned.
michael@0 1854
michael@0 1855 scoringcontext.ulscript = scriptspan.ulscript;
michael@0 1856 // FLAGS_cld2_html = scoringcontext.flags_cld2_html;
michael@0 1857
michael@0 1858 ScoreOneScriptSpan(scriptspan,
michael@0 1859 &scoringcontext,
michael@0 1860 &doc_tote,
michael@0 1861 resultchunkvector);
michael@0 1862
michael@0 1863 total_text_bytes += scriptspan.text_bytes;
michael@0 1864 } // End while (ss.GetOneScriptSpanLower())
michael@0 1865
michael@0 1866 // Deallocate full-document prediction table
michael@0 1867 delete[] predict_tbl;
michael@0 1868
michael@0 1869 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
michael@0 1870 // If no forced <cr>, put one in front of dump
michael@0 1871 if (!scoringcontext.flags_cld2_cr) {fprintf(stderr, "<br>\n");}
michael@0 1872 doc_tote.Dump(stderr);
michael@0 1873 }
michael@0 1874
michael@0 1875
michael@0 1876 // If extended langauges are disallowed, remove them here
michael@0 1877 if (!allow_extended_lang) {
michael@0 1878 RemoveExtendedLanguages(&doc_tote);
michael@0 1879 }
michael@0 1880
michael@0 1881 // Force close pairs to one or the other
michael@0 1882 // If given, also update resultchunkvector
michael@0 1883 RefineScoredClosePairs(&doc_tote, resultchunkvector,
michael@0 1884 FLAGS_cld2_html, FLAGS_cld2_quiet);
michael@0 1885
michael@0 1886
michael@0 1887 // Calculate return results
michael@0 1888 // Find top three byte counts in tote heap
michael@0 1889 int reliable_percent3[3];
michael@0 1890
michael@0 1891 // Cannot use Add, etc. after sorting
michael@0 1892 doc_tote.Sort(3);
michael@0 1893
michael@0 1894 ExtractLangEtc(&doc_tote, total_text_bytes,
michael@0 1895 reliable_percent3, language3, percent3, normalized_score3,
michael@0 1896 text_bytes, is_reliable);
michael@0 1897
michael@0 1898 bool have_good_answer = false;
michael@0 1899 if (FlagFinish(flags)) {
michael@0 1900 // Force a result
michael@0 1901 have_good_answer = true;
michael@0 1902 } else if (total_text_bytes <= kShortTextThresh) {
michael@0 1903 // Don't recurse on short text -- we already did word scores
michael@0 1904 have_good_answer = true;
michael@0 1905 } else if (*is_reliable &&
michael@0 1906 (percent3[0] >= kGoodLang1Percent)) {
michael@0 1907 have_good_answer = true;
michael@0 1908 } else if (*is_reliable &&
michael@0 1909 ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) {
michael@0 1910 have_good_answer = true;
michael@0 1911 }
michael@0 1912
michael@0 1913
michael@0 1914 if (have_good_answer) {
michael@0 1915 // This is the real, non-recursive return
michael@0 1916
michael@0 1917 // Move bytes for unreliable langs to another lang or UNKNOWN
michael@0 1918 RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet);
michael@0 1919
michael@0 1920 // Redo the result extraction after the removal above
michael@0 1921 doc_tote.Sort(3);
michael@0 1922 ExtractLangEtc(&doc_tote, total_text_bytes,
michael@0 1923 reliable_percent3, language3, percent3, normalized_score3,
michael@0 1924 text_bytes, is_reliable);
michael@0 1925
michael@0 1926
michael@0 1927
michael@0 1928 Language summary_lang;
michael@0 1929 CalcSummaryLang(&doc_tote, total_text_bytes,
michael@0 1930 reliable_percent3, language3, percent3,
michael@0 1931 &summary_lang, is_reliable,
michael@0 1932 FLAGS_cld2_html, FLAGS_cld2_quiet);
michael@0 1933
michael@0 1934 if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
michael@0 1935 for (int i = 0; i < 3; ++i) {
michael@0 1936 if (language3[i] != UNKNOWN_LANGUAGE) {
michael@0 1937 fprintf(stderr, "%s.%dR(%d%%) ",
michael@0 1938 LanguageCode(language3[i]),
michael@0 1939 reliable_percent3[i],
michael@0 1940 percent3[i]);
michael@0 1941 }
michael@0 1942 }
michael@0 1943
michael@0 1944 fprintf(stderr, "%d bytes ", total_text_bytes);
michael@0 1945 fprintf(stderr, "= %s%c ",
michael@0 1946 LanguageName(summary_lang), *is_reliable ? ' ' : '*');
michael@0 1947 fprintf(stderr, "<br><br>\n");
michael@0 1948 }
michael@0 1949
michael@0 1950 // Slightly condensed if quiet
michael@0 1951 if (FLAGS_cld2_html && FLAGS_cld2_quiet) {
michael@0 1952 fprintf(stderr, "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ");
michael@0 1953 for (int i = 0; i < 3; ++i) {
michael@0 1954 if (language3[i] != UNKNOWN_LANGUAGE) {
michael@0 1955 fprintf(stderr, "&nbsp;&nbsp;%s %d%% ",
michael@0 1956 LanguageCode(language3[i]),
michael@0 1957 percent3[i]);
michael@0 1958 }
michael@0 1959 }
michael@0 1960 fprintf(stderr, "= %s%c ",
michael@0 1961 LanguageName(summary_lang), *is_reliable ? ' ' : '*');
michael@0 1962 fprintf(stderr, "<br>\n");
michael@0 1963 }
michael@0 1964
michael@0 1965 return summary_lang;
michael@0 1966 }
michael@0 1967
michael@0 1968 // Not a good answer -- do recursive call to refine
michael@0 1969 if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) {
michael@0 1970 // This is what we hope to improve on in the recursive call, if any
michael@0 1971 PrintLangs(stderr, language3, percent3, text_bytes, is_reliable);
michael@0 1972 }
michael@0 1973
michael@0 1974 // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40
michael@0 1975 // For this purpose, we treate "Ignore" as top40
michael@0 1976 Language new_plus_one = UNKNOWN_LANGUAGE;
michael@0 1977
michael@0 1978 if (total_text_bytes < kShortTextThresh) {
michael@0 1979 // Short text: Recursive call with top40 and short set
michael@0 1980 if (FLAGS_cld2_html || FLAGS_dbgscore) {
michael@0 1981 fprintf(stderr, "&nbsp;&nbsp;---text_bytes[%d] "
michael@0 1982 "Recursive(Top40/Rep/Short/Words)---<br><br>\n",
michael@0 1983 total_text_bytes);
michael@0 1984 }
michael@0 1985 return DetectLanguageSummaryV2(
michael@0 1986 buffer,
michael@0 1987 buffer_length,
michael@0 1988 is_plain_text,
michael@0 1989 cld_hints,
michael@0 1990 allow_extended_lang,
michael@0 1991 flags | kCLDFlagTop40 | kCLDFlagRepeats |
michael@0 1992 kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish,
michael@0 1993 new_plus_one,
michael@0 1994 language3,
michael@0 1995 percent3,
michael@0 1996 normalized_score3,
michael@0 1997 resultchunkvector,
michael@0 1998 text_bytes,
michael@0 1999 is_reliable);
michael@0 2000 }
michael@0 2001
michael@0 2002 // Longer text: Recursive call with top40 set
michael@0 2003 if (FLAGS_cld2_html || FLAGS_dbgscore) {
michael@0 2004 fprintf(stderr,
michael@0 2005 "&nbsp;&nbsp;---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n",
michael@0 2006 total_text_bytes);
michael@0 2007 }
michael@0 2008 return DetectLanguageSummaryV2(
michael@0 2009 buffer,
michael@0 2010 buffer_length,
michael@0 2011 is_plain_text,
michael@0 2012 cld_hints,
michael@0 2013 allow_extended_lang,
michael@0 2014 flags | kCLDFlagTop40 | kCLDFlagRepeats |
michael@0 2015 kCLDFlagFinish,
michael@0 2016 new_plus_one,
michael@0 2017 language3,
michael@0 2018 percent3,
michael@0 2019 normalized_score3,
michael@0 2020 resultchunkvector,
michael@0 2021 text_bytes,
michael@0 2022 is_reliable);
michael@0 2023 }
michael@0 2024
michael@0 2025
michael@0 2026 // For debugging and wrappers. Not thread safe.
michael@0 2027 static char temp_detectlanguageversion[32];
michael@0 2028
michael@0 2029 // Return version text string
michael@0 2030 // String is "code_version - data_build_date"
michael@0 2031 const char* DetectLanguageVersion() {
michael@0 2032 if (kScoringtables.quadgram_obj == NULL) {return "";}
michael@0 2033 sprintf(temp_detectlanguageversion,
michael@0 2034 "V2.0 - %u", kScoringtables.quadgram_obj->kCLDTableBuildDate);
michael@0 2035 return temp_detectlanguageversion;
michael@0 2036 }
michael@0 2037
michael@0 2038
michael@0 2039 } // End namespace CLD2

mercurial