browser/components/translation/cld2/internal/compact_lang_det_impl.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/browser/components/translation/cld2/internal/compact_lang_det_impl.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,2039 @@
     1.4 +// Copyright 2013 Google Inc. All Rights Reserved.
     1.5 +//
     1.6 +// Licensed under the Apache License, Version 2.0 (the "License");
     1.7 +// you may not use this file except in compliance with the License.
     1.8 +// You may obtain a copy of the License at
     1.9 +//
    1.10 +//     http://www.apache.org/licenses/LICENSE-2.0
    1.11 +//
    1.12 +// Unless required by applicable law or agreed to in writing, software
    1.13 +// distributed under the License is distributed on an "AS IS" BASIS,
    1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    1.15 +// See the License for the specific language governing permissions and
    1.16 +// limitations under the License.
    1.17 +
    1.18 +//
    1.19 +// Author: dsites@google.com (Dick Sites)
    1.20 +// Updated 2014.01 for dual table lookup
    1.21 +//
    1.22 +
    1.23 +#include <stdio.h>
    1.24 +#include <string.h>
    1.25 +#include <string>
    1.26 +#include <vector>
    1.27 +
    1.28 +#include "cldutil.h"
    1.29 +#include "debug.h"
    1.30 +#include "integral_types.h"
    1.31 +#include "lang_script.h"
    1.32 +#include "utf8statetable.h"
    1.33 +
    1.34 +#ifdef CLD2_DYNAMIC_MODE
    1.35 +#include "cld2_dynamic_data.h"
    1.36 +#include "cld2_dynamic_data_loader.h"
    1.37 +#endif
    1.38 +#include "cld2tablesummary.h"
    1.39 +#include "compact_lang_det_impl.h"
    1.40 +#include "compact_lang_det_hint_code.h"
    1.41 +#include "getonescriptspan.h"
    1.42 +#include "tote.h"
    1.43 +
    1.44 +
    1.45 +namespace CLD2 {
    1.46 +
    1.47 +using namespace std;
    1.48 +
    1.49 +// Linker supplies the right tables, From files
    1.50 +// cld_generated_cjk_uni_prop_80.cc  cld2_generated_cjk_compatible.cc
    1.51 +// cld_generated_cjk_delta_bi_32.cc  generated_distinct_bi_0.cc
    1.52 +// cld2_generated_quad*.cc  cld2_generated_deltaocta*.cc
    1.53 +// cld2_generated_distinctocta*.cc
    1.54 +// cld_generated_score_quad_octa_1024_256.cc
    1.55 +
    1.56 +// 2014.01 Now implementing quadgram dual lookup tables, to allow main table
    1.57 +//   sizes that are 1/3/5 times a power of two, instead of just powers of two.
    1.58 +//   Gives more flexibility of total footprint for CLD2.
    1.59 +
    1.60 +extern const int kLanguageToPLangSize;
    1.61 +extern const int kCloseSetSize;
    1.62 +
    1.63 +extern const UTF8PropObj cld_generated_CjkUni_obj;
    1.64 +extern const CLD2TableSummary kCjkCompat_obj;
    1.65 +extern const CLD2TableSummary kCjkDeltaBi_obj;
    1.66 +extern const CLD2TableSummary kDistinctBiTable_obj;
    1.67 +extern const CLD2TableSummary kQuad_obj;
    1.68 +extern const CLD2TableSummary kQuad_obj2;     // Dual lookup tables
    1.69 +extern const CLD2TableSummary kDeltaOcta_obj;
    1.70 +extern const CLD2TableSummary kDistinctOcta_obj;
    1.71 +extern const short kAvgDeltaOctaScore[];
    1.72 +
    1.73 +#ifdef CLD2_DYNAMIC_MODE
    1.74 +  // CLD2_DYNAMIC_MODE is defined:
    1.75 +  // Data will be read from an mmap opened at runtime.
    1.76 +  static ScoringTables kScoringtables = {
    1.77 +    NULL, //&cld_generated_CjkUni_obj,
    1.78 +    NULL, //&kCjkCompat_obj,
    1.79 +    NULL, //&kCjkDeltaBi_obj,
    1.80 +    NULL, //&kDistinctBiTable_obj,
    1.81 +    NULL, //&kQuad_obj,
    1.82 +    NULL, //&kQuad_obj2,
    1.83 +    NULL, //&kDeltaOcta_obj,
    1.84 +    NULL, //&kDistinctOcta_obj,
    1.85 +    NULL, //kAvgDeltaOctaScore,
    1.86 +  };
    1.87 +  static bool dynamicDataLoaded = false;
    1.88 +  static ScoringTables* dynamicTables = NULL;
    1.89 +  static void* mmapAddress = NULL;
    1.90 +  static int mmapLength = 0;
    1.91 +
    1.92 +  bool isDataLoaded() { return dynamicDataLoaded; }
    1.93 +
    1.94 +  void loadData(const char* fileName) {
    1.95 +    if (isDataLoaded()) {
    1.96 +      unloadData();
    1.97 +    }
    1.98 +    dynamicTables = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength);
    1.99 +    kScoringtables = *dynamicTables;
   1.100 +    dynamicDataLoaded = true;
   1.101 +  };
   1.102 +
   1.103 +  void unloadData() {
   1.104 +    if (!dynamicDataLoaded) return;
   1.105 +    dynamicDataLoaded = false;
   1.106 +    // unloading will null all the pointers out.
   1.107 +    CLD2DynamicDataLoader::unloadData(&dynamicTables, &mmapAddress, &mmapLength);
   1.108 +  }
   1.109 +#else
   1.110 +  // This initializes kScoringtables.quadgram_obj etc.
   1.111 +  static const ScoringTables kScoringtables = {
   1.112 +    &cld_generated_CjkUni_obj,
   1.113 +    &kCjkCompat_obj,
   1.114 +    &kCjkDeltaBi_obj,
   1.115 +    &kDistinctBiTable_obj,
   1.116 +
   1.117 +    &kQuad_obj,
   1.118 +    &kQuad_obj2,                              // Dual lookup tables
   1.119 +    &kDeltaOcta_obj,
   1.120 +    &kDistinctOcta_obj,
   1.121 +
   1.122 +    kAvgDeltaOctaScore,
   1.123 +  };
   1.124 +#endif // #ifdef CLD2_DYNAMIC_MODE
   1.125 +
   1.126 +
   1.127 +static const bool FLAGS_cld_no_minimum_bytes = false;
   1.128 +static const bool FLAGS_cld_forcewords = true;
   1.129 +static const bool FLAGS_cld_showme = false;
   1.130 +static const bool FLAGS_cld_echotext = true;
   1.131 +static const int32 FLAGS_cld_textlimit = 160;
   1.132 +static const int32 FLAGS_cld_smoothwidth = 20;
   1.133 +static const bool FLAGS_cld_2011_hints = true;
   1.134 +static const int32 FLAGS_cld_max_lang_tag_scan_kb = 8;
   1.135 +
   1.136 +static const bool FLAGS_dbgscore = false;
   1.137 +
   1.138 +
   1.139 +static const int kLangHintInitial = 12;  // Boost language by N initially
   1.140 +static const int kLangHintBoost = 12;    // Boost language by N/16 per quadgram
   1.141 +
   1.142 +static const int kShortSpanThresh = 32;       // Bytes
   1.143 +static const int kMaxSecondChanceLen = 1024;  // Look at first 1K of short spans
   1.144 +
   1.145 +static const int kCheapSqueezeTestThresh = 4096;  // Only look for squeezing
   1.146 +                                                  // after this many text bytes
   1.147 +static const int kCheapSqueezeTestLen = 256;  // Bytes to test to trigger sqz
   1.148 +static const int kSpacesTriggerPercent = 25;  // Trigger sqz if >=25% spaces
   1.149 +static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted
   1.150 +
   1.151 +static const int kChunksizeDefault = 48;      // Squeeze 48-byte chunks
   1.152 +static const int kSpacesThreshPercent = 25;   // Squeeze if >=25% spaces
   1.153 +static const int kPredictThreshPercent = 40;  // Squeeze if >=40% predicted
   1.154 +
   1.155 +static const int kMaxSpaceScan = 32;          // Bytes
   1.156 +
   1.157 +static const int kGoodLang1Percent = 70;
   1.158 +static const int kGoodLang1and2Percent = 93;
   1.159 +static const int kShortTextThresh = 256;      // Bytes
   1.160 +
   1.161 +static const int kMinChunkSizeQuads = 4;      // Chunk is at least four quads
   1.162 +static const int kMaxChunkSizeQuads = 1024;   // Chunk is at most 1K quads
   1.163 +
   1.164 +static const int kDefaultWordSpan = 256;      // Scan at least this many initial
   1.165 +                                              // bytes with word scoring
   1.166 +static const int kReallyBigWordSpan = 9999999;  // Forces word scoring all text
   1.167 +
   1.168 +static const int kMinReliableSeq = 50;      // Record in seq if >= 50% reliable
   1.169 +
   1.170 +static const int kPredictionTableSize = 4096;   // Must be exactly 4096 for
   1.171 +                                                // cheap compressor
   1.172 +
   1.173 +static const int kNonEnBoilerplateMinPercent = 17;    // <this => no second
   1.174 +static const int kNonFIGSBoilerplateMinPercent = 20;  // <this => no second
   1.175 +static const int kGoodFirstMinPercent = 26;           // <this => UNK
   1.176 +static const int kGoodFirstReliableMinPercent = 51;   // <this => unreli
   1.177 +static const int kIgnoreMaxPercent = 20;              // >this => unreli
   1.178 +static const int kKeepMinPercent = 2;                 // <this => unreli
   1.179 +
   1.180 +
   1.181 +
   1.182 +// Statistically closest language, based on quadgram table
   1.183 +// Those that are far from other languges map to UNKNOWN_LANGUAGE
   1.184 +// Subscripted by Language
   1.185 +//
   1.186 +// From lang_correlation.txt and hand-edits
   1.187 +// sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/
   1.188 +//   (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE,
   1.189 +//   \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt
   1.190 +//
   1.191 +static const int kMinCorrPercent = 24;        // Pick off how close you want
   1.192 +                                              // 24 catches PERSIAN <== ARABIC
   1.193 +                                              // but not SPANISH <== PORTUGESE
   1.194 +static Language Unknown = UNKNOWN_LANGUAGE;
   1.195 +
   1.196 +// Suspect idea
   1.197 +// Subscripted by Language
   1.198 +static const Language kClosestAltLanguage[] = {
   1.199 +  (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE,  // ENGLISH
   1.200 +  (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // DANISH
   1.201 +  (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE,  // DUTCH
   1.202 +  (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE,  // FINNISH
   1.203 +  (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // FRENCH
   1.204 +  (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE,  // GERMAN
   1.205 +  (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE,  // HEBREW
   1.206 +  (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE,  // ITALIAN
   1.207 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Japanese
   1.208 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Korean
   1.209 +  (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE,  // NORWEGIAN
   1.210 +  ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // POLISH
   1.211 +  (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // PORTUGUESE
   1.212 +  (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE,  // RUSSIAN
   1.213 +  (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE,  // SPANISH
   1.214 +  (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // SWEDISH
   1.215 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Chinese
   1.216 +  (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // CZECH
   1.217 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GREEK
   1.218 +  (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE,  // ICELANDIC
   1.219 +  ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE,  // LATVIAN
   1.220 +  ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE,  // LITHUANIAN
   1.221 +  ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // ROMANIAN
   1.222 +  ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // HUNGARIAN
   1.223 +  (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE,  // ESTONIAN
   1.224 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Ignore
   1.225 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Unknown
   1.226 +  (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // BULGARIAN
   1.227 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CROATIAN
   1.228 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SERBIAN
   1.229 +  (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE,  // IRISH
   1.230 +  (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // GALICIAN
   1.231 +  ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE,  // TAGALOG
   1.232 +  (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE,  // TURKISH
   1.233 +  (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // UKRAINIAN
   1.234 +  (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE,  // HINDI
   1.235 +  (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE,  // MACEDONIAN
   1.236 +  (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE,  // BENGALI
   1.237 +  (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE,  // INDONESIAN
   1.238 +  ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE,  // LATIN
   1.239 +  (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE,  // MALAY
   1.240 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MALAYALAM
   1.241 +  ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE,  // WELSH
   1.242 +  ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // NEPALI
   1.243 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // TELUGU
   1.244 +  ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE,  // ALBANIAN
   1.245 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // TAMIL
   1.246 +  (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE,  // BELARUSIAN
   1.247 +  (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE,  // JAVANESE
   1.248 +  (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE,  // OCCITAN
   1.249 +  (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // URDU
   1.250 +  (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // BIHARI
   1.251 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GUJARATI
   1.252 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // THAI
   1.253 +  (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // ARABIC
   1.254 +  (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // CATALAN
   1.255 +  ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // ESPERANTO
   1.256 +  ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // BASQUE
   1.257 +  ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // INTERLINGUA
   1.258 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // KANNADA
   1.259 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PUNJABI
   1.260 +  (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE,  // SCOTS_GAELIC
   1.261 +  ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // SWAHILI
   1.262 +  (28 >= kMinCorrPercent) ? SERBIAN : UNKNOWN_LANGUAGE,  // SLOVENIAN
   1.263 +  (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // MARATHI
   1.264 +  ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // MALTESE
   1.265 +  ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE,  // VIETNAMESE
   1.266 +  (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // FRISIAN
   1.267 +  (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE,  // SLOVAK
   1.268 +  // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ChineseT
   1.269 +  (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE,  // ChineseT
   1.270 +  (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE,  // FAROESE
   1.271 +  (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE,  // SUNDANESE
   1.272 +  (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE,  // UZBEK
   1.273 +  ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE,  // AMHARIC
   1.274 +  (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE,  // AZERBAIJANI
   1.275 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GEORGIAN
   1.276 +  ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE,  // TIGRINYA
   1.277 +  (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE,  // PERSIAN
   1.278 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // BOSNIAN
   1.279 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SINHALESE
   1.280 +  (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // NORWEGIAN_N
   1.281 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PORTUGUESE_P
   1.282 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PORTUGUESE_B
   1.283 +  (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE,  // XHOSA
   1.284 +  (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE,  // ZULU
   1.285 +  ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // GUARANI
   1.286 +  (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE,  // SESOTHO
   1.287 +  ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE,  // TURKMEN
   1.288 +  ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE,  // KYRGYZ
   1.289 +  ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE,  // BRETON
   1.290 +  ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE,  // TWI
   1.291 +  (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE,  // YIDDISH
   1.292 +  (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE,  // SERBO_CROATIAN
   1.293 +  (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE,  // SOMALI
   1.294 +  ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE,  // UIGHUR
   1.295 +  (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // KURDISH
   1.296 +  ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE,  // MONGOLIAN
   1.297 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ARMENIAN
   1.298 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // LAOTHIAN
   1.299 +  ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE,  // SINDHI
   1.300 +  (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // RHAETO_ROMANCE
   1.301 +  (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // AFRIKAANS
   1.302 +  (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // LUXEMBOURGISH
   1.303 +  ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE,  // BURMESE
   1.304 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // KHMER
   1.305 +  (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE,  // TIBETAN
   1.306 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // DHIVEHI
   1.307 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CHEROKEE
   1.308 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SYRIAC
   1.309 +  ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // LIMBU
   1.310 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ORIYA
   1.311 +  (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE,  // ASSAMESE
   1.312 +  (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // CORSICAN
   1.313 +  ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE,  // INTERLINGUE
   1.314 +  ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE,  // KAZAKH
   1.315 +  ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE,  // LINGALA
   1.316 +  (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // MOLDAVIAN
   1.317 +  (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // PASHTO
   1.318 +  ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE,  // QUECHUA
   1.319 +  ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // SHONA
   1.320 +  (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE,  // TAJIK
   1.321 +  (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE,  // TATAR
   1.322 +  (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE,  // TONGA
   1.323 +  ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE,  // YORUBA
   1.324 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_ENGLISH_BASED
   1.325 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_FRENCH_BASED
   1.326 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
   1.327 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_OTHER
   1.328 +  ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // MAORI
   1.329 +  ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE,  // WOLOF
   1.330 +  ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE,  // ABKHAZIAN
   1.331 +  ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE,  // AFAR
   1.332 +  ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE,  // AYMARA
   1.333 +  (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE,  // BASHKIR
   1.334 +  ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // BISLAMA
   1.335 +  (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE,  // DZONGKHA
   1.336 +  ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // FIJIAN
   1.337 +  ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE,  // GREENLANDIC
   1.338 +  ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE,  // HAUSA
   1.339 +  ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // HAITIAN_CREOLE
   1.340 +  ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE,  // INUPIAK
   1.341 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // INUKTITUT
   1.342 +  ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // KASHMIRI
   1.343 +  (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE,  // KINYARWANDA
   1.344 +  ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE,  // MALAGASY
   1.345 +  (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // NAURU
   1.346 +  (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE,  // OROMO
   1.347 +  (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // RUNDI
   1.348 +  (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // SAMOAN
   1.349 +  ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE,  // SANGO
   1.350 +  (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE,  // SANSKRIT
   1.351 +  (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE,  // SISWANT
   1.352 +  ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE,  // TSONGA
   1.353 +  (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE,  // TSWANA
   1.354 +  ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE,  // VOLAPUK
   1.355 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ZHUANG
   1.356 +  ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE,  // KHASI
   1.357 +  (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // SCOTS
   1.358 +  (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // GANDA
   1.359 +  ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // MANX
   1.360 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MONTENEGRIN
   1.361 +
   1.362 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // AKAN
   1.363 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // IGBO
   1.364 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MAURITIAN_CREOLE
   1.365 +  ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // HAWAIIAN
   1.366 +};
   1.367 +
   1.368 +// COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES,
   1.369 +//                kClosestAltLanguage_has_incorrect_size);
   1.370 +
   1.371 +
   1.372 +inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;}
   1.373 +inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;}
   1.374 +inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;}
   1.375 +inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}
   1.376 +inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
   1.377 +inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
   1.378 +inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
   1.379 +
   1.380 +
   1.381 +  // Defines Top40 packed languages
   1.382 +
   1.383 +  // Google top 40 languages
   1.384 +  //
   1.385 +  // Tier 0/1 Language enum list (16)
   1.386 +  //   ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH,    // E - FIGS
   1.387 +  //   DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN,
   1.388 +  //   PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI,
   1.389 +  //   ARABIC,
   1.390 +  //
   1.391 +  // Tier 2 Language enum list (22)
   1.392 +  //   SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN,
   1.393 +  //   HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN,
   1.394 +  //   VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK,
   1.395 +  //   TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN,
   1.396 +  //   UKRAINIAN, HINDI,
   1.397 +  //
   1.398 +  //   use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21)
   1.399 +  //
   1.400 +  // Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40
   1.401 +
   1.402 +
   1.403 +void DemoteNotTop40(Tote* chunk_tote, uint16 psplus_one) {
   1.404 +  // REVISIT
   1.405 +}
   1.406 +
   1.407 +void PrintText(FILE* f, Language cur_lang, const string& temp) {
   1.408 +  if (temp.size() == 0) {return;}
   1.409 +  fprintf(f, "PrintText[%s]%s<br>\n", LanguageName(cur_lang), temp.c_str());
   1.410 +}
   1.411 +
   1.412 +
   1.413 +//------------------------------------------------------------------------------
   1.414 +// For --cld_html debugging output. Not thread safe
   1.415 +//------------------------------------------------------------------------------
   1.416 +static Language prior_lang = UNKNOWN_LANGUAGE;
   1.417 +static bool prior_unreliable = false;
   1.418 +
   1.419 +//------------------------------------------------------------------------------
   1.420 +// End For --cld_html debugging output
   1.421 +//------------------------------------------------------------------------------
   1.422 +
   1.423 +
   1.424 +// Backscan to word boundary, returning how many bytes n to go back
   1.425 +// so that src - n is non-space ans src - n - 1 is space.
   1.426 +// If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
   1.427 +int BackscanToSpace(const char* src, int limit) {
   1.428 +  int n = 0;
   1.429 +  limit = minint(limit, kMaxSpaceScan);
   1.430 +  while (n < limit) {
   1.431 +    if (src[-n - 1] == ' ') {return n;}    // We are at _X
   1.432 +    ++n;
   1.433 +  }
   1.434 +  n = 0;
   1.435 +  while (n < limit) {
   1.436 +    if ((src[-n] & 0xc0) != 0x80) {return n;}    // We are at char begin
   1.437 +    ++n;
   1.438 +  }
   1.439 +  return 0;
   1.440 +}
   1.441 +
   1.442 +// Forwardscan to word boundary, returning how many bytes n to go forward
   1.443 +// so that src + n is non-space ans src + n - 1 is space.
   1.444 +// If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
   1.445 +int ForwardscanToSpace(const char* src, int limit) {
   1.446 +  int n = 0;
   1.447 +  limit = minint(limit, kMaxSpaceScan);
   1.448 +  while (n < limit) {
   1.449 +    if (src[n] == ' ') {return n + 1;}    // We are at _X
   1.450 +    ++n;
   1.451 +  }
   1.452 +  n = 0;
   1.453 +  while (n < limit) {
   1.454 +    if ((src[n] & 0xc0) != 0x80) {return n;}    // We are at char begin
   1.455 +    ++n;
   1.456 +  }
   1.457 +  return 0;
   1.458 +}
   1.459 +
   1.460 +
   1.461 +// This uses a cheap predictor to get a measure of compression, and
   1.462 +// hence a measure of repetitiveness. It works on complete UTF-8 characters
   1.463 +// instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
   1.464 +// all the time when done with a byte-based count. Sigh.
   1.465 +//
   1.466 +// To allow running prediction across multiple chunks, caller passes in current
   1.467 +// 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
   1.468 +//
   1.469 +// Returns the number of *bytes* correctly predicted, increments by 1..4 for
   1.470 +// each correctly-predicted character.
   1.471 +//
   1.472 +// NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
   1.473 +//
   1.474 +
   1.475 +// TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen
   1.476 +
   1.477 +int CountPredictedBytes(const char* isrc, int src_len, int* hash, int* tbl) {
   1.478 +  int p_count = 0;
   1.479 +  const uint8* src = reinterpret_cast<const uint8*>(isrc);
   1.480 +  const uint8* srclimit = src + src_len;
   1.481 +  int local_hash = *hash;
   1.482 +
   1.483 +  while (src < srclimit) {
   1.484 +    int c = src[0];
   1.485 +    int incr = 1;
   1.486 +
   1.487 +    // Pick up one char and length
   1.488 +    if (c < 0xc0) {
   1.489 +      // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
   1.490 +      // Do nothing more
   1.491 +    } else if ((c & 0xe0) == 0xc0) {
   1.492 +      // Two-byte
   1.493 +      c = (c << 8) | src[1];
   1.494 +      incr = 2;
   1.495 +    } else if ((c & 0xf0) == 0xe0) {
   1.496 +      // Three-byte
   1.497 +      c = (c << 16) | (src[1] << 8) | src[2];
   1.498 +      incr = 3;
   1.499 +    } else {
   1.500 +      // Four-byte
   1.501 +      c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
   1.502 +      incr = 4;
   1.503 +    }
   1.504 +    src += incr;
   1.505 +
   1.506 +    int p = tbl[local_hash];            // Prediction
   1.507 +    tbl[local_hash] = c;                // Update prediction
   1.508 +    if (c == p) {
   1.509 +      p_count += incr;                  // Count bytes of good predictions
   1.510 +    }
   1.511 +
   1.512 +    local_hash = ((local_hash << 4) ^ c) & 0xfff;
   1.513 +  }
   1.514 +  *hash = local_hash;
   1.515 +  return p_count;
   1.516 +}
   1.517 +
   1.518 +
   1.519 +
   1.520 +// Counts number of spaces; a little faster than one-at-a-time
   1.521 +// Doesn't count odd bytes at end
   1.522 +int CountSpaces4(const char* src, int src_len) {
   1.523 +  int s_count = 0;
   1.524 +  for (int i = 0; i < (src_len & ~3); i += 4) {
   1.525 +    s_count += (src[i] == ' ');
   1.526 +    s_count += (src[i+1] == ' ');
   1.527 +    s_count += (src[i+2] == ' ');
   1.528 +    s_count += (src[i+3] == ' ');
   1.529 +  }
   1.530 +  return s_count;
   1.531 +}
   1.532 +
   1.533 +
   1.534 +// Remove words of text that have more than half their letters predicted
   1.535 +// correctly by our cheap predictor, moving the remaining words in-place
   1.536 +// to the front of the input buffer.
   1.537 +//
   1.538 +// To allow running prediction across multiple chunks, caller passes in current
   1.539 +// 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
   1.540 +//
   1.541 +// Return the new, possibly-shorter length
   1.542 +//
   1.543 +// Result Buffer ALWAYS has leading space and trailing space space space NUL,
   1.544 +// if input does
   1.545 +//
   1.546 +int CheapRepWordsInplace(char* isrc, int src_len, int* hash, int* tbl) {
   1.547 +  const uint8* src = reinterpret_cast<const uint8*>(isrc);
   1.548 +  const uint8* srclimit = src + src_len;
   1.549 +  char* dst = isrc;
   1.550 +  int local_hash = *hash;
   1.551 +  char* word_dst = dst;           // Start of next word
   1.552 +  int good_predict_bytes = 0;
   1.553 +  int word_length_bytes = 0;
   1.554 +
   1.555 +  while (src < srclimit) {
   1.556 +    int c = src[0];
   1.557 +    int incr = 1;
   1.558 +    *dst++ = c;
   1.559 +
   1.560 +    if (c == ' ') {
   1.561 +      if ((good_predict_bytes * 2) > word_length_bytes) {
   1.562 +        // Word is well-predicted: backup to start of this word
   1.563 +        dst = word_dst;
   1.564 +        if (FLAGS_cld_showme) {
   1.565 +          // Mark the deletion point with period
   1.566 +          // Don't repeat multiple periods
   1.567 +          // Cannot mark with more bytes or may overwrite unseen input
   1.568 +          if ((isrc < (dst - 2)) && (dst[-2] != '.')) {
   1.569 +            *dst++ = '.';
   1.570 +            *dst++ = ' ';
   1.571 +          }
   1.572 +        }
   1.573 +      }
   1.574 +      word_dst = dst;              // Start of next word
   1.575 +      good_predict_bytes = 0;
   1.576 +      word_length_bytes = 0;
   1.577 +    }
   1.578 +
   1.579 +    // Pick up one char and length
   1.580 +    if (c < 0xc0) {
   1.581 +      // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
   1.582 +      // Do nothing more
   1.583 +    } else if ((c & 0xe0) == 0xc0) {
   1.584 +      // Two-byte
   1.585 +      *dst++ = src[1];
   1.586 +      c = (c << 8) | src[1];
   1.587 +      incr = 2;
   1.588 +    } else if ((c & 0xf0) == 0xe0) {
   1.589 +      // Three-byte
   1.590 +      *dst++ = src[1];
   1.591 +      *dst++ = src[2];
   1.592 +      c = (c << 16) | (src[1] << 8) | src[2];
   1.593 +      incr = 3;
   1.594 +    } else {
   1.595 +      // Four-byte
   1.596 +      *dst++ = src[1];
   1.597 +      *dst++ = src[2];
   1.598 +      *dst++ = src[3];
   1.599 +      c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
   1.600 +      incr = 4;
   1.601 +    }
   1.602 +    src += incr;
   1.603 +    word_length_bytes += incr;
   1.604 +
   1.605 +    int p = tbl[local_hash];            // Prediction
   1.606 +    tbl[local_hash] = c;                // Update prediction
   1.607 +    if (c == p) {
   1.608 +      good_predict_bytes += incr;       // Count good predictions
   1.609 +    }
   1.610 +
   1.611 +    local_hash = ((local_hash << 4) ^ c) & 0xfff;
   1.612 +  }
   1.613 +
   1.614 +  *hash = local_hash;
   1.615 +
   1.616 +  if ((dst - isrc) < (src_len - 3)) {
   1.617 +    // Pad and make last char clean UTF-8 by putting following spaces
   1.618 +    dst[0] = ' ';
   1.619 +    dst[1] = ' ';
   1.620 +    dst[2] = ' ';
   1.621 +    dst[3] = '\0';
   1.622 +  } else  if ((dst - isrc) < src_len) {
   1.623 +    // Make last char clean UTF-8 by putting following space off the end
   1.624 +    dst[0] = ' ';
   1.625 +  }
   1.626 +
   1.627 +  return static_cast<int>(dst - isrc);
   1.628 +}
   1.629 +
   1.630 +
   1.631 +// This alternate form overwrites redundant words, thus avoiding corrupting the
   1.632 +// backmap for generate a vector of original-text ranges.
   1.633 +int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) {
   1.634 +  const uint8* src = reinterpret_cast<const uint8*>(isrc);
   1.635 +  const uint8* srclimit = src + src_len;
   1.636 +  char* dst = isrc;
   1.637 +  int local_hash = *hash;
   1.638 +  char* word_dst = dst;           // Start of next word
   1.639 +  int good_predict_bytes = 0;
   1.640 +  int word_length_bytes = 0;
   1.641 +
   1.642 +  while (src < srclimit) {
   1.643 +    int c = src[0];
   1.644 +    int incr = 1;
   1.645 +    *dst++ = c;
   1.646 +
   1.647 +    if (c == ' ') {
   1.648 +      if ((good_predict_bytes * 2) > word_length_bytes) {
   1.649 +        // Word [word_dst..dst-1) is well-predicted: overwrite
   1.650 +        for (char* p = word_dst; p < dst - 1; ++p) {*p = '.';}
   1.651 +      }
   1.652 +      word_dst = dst;              // Start of next word
   1.653 +      good_predict_bytes = 0;
   1.654 +      word_length_bytes = 0;
   1.655 +    }
   1.656 +
   1.657 +    // Pick up one char and length
   1.658 +    if (c < 0xc0) {
   1.659 +      // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
   1.660 +      // Do nothing more
   1.661 +    } else if ((c & 0xe0) == 0xc0) {
   1.662 +      // Two-byte
   1.663 +      *dst++ = src[1];
   1.664 +      c = (c << 8) | src[1];
   1.665 +      incr = 2;
   1.666 +    } else if ((c & 0xf0) == 0xe0) {
   1.667 +      // Three-byte
   1.668 +      *dst++ = src[1];
   1.669 +      *dst++ = src[2];
   1.670 +      c = (c << 16) | (src[1] << 8) | src[2];
   1.671 +      incr = 3;
   1.672 +    } else {
   1.673 +      // Four-byte
   1.674 +      *dst++ = src[1];
   1.675 +      *dst++ = src[2];
   1.676 +      *dst++ = src[3];
   1.677 +      c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
   1.678 +      incr = 4;
   1.679 +    }
   1.680 +    src += incr;
   1.681 +    word_length_bytes += incr;
   1.682 +
   1.683 +    int p = tbl[local_hash];            // Prediction
   1.684 +    tbl[local_hash] = c;                // Update prediction
   1.685 +    if (c == p) {
   1.686 +      good_predict_bytes += incr;       // Count good predictions
   1.687 +    }
   1.688 +
   1.689 +    local_hash = ((local_hash << 4) ^ c) & 0xfff;
   1.690 +  }
   1.691 +
   1.692 +  *hash = local_hash;
   1.693 +
   1.694 +  if ((dst - isrc) < (src_len - 3)) {
   1.695 +    // Pad and make last char clean UTF-8 by putting following spaces
   1.696 +    dst[0] = ' ';
   1.697 +    dst[1] = ' ';
   1.698 +    dst[2] = ' ';
   1.699 +    dst[3] = '\0';
   1.700 +  } else  if ((dst - isrc) < src_len) {
   1.701 +    // Make last char clean UTF-8 by putting following space off the end
   1.702 +    dst[0] = ' ';
   1.703 +  }
   1.704 +
   1.705 +  return static_cast<int>(dst - isrc);
   1.706 +}
   1.707 +
   1.708 +
   1.709 +// Remove portions of text that have a high density of spaces, or that are
   1.710 +// overly repetitive, squeezing the remaining text in-place to the front of the
   1.711 +// input buffer.
   1.712 +//
   1.713 +// Squeezing looks at density of space/prediced chars in fixed-size chunks,
   1.714 +// specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
   1.715 +//
   1.716 +// Return the new, possibly-shorter length
   1.717 +//
   1.718 +// Result Buffer ALWAYS has leading space and trailing space space space NUL,
   1.719 +// if input does
   1.720 +//
   1.721 +int CheapSqueezeInplace(char* isrc,
   1.722 +                                            int src_len,
   1.723 +                                            int ichunksize) {
   1.724 +  char* src = isrc;
   1.725 +  char* dst = src;
   1.726 +  char* srclimit = src + src_len;
   1.727 +  bool skipping = false;
   1.728 +
   1.729 +  int hash = 0;
   1.730 +  // Allocate local prediction table.
   1.731 +  int* predict_tbl = new int[kPredictionTableSize];
   1.732 +  memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
   1.733 +
   1.734 +  int chunksize = ichunksize;
   1.735 +  if (chunksize == 0) {chunksize = kChunksizeDefault;}
   1.736 +  int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
   1.737 +  int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
   1.738 +
   1.739 +  while (src < srclimit) {
   1.740 +    int remaining_bytes = srclimit - src;
   1.741 +    int len = minint(chunksize, remaining_bytes);
   1.742 +    // Make len land us on a UTF-8 character boundary.
   1.743 +    // Ah. Also fixes mispredict because we could get out of phase
   1.744 +    // Loop always terminates at trailing space in buffer
   1.745 +    while ((src[len] & 0xc0) == 0x80) {++len;}  // Move past continuation bytes
   1.746 +
   1.747 +    int space_n = CountSpaces4(src, len);
   1.748 +    int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
   1.749 +    if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
   1.750 +      // Skip the text
   1.751 +      if (!skipping) {
   1.752 +        // Keeping-to-skipping transition; do it at a space
   1.753 +        int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
   1.754 +        dst -= n;
   1.755 +        if (dst == isrc) {
   1.756 +          // Force a leading space if the first chunk is deleted
   1.757 +          *dst++ = ' ';
   1.758 +        }
   1.759 +        if (FLAGS_cld_showme) {
   1.760 +          // Mark the deletion point with black square U+25A0
   1.761 +          *dst++ = static_cast<unsigned char>(0xe2);
   1.762 +          *dst++ = static_cast<unsigned char>(0x96);
   1.763 +          *dst++ = static_cast<unsigned char>(0xa0);
   1.764 +          *dst++ = ' ';
   1.765 +        }
   1.766 +        skipping = true;
   1.767 +      }
   1.768 +    } else {
   1.769 +      // Keep the text
   1.770 +      if (skipping) {
   1.771 +        // Skipping-to-keeping transition; do it at a space
   1.772 +        int n = ForwardscanToSpace(src, len);
   1.773 +        src += n;
   1.774 +        remaining_bytes -= n;   // Shrink remaining length
   1.775 +        len -= n;
   1.776 +        skipping = false;
   1.777 +      }
   1.778 +      // "len" can be negative in some cases
   1.779 +      if (len > 0) {
   1.780 +        memmove(dst, src, len);
   1.781 +        dst += len;
   1.782 +      }
   1.783 +    }
   1.784 +    src += len;
   1.785 +  }
   1.786 +
   1.787 +  if ((dst - isrc) < (src_len - 3)) {
   1.788 +    // Pad and make last char clean UTF-8 by putting following spaces
   1.789 +    dst[0] = ' ';
   1.790 +    dst[1] = ' ';
   1.791 +    dst[2] = ' ';
   1.792 +    dst[3] = '\0';
   1.793 +  } else   if ((dst - isrc) < src_len) {
   1.794 +    // Make last char clean UTF-8 by putting following space off the end
   1.795 +    dst[0] = ' ';
   1.796 +  }
   1.797 +
   1.798 +  // Deallocate local prediction table
   1.799 +  delete[] predict_tbl;
   1.800 +  return static_cast<int>(dst - isrc);
   1.801 +}
   1.802 +
   1.803 +// This alternate form overwrites redundant words, thus avoiding corrupting the
   1.804 +// backmap for generate a vector of original-text ranges.
   1.805 +int CheapSqueezeInplaceOverwrite(char* isrc,
   1.806 +                                            int src_len,
   1.807 +                                            int ichunksize) {
   1.808 +  char* src = isrc;
   1.809 +  char* dst = src;
   1.810 +  char* srclimit = src + src_len;
   1.811 +  bool skipping = false;
   1.812 +
   1.813 +  int hash = 0;
   1.814 +  // Allocate local prediction table.
   1.815 +  int* predict_tbl = new int[kPredictionTableSize];
   1.816 +  memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
   1.817 +
   1.818 +  int chunksize = ichunksize;
   1.819 +  if (chunksize == 0) {chunksize = kChunksizeDefault;}
   1.820 +  int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
   1.821 +  int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
   1.822 +
   1.823 +  // Always keep first byte (space)
   1.824 +  ++src;
   1.825 +  ++dst;
   1.826 +  while (src < srclimit) {
   1.827 +    int remaining_bytes = srclimit - src;
   1.828 +    int len = minint(chunksize, remaining_bytes);
   1.829 +    // Make len land us on a UTF-8 character boundary.
   1.830 +    // Ah. Also fixes mispredict because we could get out of phase
   1.831 +    // Loop always terminates at trailing space in buffer
   1.832 +    while ((src[len] & 0xc0) == 0x80) {++len;}  // Move past continuation bytes
   1.833 +
   1.834 +    int space_n = CountSpaces4(src, len);
   1.835 +    int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
   1.836 +    if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
   1.837 +      // Overwrite the text [dst-n..dst)
   1.838 +      if (!skipping) {
   1.839 +        // Keeping-to-skipping transition; do it at a space
   1.840 +        int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
   1.841 +        // Text [word_dst..dst) is well-predicted: overwrite
   1.842 +        for (char* p = dst - n; p < dst; ++p) {*p = '.';}
   1.843 +        skipping = true;
   1.844 +      }
   1.845 +      // Overwrite the text [dst..dst+len)
   1.846 +      for (char* p = dst; p < dst + len; ++p) {*p = '.';}
   1.847 +      dst[len - 1] = ' ';    // Space at end so we can see what is happening
   1.848 +    } else {
   1.849 +      // Keep the text
   1.850 +      if (skipping) {
   1.851 +        // Skipping-to-keeping transition; do it at a space
   1.852 +        int n = ForwardscanToSpace(src, len);
   1.853 +        // Text [dst..dst+n) is well-predicted: overwrite
   1.854 +        for (char* p = dst; p < dst + n - 1; ++p) {*p = '.';}
   1.855 +        skipping = false;
   1.856 +      }
   1.857 +    }
   1.858 +    dst += len;
   1.859 +    src += len;
   1.860 +  }
   1.861 +
   1.862 +  if ((dst - isrc) < (src_len - 3)) {
   1.863 +    // Pad and make last char clean UTF-8 by putting following spaces
   1.864 +    dst[0] = ' ';
   1.865 +    dst[1] = ' ';
   1.866 +    dst[2] = ' ';
   1.867 +    dst[3] = '\0';
   1.868 +  } else   if ((dst - isrc) < src_len) {
   1.869 +    // Make last char clean UTF-8 by putting following space off the end
   1.870 +    dst[0] = ' ';
   1.871 +  }
   1.872 +
   1.873 +  // Deallocate local prediction table
   1.874 +  delete[] predict_tbl;
   1.875 +  return static_cast<int>(dst - isrc);
   1.876 +}
   1.877 +
   1.878 +// Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input
   1.879 +//  About 90 MB/sec, with or without memcpy, chunksize 48 or 4096
   1.880 +//  Just CountSpaces is about 340 MB/sec
   1.881 +//  Byte-only CountPredictedBytes is about 150 MB/sec
   1.882 +//  Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec
   1.883 +//  Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c
   1.884 +//  Unjammed byte-only both = 170 MB/sec
   1.885 +//  Jammed byte-only both = 120 MB/sec
   1.886 +//  Back to original w/slight updates, 110 MB/sec
   1.887 +//
   1.888 +bool CheapSqueezeTriggerTest(const char* src, int src_len, int testsize) {
   1.889 +  // Don't trigger at all on short text
   1.890 +  if (src_len < testsize) {return false;}
   1.891 +  int space_thresh = (testsize * kSpacesTriggerPercent) / 100;
   1.892 +  int predict_thresh = (testsize * kPredictTriggerPercent) / 100;
   1.893 +  int hash = 0;
   1.894 +  // Allocate local prediction table.
   1.895 +  int* predict_tbl = new int[kPredictionTableSize];
   1.896 +  memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
   1.897 +
   1.898 +  bool retval = false;
   1.899 +  if ((CountSpaces4(src, testsize) >= space_thresh) ||
   1.900 +      (CountPredictedBytes(src, testsize, &hash, predict_tbl) >=
   1.901 +       predict_thresh)) {
   1.902 +    retval = true;
   1.903 +  }
   1.904 +  // Deallocate local prediction table
   1.905 +  delete[] predict_tbl;
   1.906 +  return retval;
   1.907 +}
   1.908 +
   1.909 +
   1.910 +
   1.911 +
   1.912 +// Delete any extended languages from doc_tote
   1.913 +void RemoveExtendedLanguages(DocTote* doc_tote) {
   1.914 +  // Now a nop
   1.915 +}
   1.916 +
   1.917 +static const int kMinReliableKeepPercent = 41;  // Remove lang if reli < this
   1.918 +
   1.919 +// For Tier3 languages, require a minimum number of bytes to be first-place lang
   1.920 +static const int kGoodFirstT3MinBytes = 24;         // <this => no first
   1.921 +
   1.922 +// Move bytes for unreliable langs to another lang or UNKNOWN
   1.923 +// doc_tote is sorted, so cannot Add
   1.924 +//
   1.925 +// If both CHINESE and CHINESET are present and unreliable, do not delete both;
   1.926 +// merge both into CHINESE.
   1.927 +//
   1.928 +//dsites 2009.03.19
   1.929 +// we also want to remove Tier3 languages as the first lang if there is very
   1.930 +// little text like ej1 ej2 ej3 ej4
   1.931 +// maybe fold this back in earlier
   1.932 +//
   1.933 +void RemoveUnreliableLanguages(DocTote* doc_tote,
   1.934 +                               bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
   1.935 +  // Prepass to merge some low-reliablility languages
   1.936 +  // TODO: this shouldn't really reach in to the internal structure of doc_tote
   1.937 +  int total_bytes = 0;
   1.938 +  for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
   1.939 +    int plang = doc_tote->Key(sub);
   1.940 +    if (plang == DocTote::kUnusedKey) {continue;}               // Empty slot
   1.941 +
   1.942 +    Language lang = static_cast<Language>(plang);
   1.943 +    int bytes = doc_tote->Value(sub);
   1.944 +    int reli = doc_tote->Reliability(sub);
   1.945 +    if (bytes == 0) {continue;}                     // Zero bytes
   1.946 +    total_bytes += bytes;
   1.947 +
   1.948 +    // Reliable percent = stored reliable score over stored bytecount
   1.949 +    int reliable_percent = reli / bytes;
   1.950 +    if (reliable_percent >= kMinReliableKeepPercent) {continue;}   // Keeper
   1.951 +
   1.952 +    // This language is too unreliable to keep, but we might merge it.
   1.953 +    Language altlang = UNKNOWN_LANGUAGE;
   1.954 +    if (lang <= HAWAIIAN) {altlang = kClosestAltLanguage[lang];}
   1.955 +    if (altlang == UNKNOWN_LANGUAGE) {continue;}    // No alternative
   1.956 +
   1.957 +    // Look for alternative in doc_tote
   1.958 +    int altsub = doc_tote->Find(altlang);
   1.959 +    if (altsub < 0) {continue;}                     // No alternative text
   1.960 +
   1.961 +    int bytes2 = doc_tote->Value(altsub);
   1.962 +    int reli2 = doc_tote->Reliability(altsub);
   1.963 +    if (bytes2 == 0) {continue;}                    // Zero bytes
   1.964 +
   1.965 +    // Reliable percent is stored reliable score over stored bytecount
   1.966 +    int reliable_percent2 = reli2 / bytes2;
   1.967 +
   1.968 +    // Merge one language into the other. Break ties toward lower lang #
   1.969 +    int tosub = altsub;
   1.970 +    int fromsub = sub;
   1.971 +    bool into_lang = false;
   1.972 +    if ((reliable_percent2 < reliable_percent) ||
   1.973 +        ((reliable_percent2 == reliable_percent) && (lang < altlang))) {
   1.974 +      tosub = sub;
   1.975 +      fromsub = altsub;
   1.976 +      into_lang = true;
   1.977 +    }
   1.978 +
   1.979 +    // Make sure merged reliability doesn't drop and is enough to avoid delete
   1.980 +    int newpercent = maxint(reliable_percent, reliable_percent2);
   1.981 +    newpercent = maxint(newpercent, kMinReliableKeepPercent);
   1.982 +    int newbytes = bytes + bytes2;
   1.983 +    int newreli = newpercent * newbytes;
   1.984 +
   1.985 +    doc_tote->SetKey(fromsub, DocTote::kUnusedKey);
   1.986 +    doc_tote->SetScore(fromsub, 0);
   1.987 +    doc_tote->SetReliability(fromsub, 0);
   1.988 +    doc_tote->SetScore(tosub, newbytes);
   1.989 +    doc_tote->SetReliability(tosub, newreli);
   1.990 +
   1.991 +    // Show fate of unreliable languages if at least 10 bytes
   1.992 +    if (FLAGS_cld2_html && (newbytes >= 10) &&
   1.993 +        !FLAGS_cld2_quiet) {
   1.994 +      if (into_lang) {
   1.995 +        fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ",
   1.996 +                LanguageCode(altlang), reliable_percent2, bytes2,
   1.997 +                LanguageCode(lang));
   1.998 +      } else {
   1.999 +        fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ",
  1.1000 +                LanguageCode(lang), reliable_percent, bytes,
  1.1001 +                LanguageCode(altlang));
  1.1002 +      }
  1.1003 +    }
  1.1004 +  }
  1.1005 +
  1.1006 +
  1.1007 +  // Pass to delete any remaining unreliable languages
  1.1008 +  for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
  1.1009 +    int plang = doc_tote->Key(sub);
  1.1010 +    if (plang == DocTote::kUnusedKey) {continue;}               // Empty slot
  1.1011 +
  1.1012 +    Language lang = static_cast<Language>(plang);
  1.1013 +    int bytes = doc_tote->Value(sub);
  1.1014 +    int reli = doc_tote->Reliability(sub);
  1.1015 +    if (bytes == 0) {continue;}                     // Zero bytes
  1.1016 +
  1.1017 +    // Reliable percent is stored as reliable score over stored bytecount
  1.1018 +    int reliable_percent = reli / bytes;
  1.1019 +    if (reliable_percent >= kMinReliableKeepPercent) {  // Keeper?
  1.1020 +       continue;                                        // yes
  1.1021 +    }
  1.1022 +
  1.1023 +    // Delete unreliable entry
  1.1024 +    doc_tote->SetKey(sub, DocTote::kUnusedKey);
  1.1025 +    doc_tote->SetScore(sub, 0);
  1.1026 +    doc_tote->SetReliability(sub, 0);
  1.1027 +
  1.1028 +    // Show fate of unreliable languages if at least 10 bytes
  1.1029 +    if (FLAGS_cld2_html && (bytes >= 10) &&
  1.1030 +        !FLAGS_cld2_quiet) {
  1.1031 +      fprintf(stderr, "{Unreli %s.%dR,%dB} ",
  1.1032 +              LanguageCode(lang), reliable_percent, bytes);
  1.1033 +    }
  1.1034 +  }
  1.1035 +
  1.1036 +  ////if (FLAGS_cld2_html) {fprintf(stderr, "<br>\n");}
  1.1037 +}
  1.1038 +
  1.1039 +
  1.1040 +// Move all the text bytes from lower byte-count to higher one
  1.1041 +void MoveLang1ToLang2(Language lang1, Language lang2,
  1.1042 +                      int lang1_sub, int lang2_sub,
  1.1043 +                      DocTote* doc_tote,
  1.1044 +                      ResultChunkVector* resultchunkvector) {
  1.1045 +  // In doc_tote, move all the bytes lang1 => lang2
  1.1046 +  int sum = doc_tote->Value(lang2_sub) + doc_tote->Value(lang1_sub);
  1.1047 +  doc_tote->SetValue(lang2_sub, sum);
  1.1048 +  sum = doc_tote->Score(lang2_sub) + doc_tote->Score(lang1_sub);
  1.1049 +  doc_tote->SetScore(lang2_sub, sum);
  1.1050 +  sum = doc_tote->Reliability(lang2_sub) + doc_tote->Reliability(lang1_sub);
  1.1051 +  doc_tote->SetReliability(lang2_sub, sum);
  1.1052 +
  1.1053 +  // Delete old entry
  1.1054 +  doc_tote->SetKey(lang1_sub, DocTote::kUnusedKey);
  1.1055 +  doc_tote->SetScore(lang1_sub, 0);
  1.1056 +  doc_tote->SetReliability(lang1_sub, 0);
  1.1057 +
  1.1058 +  // In resultchunkvector, move all the bytes lang1 => lang2
  1.1059 +  if (resultchunkvector == NULL) {return;}
  1.1060 +
  1.1061 +  int k = 0;
  1.1062 +  uint16 prior_lang = UNKNOWN_LANGUAGE;
  1.1063 +  for (int i = 0; i < static_cast<int>(resultchunkvector->size()); ++i) {
  1.1064 +    ResultChunk* rc = &(*resultchunkvector)[i];
  1.1065 +    if (rc->lang1 == lang1) {
  1.1066 +      // Update entry[i] lang1 => lang2
  1.1067 +      rc->lang1 = lang2;
  1.1068 +    }
  1.1069 +    // One change may produce two merges -- entry before and entry after
  1.1070 +    if ((rc->lang1 == prior_lang) && (k > 0)) {
  1.1071 +      // Merge with previous, deleting entry[i]
  1.1072 +      ResultChunk* prior_rc = &(*resultchunkvector)[k - 1];
  1.1073 +      prior_rc->bytes += rc->bytes;
  1.1074 +      // fprintf(stderr, "MoveLang1ToLang2 merged [%d] => [%d]<br>\n", i, k-1);
  1.1075 +    } else {
  1.1076 +      // Keep entry[i]
  1.1077 +      (*resultchunkvector)[k] = (*resultchunkvector)[i];
  1.1078 +      // fprintf(stderr, "MoveLang1ToLang2 keep [%d] => [%d]<br>\n", i, k);
  1.1079 +      ++k;
  1.1080 +    }
  1.1081 +    prior_lang = rc->lang1;
  1.1082 +  }
  1.1083 +  resultchunkvector->resize(k);
  1.1084 +}
  1.1085 +
  1.1086 +
  1.1087 +
  1.1088 +// Move less likely byte count to more likely for close pairs of languages
  1.1089 +// If given, also update resultchunkvector
  1.1090 +void RefineScoredClosePairs(DocTote* doc_tote,
  1.1091 +                            ResultChunkVector* resultchunkvector,
  1.1092 +                            bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
  1.1093 +  for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
  1.1094 +    int close_packedlang = doc_tote->Key(sub);
  1.1095 +    int subscr = LanguageCloseSet(static_cast<Language>(close_packedlang));
  1.1096 +    if (subscr == 0) {continue;}
  1.1097 +
  1.1098 +    // We have a close pair language -- if the other one is also scored and the
  1.1099 +    // longword score differs enough, put all our eggs into one basket
  1.1100 +
  1.1101 +    // Nonzero longword score: Go look for the other of this pair
  1.1102 +    for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) {
  1.1103 +      if (LanguageCloseSet(static_cast<Language>(doc_tote->Key(sub2))) == subscr) {
  1.1104 +        // We have a matching pair
  1.1105 +        int close_packedlang2 = doc_tote->Key(sub2);
  1.1106 +
  1.1107 +        // Move all the text bytes from lower byte-count to higher one
  1.1108 +        int from_sub, to_sub;
  1.1109 +        Language from_lang, to_lang;
  1.1110 +        if (doc_tote->Value(sub) < doc_tote->Value(sub2)) {
  1.1111 +          from_sub = sub;
  1.1112 +          to_sub = sub2;
  1.1113 +          from_lang = static_cast<Language>(close_packedlang);
  1.1114 +          to_lang = static_cast<Language>(close_packedlang2);
  1.1115 +        } else {
  1.1116 +          from_sub = sub2;
  1.1117 +          to_sub = sub;
  1.1118 +          from_lang = static_cast<Language>(close_packedlang2);
  1.1119 +          to_lang = static_cast<Language>(close_packedlang);
  1.1120 +        }
  1.1121 +
  1.1122 +        if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) {
  1.1123 +          // Show fate of closepair language
  1.1124 +          int val = doc_tote->Value(from_sub);           // byte count
  1.1125 +          int reli = doc_tote->Reliability(from_sub);
  1.1126 +          int reliable_percent = reli / (val ? val : 1);  // avoid zdiv
  1.1127 +          fprintf(stderr, "{CloseLangPair: %s.%dR,%dB => %s}<br>\n",
  1.1128 +                  LanguageCode(from_lang),
  1.1129 +                  reliable_percent,
  1.1130 +                  doc_tote->Value(from_sub),
  1.1131 +                  LanguageCode(to_lang));
  1.1132 +        }
  1.1133 +        MoveLang1ToLang2(from_lang, to_lang, from_sub, to_sub,
  1.1134 +                         doc_tote, resultchunkvector);
  1.1135 +        break;    // Exit inner for sub2 loop
  1.1136 +      }
  1.1137 +    }     // End for sub2
  1.1138 +  }   // End for sub
  1.1139 +}
  1.1140 +
  1.1141 +
  1.1142 +void ApplyAllLanguageHints(Tote* chunk_tote, int tote_grams,
  1.1143 +                        uint8* lang_hint_boost) {
  1.1144 +}
  1.1145 +
  1.1146 +
  1.1147 +void PrintHtmlEscapedText(FILE* f, const char* txt, int len) {
  1.1148 +   string temp(txt, len);
  1.1149 +   fprintf(f, "%s", GetHtmlEscapedText(temp).c_str());
  1.1150 +}
  1.1151 +
  1.1152 +void PrintLang(FILE* f, Tote* chunk_tote,
  1.1153 +              Language cur_lang, bool cur_unreliable,
  1.1154 +              Language prior_lang, bool prior_unreliable) {
  1.1155 +  if (cur_lang == prior_lang) {
  1.1156 +    fprintf(f, "[]");
  1.1157 +  } else {
  1.1158 +    fprintf(f, "[%s%s]", LanguageCode(cur_lang), cur_unreliable ? "*" : "");
  1.1159 +  }
  1.1160 +}
  1.1161 +
  1.1162 +
  1.1163 +void PrintTopLang(Language top_lang) {
  1.1164 +  if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
  1.1165 +    fprintf(stderr, "[] ");
  1.1166 +  } else {
  1.1167 +    fprintf(stderr, "[%s] ", LanguageName(top_lang));
  1.1168 +    prior_lang = top_lang;
  1.1169 +  }
  1.1170 +}
  1.1171 +
  1.1172 +void PrintTopLangSpeculative(Language top_lang) {
  1.1173 +  fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0);
  1.1174 +  if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
  1.1175 +    fprintf(stderr, "[] ");
  1.1176 +  } else {
  1.1177 +    fprintf(stderr, "[%s] ", LanguageName(top_lang));
  1.1178 +    prior_lang = top_lang;
  1.1179 +  }
  1.1180 +  fprintf(stderr, "</span>\n");
  1.1181 +}
  1.1182 +
  1.1183 +void PrintLangs(FILE* f, const Language* language3, const int* percent3,
  1.1184 +                const int* text_bytes, const bool* is_reliable) {
  1.1185 +  fprintf(f, "<br>&nbsp;&nbsp;Initial_Languages ");
  1.1186 +  if (language3[0] != UNKNOWN_LANGUAGE) {
  1.1187 +    fprintf(f, "%s%s(%d%%)  ",
  1.1188 +            LanguageName(language3[0]),
  1.1189 +            *is_reliable ? "" : "*",
  1.1190 +            percent3[0]);
  1.1191 +  }
  1.1192 +  if (language3[1] != UNKNOWN_LANGUAGE) {
  1.1193 +    fprintf(f, "%s(%d%%)  ", LanguageName(language3[1]), percent3[1]);
  1.1194 +  }
  1.1195 +  if (language3[2] != UNKNOWN_LANGUAGE) {
  1.1196 +    fprintf(f, "%s(%d%%)  ", LanguageName(language3[2]), percent3[2]);
  1.1197 +  }
  1.1198 +  fprintf(f, "%d bytes \n", *text_bytes);
  1.1199 +
  1.1200 +  fprintf(f, "<br>\n");
  1.1201 +}
  1.1202 +
  1.1203 +
  1.1204 +// Return internal probability score (sum) per 1024 bytes
  1.1205 +double GetNormalizedScore(Language lang, ULScript ulscript,
  1.1206 +                          int bytecount, int score) {
  1.1207 +  if (bytecount <= 0) {return 0.0;}
  1.1208 +  return (score << 10) / bytecount;
  1.1209 +}
  1.1210 +
  1.1211 +// Extract return values before fixups
  1.1212 +void ExtractLangEtc(DocTote* doc_tote, int total_text_bytes,
  1.1213 +                    int* reliable_percent3, Language* language3, int* percent3,
  1.1214 +                    double*  normalized_score3,
  1.1215 +                    int* text_bytes, bool* is_reliable) {
  1.1216 +  reliable_percent3[0] = 0;
  1.1217 +  reliable_percent3[1] = 0;
  1.1218 +  reliable_percent3[2] = 0;
  1.1219 +  language3[0] = UNKNOWN_LANGUAGE;
  1.1220 +  language3[1] = UNKNOWN_LANGUAGE;
  1.1221 +  language3[2] = UNKNOWN_LANGUAGE;
  1.1222 +  percent3[0] = 0;
  1.1223 +  percent3[1] = 0;
  1.1224 +  percent3[2] = 0;
  1.1225 +  normalized_score3[0] = 0.0;
  1.1226 +  normalized_score3[1] = 0.0;
  1.1227 +  normalized_score3[2] = 0.0;
  1.1228 +
  1.1229 +  *text_bytes = total_text_bytes;
  1.1230 +  *is_reliable = false;
  1.1231 +
  1.1232 +  int bytecount1 = 0;
  1.1233 +  int bytecount2 = 0;
  1.1234 +  int bytecount3 = 0;
  1.1235 +
  1.1236 +  int lang1 = doc_tote->Key(0);
  1.1237 +  if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) {
  1.1238 +    // We have a top language
  1.1239 +    language3[0] = static_cast<Language>(lang1);
  1.1240 +    bytecount1 = doc_tote->Value(0);
  1.1241 +    int reli1 = doc_tote->Reliability(0);
  1.1242 +    reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1);  // avoid zdiv
  1.1243 +    normalized_score3[0] = GetNormalizedScore(language3[0],
  1.1244 +                                                  ULScript_Common,
  1.1245 +                                                  bytecount1,
  1.1246 +                                                  doc_tote->Score(0));
  1.1247 +  }
  1.1248 +
  1.1249 +  int lang2 = doc_tote->Key(1);
  1.1250 +  if ((lang2 != DocTote::kUnusedKey) && (lang2 != UNKNOWN_LANGUAGE)) {
  1.1251 +    language3[1] = static_cast<Language>(lang2);
  1.1252 +    bytecount2 = doc_tote->Value(1);
  1.1253 +    int reli2 = doc_tote->Reliability(1);
  1.1254 +    reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1);  // avoid zdiv
  1.1255 +    normalized_score3[1] = GetNormalizedScore(language3[1],
  1.1256 +                                                  ULScript_Common,
  1.1257 +                                                  bytecount2,
  1.1258 +                                                  doc_tote->Score(1));
  1.1259 +  }
  1.1260 +
  1.1261 +  int lang3 = doc_tote->Key(2);
  1.1262 +  if ((lang3 != DocTote::kUnusedKey) && (lang3 != UNKNOWN_LANGUAGE)) {
  1.1263 +    language3[2] = static_cast<Language>(lang3);
  1.1264 +    bytecount3 = doc_tote->Value(2);
  1.1265 +    int reli3 = doc_tote->Reliability(2);
  1.1266 +    reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1);  // avoid zdiv
  1.1267 +    normalized_score3[2] = GetNormalizedScore(language3[2],
  1.1268 +                                                  ULScript_Common,
  1.1269 +                                                  bytecount3,
  1.1270 +                                                  doc_tote->Score(2));
  1.1271 +  }
  1.1272 +
  1.1273 +  // Increase total bytes to sum (top 3) if low for some reason
  1.1274 +  int total_bytecount12 = bytecount1 + bytecount2;
  1.1275 +  int total_bytecount123 = total_bytecount12 + bytecount3;
  1.1276 +  if (total_text_bytes < total_bytecount123) {
  1.1277 +    total_text_bytes = total_bytecount123;
  1.1278 +    *text_bytes = total_text_bytes;
  1.1279 +  }
  1.1280 +
  1.1281 +  // Sum minus previous % gives better roundoff behavior than bytecount/total
  1.1282 +  int total_text_bytes_div = maxint(1, total_text_bytes);    // Avoid zdiv
  1.1283 +  percent3[0] = (bytecount1 * 100) / total_text_bytes_div;
  1.1284 +  percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div;
  1.1285 +  percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div;
  1.1286 +  percent3[2] -= percent3[1];
  1.1287 +  percent3[1] -= percent3[0];
  1.1288 +
  1.1289 +  // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2%
  1.1290 +  // Fix this explicitly
  1.1291 +  if (percent3[1] < percent3[2]) {
  1.1292 +    ++percent3[1];
  1.1293 +    --percent3[2];
  1.1294 +  }
  1.1295 +  if (percent3[0] < percent3[1]) {
  1.1296 +    ++percent3[0];
  1.1297 +    --percent3[1];
  1.1298 +  }
  1.1299 +
  1.1300 +  *text_bytes = total_text_bytes;
  1.1301 +
  1.1302 +  if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) {
  1.1303 +    // We have a top language
  1.1304 +    // Its reliability is overall result reliability
  1.1305 +    int bytecount = doc_tote->Value(0);
  1.1306 +    int reli = doc_tote->Reliability(0);
  1.1307 +    int reliable_percent = reli / (bytecount ? bytecount : 1);  // avoid zdiv
  1.1308 +    *is_reliable = (reliable_percent >= kMinReliableKeepPercent);
  1.1309 +  } else {
  1.1310 +    // No top language at all. This can happen with zero text or 100% Klingon
  1.1311 +    // if extended=false. Just return all UNKNOWN_LANGUAGE, unreliable.
  1.1312 +    *is_reliable = false;
  1.1313 +  }
  1.1314 +
  1.1315 +  // If ignore percent is too large, set unreliable.
  1.1316 +  int ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]);
  1.1317 +  if ((ignore_percent > kIgnoreMaxPercent)) {
  1.1318 +    *is_reliable = false;
  1.1319 +  }
  1.1320 +}
  1.1321 +
  1.1322 +bool IsFIGS(Language lang) {
  1.1323 +  if (lang == FRENCH) {return true;}
  1.1324 +  if (lang == ITALIAN) {return true;}
  1.1325 +  if (lang == GERMAN) {return true;}
  1.1326 +  if (lang == SPANISH) {return true;}
  1.1327 +  return false;
  1.1328 +}
  1.1329 +
  1.1330 +bool IsEFIGS(Language lang) {
  1.1331 +  if (lang == ENGLISH) {return true;}
  1.1332 +  if (lang == FRENCH) {return true;}
  1.1333 +  if (lang == ITALIAN) {return true;}
  1.1334 +  if (lang == GERMAN) {return true;}
  1.1335 +  if (lang == SPANISH) {return true;}
  1.1336 +  return false;
  1.1337 +}
  1.1338 +
  1.1339 +// For Tier3 languages, require more bytes of text to override
  1.1340 +// the first-place language
  1.1341 +static const int kGoodSecondT1T2MinBytes = 15;        // <this => no second
  1.1342 +static const int kGoodSecondT3MinBytes = 128;         // <this => no second
  1.1343 +
  1.1344 +// Calculate a single summary language for the document, and its reliability.
  1.1345 +// Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE
  1.1346 +// This is the heart of matching human-rater perception.
  1.1347 +// reliable_percent3[] is currently unused
  1.1348 +//
  1.1349 +// Do not return Tier3 second language unless there are at least 128 bytes
  1.1350 +void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes,
  1.1351 +                     const int* reliable_percent3,
  1.1352 +                     const Language* language3,
  1.1353 +                     const int* percent3,
  1.1354 +                     Language* summary_lang, bool* is_reliable,
  1.1355 +                     bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
  1.1356 +  // Vector of active languages; changes if we delete some
  1.1357 +  int slot_count = 3;
  1.1358 +  int active_slot[3] = {0, 1, 2};
  1.1359 +
  1.1360 +  int ignore_percent = 0;
  1.1361 +  int return_percent = percent3[0];   // Default to top lang
  1.1362 +  *summary_lang = language3[0];
  1.1363 +  *is_reliable = true;
  1.1364 +  if (percent3[0] < kKeepMinPercent) {*is_reliable = false;}
  1.1365 +
  1.1366 +  // If any of top 3 is IGNORE, remove it and increment ignore_percent
  1.1367 +  for (int i = 0; i < 3; ++i) {
  1.1368 +    if (language3[i] == TG_UNKNOWN_LANGUAGE) {
  1.1369 +      ignore_percent += percent3[i];
  1.1370 +      // Move the rest up, levaing input vectors unchanged
  1.1371 +      for (int j=i+1; j < 3; ++j) {
  1.1372 +        active_slot[j - 1] = active_slot[j];
  1.1373 +      }
  1.1374 +      -- slot_count;
  1.1375 +      // Logically remove Ignore from percentage-text calculation
  1.1376 +      // (extra 1 in 101 avoids zdiv, biases slightly small)
  1.1377 +      return_percent = (percent3[0] * 100) / (101 - ignore_percent);
  1.1378 +      *summary_lang = language3[active_slot[0]];
  1.1379 +      if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;}
  1.1380 +    }
  1.1381 +  }
  1.1382 +
  1.1383 +
  1.1384 +  // If English and X, where X (not UNK) is big enough,
  1.1385 +  // assume the English is boilerplate and return X.
  1.1386 +  // Logically remove English from percentage-text calculation
  1.1387 +  int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100;
  1.1388 +  // Require more bytes of text for Tier3 languages
  1.1389 +  int minbytesneeded = kGoodSecondT1T2MinBytes;
  1.1390 +  int plang_second = PerScriptNumber(ULScript_Latin, language3[active_slot[1]]);
  1.1391 +
  1.1392 +  if ((language3[active_slot[0]] == ENGLISH) &&
  1.1393 +      (language3[active_slot[1]] != ENGLISH) &&
  1.1394 +      (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
  1.1395 +      (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) &&
  1.1396 +      (second_bytes >= minbytesneeded)) {
  1.1397 +    ignore_percent += percent3[active_slot[0]];
  1.1398 +    return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
  1.1399 +    *summary_lang = language3[active_slot[1]];
  1.1400 +    if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
  1.1401 +
  1.1402 +  // Else If FIGS and X, where X (not UNK, EFIGS) is big enough,
  1.1403 +  // assume the FIGS is boilerplate and return X.
  1.1404 +  // Logically remove FIGS from percentage-text calculation
  1.1405 +  } else if (IsFIGS(language3[active_slot[0]]) &&
  1.1406 +             !IsEFIGS(language3[active_slot[1]]) &&
  1.1407 +             (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
  1.1408 +             (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) &&
  1.1409 +             (second_bytes >= minbytesneeded)) {
  1.1410 +    ignore_percent += percent3[active_slot[0]];
  1.1411 +    return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
  1.1412 +    *summary_lang = language3[active_slot[1]];
  1.1413 +    if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
  1.1414 +
  1.1415 +  // Else we are returning the first language, but want to improve its
  1.1416 +  // return_percent if the second language should be ignored
  1.1417 +  } else  if ((language3[active_slot[1]] == ENGLISH) &&
  1.1418 +              (language3[active_slot[0]] != ENGLISH)) {
  1.1419 +    ignore_percent += percent3[active_slot[1]];
  1.1420 +    return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
  1.1421 +  } else  if (IsFIGS(language3[active_slot[1]]) &&
  1.1422 +              !IsEFIGS(language3[active_slot[0]])) {
  1.1423 +    ignore_percent += percent3[active_slot[1]];
  1.1424 +    return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
  1.1425 +  }
  1.1426 +
  1.1427 +  // If return percent is too small (too many languages), return UNKNOWN
  1.1428 +  if ((return_percent < kGoodFirstMinPercent)) {
  1.1429 +    if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
  1.1430 +      fprintf(stderr, "{Unreli %s %d%% percent too small} ",
  1.1431 +              LanguageCode(*summary_lang), return_percent);
  1.1432 +    }
  1.1433 +    *summary_lang = UNKNOWN_LANGUAGE;
  1.1434 +    *is_reliable = false;
  1.1435 +  }
  1.1436 +
  1.1437 +  // If return percent is small, return language but set unreliable.
  1.1438 +  if ((return_percent < kGoodFirstReliableMinPercent)) {
  1.1439 +    *is_reliable = false;
  1.1440 +  }
  1.1441 +
  1.1442 +  // If ignore percent is too large, set unreliable.
  1.1443 +  ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]);
  1.1444 +  if ((ignore_percent > kIgnoreMaxPercent)) {
  1.1445 +    *is_reliable = false;
  1.1446 +  }
  1.1447 +
  1.1448 +  // If we removed all the active languages, return UNKNOWN
  1.1449 +  if (slot_count == 0) {
  1.1450 +    if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
  1.1451 +      fprintf(stderr, "{Unreli %s no languages left} ",
  1.1452 +              LanguageCode(*summary_lang));
  1.1453 +    }
  1.1454 +    *summary_lang = UNKNOWN_LANGUAGE;
  1.1455 +    *is_reliable = false;
  1.1456 +  }
  1.1457 +}
  1.1458 +
  1.1459 +void AddLangPriorBoost(Language lang, uint32 langprob,
  1.1460 +                       ScoringContext* scoringcontext) {
  1.1461 +  // This is called 0..n times with language hints
  1.1462 +  // but we don't know the script -- so boost either or both Latn, Othr.
  1.1463 +
  1.1464 +  if (IsLatnLanguage(lang)) {
  1.1465 +    LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
  1.1466 +    int n = langprior_boost->n;
  1.1467 +    langprior_boost->langprob[n] = langprob;
  1.1468 +    langprior_boost->n = langprior_boost->wrap(n + 1);
  1.1469 +  }
  1.1470 +
  1.1471 +  if (IsOthrLanguage(lang)) {
  1.1472 +    LangBoosts* langprior_boost = &scoringcontext->langprior_boost.othr;
  1.1473 +    int n = langprior_boost->n;
  1.1474 +    langprior_boost->langprob[n] = langprob;
  1.1475 +    langprior_boost->n = langprior_boost->wrap(n + 1);
  1.1476 +  }
  1.1477 +
  1.1478 +}
  1.1479 +
  1.1480 +void AddOneWhack(Language whacker_lang, Language whackee_lang,
  1.1481 +                 ScoringContext* scoringcontext) {
  1.1482 +  uint32 langprob = MakeLangProb(whackee_lang, 1);
  1.1483 +  // This logic avoids hr-Latn whacking sr-Cyrl, but still whacks sr-Latn
  1.1484 +  if (IsLatnLanguage(whacker_lang) && IsLatnLanguage(whackee_lang)) {
  1.1485 +    LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
  1.1486 +    int n = langprior_whack->n;
  1.1487 +    langprior_whack->langprob[n] = langprob;
  1.1488 +    langprior_whack->n = langprior_whack->wrap(n + 1);
  1.1489 +  }
  1.1490 +  if (IsOthrLanguage(whacker_lang) && IsOthrLanguage(whackee_lang)) {
  1.1491 +    LangBoosts* langprior_whack = &scoringcontext->langprior_whack.othr;
  1.1492 +    int n = langprior_whack->n;
  1.1493 +    langprior_whack->langprob[n] = langprob;
  1.1494 +    langprior_whack->n = langprior_whack->wrap(n + 1);
  1.1495 + }
  1.1496 +}
  1.1497 +
  1.1498 +void AddCloseLangWhack(Language lang, ScoringContext* scoringcontext) {
  1.1499 +  // We do not in general want zh-Hans and zh-Hant to be close pairs,
  1.1500 +  // but we do here.
  1.1501 +  if (lang == CLD2::CHINESE) {
  1.1502 +    AddOneWhack(lang, CLD2::CHINESE_T, scoringcontext);
  1.1503 +    return;
  1.1504 +  }
  1.1505 +  if (lang == CLD2::CHINESE_T) {
  1.1506 +    AddOneWhack(lang, CLD2::CHINESE, scoringcontext);
  1.1507 +    return;
  1.1508 +  }
  1.1509 +
  1.1510 +  int base_lang_set = LanguageCloseSet(lang);
  1.1511 +  if (base_lang_set == 0) {return;}
  1.1512 +  // TODO: add an explicit list of each set to avoid this 512-times loop
  1.1513 +  for (int i = 0; i < kLanguageToPLangSize; ++i) {
  1.1514 +    Language lang2 = static_cast<Language>(i);
  1.1515 +    if ((base_lang_set == LanguageCloseSet(lang2)) && (lang != lang2)) {
  1.1516 +      AddOneWhack(lang, lang2, scoringcontext);
  1.1517 +    }
  1.1518 +  }
  1.1519 +}
  1.1520 +
  1.1521 +
  1.1522 +void ApplyHints(const char* buffer,
  1.1523 +                int buffer_length,
  1.1524 +                bool is_plain_text,
  1.1525 +                const CLDHints* cld_hints,
  1.1526 +                ScoringContext* scoringcontext) {
  1.1527 +  CLDLangPriors lang_priors;
  1.1528 +  InitCLDLangPriors(&lang_priors);
  1.1529 +
  1.1530 +  // We now use lang= tags.
  1.1531 +  // Last look, circa 2008 found only 15% of web pages with lang= tags and
  1.1532 +  // many of those were wrong. Now (July 2011), we find 44% of web pages have
  1.1533 +  // lang= tags, and most of them are correct. So we now give them substantial
  1.1534 +  // weight in each chunk scored.
  1.1535 +  if (!is_plain_text) {
  1.1536 +    // Get any contained language tags in first n KB
  1.1537 +    int32 max_scan_bytes = FLAGS_cld_max_lang_tag_scan_kb << 10;
  1.1538 +    string lang_tags = GetLangTagsFromHtml(buffer, buffer_length,
  1.1539 +                                           max_scan_bytes);
  1.1540 +    SetCLDLangTagsHint(lang_tags, &lang_priors);
  1.1541 +    if (scoringcontext->flags_cld2_html) {
  1.1542 +      if (!lang_tags.empty()) {
  1.1543 +        fprintf(scoringcontext->debug_file, "<br>lang_tags '%s'<br>\n",
  1.1544 +                lang_tags.c_str());
  1.1545 +      }
  1.1546 +    }
  1.1547 +  }
  1.1548 +
  1.1549 +  if (cld_hints != NULL) {
  1.1550 +    if ((cld_hints->content_language_hint != NULL) &&
  1.1551 +        (cld_hints->content_language_hint[0] != '\0')) {
  1.1552 +      SetCLDContentLangHint(cld_hints->content_language_hint, &lang_priors);
  1.1553 +    }
  1.1554 +
  1.1555 +    // Input is from GetTLD(), already lowercased
  1.1556 +    if ((cld_hints->tld_hint != NULL) && (cld_hints->tld_hint[0] != '\0')) {
  1.1557 +      SetCLDTLDHint(cld_hints->tld_hint, &lang_priors);
  1.1558 +    }
  1.1559 +
  1.1560 +    if (cld_hints->encoding_hint != UNKNOWN_ENCODING) {
  1.1561 +      Encoding enc = static_cast<Encoding>(cld_hints->encoding_hint);
  1.1562 +      SetCLDEncodingHint(enc, &lang_priors);
  1.1563 +    }
  1.1564 +
  1.1565 +    if (cld_hints->language_hint != UNKNOWN_LANGUAGE) {
  1.1566 +      SetCLDLanguageHint(cld_hints->language_hint, &lang_priors);
  1.1567 +    }
  1.1568 +  }
  1.1569 +
  1.1570 +  // Keep no more than four different languages with hints
  1.1571 +  TrimCLDLangPriors(4, &lang_priors);
  1.1572 +
  1.1573 +  if (scoringcontext->flags_cld2_html) {
  1.1574 +    string print_temp = DumpCLDLangPriors(&lang_priors);
  1.1575 +    if (!print_temp.empty()) {
  1.1576 +      fprintf(scoringcontext->debug_file, "DumpCLDLangPriors %s<br>\n",
  1.1577 +              print_temp.c_str());
  1.1578 +    }
  1.1579 +  }
  1.1580 +
  1.1581 +  // Put boosts into ScoringContext
  1.1582 +  for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
  1.1583 +    Language lang = GetCLDPriorLang(lang_priors.prior[i]);
  1.1584 +    int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
  1.1585 +    if (qprob > 0) {
  1.1586 +      uint32 langprob = MakeLangProb(lang, qprob);
  1.1587 +      AddLangPriorBoost(lang, langprob, scoringcontext);
  1.1588 +    }
  1.1589 +  }
  1.1590 +
  1.1591 +  // Put whacks into scoring context
  1.1592 +  // We do not in general want zh-Hans and zh-Hant to be close pairs,
  1.1593 +  // but we do here. Use close_set_count[kCloseSetSize] to count zh, zh-Hant
  1.1594 +  std::vector<int> close_set_count(kCloseSetSize + 1, 0);
  1.1595 +
  1.1596 +  for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
  1.1597 +    Language lang = GetCLDPriorLang(lang_priors.prior[i]);
  1.1598 +    ++close_set_count[LanguageCloseSet(lang)];
  1.1599 +    if (lang == CLD2::CHINESE) {++close_set_count[kCloseSetSize];}
  1.1600 +    if (lang == CLD2::CHINESE_T) {++close_set_count[kCloseSetSize];}
  1.1601 +  }
  1.1602 +
  1.1603 +  // If a boost language is in a close set, force suppressing the others in
  1.1604 +  // that set, if exactly one of the set is present
  1.1605 +  for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
  1.1606 +    Language lang = GetCLDPriorLang(lang_priors.prior[i]);
  1.1607 +    int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
  1.1608 +    if (qprob > 0) {
  1.1609 +      int close_set = LanguageCloseSet(lang);
  1.1610 +      if ((close_set > 0) && (close_set_count[close_set] == 1)) {
  1.1611 +        AddCloseLangWhack(lang, scoringcontext);
  1.1612 +      }
  1.1613 +      if (((lang == CLD2::CHINESE) || (lang == CLD2::CHINESE_T)) &&
  1.1614 +          (close_set_count[kCloseSetSize] == 1)) {
  1.1615 +        AddCloseLangWhack(lang, scoringcontext);
  1.1616 +      }
  1.1617 +    }
  1.1618 +  }
  1.1619 +
  1.1620 +
  1.1621 +
  1.1622 +
  1.1623 +
  1.1624 +
  1.1625 +}
  1.1626 +
  1.1627 +
  1.1628 +
  1.1629 +// Results language3/percent3/text_bytes must be exactly three items
  1.1630 +Language DetectLanguageSummaryV2(
  1.1631 +                        const char* buffer,
  1.1632 +                        int buffer_length,
  1.1633 +                        bool is_plain_text,
  1.1634 +                        const CLDHints* cld_hints,
  1.1635 +                        bool allow_extended_lang,
  1.1636 +                        int flags,
  1.1637 +                        Language plus_one,
  1.1638 +                        Language* language3,
  1.1639 +                        int* percent3,
  1.1640 +                        double* normalized_score3,
  1.1641 +                        ResultChunkVector* resultchunkvector,
  1.1642 +                        int* text_bytes,
  1.1643 +                        bool* is_reliable) {
  1.1644 +  language3[0] = UNKNOWN_LANGUAGE;
  1.1645 +  language3[1] = UNKNOWN_LANGUAGE;
  1.1646 +  language3[2] = UNKNOWN_LANGUAGE;
  1.1647 +  percent3[0] = 0;
  1.1648 +  percent3[1] = 0;
  1.1649 +  percent3[2] = 0;
  1.1650 +  normalized_score3[0] = 0.0;
  1.1651 +  normalized_score3[1] = 0.0;
  1.1652 +  normalized_score3[2] = 0.0;
  1.1653 +  if (resultchunkvector != NULL) {
  1.1654 +    resultchunkvector->clear();
  1.1655 +  }
  1.1656 +  *text_bytes = 0;
  1.1657 +  *is_reliable = false;
  1.1658 +
  1.1659 +  if ((flags & kCLDFlagEcho) != 0) {
  1.1660 +     string temp(buffer, buffer_length);
  1.1661 +     if ((flags & kCLDFlagHtml) != 0) {
  1.1662 +        fprintf(stderr, "CLD2[%d] '%s'<br>\n",
  1.1663 +                buffer_length, GetHtmlEscapedText(temp).c_str());
  1.1664 +     } else {
  1.1665 +        fprintf(stderr, "CLD2[%d] '%s'\n",
  1.1666 +                buffer_length, GetPlainEscapedText(temp).c_str());
  1.1667 +     }
  1.1668 +  }
  1.1669 +
  1.1670 +#ifdef CLD2_DYNAMIC_MODE
  1.1671 +  // In dynamic mode, we immediately return UNKNOWN_LANGUAGE if the data file
  1.1672 +  // hasn't been loaded yet. This is the only sane thing we can do, as there
  1.1673 +  // are no scoring tables to consult.
  1.1674 +  bool dataLoaded = isDataLoaded();
  1.1675 +  if ((flags & kCLDFlagVerbose) != 0) {
  1.1676 +    fprintf(stderr, "Data loaded: %s\n", (dataLoaded ? "true" : "false"));
  1.1677 +  }
  1.1678 +  if (!dataLoaded) {
  1.1679 +    return UNKNOWN_LANGUAGE;
  1.1680 +  }
  1.1681 +#endif
  1.1682 +
  1.1683 +  // Exit now if no text
  1.1684 +  if (buffer_length == 0) {return UNKNOWN_LANGUAGE;}
  1.1685 +  if (kScoringtables.quadgram_obj == NULL) {return UNKNOWN_LANGUAGE;}
  1.1686 +
  1.1687 +  // Document totals
  1.1688 +  DocTote doc_tote;   // Reliability = 0..100
  1.1689 +
  1.1690 +  // ScoringContext carries state across scriptspans
  1.1691 +  ScoringContext scoringcontext;
  1.1692 +  scoringcontext.debug_file = stderr;
  1.1693 +  scoringcontext.flags_cld2_score_as_quads =
  1.1694 +    ((flags & kCLDFlagScoreAsQuads) != 0);
  1.1695 +  scoringcontext.flags_cld2_html = ((flags & kCLDFlagHtml) != 0);
  1.1696 +  scoringcontext.flags_cld2_cr = ((flags & kCLDFlagCr) != 0);
  1.1697 +  scoringcontext.flags_cld2_verbose = ((flags & kCLDFlagVerbose) != 0);
  1.1698 +  scoringcontext.prior_chunk_lang = UNKNOWN_LANGUAGE;
  1.1699 +  scoringcontext.ulscript = ULScript_Common;
  1.1700 +  scoringcontext.scoringtables = &kScoringtables;
  1.1701 +  scoringcontext.scanner = NULL;
  1.1702 +  scoringcontext.init();            // Clear the internal memory arrays
  1.1703 +
  1.1704 +  // Now thread safe.
  1.1705 +  bool FLAGS_cld2_html = ((flags & kCLDFlagHtml) != 0);
  1.1706 +  bool FLAGS_cld2_quiet = ((flags & kCLDFlagQuiet) != 0);
  1.1707 +
  1.1708 +  ApplyHints(buffer, buffer_length, is_plain_text, cld_hints, &scoringcontext);
  1.1709 +
  1.1710 +  // Four individual script totals, Latin, Han, other2, other3
  1.1711 +  int next_other_tote = 2;
  1.1712 +  int tote_num = 0;
  1.1713 +
  1.1714 +  // Four totes for up to four different scripts pending at once
  1.1715 +  Tote totes[4];                  // [0] Latn  [1] Hani  [2] other  [3] other
  1.1716 +  bool tote_seen[4] = {false, false, false, false};
  1.1717 +  int tote_grams[4] = {0, 0, 0, 0};     // Number in partial chunk
  1.1718 +  ULScript tote_script[4] =
  1.1719 +    {ULScript_Latin, ULScript_Hani, ULScript_Common, ULScript_Common};
  1.1720 +
  1.1721 +  // Loop through text spans in a single script
  1.1722 +  ScriptScanner ss(buffer, buffer_length, is_plain_text);
  1.1723 +  LangSpan scriptspan;
  1.1724 +
  1.1725 +  scoringcontext.scanner = &ss;
  1.1726 +
  1.1727 +  scriptspan.text = NULL;
  1.1728 +  scriptspan.text_bytes = 0;
  1.1729 +  scriptspan.offset = 0;
  1.1730 +  scriptspan.ulscript = ULScript_Common;
  1.1731 +  scriptspan.lang = UNKNOWN_LANGUAGE;
  1.1732 +
  1.1733 +  int total_text_bytes = 0;
  1.1734 +  int textlimit = FLAGS_cld_textlimit << 10;    // in KB
  1.1735 +  if (textlimit == 0) {textlimit = 0x7fffffff;}
  1.1736 +
  1.1737 +  int advance_by = 2;                   // Advance 2 bytes
  1.1738 +  int advance_limit = textlimit >> 3;   // For first 1/8 of max document
  1.1739 +
  1.1740 +  int initial_word_span = kDefaultWordSpan;
  1.1741 +  if (FLAGS_cld_forcewords) {
  1.1742 +    initial_word_span = kReallyBigWordSpan;
  1.1743 +  }
  1.1744 +
  1.1745 +  // Pick up chunk sizes
  1.1746 +  // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each
  1.1747 +  // Sanity check -- force into a reasonable range
  1.1748 +  int chunksizequads = FLAGS_cld_smoothwidth;
  1.1749 +  chunksizequads = minint(maxint(chunksizequads, kMinChunkSizeQuads),
  1.1750 +                               kMaxChunkSizeQuads);
  1.1751 +  int chunksizeunis = (chunksizequads * 5) >> 1;
  1.1752 +
  1.1753 +  // Varying short-span limit doesn't work well -- skips too much beyond 20KB
  1.1754 +  // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth;
  1.1755 +  int spantooshortlimit = kShortSpanThresh;
  1.1756 +
  1.1757 +  // For debugging only. Not thread-safe
  1.1758 +  prior_lang = UNKNOWN_LANGUAGE;
  1.1759 +  prior_unreliable = false;
  1.1760 +
  1.1761 +  // Allocate full-document prediction table for finding repeating words
  1.1762 +  int hash = 0;
  1.1763 +  int* predict_tbl = new int[kPredictionTableSize];
  1.1764 +  if (FlagRepeats(flags)) {
  1.1765 +    memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
  1.1766 +  }
  1.1767 +
  1.1768 +
  1.1769 +
  1.1770 +  // Loop through scriptspans accumulating number of text bytes in each language
  1.1771 +  while (ss.GetOneScriptSpanLower(&scriptspan)) {
  1.1772 +    ULScript ulscript = scriptspan.ulscript;
  1.1773 +
  1.1774 +    // Squeeze out big chunks of text span if asked to
  1.1775 +    if (FlagSqueeze(flags)) {
  1.1776 +      // Remove repetitive or mostly-spaces chunks
  1.1777 +      int newlen;
  1.1778 +      int chunksize = 0;    // Use the default
  1.1779 +      if (resultchunkvector != NULL) {
  1.1780 +         newlen = CheapSqueezeInplaceOverwrite(scriptspan.text,
  1.1781 +                                               scriptspan.text_bytes,
  1.1782 +                                               chunksize);
  1.1783 +      } else {
  1.1784 +         newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes,
  1.1785 +                                      chunksize);
  1.1786 +      }
  1.1787 +      scriptspan.text_bytes = newlen;
  1.1788 +    } else {
  1.1789 +      // Check now and then to see if we should be squeezing
  1.1790 +      if (((kCheapSqueezeTestThresh >> 1) < scriptspan.text_bytes) &&
  1.1791 +          !FlagFinish(flags)) {
  1.1792 +        // fprintf(stderr, "CheapSqueezeTriggerTest, "
  1.1793 +        //                 "first %d bytes of %d (>%d/2)<br>\n",
  1.1794 +        //         kCheapSqueezeTestLen,
  1.1795 +        //         scriptspan.text_bytes,
  1.1796 +        //         kCheapSqueezeTestThresh);
  1.1797 +
  1.1798 +        if (CheapSqueezeTriggerTest(scriptspan.text,
  1.1799 +                                      scriptspan.text_bytes,
  1.1800 +                                      kCheapSqueezeTestLen)) {
  1.1801 +          // Recursive call with big-chunk squeezing set
  1.1802 +          if (FLAGS_cld2_html || FLAGS_dbgscore) {
  1.1803 +            fprintf(stderr,
  1.1804 +                    "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n",
  1.1805 +                    total_text_bytes);
  1.1806 +          }
  1.1807 +          // Deallocate full-document prediction table
  1.1808 +          delete[] predict_tbl;
  1.1809 +
  1.1810 +          return DetectLanguageSummaryV2(
  1.1811 +                            buffer,
  1.1812 +                            buffer_length,
  1.1813 +                            is_plain_text,
  1.1814 +                            cld_hints,
  1.1815 +                            allow_extended_lang,
  1.1816 +                            flags | kCLDFlagSqueeze,
  1.1817 +                            plus_one,
  1.1818 +                            language3,
  1.1819 +                            percent3,
  1.1820 +                            normalized_score3,
  1.1821 +                            resultchunkvector,
  1.1822 +                            text_bytes,
  1.1823 +                            is_reliable);
  1.1824 +        }
  1.1825 +      }
  1.1826 +    }
  1.1827 +
  1.1828 +    // Remove repetitive words if asked to
  1.1829 +    if (FlagRepeats(flags)) {
  1.1830 +      // Remove repetitive words
  1.1831 +      int newlen;
  1.1832 +      if (resultchunkvector != NULL) {
  1.1833 +        newlen = CheapRepWordsInplaceOverwrite(scriptspan.text,
  1.1834 +                                               scriptspan.text_bytes,
  1.1835 +                                               &hash, predict_tbl);
  1.1836 +      } else {
  1.1837 +        newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes,
  1.1838 +                                      &hash, predict_tbl);
  1.1839 +      }
  1.1840 +      scriptspan.text_bytes = newlen;
  1.1841 +    }
  1.1842 +
  1.1843 +    // Scoring depends on scriptspan buffer ALWAYS having
  1.1844 +    // leading space and off-the-end space space space NUL,
  1.1845 +    // DCHECK(scriptspan.text[0] == ' ');
  1.1846 +    // DCHECK(scriptspan.text[scriptspan.text_bytes + 0] == ' ');
  1.1847 +    // DCHECK(scriptspan.text[scriptspan.text_bytes + 1] == ' ');
  1.1848 +    // DCHECK(scriptspan.text[scriptspan.text_bytes + 2] == ' ');
  1.1849 +    // DCHECK(scriptspan.text[scriptspan.text_bytes + 3] == '\0');
  1.1850 +
  1.1851 +    // The real scoring
  1.1852 +    // Accumulate directly into the document total, or accmulate in one of four
  1.1853 +    // chunk totals. The purpose of the multiple chunk totals is to piece
  1.1854 +    // together short choppy pieces of text in alternating scripts. One total is
  1.1855 +    // dedicated to Latin text, one to Han text, and the other two are dynamicly
  1.1856 +    // assigned.
  1.1857 +
  1.1858 +    scoringcontext.ulscript = scriptspan.ulscript;
  1.1859 +    // FLAGS_cld2_html = scoringcontext.flags_cld2_html;
  1.1860 +
  1.1861 +    ScoreOneScriptSpan(scriptspan,
  1.1862 +                       &scoringcontext,
  1.1863 +                       &doc_tote,
  1.1864 +                       resultchunkvector);
  1.1865 +
  1.1866 +    total_text_bytes += scriptspan.text_bytes;
  1.1867 +  }     // End while (ss.GetOneScriptSpanLower())
  1.1868 +
  1.1869 +  // Deallocate full-document prediction table
  1.1870 +  delete[] predict_tbl;
  1.1871 +
  1.1872 +  if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
  1.1873 +    // If no forced <cr>, put one in front of dump
  1.1874 +    if (!scoringcontext.flags_cld2_cr) {fprintf(stderr, "<br>\n");}
  1.1875 +    doc_tote.Dump(stderr);
  1.1876 +  }
  1.1877 +
  1.1878 +
  1.1879 +  // If extended langauges are disallowed, remove them here
  1.1880 +  if (!allow_extended_lang) {
  1.1881 +    RemoveExtendedLanguages(&doc_tote);
  1.1882 +  }
  1.1883 +
  1.1884 +  // Force close pairs to one or the other
  1.1885 +  // If given, also update resultchunkvector
  1.1886 +  RefineScoredClosePairs(&doc_tote, resultchunkvector,
  1.1887 +                         FLAGS_cld2_html, FLAGS_cld2_quiet);
  1.1888 +
  1.1889 +
  1.1890 +  // Calculate return results
  1.1891 +  // Find top three byte counts in tote heap
  1.1892 +  int reliable_percent3[3];
  1.1893 +
  1.1894 +  // Cannot use Add, etc. after sorting
  1.1895 +  doc_tote.Sort(3);
  1.1896 +
  1.1897 +  ExtractLangEtc(&doc_tote, total_text_bytes,
  1.1898 +                 reliable_percent3, language3, percent3, normalized_score3,
  1.1899 +                 text_bytes, is_reliable);
  1.1900 +
  1.1901 +  bool have_good_answer = false;
  1.1902 +  if (FlagFinish(flags)) {
  1.1903 +    // Force a result
  1.1904 +    have_good_answer = true;
  1.1905 +  } else if (total_text_bytes <= kShortTextThresh) {
  1.1906 +    // Don't recurse on short text -- we already did word scores
  1.1907 +    have_good_answer = true;
  1.1908 +  } else if (*is_reliable &&
  1.1909 +             (percent3[0] >= kGoodLang1Percent)) {
  1.1910 +    have_good_answer = true;
  1.1911 +  } else if (*is_reliable &&
  1.1912 +             ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) {
  1.1913 +    have_good_answer = true;
  1.1914 +  }
  1.1915 +
  1.1916 +
  1.1917 +  if (have_good_answer) {
  1.1918 +    // This is the real, non-recursive return
  1.1919 +
  1.1920 +    // Move bytes for unreliable langs to another lang or UNKNOWN
  1.1921 +    RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet);
  1.1922 +
  1.1923 +    // Redo the result extraction after the removal above
  1.1924 +    doc_tote.Sort(3);
  1.1925 +    ExtractLangEtc(&doc_tote, total_text_bytes,
  1.1926 +                   reliable_percent3, language3, percent3, normalized_score3,
  1.1927 +                   text_bytes, is_reliable);
  1.1928 +
  1.1929 +
  1.1930 +
  1.1931 +    Language summary_lang;
  1.1932 +    CalcSummaryLang(&doc_tote, total_text_bytes,
  1.1933 +                    reliable_percent3, language3, percent3,
  1.1934 +                    &summary_lang, is_reliable,
  1.1935 +                    FLAGS_cld2_html, FLAGS_cld2_quiet);
  1.1936 +
  1.1937 +    if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
  1.1938 +      for (int i = 0; i < 3; ++i) {
  1.1939 +        if (language3[i] != UNKNOWN_LANGUAGE) {
  1.1940 +          fprintf(stderr, "%s.%dR(%d%%) ",
  1.1941 +                  LanguageCode(language3[i]),
  1.1942 +                  reliable_percent3[i],
  1.1943 +                  percent3[i]);
  1.1944 +        }
  1.1945 +      }
  1.1946 +
  1.1947 +      fprintf(stderr, "%d bytes ", total_text_bytes);
  1.1948 +      fprintf(stderr, "= %s%c ",
  1.1949 +              LanguageName(summary_lang), *is_reliable ? ' ' : '*');
  1.1950 +      fprintf(stderr, "<br><br>\n");
  1.1951 +    }
  1.1952 +
  1.1953 +    // Slightly condensed if quiet
  1.1954 +    if (FLAGS_cld2_html && FLAGS_cld2_quiet) {
  1.1955 +      fprintf(stderr, "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ");
  1.1956 +      for (int i = 0; i < 3; ++i) {
  1.1957 +        if (language3[i] != UNKNOWN_LANGUAGE) {
  1.1958 +          fprintf(stderr, "&nbsp;&nbsp;%s %d%% ",
  1.1959 +                  LanguageCode(language3[i]),
  1.1960 +                  percent3[i]);
  1.1961 +        }
  1.1962 +      }
  1.1963 +      fprintf(stderr, "= %s%c ",
  1.1964 +              LanguageName(summary_lang), *is_reliable ? ' ' : '*');
  1.1965 +      fprintf(stderr, "<br>\n");
  1.1966 +    }
  1.1967 +
  1.1968 +    return summary_lang;
  1.1969 +  }
  1.1970 +
  1.1971 +  // Not a good answer -- do recursive call to refine
  1.1972 +  if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) {
  1.1973 +    // This is what we hope to improve on in the recursive call, if any
  1.1974 +    PrintLangs(stderr, language3, percent3, text_bytes, is_reliable);
  1.1975 +  }
  1.1976 +
  1.1977 +  // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40
  1.1978 +  // For this purpose, we treate "Ignore" as top40
  1.1979 +  Language new_plus_one = UNKNOWN_LANGUAGE;
  1.1980 +
  1.1981 +  if (total_text_bytes < kShortTextThresh) {
  1.1982 +      // Short text: Recursive call with top40 and short set
  1.1983 +      if (FLAGS_cld2_html || FLAGS_dbgscore) {
  1.1984 +        fprintf(stderr, "&nbsp;&nbsp;---text_bytes[%d] "
  1.1985 +                "Recursive(Top40/Rep/Short/Words)---<br><br>\n",
  1.1986 +                total_text_bytes);
  1.1987 +      }
  1.1988 +      return DetectLanguageSummaryV2(
  1.1989 +                        buffer,
  1.1990 +                        buffer_length,
  1.1991 +                        is_plain_text,
  1.1992 +                        cld_hints,
  1.1993 +                        allow_extended_lang,
  1.1994 +                        flags | kCLDFlagTop40 | kCLDFlagRepeats |
  1.1995 +                          kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish,
  1.1996 +                        new_plus_one,
  1.1997 +                        language3,
  1.1998 +                        percent3,
  1.1999 +                        normalized_score3,
  1.2000 +                        resultchunkvector,
  1.2001 +                        text_bytes,
  1.2002 +                        is_reliable);
  1.2003 +  }
  1.2004 +
  1.2005 +  // Longer text: Recursive call with top40 set
  1.2006 +  if (FLAGS_cld2_html || FLAGS_dbgscore) {
  1.2007 +    fprintf(stderr,
  1.2008 +            "&nbsp;&nbsp;---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n",
  1.2009 +            total_text_bytes);
  1.2010 +  }
  1.2011 +  return DetectLanguageSummaryV2(
  1.2012 +                        buffer,
  1.2013 +                        buffer_length,
  1.2014 +                        is_plain_text,
  1.2015 +                        cld_hints,
  1.2016 +                        allow_extended_lang,
  1.2017 +                        flags | kCLDFlagTop40 | kCLDFlagRepeats |
  1.2018 +                          kCLDFlagFinish,
  1.2019 +                        new_plus_one,
  1.2020 +                        language3,
  1.2021 +                        percent3,
  1.2022 +                        normalized_score3,
  1.2023 +                        resultchunkvector,
  1.2024 +                        text_bytes,
  1.2025 +                        is_reliable);
  1.2026 +}
  1.2027 +
  1.2028 +
  1.2029 +// For debugging and wrappers. Not thread safe.
  1.2030 +static char temp_detectlanguageversion[32];
  1.2031 +
  1.2032 +// Return version text string
  1.2033 +// String is "code_version - data_build_date"
  1.2034 +const char* DetectLanguageVersion() {
  1.2035 +  if (kScoringtables.quadgram_obj == NULL) {return "";}
  1.2036 +  sprintf(temp_detectlanguageversion,
  1.2037 +          "V2.0 - %u", kScoringtables.quadgram_obj->kCLDTableBuildDate);
  1.2038 +  return temp_detectlanguageversion;
  1.2039 +}
  1.2040 +
  1.2041 +
  1.2042 +}       // End namespace CLD2

mercurial