The Tor Browser: browser/components/translation/cld2/internal/compact_lang_det

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.

     2 //

     3 // Licensed under the Apache License, Version 2.0 (the "License");

     4 // you may not use this file except in compliance with the License.

     5 // You may obtain a copy of the License at

     6 //

     7 //     http://www.apache.org/licenses/LICENSE-2.0

     8 //

     9 // Unless required by applicable law or agreed to in writing, software

    10 // distributed under the License is distributed on an "AS IS" BASIS,

    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

    12 // See the License for the specific language governing permissions and

    13 // limitations under the License.

    15 //

    16 // Author: dsites@google.com (Dick Sites)

    17 // Updated 2014.01 for dual table lookup

    18 //

    20 #include <stdio.h>

    21 #include <string.h>

    22 #include <string>

    23 #include <vector>

    25 #include "cldutil.h"

    26 #include "debug.h"

    27 #include "integral_types.h"

    28 #include "lang_script.h"

    29 #include "utf8statetable.h"

    31 #ifdef CLD2_DYNAMIC_MODE

    32 #include "cld2_dynamic_data.h"

    33 #include "cld2_dynamic_data_loader.h"

    34 #endif

    35 #include "cld2tablesummary.h"

    36 #include "compact_lang_det_impl.h"

    37 #include "compact_lang_det_hint_code.h"

    38 #include "getonescriptspan.h"

    39 #include "tote.h"

    42 namespace CLD2 {

    44 using namespace std;

    46 // Linker supplies the right tables, From files

    47 // cld_generated_cjk_uni_prop_80.cc  cld2_generated_cjk_compatible.cc

    48 // cld_generated_cjk_delta_bi_32.cc  generated_distinct_bi_0.cc

    49 // cld2_generated_quad*.cc  cld2_generated_deltaocta*.cc

    50 // cld2_generated_distinctocta*.cc

    51 // cld_generated_score_quad_octa_1024_256.cc

    53 // 2014.01 Now implementing quadgram dual lookup tables, to allow main table

    54 //   sizes that are 1/3/5 times a power of two, instead of just powers of two.

    55 //   Gives more flexibility of total footprint for CLD2.

    57 extern const int kLanguageToPLangSize;

    58 extern const int kCloseSetSize;

    60 extern const UTF8PropObj cld_generated_CjkUni_obj;

    61 extern const CLD2TableSummary kCjkCompat_obj;

    62 extern const CLD2TableSummary kCjkDeltaBi_obj;

    63 extern const CLD2TableSummary kDistinctBiTable_obj;

    64 extern const CLD2TableSummary kQuad_obj;

    65 extern const CLD2TableSummary kQuad_obj2;     // Dual lookup tables

    66 extern const CLD2TableSummary kDeltaOcta_obj;

    67 extern const CLD2TableSummary kDistinctOcta_obj;

    68 extern const short kAvgDeltaOctaScore[];

    70 #ifdef CLD2_DYNAMIC_MODE

    71   // CLD2_DYNAMIC_MODE is defined:

    72   // Data will be read from an mmap opened at runtime.

    73   static ScoringTables kScoringtables = {

    74     NULL, //&cld_generated_CjkUni_obj,

    75     NULL, //&kCjkCompat_obj,

    76     NULL, //&kCjkDeltaBi_obj,

    77     NULL, //&kDistinctBiTable_obj,

    78     NULL, //&kQuad_obj,

    79     NULL, //&kQuad_obj2,

    80     NULL, //&kDeltaOcta_obj,

    81     NULL, //&kDistinctOcta_obj,

    82     NULL, //kAvgDeltaOctaScore,

    83   };

    84   static bool dynamicDataLoaded = false;

    85   static ScoringTables* dynamicTables = NULL;

    86   static void* mmapAddress = NULL;

    87   static int mmapLength = 0;

    89   bool isDataLoaded() { return dynamicDataLoaded; }

    91   void loadData(const char* fileName) {

    92     if (isDataLoaded()) {

    93       unloadData();

    94     }

    95     dynamicTables = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength);

    96     kScoringtables = *dynamicTables;

    97     dynamicDataLoaded = true;

    98   };

   100   void unloadData() {

   101     if (!dynamicDataLoaded) return;

   102     dynamicDataLoaded = false;

   103     // unloading will null all the pointers out.

   104     CLD2DynamicDataLoader::unloadData(&dynamicTables, &mmapAddress, &mmapLength);

   105   }

   106 #else

   107   // This initializes kScoringtables.quadgram_obj etc.

   108   static const ScoringTables kScoringtables = {

   109     &cld_generated_CjkUni_obj,

   110     &kCjkCompat_obj,

   111     &kCjkDeltaBi_obj,

   112     &kDistinctBiTable_obj,

   114     &kQuad_obj,

   115     &kQuad_obj2,                              // Dual lookup tables

   116     &kDeltaOcta_obj,

   117     &kDistinctOcta_obj,

   119     kAvgDeltaOctaScore,

   120   };

   121 #endif // #ifdef CLD2_DYNAMIC_MODE

   124 static const bool FLAGS_cld_no_minimum_bytes = false;

   125 static const bool FLAGS_cld_forcewords = true;

   126 static const bool FLAGS_cld_showme = false;

   127 static const bool FLAGS_cld_echotext = true;

   128 static const int32 FLAGS_cld_textlimit = 160;

   129 static const int32 FLAGS_cld_smoothwidth = 20;

   130 static const bool FLAGS_cld_2011_hints = true;

   131 static const int32 FLAGS_cld_max_lang_tag_scan_kb = 8;

   133 static const bool FLAGS_dbgscore = false;

   136 static const int kLangHintInitial = 12;  // Boost language by N initially

   137 static const int kLangHintBoost = 12;    // Boost language by N/16 per quadgram

   139 static const int kShortSpanThresh = 32;       // Bytes

   140 static const int kMaxSecondChanceLen = 1024;  // Look at first 1K of short spans

   142 static const int kCheapSqueezeTestThresh = 4096;  // Only look for squeezing

   143                                                   // after this many text bytes

   144 static const int kCheapSqueezeTestLen = 256;  // Bytes to test to trigger sqz

   145 static const int kSpacesTriggerPercent = 25;  // Trigger sqz if >=25% spaces

   146 static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted

   148 static const int kChunksizeDefault = 48;      // Squeeze 48-byte chunks

   149 static const int kSpacesThreshPercent = 25;   // Squeeze if >=25% spaces

   150 static const int kPredictThreshPercent = 40;  // Squeeze if >=40% predicted

   152 static const int kMaxSpaceScan = 32;          // Bytes

   154 static const int kGoodLang1Percent = 70;

   155 static const int kGoodLang1and2Percent = 93;

   156 static const int kShortTextThresh = 256;      // Bytes

   158 static const int kMinChunkSizeQuads = 4;      // Chunk is at least four quads

   159 static const int kMaxChunkSizeQuads = 1024;   // Chunk is at most 1K quads

   161 static const int kDefaultWordSpan = 256;      // Scan at least this many initial

   162                                               // bytes with word scoring

   163 static const int kReallyBigWordSpan = 9999999;  // Forces word scoring all text

   165 static const int kMinReliableSeq = 50;      // Record in seq if >= 50% reliable

   167 static const int kPredictionTableSize = 4096;   // Must be exactly 4096 for

   168                                                 // cheap compressor

   170 static const int kNonEnBoilerplateMinPercent = 17;    // <this => no second

   171 static const int kNonFIGSBoilerplateMinPercent = 20;  // <this => no second

   172 static const int kGoodFirstMinPercent = 26;           // <this => UNK

   173 static const int kGoodFirstReliableMinPercent = 51;   // <this => unreli

   174 static const int kIgnoreMaxPercent = 20;              // >this => unreli

   175 static const int kKeepMinPercent = 2;                 // <this => unreli

   179 // Statistically closest language, based on quadgram table

   180 // Those that are far from other languges map to UNKNOWN_LANGUAGE

   181 // Subscripted by Language

   182 //

   183 // From lang_correlation.txt and hand-edits

   184 // sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/

   185 //   (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE,

   186 //   \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt

   187 //

   188 static const int kMinCorrPercent = 24;        // Pick off how close you want

   189                                               // 24 catches PERSIAN <== ARABIC

   190                                               // but not SPANISH <== PORTUGESE

   191 static Language Unknown = UNKNOWN_LANGUAGE;

   193 // Suspect idea

   194 // Subscripted by Language

   195 static const Language kClosestAltLanguage[] = {

   196   (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE,  // ENGLISH

   197   (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // DANISH

   198   (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE,  // DUTCH

   199   (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE,  // FINNISH

   200   (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // FRENCH

   201   (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE,  // GERMAN

   202   (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE,  // HEBREW

   203   (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE,  // ITALIAN

   204   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Japanese

   205   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Korean

   206   (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE,  // NORWEGIAN

   207   ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // POLISH

   208   (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // PORTUGUESE

   209   (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE,  // RUSSIAN

   210   (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE,  // SPANISH

   211   (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // SWEDISH

   212   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Chinese

   213   (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // CZECH

   214   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GREEK

   215   (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE,  // ICELANDIC

   216   ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE,  // LATVIAN

   217   ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE,  // LITHUANIAN

   218   ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // ROMANIAN

   219   ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // HUNGARIAN

   220   (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE,  // ESTONIAN

   221   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Ignore

   222   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Unknown

   223   (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // BULGARIAN

   224   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CROATIAN

   225   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SERBIAN

   226   (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE,  // IRISH

   227   (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // GALICIAN

   228   ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE,  // TAGALOG

   229   (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE,  // TURKISH

   230   (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // UKRAINIAN

   231   (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE,  // HINDI

   232   (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE,  // MACEDONIAN

   233   (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE,  // BENGALI

   234   (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE,  // INDONESIAN

   235   ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE,  // LATIN

   236   (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE,  // MALAY

   237   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MALAYALAM

   238   ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE,  // WELSH

   239   ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // NEPALI

   240   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // TELUGU

   241   ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE,  // ALBANIAN

   242   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // TAMIL

   243   (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE,  // BELARUSIAN

   244   (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE,  // JAVANESE

   245   (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE,  // OCCITAN

   246   (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // URDU

   247   (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // BIHARI

   248   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GUJARATI

   249   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // THAI

   250   (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // ARABIC

   251   (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // CATALAN

   252   ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // ESPERANTO

   253   ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // BASQUE

   254   ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // INTERLINGUA

   255   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // KANNADA

   256   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PUNJABI

   257   (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE,  // SCOTS_GAELIC

   258   ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // SWAHILI

   259   (28 >= kMinCorrPercent) ? SERBIAN : UNKNOWN_LANGUAGE,  // SLOVENIAN

   260   (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // MARATHI

   261   ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // MALTESE

   262   ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE,  // VIETNAMESE

   263   (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // FRISIAN

   264   (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE,  // SLOVAK

   265   // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ChineseT

   266   (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE,  // ChineseT

   267   (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE,  // FAROESE

   268   (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE,  // SUNDANESE

   269   (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE,  // UZBEK

   270   ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE,  // AMHARIC

   271   (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE,  // AZERBAIJANI

   272   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GEORGIAN

   273   ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE,  // TIGRINYA

   274   (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE,  // PERSIAN

   275   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // BOSNIAN

   276   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SINHALESE

   277   (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // NORWEGIAN_N

   278   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PORTUGUESE_P

   279   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PORTUGUESE_B

   280   (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE,  // XHOSA

   281   (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE,  // ZULU

   282   ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // GUARANI

   283   (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE,  // SESOTHO

   284   ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE,  // TURKMEN

   285   ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE,  // KYRGYZ

   286   ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE,  // BRETON

   287   ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE,  // TWI

   288   (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE,  // YIDDISH

   289   (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE,  // SERBO_CROATIAN

   290   (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE,  // SOMALI

   291   ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE,  // UIGHUR

   292   (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // KURDISH

   293   ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE,  // MONGOLIAN

   294   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ARMENIAN

   295   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // LAOTHIAN

   296   ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE,  // SINDHI

   297   (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // RHAETO_ROMANCE

   298   (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // AFRIKAANS

   299   (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // LUXEMBOURGISH

   300   ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE,  // BURMESE

   301   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // KHMER

   302   (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE,  // TIBETAN

   303   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // DHIVEHI

   304   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CHEROKEE

   305   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SYRIAC

   306   ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // LIMBU

   307   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ORIYA

   308   (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE,  // ASSAMESE

   309   (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // CORSICAN

   310   ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE,  // INTERLINGUE

   311   ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE,  // KAZAKH

   312   ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE,  // LINGALA

   313   (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // MOLDAVIAN

   314   (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // PASHTO

   315   ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE,  // QUECHUA

   316   ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // SHONA

   317   (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE,  // TAJIK

   318   (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE,  // TATAR

   319   (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE,  // TONGA

   320   ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE,  // YORUBA

   321   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_ENGLISH_BASED

   322   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_FRENCH_BASED

   323   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_PORTUGUESE_BASED

   324   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_OTHER

   325   ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // MAORI

   326   ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE,  // WOLOF

   327   ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE,  // ABKHAZIAN

   328   ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE,  // AFAR

   329   ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE,  // AYMARA

   330   (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE,  // BASHKIR

   331   ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // BISLAMA

   332   (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE,  // DZONGKHA

   333   ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // FIJIAN

   334   ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE,  // GREENLANDIC

   335   ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE,  // HAUSA

   336   ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // HAITIAN_CREOLE

   337   ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE,  // INUPIAK

   338   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // INUKTITUT

   339   ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // KASHMIRI

   340   (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE,  // KINYARWANDA

   341   ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE,  // MALAGASY

   342   (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // NAURU

   343   (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE,  // OROMO

   344   (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // RUNDI

   345   (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // SAMOAN

   346   ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE,  // SANGO

   347   (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE,  // SANSKRIT

   348   (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE,  // SISWANT

   349   ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE,  // TSONGA

   350   (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE,  // TSWANA

   351   ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE,  // VOLAPUK

   352   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ZHUANG

   353   ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE,  // KHASI

   354   (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // SCOTS

   355   (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // GANDA

   356   ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // MANX

   357   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MONTENEGRIN

   359   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // AKAN

   360   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // IGBO

   361   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MAURITIAN_CREOLE

   362   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // HAWAIIAN

   363 };

   365 // COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES,

   366 //                kClosestAltLanguage_has_incorrect_size);

   369 inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;}

   370 inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;}

   371 inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;}

   372 inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}

   373 inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}

   374 inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}

   375 inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}

   378   // Defines Top40 packed languages

   380   // Google top 40 languages

   381   //

   382   // Tier 0/1 Language enum list (16)

   383   //   ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH,    // E - FIGS

   384   //   DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN,

   385   //   PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI,

   386   //   ARABIC,

   387   //

   388   // Tier 2 Language enum list (22)

   389   //   SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN,

   390   //   HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN,

   391   //   VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK,

   392   //   TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN,

   393   //   UKRAINIAN, HINDI,

   394   //

   395   //   use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21)

   396   //

   397   // Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40

   400 void DemoteNotTop40(Tote* chunk_tote, uint16 psplus_one) {

   401   // REVISIT

   402 }

   404 void PrintText(FILE* f, Language cur_lang, const string& temp) {

   405   if (temp.size() == 0) {return;}

   406   fprintf(f, "PrintText[%s]%s<br>\n", LanguageName(cur_lang), temp.c_str());

   407 }

   410 //------------------------------------------------------------------------------

   411 // For --cld_html debugging output. Not thread safe

   412 //------------------------------------------------------------------------------

   413 static Language prior_lang = UNKNOWN_LANGUAGE;

   414 static bool prior_unreliable = false;

   416 //------------------------------------------------------------------------------

   417 // End For --cld_html debugging output

   418 //------------------------------------------------------------------------------

   421 // Backscan to word boundary, returning how many bytes n to go back

   422 // so that src - n is non-space ans src - n - 1 is space.

   423 // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary

   424 int BackscanToSpace(const char* src, int limit) {

   425   int n = 0;

   426   limit = minint(limit, kMaxSpaceScan);

   427   while (n < limit) {

   428     if (src[-n - 1] == ' ') {return n;}    // We are at _X

   429     ++n;

   430   }

   431   n = 0;

   432   while (n < limit) {

   433     if ((src[-n] & 0xc0) != 0x80) {return n;}    // We are at char begin

   434     ++n;

   435   }

   436   return 0;

   437 }

   439 // Forwardscan to word boundary, returning how many bytes n to go forward

   440 // so that src + n is non-space ans src + n - 1 is space.

   441 // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary

   442 int ForwardscanToSpace(const char* src, int limit) {

   443   int n = 0;

   444   limit = minint(limit, kMaxSpaceScan);

   445   while (n < limit) {

   446     if (src[n] == ' ') {return n + 1;}    // We are at _X

   447     ++n;

   448   }

   449   n = 0;

   450   while (n < limit) {

   451     if ((src[n] & 0xc0) != 0x80) {return n;}    // We are at char begin

   452     ++n;

   453   }

   454   return 0;

   455 }

   458 // This uses a cheap predictor to get a measure of compression, and

   459 // hence a measure of repetitiveness. It works on complete UTF-8 characters

   460 // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly

   461 // all the time when done with a byte-based count. Sigh.

   462 //

   463 // To allow running prediction across multiple chunks, caller passes in current

   464 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.

   465 //

   466 // Returns the number of *bytes* correctly predicted, increments by 1..4 for

   467 // each correctly-predicted character.

   468 //

   469 // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text

   470 //

   472 // TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen

   474 int CountPredictedBytes(const char* isrc, int src_len, int* hash, int* tbl) {

   475   int p_count = 0;

   476   const uint8* src = reinterpret_cast<const uint8*>(isrc);

   477   const uint8* srclimit = src + src_len;

   478   int local_hash = *hash;

   480   while (src < srclimit) {

   481     int c = src[0];

   482     int incr = 1;

   484     // Pick up one char and length

   485     if (c < 0xc0) {

   486       // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx

   487       // Do nothing more

   488     } else if ((c & 0xe0) == 0xc0) {

   489       // Two-byte

   490       c = (c << 8) | src[1];

   491       incr = 2;

   492     } else if ((c & 0xf0) == 0xe0) {

   493       // Three-byte

   494       c = (c << 16) | (src[1] << 8) | src[2];

   495       incr = 3;

   496     } else {

   497       // Four-byte

   498       c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];

   499       incr = 4;

   500     }

   501     src += incr;

   503     int p = tbl[local_hash];            // Prediction

   504     tbl[local_hash] = c;                // Update prediction

   505     if (c == p) {

   506       p_count += incr;                  // Count bytes of good predictions

   507     }

   509     local_hash = ((local_hash << 4) ^ c) & 0xfff;

   510   }

   511   *hash = local_hash;

   512   return p_count;

   513 }

   517 // Counts number of spaces; a little faster than one-at-a-time

   518 // Doesn't count odd bytes at end

   519 int CountSpaces4(const char* src, int src_len) {

   520   int s_count = 0;

   521   for (int i = 0; i < (src_len & ~3); i += 4) {

   522     s_count += (src[i] == ' ');

   523     s_count += (src[i+1] == ' ');

   524     s_count += (src[i+2] == ' ');

   525     s_count += (src[i+3] == ' ');

   526   }

   527   return s_count;

   528 }

   531 // Remove words of text that have more than half their letters predicted

   532 // correctly by our cheap predictor, moving the remaining words in-place

   533 // to the front of the input buffer.

   534 //

   535 // To allow running prediction across multiple chunks, caller passes in current

   536 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.

   537 //

   538 // Return the new, possibly-shorter length

   539 //

   540 // Result Buffer ALWAYS has leading space and trailing space space space NUL,

   541 // if input does

   542 //

   543 int CheapRepWordsInplace(char* isrc, int src_len, int* hash, int* tbl) {

   544   const uint8* src = reinterpret_cast<const uint8*>(isrc);

   545   const uint8* srclimit = src + src_len;

   546   char* dst = isrc;

   547   int local_hash = *hash;

   548   char* word_dst = dst;           // Start of next word

   549   int good_predict_bytes = 0;

   550   int word_length_bytes = 0;

   552   while (src < srclimit) {

   553     int c = src[0];

   554     int incr = 1;

   555     *dst++ = c;

   557     if (c == ' ') {

   558       if ((good_predict_bytes * 2) > word_length_bytes) {

   559         // Word is well-predicted: backup to start of this word

   560         dst = word_dst;

   561         if (FLAGS_cld_showme) {

   562           // Mark the deletion point with period

   563           // Don't repeat multiple periods

   564           // Cannot mark with more bytes or may overwrite unseen input

   565           if ((isrc < (dst - 2)) && (dst[-2] != '.')) {

   566             *dst++ = '.';

   567             *dst++ = ' ';

   568           }

   569         }

   570       }

   571       word_dst = dst;              // Start of next word

   572       good_predict_bytes = 0;

   573       word_length_bytes = 0;

   574     }

   576     // Pick up one char and length

   577     if (c < 0xc0) {

   578       // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx

   579       // Do nothing more

   580     } else if ((c & 0xe0) == 0xc0) {

   581       // Two-byte

   582       *dst++ = src[1];

   583       c = (c << 8) | src[1];

   584       incr = 2;

   585     } else if ((c & 0xf0) == 0xe0) {

   586       // Three-byte

   587       *dst++ = src[1];

   588       *dst++ = src[2];

   589       c = (c << 16) | (src[1] << 8) | src[2];

   590       incr = 3;

   591     } else {

   592       // Four-byte

   593       *dst++ = src[1];

   594       *dst++ = src[2];

   595       *dst++ = src[3];

   596       c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];

   597       incr = 4;

   598     }

   599     src += incr;

   600     word_length_bytes += incr;

   602     int p = tbl[local_hash];            // Prediction

   603     tbl[local_hash] = c;                // Update prediction

   604     if (c == p) {

   605       good_predict_bytes += incr;       // Count good predictions

   606     }

   608     local_hash = ((local_hash << 4) ^ c) & 0xfff;

   609   }

   611   *hash = local_hash;

   613   if ((dst - isrc) < (src_len - 3)) {

   614     // Pad and make last char clean UTF-8 by putting following spaces

   615     dst[0] = ' ';

   616     dst[1] = ' ';

   617     dst[2] = ' ';

   618     dst[3] = '\0';

   619   } else  if ((dst - isrc) < src_len) {

   620     // Make last char clean UTF-8 by putting following space off the end

   621     dst[0] = ' ';

   622   }

   624   return static_cast<int>(dst - isrc);

   625 }

   628 // This alternate form overwrites redundant words, thus avoiding corrupting the

   629 // backmap for generate a vector of original-text ranges.

   630 int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) {

   631   const uint8* src = reinterpret_cast<const uint8*>(isrc);

   632   const uint8* srclimit = src + src_len;

   633   char* dst = isrc;

   634   int local_hash = *hash;

   635   char* word_dst = dst;           // Start of next word

   636   int good_predict_bytes = 0;

   637   int word_length_bytes = 0;

   639   while (src < srclimit) {

   640     int c = src[0];

   641     int incr = 1;

   642     *dst++ = c;

   644     if (c == ' ') {

   645       if ((good_predict_bytes * 2) > word_length_bytes) {

   646         // Word [word_dst..dst-1) is well-predicted: overwrite

   647         for (char* p = word_dst; p < dst - 1; ++p) {*p = '.';}

   648       }

   649       word_dst = dst;              // Start of next word

   650       good_predict_bytes = 0;

   651       word_length_bytes = 0;

   652     }

   654     // Pick up one char and length

   655     if (c < 0xc0) {

   656       // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx

   657       // Do nothing more

   658     } else if ((c & 0xe0) == 0xc0) {

   659       // Two-byte

   660       *dst++ = src[1];

   661       c = (c << 8) | src[1];

   662       incr = 2;

   663     } else if ((c & 0xf0) == 0xe0) {

   664       // Three-byte

   665       *dst++ = src[1];

   666       *dst++ = src[2];

   667       c = (c << 16) | (src[1] << 8) | src[2];

   668       incr = 3;

   669     } else {

   670       // Four-byte

   671       *dst++ = src[1];

   672       *dst++ = src[2];

   673       *dst++ = src[3];

   674       c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];

   675       incr = 4;

   676     }

   677     src += incr;

   678     word_length_bytes += incr;

   680     int p = tbl[local_hash];            // Prediction

   681     tbl[local_hash] = c;                // Update prediction

   682     if (c == p) {

   683       good_predict_bytes += incr;       // Count good predictions

   684     }

   686     local_hash = ((local_hash << 4) ^ c) & 0xfff;

   687   }

   689   *hash = local_hash;

   691   if ((dst - isrc) < (src_len - 3)) {

   692     // Pad and make last char clean UTF-8 by putting following spaces

   693     dst[0] = ' ';

   694     dst[1] = ' ';

   695     dst[2] = ' ';

   696     dst[3] = '\0';

   697   } else  if ((dst - isrc) < src_len) {

   698     // Make last char clean UTF-8 by putting following space off the end

   699     dst[0] = ' ';

   700   }

   702   return static_cast<int>(dst - isrc);

   703 }

   706 // Remove portions of text that have a high density of spaces, or that are

   707 // overly repetitive, squeezing the remaining text in-place to the front of the

   708 // input buffer.

   709 //

   710 // Squeezing looks at density of space/prediced chars in fixed-size chunks,

   711 // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.

   712 //

   713 // Return the new, possibly-shorter length

   714 //

   715 // Result Buffer ALWAYS has leading space and trailing space space space NUL,

   716 // if input does

   717 //

   718 int CheapSqueezeInplace(char* isrc,

   719                                             int src_len,

   720                                             int ichunksize) {

   721   char* src = isrc;

   722   char* dst = src;

   723   char* srclimit = src + src_len;

   724   bool skipping = false;

   726   int hash = 0;

   727   // Allocate local prediction table.

   728   int* predict_tbl = new int[kPredictionTableSize];

   729   memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));

   731   int chunksize = ichunksize;

   732   if (chunksize == 0) {chunksize = kChunksizeDefault;}

   733   int space_thresh = (chunksize * kSpacesThreshPercent) / 100;

   734   int predict_thresh = (chunksize * kPredictThreshPercent) / 100;

   736   while (src < srclimit) {

   737     int remaining_bytes = srclimit - src;

   738     int len = minint(chunksize, remaining_bytes);

   739     // Make len land us on a UTF-8 character boundary.

   740     // Ah. Also fixes mispredict because we could get out of phase

   741     // Loop always terminates at trailing space in buffer

   742     while ((src[len] & 0xc0) == 0x80) {++len;}  // Move past continuation bytes

   744     int space_n = CountSpaces4(src, len);

   745     int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);

   746     if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {

   747       // Skip the text

   748       if (!skipping) {

   749         // Keeping-to-skipping transition; do it at a space

   750         int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));

   751         dst -= n;

   752         if (dst == isrc) {

   753           // Force a leading space if the first chunk is deleted

   754           *dst++ = ' ';

   755         }

   756         if (FLAGS_cld_showme) {

   757           // Mark the deletion point with black square U+25A0

   758           *dst++ = static_cast<unsigned char>(0xe2);

   759           *dst++ = static_cast<unsigned char>(0x96);

   760           *dst++ = static_cast<unsigned char>(0xa0);

   761           *dst++ = ' ';

   762         }

   763         skipping = true;

   764       }

   765     } else {

   766       // Keep the text

   767       if (skipping) {

   768         // Skipping-to-keeping transition; do it at a space

   769         int n = ForwardscanToSpace(src, len);

   770         src += n;

   771         remaining_bytes -= n;   // Shrink remaining length

   772         len -= n;

   773         skipping = false;

   774       }

   775       // "len" can be negative in some cases

   776       if (len > 0) {

   777         memmove(dst, src, len);

   778         dst += len;

   779       }

   780     }

   781     src += len;

   782   }

   784   if ((dst - isrc) < (src_len - 3)) {

   785     // Pad and make last char clean UTF-8 by putting following spaces

   786     dst[0] = ' ';

   787     dst[1] = ' ';

   788     dst[2] = ' ';

   789     dst[3] = '\0';

   790   } else   if ((dst - isrc) < src_len) {

   791     // Make last char clean UTF-8 by putting following space off the end

   792     dst[0] = ' ';

   793   }

   795   // Deallocate local prediction table

   796   delete[] predict_tbl;

   797   return static_cast<int>(dst - isrc);

   798 }

   800 // This alternate form overwrites redundant words, thus avoiding corrupting the

   801 // backmap for generate a vector of original-text ranges.

   802 int CheapSqueezeInplaceOverwrite(char* isrc,

   803                                             int src_len,

   804                                             int ichunksize) {

   805   char* src = isrc;

   806   char* dst = src;

   807   char* srclimit = src + src_len;

   808   bool skipping = false;

   810   int hash = 0;

   811   // Allocate local prediction table.

   812   int* predict_tbl = new int[kPredictionTableSize];

   813   memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));

   815   int chunksize = ichunksize;

   816   if (chunksize == 0) {chunksize = kChunksizeDefault;}

   817   int space_thresh = (chunksize * kSpacesThreshPercent) / 100;

   818   int predict_thresh = (chunksize * kPredictThreshPercent) / 100;

   820   // Always keep first byte (space)

   821   ++src;

   822   ++dst;

   823   while (src < srclimit) {

   824     int remaining_bytes = srclimit - src;

   825     int len = minint(chunksize, remaining_bytes);

   826     // Make len land us on a UTF-8 character boundary.

   827     // Ah. Also fixes mispredict because we could get out of phase

   828     // Loop always terminates at trailing space in buffer

   829     while ((src[len] & 0xc0) == 0x80) {++len;}  // Move past continuation bytes

   831     int space_n = CountSpaces4(src, len);

   832     int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);

   833     if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {

   834       // Overwrite the text [dst-n..dst)

   835       if (!skipping) {

   836         // Keeping-to-skipping transition; do it at a space

   837         int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));

   838         // Text [word_dst..dst) is well-predicted: overwrite

   839         for (char* p = dst - n; p < dst; ++p) {*p = '.';}

   840         skipping = true;

   841       }

   842       // Overwrite the text [dst..dst+len)

   843       for (char* p = dst; p < dst + len; ++p) {*p = '.';}

   844       dst[len - 1] = ' ';    // Space at end so we can see what is happening

   845     } else {

   846       // Keep the text

   847       if (skipping) {

   848         // Skipping-to-keeping transition; do it at a space

   849         int n = ForwardscanToSpace(src, len);

   850         // Text [dst..dst+n) is well-predicted: overwrite

   851         for (char* p = dst; p < dst + n - 1; ++p) {*p = '.';}

   852         skipping = false;

   853       }

   854     }

   855     dst += len;

   856     src += len;

   857   }

   859   if ((dst - isrc) < (src_len - 3)) {

   860     // Pad and make last char clean UTF-8 by putting following spaces

   861     dst[0] = ' ';

   862     dst[1] = ' ';

   863     dst[2] = ' ';

   864     dst[3] = '\0';

   865   } else   if ((dst - isrc) < src_len) {

   866     // Make last char clean UTF-8 by putting following space off the end

   867     dst[0] = ' ';

   868   }

   870   // Deallocate local prediction table

   871   delete[] predict_tbl;

   872   return static_cast<int>(dst - isrc);

   873 }

   875 // Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input

   876 //  About 90 MB/sec, with or without memcpy, chunksize 48 or 4096

   877 //  Just CountSpaces is about 340 MB/sec

   878 //  Byte-only CountPredictedBytes is about 150 MB/sec

   879 //  Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec

   880 //  Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c

   881 //  Unjammed byte-only both = 170 MB/sec

   882 //  Jammed byte-only both = 120 MB/sec

   883 //  Back to original w/slight updates, 110 MB/sec

   884 //

   885 bool CheapSqueezeTriggerTest(const char* src, int src_len, int testsize) {

   886   // Don't trigger at all on short text

   887   if (src_len < testsize) {return false;}

   888   int space_thresh = (testsize * kSpacesTriggerPercent) / 100;

   889   int predict_thresh = (testsize * kPredictTriggerPercent) / 100;

   890   int hash = 0;

   891   // Allocate local prediction table.

   892   int* predict_tbl = new int[kPredictionTableSize];

   893   memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));

   895   bool retval = false;

   896   if ((CountSpaces4(src, testsize) >= space_thresh) ||

   897       (CountPredictedBytes(src, testsize, &hash, predict_tbl) >=

   898        predict_thresh)) {

   899     retval = true;

   900   }

   901   // Deallocate local prediction table

   902   delete[] predict_tbl;

   903   return retval;

   904 }

   909 // Delete any extended languages from doc_tote

   910 void RemoveExtendedLanguages(DocTote* doc_tote) {

   911   // Now a nop

   912 }

   914 static const int kMinReliableKeepPercent = 41;  // Remove lang if reli < this

   916 // For Tier3 languages, require a minimum number of bytes to be first-place lang

   917 static const int kGoodFirstT3MinBytes = 24;         // <this => no first

   919 // Move bytes for unreliable langs to another lang or UNKNOWN

   920 // doc_tote is sorted, so cannot Add

   921 //

   922 // If both CHINESE and CHINESET are present and unreliable, do not delete both;

   923 // merge both into CHINESE.

   924 //

   925 //dsites 2009.03.19

   926 // we also want to remove Tier3 languages as the first lang if there is very

   927 // little text like ej1 ej2 ej3 ej4

   928 // maybe fold this back in earlier

   929 //

   930 void RemoveUnreliableLanguages(DocTote* doc_tote,

   931                                bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {

   932   // Prepass to merge some low-reliablility languages

   933   // TODO: this shouldn't really reach in to the internal structure of doc_tote

   934   int total_bytes = 0;

   935   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {

   936     int plang = doc_tote->Key(sub);

   937     if (plang == DocTote::kUnusedKey) {continue;}               // Empty slot

   939     Language lang = static_cast<Language>(plang);

   940     int bytes = doc_tote->Value(sub);

   941     int reli = doc_tote->Reliability(sub);

   942     if (bytes == 0) {continue;}                     // Zero bytes

   943     total_bytes += bytes;

   945     // Reliable percent = stored reliable score over stored bytecount

   946     int reliable_percent = reli / bytes;

   947     if (reliable_percent >= kMinReliableKeepPercent) {continue;}   // Keeper

   949     // This language is too unreliable to keep, but we might merge it.

   950     Language altlang = UNKNOWN_LANGUAGE;

   951     if (lang <= HAWAIIAN) {altlang = kClosestAltLanguage[lang];}

   952     if (altlang == UNKNOWN_LANGUAGE) {continue;}    // No alternative

   954     // Look for alternative in doc_tote

   955     int altsub = doc_tote->Find(altlang);

   956     if (altsub < 0) {continue;}                     // No alternative text

   958     int bytes2 = doc_tote->Value(altsub);

   959     int reli2 = doc_tote->Reliability(altsub);

   960     if (bytes2 == 0) {continue;}                    // Zero bytes

   962     // Reliable percent is stored reliable score over stored bytecount

   963     int reliable_percent2 = reli2 / bytes2;

   965     // Merge one language into the other. Break ties toward lower lang #

   966     int tosub = altsub;

   967     int fromsub = sub;

   968     bool into_lang = false;

   969     if ((reliable_percent2 < reliable_percent) ||

   970         ((reliable_percent2 == reliable_percent) && (lang < altlang))) {

   971       tosub = sub;

   972       fromsub = altsub;

   973       into_lang = true;

   974     }

   976     // Make sure merged reliability doesn't drop and is enough to avoid delete

   977     int newpercent = maxint(reliable_percent, reliable_percent2);

   978     newpercent = maxint(newpercent, kMinReliableKeepPercent);

   979     int newbytes = bytes + bytes2;

   980     int newreli = newpercent * newbytes;

   982     doc_tote->SetKey(fromsub, DocTote::kUnusedKey);

   983     doc_tote->SetScore(fromsub, 0);

   984     doc_tote->SetReliability(fromsub, 0);

   985     doc_tote->SetScore(tosub, newbytes);

   986     doc_tote->SetReliability(tosub, newreli);

   988     // Show fate of unreliable languages if at least 10 bytes

   989     if (FLAGS_cld2_html && (newbytes >= 10) &&

   990         !FLAGS_cld2_quiet) {

   991       if (into_lang) {

   992         fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ",

   993                 LanguageCode(altlang), reliable_percent2, bytes2,

   994                 LanguageCode(lang));

   995       } else {

   996         fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ",

   997                 LanguageCode(lang), reliable_percent, bytes,

   998                 LanguageCode(altlang));

   999       }

  1000     }

  1001   }

  1004   // Pass to delete any remaining unreliable languages

  1005   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {

  1006     int plang = doc_tote->Key(sub);

  1007     if (plang == DocTote::kUnusedKey) {continue;}               // Empty slot

  1009     Language lang = static_cast<Language>(plang);

  1010     int bytes = doc_tote->Value(sub);

  1011     int reli = doc_tote->Reliability(sub);

  1012     if (bytes == 0) {continue;}                     // Zero bytes

  1014     // Reliable percent is stored as reliable score over stored bytecount

  1015     int reliable_percent = reli / bytes;

  1016     if (reliable_percent >= kMinReliableKeepPercent) {  // Keeper?

  1017        continue;                                        // yes

  1018     }

  1020     // Delete unreliable entry

  1021     doc_tote->SetKey(sub, DocTote::kUnusedKey);

  1022     doc_tote->SetScore(sub, 0);

  1023     doc_tote->SetReliability(sub, 0);

  1025     // Show fate of unreliable languages if at least 10 bytes

  1026     if (FLAGS_cld2_html && (bytes >= 10) &&

  1027         !FLAGS_cld2_quiet) {

  1028       fprintf(stderr, "{Unreli %s.%dR,%dB} ",

  1029               LanguageCode(lang), reliable_percent, bytes);

  1030     }

  1031   }

  1033   ////if (FLAGS_cld2_html) {fprintf(stderr, "<br>\n");}

  1034 }

  1037 // Move all the text bytes from lower byte-count to higher one

  1038 void MoveLang1ToLang2(Language lang1, Language lang2,

  1039                       int lang1_sub, int lang2_sub,

  1040                       DocTote* doc_tote,

  1041                       ResultChunkVector* resultchunkvector) {

  1042   // In doc_tote, move all the bytes lang1 => lang2

  1043   int sum = doc_tote->Value(lang2_sub) + doc_tote->Value(lang1_sub);

  1044   doc_tote->SetValue(lang2_sub, sum);

  1045   sum = doc_tote->Score(lang2_sub) + doc_tote->Score(lang1_sub);

  1046   doc_tote->SetScore(lang2_sub, sum);

  1047   sum = doc_tote->Reliability(lang2_sub) + doc_tote->Reliability(lang1_sub);

  1048   doc_tote->SetReliability(lang2_sub, sum);

  1050   // Delete old entry

  1051   doc_tote->SetKey(lang1_sub, DocTote::kUnusedKey);

  1052   doc_tote->SetScore(lang1_sub, 0);

  1053   doc_tote->SetReliability(lang1_sub, 0);

  1055   // In resultchunkvector, move all the bytes lang1 => lang2

  1056   if (resultchunkvector == NULL) {return;}

  1058   int k = 0;

  1059   uint16 prior_lang = UNKNOWN_LANGUAGE;

  1060   for (int i = 0; i < static_cast<int>(resultchunkvector->size()); ++i) {

  1061     ResultChunk* rc = &(*resultchunkvector)[i];

  1062     if (rc->lang1 == lang1) {

  1063       // Update entry[i] lang1 => lang2

  1064       rc->lang1 = lang2;

  1065     }

  1066     // One change may produce two merges -- entry before and entry after

  1067     if ((rc->lang1 == prior_lang) && (k > 0)) {

  1068       // Merge with previous, deleting entry[i]

  1069       ResultChunk* prior_rc = &(*resultchunkvector)[k - 1];

  1070       prior_rc->bytes += rc->bytes;

  1071       // fprintf(stderr, "MoveLang1ToLang2 merged [%d] => [%d]<br>\n", i, k-1);

  1072     } else {

  1073       // Keep entry[i]

  1074       (*resultchunkvector)[k] = (*resultchunkvector)[i];

  1075       // fprintf(stderr, "MoveLang1ToLang2 keep [%d] => [%d]<br>\n", i, k);

  1076       ++k;

  1077     }

  1078     prior_lang = rc->lang1;

  1079   }

  1080   resultchunkvector->resize(k);

  1081 }

  1085 // Move less likely byte count to more likely for close pairs of languages

  1086 // If given, also update resultchunkvector

  1087 void RefineScoredClosePairs(DocTote* doc_tote,

  1088                             ResultChunkVector* resultchunkvector,

  1089                             bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {

  1090   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {

  1091     int close_packedlang = doc_tote->Key(sub);

  1092     int subscr = LanguageCloseSet(static_cast<Language>(close_packedlang));

  1093     if (subscr == 0) {continue;}

  1095     // We have a close pair language -- if the other one is also scored and the

  1096     // longword score differs enough, put all our eggs into one basket

  1098     // Nonzero longword score: Go look for the other of this pair

  1099     for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) {

  1100       if (LanguageCloseSet(static_cast<Language>(doc_tote->Key(sub2))) == subscr) {

  1101         // We have a matching pair

  1102         int close_packedlang2 = doc_tote->Key(sub2);

  1104         // Move all the text bytes from lower byte-count to higher one

  1105         int from_sub, to_sub;

  1106         Language from_lang, to_lang;

  1107         if (doc_tote->Value(sub) < doc_tote->Value(sub2)) {

  1108           from_sub = sub;

  1109           to_sub = sub2;

  1110           from_lang = static_cast<Language>(close_packedlang);

  1111           to_lang = static_cast<Language>(close_packedlang2);

  1112         } else {

  1113           from_sub = sub2;

  1114           to_sub = sub;

  1115           from_lang = static_cast<Language>(close_packedlang2);

  1116           to_lang = static_cast<Language>(close_packedlang);

  1117         }

  1119         if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) {

  1120           // Show fate of closepair language

  1121           int val = doc_tote->Value(from_sub);           // byte count

  1122           int reli = doc_tote->Reliability(from_sub);

  1123           int reliable_percent = reli / (val ? val : 1);  // avoid zdiv

  1124           fprintf(stderr, "{CloseLangPair: %s.%dR,%dB => %s}<br>\n",

  1125                   LanguageCode(from_lang),

  1126                   reliable_percent,

  1127                   doc_tote->Value(from_sub),

  1128                   LanguageCode(to_lang));

  1129         }

  1130         MoveLang1ToLang2(from_lang, to_lang, from_sub, to_sub,

  1131                          doc_tote, resultchunkvector);

  1132         break;    // Exit inner for sub2 loop

  1133       }

  1134     }     // End for sub2

  1135   }   // End for sub

  1136 }

  1139 void ApplyAllLanguageHints(Tote* chunk_tote, int tote_grams,

  1140                         uint8* lang_hint_boost) {

  1141 }

  1144 void PrintHtmlEscapedText(FILE* f, const char* txt, int len) {

  1145    string temp(txt, len);

  1146    fprintf(f, "%s", GetHtmlEscapedText(temp).c_str());

  1147 }

  1149 void PrintLang(FILE* f, Tote* chunk_tote,

  1150               Language cur_lang, bool cur_unreliable,

  1151               Language prior_lang, bool prior_unreliable) {

  1152   if (cur_lang == prior_lang) {

  1153     fprintf(f, "[]");

  1154   } else {

  1155     fprintf(f, "[%s%s]", LanguageCode(cur_lang), cur_unreliable ? "*" : "");

  1156   }

  1157 }

  1160 void PrintTopLang(Language top_lang) {

  1161   if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {

  1162     fprintf(stderr, "[] ");

  1163   } else {

  1164     fprintf(stderr, "[%s] ", LanguageName(top_lang));

  1165     prior_lang = top_lang;

  1166   }

  1167 }

  1169 void PrintTopLangSpeculative(Language top_lang) {

  1170   fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0);

  1171   if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {

  1172     fprintf(stderr, "[] ");

  1173   } else {

  1174     fprintf(stderr, "[%s] ", LanguageName(top_lang));

  1175     prior_lang = top_lang;

  1176   }

  1177   fprintf(stderr, "</span>\n");

  1178 }

  1180 void PrintLangs(FILE* f, const Language* language3, const int* percent3,

  1181                 const int* text_bytes, const bool* is_reliable) {

  1182   fprintf(f, "<br>&nbsp;&nbsp;Initial_Languages ");

  1183   if (language3[0] != UNKNOWN_LANGUAGE) {

  1184     fprintf(f, "%s%s(%d%%)  ",

  1185             LanguageName(language3[0]),

  1186             *is_reliable ? "" : "*",

  1187             percent3[0]);

  1188   }

  1189   if (language3[1] != UNKNOWN_LANGUAGE) {

  1190     fprintf(f, "%s(%d%%)  ", LanguageName(language3[1]), percent3[1]);

  1191   }

  1192   if (language3[2] != UNKNOWN_LANGUAGE) {

  1193     fprintf(f, "%s(%d%%)  ", LanguageName(language3[2]), percent3[2]);

  1194   }

  1195   fprintf(f, "%d bytes \n", *text_bytes);

  1197   fprintf(f, "<br>\n");

  1198 }

  1201 // Return internal probability score (sum) per 1024 bytes

  1202 double GetNormalizedScore(Language lang, ULScript ulscript,

  1203                           int bytecount, int score) {

  1204   if (bytecount <= 0) {return 0.0;}

  1205   return (score << 10) / bytecount;

  1206 }

  1208 // Extract return values before fixups

  1209 void ExtractLangEtc(DocTote* doc_tote, int total_text_bytes,

  1210                     int* reliable_percent3, Language* language3, int* percent3,

  1211                     double*  normalized_score3,

  1212                     int* text_bytes, bool* is_reliable) {

  1213   reliable_percent3[0] = 0;

  1214   reliable_percent3[1] = 0;

  1215   reliable_percent3[2] = 0;

  1216   language3[0] = UNKNOWN_LANGUAGE;

  1217   language3[1] = UNKNOWN_LANGUAGE;

  1218   language3[2] = UNKNOWN_LANGUAGE;

  1219   percent3[0] = 0;

  1220   percent3[1] = 0;

  1221   percent3[2] = 0;

  1222   normalized_score3[0] = 0.0;

  1223   normalized_score3[1] = 0.0;

  1224   normalized_score3[2] = 0.0;

  1226   *text_bytes = total_text_bytes;

  1227   *is_reliable = false;

  1229   int bytecount1 = 0;

  1230   int bytecount2 = 0;

  1231   int bytecount3 = 0;

  1233   int lang1 = doc_tote->Key(0);

  1234   if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) {

  1235     // We have a top language

  1236     language3[0] = static_cast<Language>(lang1);

  1237     bytecount1 = doc_tote->Value(0);

  1238     int reli1 = doc_tote->Reliability(0);

  1239     reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1);  // avoid zdiv

  1240     normalized_score3[0] = GetNormalizedScore(language3[0],

  1241                                                   ULScript_Common,

  1242                                                   bytecount1,

  1243                                                   doc_tote->Score(0));

  1244   }

  1246   int lang2 = doc_tote->Key(1);

  1247   if ((lang2 != DocTote::kUnusedKey) && (lang2 != UNKNOWN_LANGUAGE)) {

  1248     language3[1] = static_cast<Language>(lang2);

  1249     bytecount2 = doc_tote->Value(1);

  1250     int reli2 = doc_tote->Reliability(1);

  1251     reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1);  // avoid zdiv

  1252     normalized_score3[1] = GetNormalizedScore(language3[1],

  1253                                                   ULScript_Common,

  1254                                                   bytecount2,

  1255                                                   doc_tote->Score(1));

  1256   }

  1258   int lang3 = doc_tote->Key(2);

  1259   if ((lang3 != DocTote::kUnusedKey) && (lang3 != UNKNOWN_LANGUAGE)) {

  1260     language3[2] = static_cast<Language>(lang3);

  1261     bytecount3 = doc_tote->Value(2);

  1262     int reli3 = doc_tote->Reliability(2);

  1263     reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1);  // avoid zdiv

  1264     normalized_score3[2] = GetNormalizedScore(language3[2],

  1265                                                   ULScript_Common,

  1266                                                   bytecount3,

  1267                                                   doc_tote->Score(2));

  1268   }

  1270   // Increase total bytes to sum (top 3) if low for some reason

  1271   int total_bytecount12 = bytecount1 + bytecount2;

  1272   int total_bytecount123 = total_bytecount12 + bytecount3;

  1273   if (total_text_bytes < total_bytecount123) {

  1274     total_text_bytes = total_bytecount123;

  1275     *text_bytes = total_text_bytes;

  1276   }

  1278   // Sum minus previous % gives better roundoff behavior than bytecount/total

  1279   int total_text_bytes_div = maxint(1, total_text_bytes);    // Avoid zdiv

  1280   percent3[0] = (bytecount1 * 100) / total_text_bytes_div;

  1281   percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div;

  1282   percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div;

  1283   percent3[2] -= percent3[1];

  1284   percent3[1] -= percent3[0];

  1286   // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2%

  1287   // Fix this explicitly

  1288   if (percent3[1] < percent3[2]) {

  1289     ++percent3[1];

  1290     --percent3[2];

  1291   }

  1292   if (percent3[0] < percent3[1]) {

  1293     ++percent3[0];

  1294     --percent3[1];

  1295   }

  1297   *text_bytes = total_text_bytes;

  1299   if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) {

  1300     // We have a top language

  1301     // Its reliability is overall result reliability

  1302     int bytecount = doc_tote->Value(0);

  1303     int reli = doc_tote->Reliability(0);

  1304     int reliable_percent = reli / (bytecount ? bytecount : 1);  // avoid zdiv

  1305     *is_reliable = (reliable_percent >= kMinReliableKeepPercent);

  1306   } else {

  1307     // No top language at all. This can happen with zero text or 100% Klingon

  1308     // if extended=false. Just return all UNKNOWN_LANGUAGE, unreliable.

  1309     *is_reliable = false;

  1310   }

  1312   // If ignore percent is too large, set unreliable.

  1313   int ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]);

  1314   if ((ignore_percent > kIgnoreMaxPercent)) {

  1315     *is_reliable = false;

  1316   }

  1317 }

  1319 bool IsFIGS(Language lang) {

  1320   if (lang == FRENCH) {return true;}

  1321   if (lang == ITALIAN) {return true;}

  1322   if (lang == GERMAN) {return true;}

  1323   if (lang == SPANISH) {return true;}

  1324   return false;

  1325 }

  1327 bool IsEFIGS(Language lang) {

  1328   if (lang == ENGLISH) {return true;}

  1329   if (lang == FRENCH) {return true;}

  1330   if (lang == ITALIAN) {return true;}

  1331   if (lang == GERMAN) {return true;}

  1332   if (lang == SPANISH) {return true;}

  1333   return false;

  1334 }

  1336 // For Tier3 languages, require more bytes of text to override

  1337 // the first-place language

  1338 static const int kGoodSecondT1T2MinBytes = 15;        // <this => no second

  1339 static const int kGoodSecondT3MinBytes = 128;         // <this => no second

  1341 // Calculate a single summary language for the document, and its reliability.

  1342 // Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE

  1343 // This is the heart of matching human-rater perception.

  1344 // reliable_percent3[] is currently unused

  1345 //

  1346 // Do not return Tier3 second language unless there are at least 128 bytes

  1347 void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes,

  1348                      const int* reliable_percent3,

  1349                      const Language* language3,

  1350                      const int* percent3,

  1351                      Language* summary_lang, bool* is_reliable,

  1352                      bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {

  1353   // Vector of active languages; changes if we delete some

  1354   int slot_count = 3;

  1355   int active_slot[3] = {0, 1, 2};

  1357   int ignore_percent = 0;

  1358   int return_percent = percent3[0];   // Default to top lang

  1359   *summary_lang = language3[0];

  1360   *is_reliable = true;

  1361   if (percent3[0] < kKeepMinPercent) {*is_reliable = false;}

  1363   // If any of top 3 is IGNORE, remove it and increment ignore_percent

  1364   for (int i = 0; i < 3; ++i) {

  1365     if (language3[i] == TG_UNKNOWN_LANGUAGE) {

  1366       ignore_percent += percent3[i];

  1367       // Move the rest up, levaing input vectors unchanged

  1368       for (int j=i+1; j < 3; ++j) {

  1369         active_slot[j - 1] = active_slot[j];

  1370       }

  1371       -- slot_count;

  1372       // Logically remove Ignore from percentage-text calculation

  1373       // (extra 1 in 101 avoids zdiv, biases slightly small)

  1374       return_percent = (percent3[0] * 100) / (101 - ignore_percent);

  1375       *summary_lang = language3[active_slot[0]];

  1376       if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;}

  1377     }

  1378   }

  1381   // If English and X, where X (not UNK) is big enough,

  1382   // assume the English is boilerplate and return X.

  1383   // Logically remove English from percentage-text calculation

  1384   int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100;

  1385   // Require more bytes of text for Tier3 languages

  1386   int minbytesneeded = kGoodSecondT1T2MinBytes;

  1387   int plang_second = PerScriptNumber(ULScript_Latin, language3[active_slot[1]]);

  1389   if ((language3[active_slot[0]] == ENGLISH) &&

  1390       (language3[active_slot[1]] != ENGLISH) &&

  1391       (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&

  1392       (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) &&

  1393       (second_bytes >= minbytesneeded)) {

  1394     ignore_percent += percent3[active_slot[0]];

  1395     return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);

  1396     *summary_lang = language3[active_slot[1]];

  1397     if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}

  1399   // Else If FIGS and X, where X (not UNK, EFIGS) is big enough,

  1400   // assume the FIGS is boilerplate and return X.

  1401   // Logically remove FIGS from percentage-text calculation

  1402   } else if (IsFIGS(language3[active_slot[0]]) &&

  1403              !IsEFIGS(language3[active_slot[1]]) &&

  1404              (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&

  1405              (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) &&

  1406              (second_bytes >= minbytesneeded)) {

  1407     ignore_percent += percent3[active_slot[0]];

  1408     return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);

  1409     *summary_lang = language3[active_slot[1]];

  1410     if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}

  1412   // Else we are returning the first language, but want to improve its

  1413   // return_percent if the second language should be ignored

  1414   } else  if ((language3[active_slot[1]] == ENGLISH) &&

  1415               (language3[active_slot[0]] != ENGLISH)) {

  1416     ignore_percent += percent3[active_slot[1]];

  1417     return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);

  1418   } else  if (IsFIGS(language3[active_slot[1]]) &&

  1419               !IsEFIGS(language3[active_slot[0]])) {

  1420     ignore_percent += percent3[active_slot[1]];

  1421     return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);

  1422   }

  1424   // If return percent is too small (too many languages), return UNKNOWN

  1425   if ((return_percent < kGoodFirstMinPercent)) {

  1426     if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {

  1427       fprintf(stderr, "{Unreli %s %d%% percent too small} ",

  1428               LanguageCode(*summary_lang), return_percent);

  1429     }

  1430     *summary_lang = UNKNOWN_LANGUAGE;

  1431     *is_reliable = false;

  1432   }

  1434   // If return percent is small, return language but set unreliable.

  1435   if ((return_percent < kGoodFirstReliableMinPercent)) {

  1436     *is_reliable = false;

  1437   }

  1439   // If ignore percent is too large, set unreliable.

  1440   ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]);

  1441   if ((ignore_percent > kIgnoreMaxPercent)) {

  1442     *is_reliable = false;

  1443   }

  1445   // If we removed all the active languages, return UNKNOWN

  1446   if (slot_count == 0) {

  1447     if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {

  1448       fprintf(stderr, "{Unreli %s no languages left} ",

  1449               LanguageCode(*summary_lang));

  1450     }

  1451     *summary_lang = UNKNOWN_LANGUAGE;

  1452     *is_reliable = false;

  1453   }

  1454 }

  1456 void AddLangPriorBoost(Language lang, uint32 langprob,

  1457                        ScoringContext* scoringcontext) {

  1458   // This is called 0..n times with language hints

  1459   // but we don't know the script -- so boost either or both Latn, Othr.

  1461   if (IsLatnLanguage(lang)) {

  1462     LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;

  1463     int n = langprior_boost->n;

  1464     langprior_boost->langprob[n] = langprob;

  1465     langprior_boost->n = langprior_boost->wrap(n + 1);

  1466   }

  1468   if (IsOthrLanguage(lang)) {

  1469     LangBoosts* langprior_boost = &scoringcontext->langprior_boost.othr;

  1470     int n = langprior_boost->n;

  1471     langprior_boost->langprob[n] = langprob;

  1472     langprior_boost->n = langprior_boost->wrap(n + 1);

  1473   }

  1475 }

  1477 void AddOneWhack(Language whacker_lang, Language whackee_lang,

  1478                  ScoringContext* scoringcontext) {

  1479   uint32 langprob = MakeLangProb(whackee_lang, 1);

  1480   // This logic avoids hr-Latn whacking sr-Cyrl, but still whacks sr-Latn

  1481   if (IsLatnLanguage(whacker_lang) && IsLatnLanguage(whackee_lang)) {

  1482     LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;

  1483     int n = langprior_whack->n;

  1484     langprior_whack->langprob[n] = langprob;

  1485     langprior_whack->n = langprior_whack->wrap(n + 1);

  1486   }

  1487   if (IsOthrLanguage(whacker_lang) && IsOthrLanguage(whackee_lang)) {

  1488     LangBoosts* langprior_whack = &scoringcontext->langprior_whack.othr;

  1489     int n = langprior_whack->n;

  1490     langprior_whack->langprob[n] = langprob;

  1491     langprior_whack->n = langprior_whack->wrap(n + 1);

  1492  }

  1493 }

  1495 void AddCloseLangWhack(Language lang, ScoringContext* scoringcontext) {

  1496   // We do not in general want zh-Hans and zh-Hant to be close pairs,

  1497   // but we do here.

  1498   if (lang == CLD2::CHINESE) {

  1499     AddOneWhack(lang, CLD2::CHINESE_T, scoringcontext);

  1500     return;

  1501   }

  1502   if (lang == CLD2::CHINESE_T) {

  1503     AddOneWhack(lang, CLD2::CHINESE, scoringcontext);

  1504     return;

  1505   }

  1507   int base_lang_set = LanguageCloseSet(lang);

  1508   if (base_lang_set == 0) {return;}

  1509   // TODO: add an explicit list of each set to avoid this 512-times loop

  1510   for (int i = 0; i < kLanguageToPLangSize; ++i) {

  1511     Language lang2 = static_cast<Language>(i);

  1512     if ((base_lang_set == LanguageCloseSet(lang2)) && (lang != lang2)) {

  1513       AddOneWhack(lang, lang2, scoringcontext);

  1514     }

  1515   }

  1516 }

  1519 void ApplyHints(const char* buffer,

  1520                 int buffer_length,

  1521                 bool is_plain_text,

  1522                 const CLDHints* cld_hints,

  1523                 ScoringContext* scoringcontext) {

  1524   CLDLangPriors lang_priors;

  1525   InitCLDLangPriors(&lang_priors);

  1527   // We now use lang= tags.

  1528   // Last look, circa 2008 found only 15% of web pages with lang= tags and

  1529   // many of those were wrong. Now (July 2011), we find 44% of web pages have

  1530   // lang= tags, and most of them are correct. So we now give them substantial

  1531   // weight in each chunk scored.

  1532   if (!is_plain_text) {

  1533     // Get any contained language tags in first n KB

  1534     int32 max_scan_bytes = FLAGS_cld_max_lang_tag_scan_kb << 10;

  1535     string lang_tags = GetLangTagsFromHtml(buffer, buffer_length,

  1536                                            max_scan_bytes);

  1537     SetCLDLangTagsHint(lang_tags, &lang_priors);

  1538     if (scoringcontext->flags_cld2_html) {

  1539       if (!lang_tags.empty()) {

  1540         fprintf(scoringcontext->debug_file, "<br>lang_tags '%s'<br>\n",

  1541                 lang_tags.c_str());

  1542       }

  1543     }

  1544   }

  1546   if (cld_hints != NULL) {

  1547     if ((cld_hints->content_language_hint != NULL) &&

  1548         (cld_hints->content_language_hint[0] != '\0')) {

  1549       SetCLDContentLangHint(cld_hints->content_language_hint, &lang_priors);

  1550     }

  1552     // Input is from GetTLD(), already lowercased

  1553     if ((cld_hints->tld_hint != NULL) && (cld_hints->tld_hint[0] != '\0')) {

  1554       SetCLDTLDHint(cld_hints->tld_hint, &lang_priors);

  1555     }

  1557     if (cld_hints->encoding_hint != UNKNOWN_ENCODING) {

  1558       Encoding enc = static_cast<Encoding>(cld_hints->encoding_hint);

  1559       SetCLDEncodingHint(enc, &lang_priors);

  1560     }

  1562     if (cld_hints->language_hint != UNKNOWN_LANGUAGE) {

  1563       SetCLDLanguageHint(cld_hints->language_hint, &lang_priors);

  1564     }

  1565   }

  1567   // Keep no more than four different languages with hints

  1568   TrimCLDLangPriors(4, &lang_priors);

  1570   if (scoringcontext->flags_cld2_html) {

  1571     string print_temp = DumpCLDLangPriors(&lang_priors);

  1572     if (!print_temp.empty()) {

  1573       fprintf(scoringcontext->debug_file, "DumpCLDLangPriors %s<br>\n",

  1574               print_temp.c_str());

  1575     }

  1576   }

  1578   // Put boosts into ScoringContext

  1579   for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {

  1580     Language lang = GetCLDPriorLang(lang_priors.prior[i]);

  1581     int qprob = GetCLDPriorWeight(lang_priors.prior[i]);

  1582     if (qprob > 0) {

  1583       uint32 langprob = MakeLangProb(lang, qprob);

  1584       AddLangPriorBoost(lang, langprob, scoringcontext);

  1585     }

  1586   }

  1588   // Put whacks into scoring context

  1589   // We do not in general want zh-Hans and zh-Hant to be close pairs,

  1590   // but we do here. Use close_set_count[kCloseSetSize] to count zh, zh-Hant

  1591   std::vector<int> close_set_count(kCloseSetSize + 1, 0);

  1593   for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {

  1594     Language lang = GetCLDPriorLang(lang_priors.prior[i]);

  1595     ++close_set_count[LanguageCloseSet(lang)];

  1596     if (lang == CLD2::CHINESE) {++close_set_count[kCloseSetSize];}

  1597     if (lang == CLD2::CHINESE_T) {++close_set_count[kCloseSetSize];}

  1598   }

  1600   // If a boost language is in a close set, force suppressing the others in

  1601   // that set, if exactly one of the set is present

  1602   for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {

  1603     Language lang = GetCLDPriorLang(lang_priors.prior[i]);

  1604     int qprob = GetCLDPriorWeight(lang_priors.prior[i]);

  1605     if (qprob > 0) {

  1606       int close_set = LanguageCloseSet(lang);

  1607       if ((close_set > 0) && (close_set_count[close_set] == 1)) {

  1608         AddCloseLangWhack(lang, scoringcontext);

  1609       }

  1610       if (((lang == CLD2::CHINESE) || (lang == CLD2::CHINESE_T)) &&

  1611           (close_set_count[kCloseSetSize] == 1)) {

  1612         AddCloseLangWhack(lang, scoringcontext);

  1613       }

  1614     }

  1615   }

  1622 }

  1626 // Results language3/percent3/text_bytes must be exactly three items

  1627 Language DetectLanguageSummaryV2(

  1628                         const char* buffer,

  1629                         int buffer_length,

  1630                         bool is_plain_text,

  1631                         const CLDHints* cld_hints,

  1632                         bool allow_extended_lang,

  1633                         int flags,

  1634                         Language plus_one,

  1635                         Language* language3,

  1636                         int* percent3,

  1637                         double* normalized_score3,

  1638                         ResultChunkVector* resultchunkvector,

  1639                         int* text_bytes,

  1640                         bool* is_reliable) {

  1641   language3[0] = UNKNOWN_LANGUAGE;

  1642   language3[1] = UNKNOWN_LANGUAGE;

  1643   language3[2] = UNKNOWN_LANGUAGE;

  1644   percent3[0] = 0;

  1645   percent3[1] = 0;

  1646   percent3[2] = 0;

  1647   normalized_score3[0] = 0.0;

  1648   normalized_score3[1] = 0.0;

  1649   normalized_score3[2] = 0.0;

  1650   if (resultchunkvector != NULL) {

  1651     resultchunkvector->clear();

  1652   }

  1653   *text_bytes = 0;

  1654   *is_reliable = false;

  1656   if ((flags & kCLDFlagEcho) != 0) {

  1657      string temp(buffer, buffer_length);

  1658      if ((flags & kCLDFlagHtml) != 0) {

  1659         fprintf(stderr, "CLD2[%d] '%s'<br>\n",

  1660                 buffer_length, GetHtmlEscapedText(temp).c_str());

  1661      } else {

  1662         fprintf(stderr, "CLD2[%d] '%s'\n",

  1663                 buffer_length, GetPlainEscapedText(temp).c_str());

  1664      }

  1665   }

  1667 #ifdef CLD2_DYNAMIC_MODE

  1668   // In dynamic mode, we immediately return UNKNOWN_LANGUAGE if the data file

  1669   // hasn't been loaded yet. This is the only sane thing we can do, as there

  1670   // are no scoring tables to consult.

  1671   bool dataLoaded = isDataLoaded();

  1672   if ((flags & kCLDFlagVerbose) != 0) {

  1673     fprintf(stderr, "Data loaded: %s\n", (dataLoaded ? "true" : "false"));

  1674   }

  1675   if (!dataLoaded) {

  1676     return UNKNOWN_LANGUAGE;

  1677   }

  1678 #endif

  1680   // Exit now if no text

  1681   if (buffer_length == 0) {return UNKNOWN_LANGUAGE;}

  1682   if (kScoringtables.quadgram_obj == NULL) {return UNKNOWN_LANGUAGE;}

  1684   // Document totals

  1685   DocTote doc_tote;   // Reliability = 0..100

  1687   // ScoringContext carries state across scriptspans

  1688   ScoringContext scoringcontext;

  1689   scoringcontext.debug_file = stderr;

  1690   scoringcontext.flags_cld2_score_as_quads =

  1691     ((flags & kCLDFlagScoreAsQuads) != 0);

  1692   scoringcontext.flags_cld2_html = ((flags & kCLDFlagHtml) != 0);

  1693   scoringcontext.flags_cld2_cr = ((flags & kCLDFlagCr) != 0);

  1694   scoringcontext.flags_cld2_verbose = ((flags & kCLDFlagVerbose) != 0);

  1695   scoringcontext.prior_chunk_lang = UNKNOWN_LANGUAGE;

  1696   scoringcontext.ulscript = ULScript_Common;

  1697   scoringcontext.scoringtables = &kScoringtables;

  1698   scoringcontext.scanner = NULL;

  1699   scoringcontext.init();            // Clear the internal memory arrays

  1701   // Now thread safe.

  1702   bool FLAGS_cld2_html = ((flags & kCLDFlagHtml) != 0);

  1703   bool FLAGS_cld2_quiet = ((flags & kCLDFlagQuiet) != 0);

  1705   ApplyHints(buffer, buffer_length, is_plain_text, cld_hints, &scoringcontext);

  1707   // Four individual script totals, Latin, Han, other2, other3

  1708   int next_other_tote = 2;

  1709   int tote_num = 0;

  1711   // Four totes for up to four different scripts pending at once

  1712   Tote totes[4];                  // [0] Latn  [1] Hani  [2] other  [3] other

  1713   bool tote_seen[4] = {false, false, false, false};

  1714   int tote_grams[4] = {0, 0, 0, 0};     // Number in partial chunk

  1715   ULScript tote_script[4] =

  1716     {ULScript_Latin, ULScript_Hani, ULScript_Common, ULScript_Common};

  1718   // Loop through text spans in a single script

  1719   ScriptScanner ss(buffer, buffer_length, is_plain_text);

  1720   LangSpan scriptspan;

  1722   scoringcontext.scanner = &ss;

  1724   scriptspan.text = NULL;

  1725   scriptspan.text_bytes = 0;

  1726   scriptspan.offset = 0;

  1727   scriptspan.ulscript = ULScript_Common;

  1728   scriptspan.lang = UNKNOWN_LANGUAGE;

  1730   int total_text_bytes = 0;

  1731   int textlimit = FLAGS_cld_textlimit << 10;    // in KB

  1732   if (textlimit == 0) {textlimit = 0x7fffffff;}

  1734   int advance_by = 2;                   // Advance 2 bytes

  1735   int advance_limit = textlimit >> 3;   // For first 1/8 of max document

  1737   int initial_word_span = kDefaultWordSpan;

  1738   if (FLAGS_cld_forcewords) {

  1739     initial_word_span = kReallyBigWordSpan;

  1740   }

  1742   // Pick up chunk sizes

  1743   // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each

  1744   // Sanity check -- force into a reasonable range

  1745   int chunksizequads = FLAGS_cld_smoothwidth;

  1746   chunksizequads = minint(maxint(chunksizequads, kMinChunkSizeQuads),

  1747                                kMaxChunkSizeQuads);

  1748   int chunksizeunis = (chunksizequads * 5) >> 1;

  1750   // Varying short-span limit doesn't work well -- skips too much beyond 20KB

  1751   // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth;

  1752   int spantooshortlimit = kShortSpanThresh;

  1754   // For debugging only. Not thread-safe

  1755   prior_lang = UNKNOWN_LANGUAGE;

  1756   prior_unreliable = false;

  1758   // Allocate full-document prediction table for finding repeating words

  1759   int hash = 0;

  1760   int* predict_tbl = new int[kPredictionTableSize];

  1761   if (FlagRepeats(flags)) {

  1762     memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));

  1763   }

  1767   // Loop through scriptspans accumulating number of text bytes in each language

  1768   while (ss.GetOneScriptSpanLower(&scriptspan)) {

  1769     ULScript ulscript = scriptspan.ulscript;

  1771     // Squeeze out big chunks of text span if asked to

  1772     if (FlagSqueeze(flags)) {

  1773       // Remove repetitive or mostly-spaces chunks

  1774       int newlen;

  1775       int chunksize = 0;    // Use the default

  1776       if (resultchunkvector != NULL) {

  1777          newlen = CheapSqueezeInplaceOverwrite(scriptspan.text,

  1778                                                scriptspan.text_bytes,

  1779                                                chunksize);

  1780       } else {

  1781          newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes,

  1782                                       chunksize);

  1783       }

  1784       scriptspan.text_bytes = newlen;

  1785     } else {

  1786       // Check now and then to see if we should be squeezing

  1787       if (((kCheapSqueezeTestThresh >> 1) < scriptspan.text_bytes) &&

  1788           !FlagFinish(flags)) {

  1789         // fprintf(stderr, "CheapSqueezeTriggerTest, "

  1790         //                 "first %d bytes of %d (>%d/2)<br>\n",

  1791         //         kCheapSqueezeTestLen,

  1792         //         scriptspan.text_bytes,

  1793         //         kCheapSqueezeTestThresh);

  1795         if (CheapSqueezeTriggerTest(scriptspan.text,

  1796                                       scriptspan.text_bytes,

  1797                                       kCheapSqueezeTestLen)) {

  1798           // Recursive call with big-chunk squeezing set

  1799           if (FLAGS_cld2_html || FLAGS_dbgscore) {

  1800             fprintf(stderr,

  1801                     "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n",

  1802                     total_text_bytes);

  1803           }

  1804           // Deallocate full-document prediction table

  1805           delete[] predict_tbl;

  1807           return DetectLanguageSummaryV2(

  1808                             buffer,

  1809                             buffer_length,

  1810                             is_plain_text,

  1811                             cld_hints,

  1812                             allow_extended_lang,

  1813                             flags | kCLDFlagSqueeze,

  1814                             plus_one,

  1815                             language3,

  1816                             percent3,

  1817                             normalized_score3,

  1818                             resultchunkvector,

  1819                             text_bytes,

  1820                             is_reliable);

  1821         }

  1822       }

  1823     }

  1825     // Remove repetitive words if asked to

  1826     if (FlagRepeats(flags)) {

  1827       // Remove repetitive words

  1828       int newlen;

  1829       if (resultchunkvector != NULL) {

  1830         newlen = CheapRepWordsInplaceOverwrite(scriptspan.text,

  1831                                                scriptspan.text_bytes,

  1832                                                &hash, predict_tbl);

  1833       } else {

  1834         newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes,

  1835                                       &hash, predict_tbl);

  1836       }

  1837       scriptspan.text_bytes = newlen;

  1838     }

  1840     // Scoring depends on scriptspan buffer ALWAYS having

  1841     // leading space and off-the-end space space space NUL,

  1842     // DCHECK(scriptspan.text[0] == ' ');

  1843     // DCHECK(scriptspan.text[scriptspan.text_bytes + 0] == ' ');

  1844     // DCHECK(scriptspan.text[scriptspan.text_bytes + 1] == ' ');

  1845     // DCHECK(scriptspan.text[scriptspan.text_bytes + 2] == ' ');

  1846     // DCHECK(scriptspan.text[scriptspan.text_bytes + 3] == '\0');

  1848     // The real scoring

  1849     // Accumulate directly into the document total, or accmulate in one of four

  1850     // chunk totals. The purpose of the multiple chunk totals is to piece

  1851     // together short choppy pieces of text in alternating scripts. One total is

  1852     // dedicated to Latin text, one to Han text, and the other two are dynamicly

  1853     // assigned.

  1855     scoringcontext.ulscript = scriptspan.ulscript;

  1856     // FLAGS_cld2_html = scoringcontext.flags_cld2_html;

  1858     ScoreOneScriptSpan(scriptspan,

  1859                        &scoringcontext,

  1860                        &doc_tote,

  1861                        resultchunkvector);

  1863     total_text_bytes += scriptspan.text_bytes;

  1864   }     // End while (ss.GetOneScriptSpanLower())

  1866   // Deallocate full-document prediction table

  1867   delete[] predict_tbl;

  1869   if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {

  1870     // If no forced <cr>, put one in front of dump

  1871     if (!scoringcontext.flags_cld2_cr) {fprintf(stderr, "<br>\n");}

  1872     doc_tote.Dump(stderr);

  1873   }

  1876   // If extended langauges are disallowed, remove them here

  1877   if (!allow_extended_lang) {

  1878     RemoveExtendedLanguages(&doc_tote);

  1879   }

  1881   // Force close pairs to one or the other

  1882   // If given, also update resultchunkvector

  1883   RefineScoredClosePairs(&doc_tote, resultchunkvector,

  1884                          FLAGS_cld2_html, FLAGS_cld2_quiet);

  1887   // Calculate return results

  1888   // Find top three byte counts in tote heap

  1889   int reliable_percent3[3];

  1891   // Cannot use Add, etc. after sorting

  1892   doc_tote.Sort(3);

  1894   ExtractLangEtc(&doc_tote, total_text_bytes,

  1895                  reliable_percent3, language3, percent3, normalized_score3,

  1896                  text_bytes, is_reliable);

  1898   bool have_good_answer = false;

  1899   if (FlagFinish(flags)) {

  1900     // Force a result

  1901     have_good_answer = true;

  1902   } else if (total_text_bytes <= kShortTextThresh) {

  1903     // Don't recurse on short text -- we already did word scores

  1904     have_good_answer = true;

  1905   } else if (*is_reliable &&

  1906              (percent3[0] >= kGoodLang1Percent)) {

  1907     have_good_answer = true;

  1908   } else if (*is_reliable &&

  1909              ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) {

  1910     have_good_answer = true;

  1911   }

  1914   if (have_good_answer) {

  1915     // This is the real, non-recursive return

  1917     // Move bytes for unreliable langs to another lang or UNKNOWN

  1918     RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet);

  1920     // Redo the result extraction after the removal above

  1921     doc_tote.Sort(3);

  1922     ExtractLangEtc(&doc_tote, total_text_bytes,

  1923                    reliable_percent3, language3, percent3, normalized_score3,

  1924                    text_bytes, is_reliable);

  1928     Language summary_lang;

  1929     CalcSummaryLang(&doc_tote, total_text_bytes,

  1930                     reliable_percent3, language3, percent3,

  1931                     &summary_lang, is_reliable,

  1932                     FLAGS_cld2_html, FLAGS_cld2_quiet);

  1934     if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {

  1935       for (int i = 0; i < 3; ++i) {

  1936         if (language3[i] != UNKNOWN_LANGUAGE) {

  1937           fprintf(stderr, "%s.%dR(%d%%) ",

  1938                   LanguageCode(language3[i]),

  1939                   reliable_percent3[i],

  1940                   percent3[i]);

  1941         }

  1942       }

  1944       fprintf(stderr, "%d bytes ", total_text_bytes);

  1945       fprintf(stderr, "= %s%c ",

  1946               LanguageName(summary_lang), *is_reliable ? ' ' : '*');

  1947       fprintf(stderr, "<br><br>\n");

  1948     }

  1950     // Slightly condensed if quiet

  1951     if (FLAGS_cld2_html && FLAGS_cld2_quiet) {

  1952       fprintf(stderr, "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ");

  1953       for (int i = 0; i < 3; ++i) {

  1954         if (language3[i] != UNKNOWN_LANGUAGE) {

  1955           fprintf(stderr, "&nbsp;&nbsp;%s %d%% ",

  1956                   LanguageCode(language3[i]),

  1957                   percent3[i]);

  1958         }

  1959       }

  1960       fprintf(stderr, "= %s%c ",

  1961               LanguageName(summary_lang), *is_reliable ? ' ' : '*');

  1962       fprintf(stderr, "<br>\n");

  1963     }

  1965     return summary_lang;

  1966   }

  1968   // Not a good answer -- do recursive call to refine

  1969   if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) {

  1970     // This is what we hope to improve on in the recursive call, if any

  1971     PrintLangs(stderr, language3, percent3, text_bytes, is_reliable);

  1972   }

  1974   // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40

  1975   // For this purpose, we treate "Ignore" as top40

  1976   Language new_plus_one = UNKNOWN_LANGUAGE;

  1978   if (total_text_bytes < kShortTextThresh) {

  1979       // Short text: Recursive call with top40 and short set

  1980       if (FLAGS_cld2_html || FLAGS_dbgscore) {

  1981         fprintf(stderr, "&nbsp;&nbsp;---text_bytes[%d] "

  1982                 "Recursive(Top40/Rep/Short/Words)---<br><br>\n",

  1983                 total_text_bytes);

  1984       }

  1985       return DetectLanguageSummaryV2(

  1986                         buffer,

  1987                         buffer_length,

  1988                         is_plain_text,

  1989                         cld_hints,

  1990                         allow_extended_lang,

  1991                         flags | kCLDFlagTop40 | kCLDFlagRepeats |

  1992                           kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish,

  1993                         new_plus_one,

  1994                         language3,

  1995                         percent3,

  1996                         normalized_score3,

  1997                         resultchunkvector,

  1998                         text_bytes,

  1999                         is_reliable);

  2000   }

  2002   // Longer text: Recursive call with top40 set

  2003   if (FLAGS_cld2_html || FLAGS_dbgscore) {

  2004     fprintf(stderr,

  2005             "&nbsp;&nbsp;---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n",

  2006             total_text_bytes);

  2007   }

  2008   return DetectLanguageSummaryV2(

  2009                         buffer,

  2010                         buffer_length,

  2011                         is_plain_text,

  2012                         cld_hints,

  2013                         allow_extended_lang,

  2014                         flags | kCLDFlagTop40 | kCLDFlagRepeats |

  2015                           kCLDFlagFinish,

  2016                         new_plus_one,

  2017                         language3,

  2018                         percent3,

  2019                         normalized_score3,

  2020                         resultchunkvector,

  2021                         text_bytes,

  2022                         is_reliable);

  2023 }

  2026 // For debugging and wrappers. Not thread safe.

  2027 static char temp_detectlanguageversion[32];

  2029 // Return version text string

  2030 // String is "code_version - data_build_date"

  2031 const char* DetectLanguageVersion() {

  2032   if (kScoringtables.quadgram_obj == NULL) {return "";}

  2033   sprintf(temp_detectlanguageversion,

  2034           "V2.0 - %u", kScoringtables.quadgram_obj->kCLDTableBuildDate);

  2035   return temp_detectlanguageversion;

  2036 }

  2039 }       // End namespace CLD2

The Tor Browser / file revision

browser/components/translation/cld2/internal/compact_lang_det_impl.cc@6474c204b198

browser/components/translation/cld2/internal/compact_lang_det_impl.cc