michael@0: // Copyright 2013 Google Inc. All Rights Reserved. michael@0: // michael@0: // Licensed under the Apache License, Version 2.0 (the "License"); michael@0: // you may not use this file except in compliance with the License. michael@0: // You may obtain a copy of the License at michael@0: // michael@0: // http://www.apache.org/licenses/LICENSE-2.0 michael@0: // michael@0: // Unless required by applicable law or agreed to in writing, software michael@0: // distributed under the License is distributed on an "AS IS" BASIS, michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: // See the License for the specific language governing permissions and michael@0: // limitations under the License. michael@0: michael@0: // michael@0: // Author: dsites@google.com (Dick Sites) michael@0: // Updated 2014.01 for dual table lookup michael@0: // michael@0: michael@0: #include michael@0: #include michael@0: #include michael@0: #include michael@0: michael@0: #include "cldutil.h" michael@0: #include "debug.h" michael@0: #include "integral_types.h" michael@0: #include "lang_script.h" michael@0: #include "utf8statetable.h" michael@0: michael@0: #ifdef CLD2_DYNAMIC_MODE michael@0: #include "cld2_dynamic_data.h" michael@0: #include "cld2_dynamic_data_loader.h" michael@0: #endif michael@0: #include "cld2tablesummary.h" michael@0: #include "compact_lang_det_impl.h" michael@0: #include "compact_lang_det_hint_code.h" michael@0: #include "getonescriptspan.h" michael@0: #include "tote.h" michael@0: michael@0: michael@0: namespace CLD2 { michael@0: michael@0: using namespace std; michael@0: michael@0: // Linker supplies the right tables, From files michael@0: // cld_generated_cjk_uni_prop_80.cc cld2_generated_cjk_compatible.cc michael@0: // cld_generated_cjk_delta_bi_32.cc generated_distinct_bi_0.cc michael@0: // cld2_generated_quad*.cc cld2_generated_deltaocta*.cc michael@0: // cld2_generated_distinctocta*.cc michael@0: // cld_generated_score_quad_octa_1024_256.cc michael@0: michael@0: // 2014.01 Now implementing quadgram dual lookup tables, to allow main table michael@0: // sizes that are 1/3/5 times a power of two, instead of just powers of two. michael@0: // Gives more flexibility of total footprint for CLD2. michael@0: michael@0: extern const int kLanguageToPLangSize; michael@0: extern const int kCloseSetSize; michael@0: michael@0: extern const UTF8PropObj cld_generated_CjkUni_obj; michael@0: extern const CLD2TableSummary kCjkCompat_obj; michael@0: extern const CLD2TableSummary kCjkDeltaBi_obj; michael@0: extern const CLD2TableSummary kDistinctBiTable_obj; michael@0: extern const CLD2TableSummary kQuad_obj; michael@0: extern const CLD2TableSummary kQuad_obj2; // Dual lookup tables michael@0: extern const CLD2TableSummary kDeltaOcta_obj; michael@0: extern const CLD2TableSummary kDistinctOcta_obj; michael@0: extern const short kAvgDeltaOctaScore[]; michael@0: michael@0: #ifdef CLD2_DYNAMIC_MODE michael@0: // CLD2_DYNAMIC_MODE is defined: michael@0: // Data will be read from an mmap opened at runtime. michael@0: static ScoringTables kScoringtables = { michael@0: NULL, //&cld_generated_CjkUni_obj, michael@0: NULL, //&kCjkCompat_obj, michael@0: NULL, //&kCjkDeltaBi_obj, michael@0: NULL, //&kDistinctBiTable_obj, michael@0: NULL, //&kQuad_obj, michael@0: NULL, //&kQuad_obj2, michael@0: NULL, //&kDeltaOcta_obj, michael@0: NULL, //&kDistinctOcta_obj, michael@0: NULL, //kAvgDeltaOctaScore, michael@0: }; michael@0: static bool dynamicDataLoaded = false; michael@0: static ScoringTables* dynamicTables = NULL; michael@0: static void* mmapAddress = NULL; michael@0: static int mmapLength = 0; michael@0: michael@0: bool isDataLoaded() { return dynamicDataLoaded; } michael@0: michael@0: void loadData(const char* fileName) { michael@0: if (isDataLoaded()) { michael@0: unloadData(); michael@0: } michael@0: dynamicTables = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength); michael@0: kScoringtables = *dynamicTables; michael@0: dynamicDataLoaded = true; michael@0: }; michael@0: michael@0: void unloadData() { michael@0: if (!dynamicDataLoaded) return; michael@0: dynamicDataLoaded = false; michael@0: // unloading will null all the pointers out. michael@0: CLD2DynamicDataLoader::unloadData(&dynamicTables, &mmapAddress, &mmapLength); michael@0: } michael@0: #else michael@0: // This initializes kScoringtables.quadgram_obj etc. michael@0: static const ScoringTables kScoringtables = { michael@0: &cld_generated_CjkUni_obj, michael@0: &kCjkCompat_obj, michael@0: &kCjkDeltaBi_obj, michael@0: &kDistinctBiTable_obj, michael@0: michael@0: &kQuad_obj, michael@0: &kQuad_obj2, // Dual lookup tables michael@0: &kDeltaOcta_obj, michael@0: &kDistinctOcta_obj, michael@0: michael@0: kAvgDeltaOctaScore, michael@0: }; michael@0: #endif // #ifdef CLD2_DYNAMIC_MODE michael@0: michael@0: michael@0: static const bool FLAGS_cld_no_minimum_bytes = false; michael@0: static const bool FLAGS_cld_forcewords = true; michael@0: static const bool FLAGS_cld_showme = false; michael@0: static const bool FLAGS_cld_echotext = true; michael@0: static const int32 FLAGS_cld_textlimit = 160; michael@0: static const int32 FLAGS_cld_smoothwidth = 20; michael@0: static const bool FLAGS_cld_2011_hints = true; michael@0: static const int32 FLAGS_cld_max_lang_tag_scan_kb = 8; michael@0: michael@0: static const bool FLAGS_dbgscore = false; michael@0: michael@0: michael@0: static const int kLangHintInitial = 12; // Boost language by N initially michael@0: static const int kLangHintBoost = 12; // Boost language by N/16 per quadgram michael@0: michael@0: static const int kShortSpanThresh = 32; // Bytes michael@0: static const int kMaxSecondChanceLen = 1024; // Look at first 1K of short spans michael@0: michael@0: static const int kCheapSqueezeTestThresh = 4096; // Only look for squeezing michael@0: // after this many text bytes michael@0: static const int kCheapSqueezeTestLen = 256; // Bytes to test to trigger sqz michael@0: static const int kSpacesTriggerPercent = 25; // Trigger sqz if >=25% spaces michael@0: static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted michael@0: michael@0: static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks michael@0: static const int kSpacesThreshPercent = 25; // Squeeze if >=25% spaces michael@0: static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted michael@0: michael@0: static const int kMaxSpaceScan = 32; // Bytes michael@0: michael@0: static const int kGoodLang1Percent = 70; michael@0: static const int kGoodLang1and2Percent = 93; michael@0: static const int kShortTextThresh = 256; // Bytes michael@0: michael@0: static const int kMinChunkSizeQuads = 4; // Chunk is at least four quads michael@0: static const int kMaxChunkSizeQuads = 1024; // Chunk is at most 1K quads michael@0: michael@0: static const int kDefaultWordSpan = 256; // Scan at least this many initial michael@0: // bytes with word scoring michael@0: static const int kReallyBigWordSpan = 9999999; // Forces word scoring all text michael@0: michael@0: static const int kMinReliableSeq = 50; // Record in seq if >= 50% reliable michael@0: michael@0: static const int kPredictionTableSize = 4096; // Must be exactly 4096 for michael@0: // cheap compressor michael@0: michael@0: static const int kNonEnBoilerplateMinPercent = 17; // no second michael@0: static const int kNonFIGSBoilerplateMinPercent = 20; // no second michael@0: static const int kGoodFirstMinPercent = 26; // UNK michael@0: static const int kGoodFirstReliableMinPercent = 51; // unreli michael@0: static const int kIgnoreMaxPercent = 20; // >this => unreli michael@0: static const int kKeepMinPercent = 2; // unreli michael@0: michael@0: michael@0: michael@0: // Statistically closest language, based on quadgram table michael@0: // Those that are far from other languges map to UNKNOWN_LANGUAGE michael@0: // Subscripted by Language michael@0: // michael@0: // From lang_correlation.txt and hand-edits michael@0: // sed 's/^$[^ ]*$ $[^ ]*$ coef=0\.$..$.*$/ michael@0: // (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE, michael@0: // \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt michael@0: // michael@0: static const int kMinCorrPercent = 24; // Pick off how close you want michael@0: // 24 catches PERSIAN <== ARABIC michael@0: // but not SPANISH <== PORTUGESE michael@0: static Language Unknown = UNKNOWN_LANGUAGE; michael@0: michael@0: // Suspect idea michael@0: // Subscripted by Language michael@0: static const Language kClosestAltLanguage[] = { michael@0: (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // ENGLISH michael@0: (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // DANISH michael@0: (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE, // DUTCH michael@0: (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // FINNISH michael@0: (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // FRENCH michael@0: (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE, // GERMAN michael@0: (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE, // HEBREW michael@0: (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE, // ITALIAN michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Japanese michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Korean michael@0: (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE, // NORWEGIAN michael@0: ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // POLISH michael@0: (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // PORTUGUESE michael@0: (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // RUSSIAN michael@0: (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE, // SPANISH michael@0: (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // SWEDISH michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Chinese michael@0: (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // CZECH michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GREEK michael@0: (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE, // ICELANDIC michael@0: ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE, // LATVIAN michael@0: ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE, // LITHUANIAN michael@0: ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ROMANIAN michael@0: ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // HUNGARIAN michael@0: (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE, // ESTONIAN michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Ignore michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Unknown michael@0: (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // BULGARIAN michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CROATIAN michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SERBIAN michael@0: (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE, // IRISH michael@0: (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GALICIAN michael@0: ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // TAGALOG michael@0: (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE, // TURKISH michael@0: (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // UKRAINIAN michael@0: (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // HINDI michael@0: (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // MACEDONIAN michael@0: (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE, // BENGALI michael@0: (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // INDONESIAN michael@0: ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // LATIN michael@0: (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // MALAY michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MALAYALAM michael@0: ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE, // WELSH michael@0: ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // NEPALI michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TELUGU michael@0: ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE, // ALBANIAN michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TAMIL michael@0: (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE, // BELARUSIAN michael@0: (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE, // JAVANESE michael@0: (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE, // OCCITAN michael@0: (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // URDU michael@0: (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // BIHARI michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GUJARATI michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // THAI michael@0: (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // ARABIC michael@0: (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // CATALAN michael@0: ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ESPERANTO michael@0: ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // BASQUE michael@0: ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // INTERLINGUA michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KANNADA michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PUNJABI michael@0: (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE, // SCOTS_GAELIC michael@0: ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SWAHILI michael@0: (28 >= kMinCorrPercent) ? SERBIAN : UNKNOWN_LANGUAGE, // SLOVENIAN michael@0: (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // MARATHI michael@0: ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // MALTESE michael@0: ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE, // VIETNAMESE michael@0: (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // FRISIAN michael@0: (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE, // SLOVAK michael@0: // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ChineseT michael@0: (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE, // ChineseT michael@0: (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE, // FAROESE michael@0: (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE, // SUNDANESE michael@0: (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE, // UZBEK michael@0: ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE, // AMHARIC michael@0: (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // AZERBAIJANI michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GEORGIAN michael@0: ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE, // TIGRINYA michael@0: (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // PERSIAN michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // BOSNIAN michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SINHALESE michael@0: (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // NORWEGIAN_N michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_P michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_B michael@0: (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // XHOSA michael@0: (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE, // ZULU michael@0: ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GUARANI michael@0: (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE, // SESOTHO michael@0: ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // TURKMEN michael@0: ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE, // KYRGYZ michael@0: ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE, // BRETON michael@0: ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE, // TWI michael@0: (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE, // YIDDISH michael@0: (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE, // SERBO_CROATIAN michael@0: (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // SOMALI michael@0: ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // UIGHUR michael@0: (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // KURDISH michael@0: ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // MONGOLIAN michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ARMENIAN michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // LAOTHIAN michael@0: ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // SINDHI michael@0: (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // RHAETO_ROMANCE michael@0: (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // AFRIKAANS michael@0: (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // LUXEMBOURGISH michael@0: ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // BURMESE michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KHMER michael@0: (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE, // TIBETAN michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // DHIVEHI michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CHEROKEE michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SYRIAC michael@0: ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // LIMBU michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ORIYA michael@0: (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE, // ASSAMESE michael@0: (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // CORSICAN michael@0: ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // INTERLINGUE michael@0: ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // KAZAKH michael@0: ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE, // LINGALA michael@0: (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // MOLDAVIAN michael@0: (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // PASHTO michael@0: ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE, // QUECHUA michael@0: ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SHONA michael@0: (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // TAJIK michael@0: (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE, // TATAR michael@0: (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE, // TONGA michael@0: ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE, // YORUBA michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_ENGLISH_BASED michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_FRENCH_BASED michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_OTHER michael@0: ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // MAORI michael@0: ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // WOLOF michael@0: ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE, // ABKHAZIAN michael@0: ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // AFAR michael@0: ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE, // AYMARA michael@0: (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE, // BASHKIR michael@0: ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // BISLAMA michael@0: (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE, // DZONGKHA michael@0: ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // FIJIAN michael@0: ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE, // GREENLANDIC michael@0: ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE, // HAUSA michael@0: ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // HAITIAN_CREOLE michael@0: ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE, // INUPIAK michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // INUKTITUT michael@0: ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // KASHMIRI michael@0: (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE, // KINYARWANDA michael@0: ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE, // MALAGASY michael@0: (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // NAURU michael@0: (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // OROMO michael@0: (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // RUNDI michael@0: (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // SAMOAN michael@0: ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE, // SANGO michael@0: (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // SANSKRIT michael@0: (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // SISWANT michael@0: ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE, // TSONGA michael@0: (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE, // TSWANA michael@0: ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // VOLAPUK michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ZHUANG michael@0: ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // KHASI michael@0: (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // SCOTS michael@0: (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // GANDA michael@0: ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // MANX michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MONTENEGRIN michael@0: michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // AKAN michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // IGBO michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MAURITIAN_CREOLE michael@0: ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // HAWAIIAN michael@0: }; michael@0: michael@0: // COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES, michael@0: // kClosestAltLanguage_has_incorrect_size); michael@0: michael@0: michael@0: inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;} michael@0: inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;} michael@0: inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;} michael@0: inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;} michael@0: inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;} michael@0: inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;} michael@0: inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;} michael@0: michael@0: michael@0: // Defines Top40 packed languages michael@0: michael@0: // Google top 40 languages michael@0: // michael@0: // Tier 0/1 Language enum list (16) michael@0: // ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH, // E - FIGS michael@0: // DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN, michael@0: // PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI, michael@0: // ARABIC, michael@0: // michael@0: // Tier 2 Language enum list (22) michael@0: // SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN, michael@0: // HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN, michael@0: // VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK, michael@0: // TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN, michael@0: // UKRAINIAN, HINDI, michael@0: // michael@0: // use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21) michael@0: // michael@0: // Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40 michael@0: michael@0: michael@0: void DemoteNotTop40(Tote* chunk_tote, uint16 psplus_one) { michael@0: // REVISIT michael@0: } michael@0: michael@0: void PrintText(FILE* f, Language cur_lang, const string& temp) { michael@0: if (temp.size() == 0) {return;} michael@0: fprintf(f, "PrintText[%s]%s
\n", LanguageName(cur_lang), temp.c_str()); michael@0: } michael@0: michael@0: michael@0: //------------------------------------------------------------------------------ michael@0: // For --cld_html debugging output. Not thread safe michael@0: //------------------------------------------------------------------------------ michael@0: static Language prior_lang = UNKNOWN_LANGUAGE; michael@0: static bool prior_unreliable = false; michael@0: michael@0: //------------------------------------------------------------------------------ michael@0: // End For --cld_html debugging output michael@0: //------------------------------------------------------------------------------ michael@0: michael@0: michael@0: // Backscan to word boundary, returning how many bytes n to go back michael@0: // so that src - n is non-space ans src - n - 1 is space. michael@0: // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary michael@0: int BackscanToSpace(const char* src, int limit) { michael@0: int n = 0; michael@0: limit = minint(limit, kMaxSpaceScan); michael@0: while (n < limit) { michael@0: if (src[-n - 1] == ' ') {return n;} // We are at _X michael@0: ++n; michael@0: } michael@0: n = 0; michael@0: while (n < limit) { michael@0: if ((src[-n] & 0xc0) != 0x80) {return n;} // We are at char begin michael@0: ++n; michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: // Forwardscan to word boundary, returning how many bytes n to go forward michael@0: // so that src + n is non-space ans src + n - 1 is space. michael@0: // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary michael@0: int ForwardscanToSpace(const char* src, int limit) { michael@0: int n = 0; michael@0: limit = minint(limit, kMaxSpaceScan); michael@0: while (n < limit) { michael@0: if (src[n] == ' ') {return n + 1;} // We are at _X michael@0: ++n; michael@0: } michael@0: n = 0; michael@0: while (n < limit) { michael@0: if ((src[n] & 0xc0) != 0x80) {return n;} // We are at char begin michael@0: ++n; michael@0: } michael@0: return 0; michael@0: } michael@0: michael@0: michael@0: // This uses a cheap predictor to get a measure of compression, and michael@0: // hence a measure of repetitiveness. It works on complete UTF-8 characters michael@0: // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly michael@0: // all the time when done with a byte-based count. Sigh. michael@0: // michael@0: // To allow running prediction across multiple chunks, caller passes in current michael@0: // 12-bit hash value and int[4096] prediction table. Caller inits these to 0. michael@0: // michael@0: // Returns the number of *bytes* correctly predicted, increments by 1..4 for michael@0: // each correctly-predicted character. michael@0: // michael@0: // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text michael@0: // michael@0: michael@0: // TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen michael@0: michael@0: int CountPredictedBytes(const char* isrc, int src_len, int* hash, int* tbl) { michael@0: int p_count = 0; michael@0: const uint8* src = reinterpret_cast(isrc); michael@0: const uint8* srclimit = src + src_len; michael@0: int local_hash = *hash; michael@0: michael@0: while (src < srclimit) { michael@0: int c = src[0]; michael@0: int incr = 1; michael@0: michael@0: // Pick up one char and length michael@0: if (c < 0xc0) { michael@0: // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx michael@0: // Do nothing more michael@0: } else if ((c & 0xe0) == 0xc0) { michael@0: // Two-byte michael@0: c = (c << 8) | src[1]; michael@0: incr = 2; michael@0: } else if ((c & 0xf0) == 0xe0) { michael@0: // Three-byte michael@0: c = (c << 16) | (src[1] << 8) | src[2]; michael@0: incr = 3; michael@0: } else { michael@0: // Four-byte michael@0: c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; michael@0: incr = 4; michael@0: } michael@0: src += incr; michael@0: michael@0: int p = tbl[local_hash]; // Prediction michael@0: tbl[local_hash] = c; // Update prediction michael@0: if (c == p) { michael@0: p_count += incr; // Count bytes of good predictions michael@0: } michael@0: michael@0: local_hash = ((local_hash << 4) ^ c) & 0xfff; michael@0: } michael@0: *hash = local_hash; michael@0: return p_count; michael@0: } michael@0: michael@0: michael@0: michael@0: // Counts number of spaces; a little faster than one-at-a-time michael@0: // Doesn't count odd bytes at end michael@0: int CountSpaces4(const char* src, int src_len) { michael@0: int s_count = 0; michael@0: for (int i = 0; i < (src_len & ~3); i += 4) { michael@0: s_count += (src[i] == ' '); michael@0: s_count += (src[i+1] == ' '); michael@0: s_count += (src[i+2] == ' '); michael@0: s_count += (src[i+3] == ' '); michael@0: } michael@0: return s_count; michael@0: } michael@0: michael@0: michael@0: // Remove words of text that have more than half their letters predicted michael@0: // correctly by our cheap predictor, moving the remaining words in-place michael@0: // to the front of the input buffer. michael@0: // michael@0: // To allow running prediction across multiple chunks, caller passes in current michael@0: // 12-bit hash value and int[4096] prediction table. Caller inits these to 0. michael@0: // michael@0: // Return the new, possibly-shorter length michael@0: // michael@0: // Result Buffer ALWAYS has leading space and trailing space space space NUL, michael@0: // if input does michael@0: // michael@0: int CheapRepWordsInplace(char* isrc, int src_len, int* hash, int* tbl) { michael@0: const uint8* src = reinterpret_cast(isrc); michael@0: const uint8* srclimit = src + src_len; michael@0: char* dst = isrc; michael@0: int local_hash = *hash; michael@0: char* word_dst = dst; // Start of next word michael@0: int good_predict_bytes = 0; michael@0: int word_length_bytes = 0; michael@0: michael@0: while (src < srclimit) { michael@0: int c = src[0]; michael@0: int incr = 1; michael@0: *dst++ = c; michael@0: michael@0: if (c == ' ') { michael@0: if ((good_predict_bytes * 2) > word_length_bytes) { michael@0: // Word is well-predicted: backup to start of this word michael@0: dst = word_dst; michael@0: if (FLAGS_cld_showme) { michael@0: // Mark the deletion point with period michael@0: // Don't repeat multiple periods michael@0: // Cannot mark with more bytes or may overwrite unseen input michael@0: if ((isrc < (dst - 2)) && (dst[-2] != '.')) { michael@0: *dst++ = '.'; michael@0: *dst++ = ' '; michael@0: } michael@0: } michael@0: } michael@0: word_dst = dst; // Start of next word michael@0: good_predict_bytes = 0; michael@0: word_length_bytes = 0; michael@0: } michael@0: michael@0: // Pick up one char and length michael@0: if (c < 0xc0) { michael@0: // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx michael@0: // Do nothing more michael@0: } else if ((c & 0xe0) == 0xc0) { michael@0: // Two-byte michael@0: *dst++ = src[1]; michael@0: c = (c << 8) | src[1]; michael@0: incr = 2; michael@0: } else if ((c & 0xf0) == 0xe0) { michael@0: // Three-byte michael@0: *dst++ = src[1]; michael@0: *dst++ = src[2]; michael@0: c = (c << 16) | (src[1] << 8) | src[2]; michael@0: incr = 3; michael@0: } else { michael@0: // Four-byte michael@0: *dst++ = src[1]; michael@0: *dst++ = src[2]; michael@0: *dst++ = src[3]; michael@0: c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; michael@0: incr = 4; michael@0: } michael@0: src += incr; michael@0: word_length_bytes += incr; michael@0: michael@0: int p = tbl[local_hash]; // Prediction michael@0: tbl[local_hash] = c; // Update prediction michael@0: if (c == p) { michael@0: good_predict_bytes += incr; // Count good predictions michael@0: } michael@0: michael@0: local_hash = ((local_hash << 4) ^ c) & 0xfff; michael@0: } michael@0: michael@0: *hash = local_hash; michael@0: michael@0: if ((dst - isrc) < (src_len - 3)) { michael@0: // Pad and make last char clean UTF-8 by putting following spaces michael@0: dst[0] = ' '; michael@0: dst[1] = ' '; michael@0: dst[2] = ' '; michael@0: dst[3] = '\0'; michael@0: } else if ((dst - isrc) < src_len) { michael@0: // Make last char clean UTF-8 by putting following space off the end michael@0: dst[0] = ' '; michael@0: } michael@0: michael@0: return static_cast(dst - isrc); michael@0: } michael@0: michael@0: michael@0: // This alternate form overwrites redundant words, thus avoiding corrupting the michael@0: // backmap for generate a vector of original-text ranges. michael@0: int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) { michael@0: const uint8* src = reinterpret_cast(isrc); michael@0: const uint8* srclimit = src + src_len; michael@0: char* dst = isrc; michael@0: int local_hash = *hash; michael@0: char* word_dst = dst; // Start of next word michael@0: int good_predict_bytes = 0; michael@0: int word_length_bytes = 0; michael@0: michael@0: while (src < srclimit) { michael@0: int c = src[0]; michael@0: int incr = 1; michael@0: *dst++ = c; michael@0: michael@0: if (c == ' ') { michael@0: if ((good_predict_bytes * 2) > word_length_bytes) { michael@0: // Word [word_dst..dst-1) is well-predicted: overwrite michael@0: for (char* p = word_dst; p < dst - 1; ++p) {*p = '.';} michael@0: } michael@0: word_dst = dst; // Start of next word michael@0: good_predict_bytes = 0; michael@0: word_length_bytes = 0; michael@0: } michael@0: michael@0: // Pick up one char and length michael@0: if (c < 0xc0) { michael@0: // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx michael@0: // Do nothing more michael@0: } else if ((c & 0xe0) == 0xc0) { michael@0: // Two-byte michael@0: *dst++ = src[1]; michael@0: c = (c << 8) | src[1]; michael@0: incr = 2; michael@0: } else if ((c & 0xf0) == 0xe0) { michael@0: // Three-byte michael@0: *dst++ = src[1]; michael@0: *dst++ = src[2]; michael@0: c = (c << 16) | (src[1] << 8) | src[2]; michael@0: incr = 3; michael@0: } else { michael@0: // Four-byte michael@0: *dst++ = src[1]; michael@0: *dst++ = src[2]; michael@0: *dst++ = src[3]; michael@0: c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; michael@0: incr = 4; michael@0: } michael@0: src += incr; michael@0: word_length_bytes += incr; michael@0: michael@0: int p = tbl[local_hash]; // Prediction michael@0: tbl[local_hash] = c; // Update prediction michael@0: if (c == p) { michael@0: good_predict_bytes += incr; // Count good predictions michael@0: } michael@0: michael@0: local_hash = ((local_hash << 4) ^ c) & 0xfff; michael@0: } michael@0: michael@0: *hash = local_hash; michael@0: michael@0: if ((dst - isrc) < (src_len - 3)) { michael@0: // Pad and make last char clean UTF-8 by putting following spaces michael@0: dst[0] = ' '; michael@0: dst[1] = ' '; michael@0: dst[2] = ' '; michael@0: dst[3] = '\0'; michael@0: } else if ((dst - isrc) < src_len) { michael@0: // Make last char clean UTF-8 by putting following space off the end michael@0: dst[0] = ' '; michael@0: } michael@0: michael@0: return static_cast(dst - isrc); michael@0: } michael@0: michael@0: michael@0: // Remove portions of text that have a high density of spaces, or that are michael@0: // overly repetitive, squeezing the remaining text in-place to the front of the michael@0: // input buffer. michael@0: // michael@0: // Squeezing looks at density of space/prediced chars in fixed-size chunks, michael@0: // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes. michael@0: // michael@0: // Return the new, possibly-shorter length michael@0: // michael@0: // Result Buffer ALWAYS has leading space and trailing space space space NUL, michael@0: // if input does michael@0: // michael@0: int CheapSqueezeInplace(char* isrc, michael@0: int src_len, michael@0: int ichunksize) { michael@0: char* src = isrc; michael@0: char* dst = src; michael@0: char* srclimit = src + src_len; michael@0: bool skipping = false; michael@0: michael@0: int hash = 0; michael@0: // Allocate local prediction table. michael@0: int* predict_tbl = new int[kPredictionTableSize]; michael@0: memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); michael@0: michael@0: int chunksize = ichunksize; michael@0: if (chunksize == 0) {chunksize = kChunksizeDefault;} michael@0: int space_thresh = (chunksize * kSpacesThreshPercent) / 100; michael@0: int predict_thresh = (chunksize * kPredictThreshPercent) / 100; michael@0: michael@0: while (src < srclimit) { michael@0: int remaining_bytes = srclimit - src; michael@0: int len = minint(chunksize, remaining_bytes); michael@0: // Make len land us on a UTF-8 character boundary. michael@0: // Ah. Also fixes mispredict because we could get out of phase michael@0: // Loop always terminates at trailing space in buffer michael@0: while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes michael@0: michael@0: int space_n = CountSpaces4(src, len); michael@0: int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl); michael@0: if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) { michael@0: // Skip the text michael@0: if (!skipping) { michael@0: // Keeping-to-skipping transition; do it at a space michael@0: int n = BackscanToSpace(dst, static_cast(dst - isrc)); michael@0: dst -= n; michael@0: if (dst == isrc) { michael@0: // Force a leading space if the first chunk is deleted michael@0: *dst++ = ' '; michael@0: } michael@0: if (FLAGS_cld_showme) { michael@0: // Mark the deletion point with black square U+25A0 michael@0: *dst++ = static_cast(0xe2); michael@0: *dst++ = static_cast(0x96); michael@0: *dst++ = static_cast(0xa0); michael@0: *dst++ = ' '; michael@0: } michael@0: skipping = true; michael@0: } michael@0: } else { michael@0: // Keep the text michael@0: if (skipping) { michael@0: // Skipping-to-keeping transition; do it at a space michael@0: int n = ForwardscanToSpace(src, len); michael@0: src += n; michael@0: remaining_bytes -= n; // Shrink remaining length michael@0: len -= n; michael@0: skipping = false; michael@0: } michael@0: // "len" can be negative in some cases michael@0: if (len > 0) { michael@0: memmove(dst, src, len); michael@0: dst += len; michael@0: } michael@0: } michael@0: src += len; michael@0: } michael@0: michael@0: if ((dst - isrc) < (src_len - 3)) { michael@0: // Pad and make last char clean UTF-8 by putting following spaces michael@0: dst[0] = ' '; michael@0: dst[1] = ' '; michael@0: dst[2] = ' '; michael@0: dst[3] = '\0'; michael@0: } else if ((dst - isrc) < src_len) { michael@0: // Make last char clean UTF-8 by putting following space off the end michael@0: dst[0] = ' '; michael@0: } michael@0: michael@0: // Deallocate local prediction table michael@0: delete[] predict_tbl; michael@0: return static_cast(dst - isrc); michael@0: } michael@0: michael@0: // This alternate form overwrites redundant words, thus avoiding corrupting the michael@0: // backmap for generate a vector of original-text ranges. michael@0: int CheapSqueezeInplaceOverwrite(char* isrc, michael@0: int src_len, michael@0: int ichunksize) { michael@0: char* src = isrc; michael@0: char* dst = src; michael@0: char* srclimit = src + src_len; michael@0: bool skipping = false; michael@0: michael@0: int hash = 0; michael@0: // Allocate local prediction table. michael@0: int* predict_tbl = new int[kPredictionTableSize]; michael@0: memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); michael@0: michael@0: int chunksize = ichunksize; michael@0: if (chunksize == 0) {chunksize = kChunksizeDefault;} michael@0: int space_thresh = (chunksize * kSpacesThreshPercent) / 100; michael@0: int predict_thresh = (chunksize * kPredictThreshPercent) / 100; michael@0: michael@0: // Always keep first byte (space) michael@0: ++src; michael@0: ++dst; michael@0: while (src < srclimit) { michael@0: int remaining_bytes = srclimit - src; michael@0: int len = minint(chunksize, remaining_bytes); michael@0: // Make len land us on a UTF-8 character boundary. michael@0: // Ah. Also fixes mispredict because we could get out of phase michael@0: // Loop always terminates at trailing space in buffer michael@0: while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes michael@0: michael@0: int space_n = CountSpaces4(src, len); michael@0: int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl); michael@0: if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) { michael@0: // Overwrite the text [dst-n..dst) michael@0: if (!skipping) { michael@0: // Keeping-to-skipping transition; do it at a space michael@0: int n = BackscanToSpace(dst, static_cast(dst - isrc)); michael@0: // Text [word_dst..dst) is well-predicted: overwrite michael@0: for (char* p = dst - n; p < dst; ++p) {*p = '.';} michael@0: skipping = true; michael@0: } michael@0: // Overwrite the text [dst..dst+len) michael@0: for (char* p = dst; p < dst + len; ++p) {*p = '.';} michael@0: dst[len - 1] = ' '; // Space at end so we can see what is happening michael@0: } else { michael@0: // Keep the text michael@0: if (skipping) { michael@0: // Skipping-to-keeping transition; do it at a space michael@0: int n = ForwardscanToSpace(src, len); michael@0: // Text [dst..dst+n) is well-predicted: overwrite michael@0: for (char* p = dst; p < dst + n - 1; ++p) {*p = '.';} michael@0: skipping = false; michael@0: } michael@0: } michael@0: dst += len; michael@0: src += len; michael@0: } michael@0: michael@0: if ((dst - isrc) < (src_len - 3)) { michael@0: // Pad and make last char clean UTF-8 by putting following spaces michael@0: dst[0] = ' '; michael@0: dst[1] = ' '; michael@0: dst[2] = ' '; michael@0: dst[3] = '\0'; michael@0: } else if ((dst - isrc) < src_len) { michael@0: // Make last char clean UTF-8 by putting following space off the end michael@0: dst[0] = ' '; michael@0: } michael@0: michael@0: // Deallocate local prediction table michael@0: delete[] predict_tbl; michael@0: return static_cast(dst - isrc); michael@0: } michael@0: michael@0: // Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input michael@0: // About 90 MB/sec, with or without memcpy, chunksize 48 or 4096 michael@0: // Just CountSpaces is about 340 MB/sec michael@0: // Byte-only CountPredictedBytes is about 150 MB/sec michael@0: // Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec michael@0: // Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c michael@0: // Unjammed byte-only both = 170 MB/sec michael@0: // Jammed byte-only both = 120 MB/sec michael@0: // Back to original w/slight updates, 110 MB/sec michael@0: // michael@0: bool CheapSqueezeTriggerTest(const char* src, int src_len, int testsize) { michael@0: // Don't trigger at all on short text michael@0: if (src_len < testsize) {return false;} michael@0: int space_thresh = (testsize * kSpacesTriggerPercent) / 100; michael@0: int predict_thresh = (testsize * kPredictTriggerPercent) / 100; michael@0: int hash = 0; michael@0: // Allocate local prediction table. michael@0: int* predict_tbl = new int[kPredictionTableSize]; michael@0: memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); michael@0: michael@0: bool retval = false; michael@0: if ((CountSpaces4(src, testsize) >= space_thresh) || michael@0: (CountPredictedBytes(src, testsize, &hash, predict_tbl) >= michael@0: predict_thresh)) { michael@0: retval = true; michael@0: } michael@0: // Deallocate local prediction table michael@0: delete[] predict_tbl; michael@0: return retval; michael@0: } michael@0: michael@0: michael@0: michael@0: michael@0: // Delete any extended languages from doc_tote michael@0: void RemoveExtendedLanguages(DocTote* doc_tote) { michael@0: // Now a nop michael@0: } michael@0: michael@0: static const int kMinReliableKeepPercent = 41; // Remove lang if reli < this michael@0: michael@0: // For Tier3 languages, require a minimum number of bytes to be first-place lang michael@0: static const int kGoodFirstT3MinBytes = 24; // no first michael@0: michael@0: // Move bytes for unreliable langs to another lang or UNKNOWN michael@0: // doc_tote is sorted, so cannot Add michael@0: // michael@0: // If both CHINESE and CHINESET are present and unreliable, do not delete both; michael@0: // merge both into CHINESE. michael@0: // michael@0: //dsites 2009.03.19 michael@0: // we also want to remove Tier3 languages as the first lang if there is very michael@0: // little text like ej1 ej2 ej3 ej4 michael@0: // maybe fold this back in earlier michael@0: // michael@0: void RemoveUnreliableLanguages(DocTote* doc_tote, michael@0: bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) { michael@0: // Prepass to merge some low-reliablility languages michael@0: // TODO: this shouldn't really reach in to the internal structure of doc_tote michael@0: int total_bytes = 0; michael@0: for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { michael@0: int plang = doc_tote->Key(sub); michael@0: if (plang == DocTote::kUnusedKey) {continue;} // Empty slot michael@0: michael@0: Language lang = static_cast(plang); michael@0: int bytes = doc_tote->Value(sub); michael@0: int reli = doc_tote->Reliability(sub); michael@0: if (bytes == 0) {continue;} // Zero bytes michael@0: total_bytes += bytes; michael@0: michael@0: // Reliable percent = stored reliable score over stored bytecount michael@0: int reliable_percent = reli / bytes; michael@0: if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper michael@0: michael@0: // This language is too unreliable to keep, but we might merge it. michael@0: Language altlang = UNKNOWN_LANGUAGE; michael@0: if (lang <= HAWAIIAN) {altlang = kClosestAltLanguage[lang];} michael@0: if (altlang == UNKNOWN_LANGUAGE) {continue;} // No alternative michael@0: michael@0: // Look for alternative in doc_tote michael@0: int altsub = doc_tote->Find(altlang); michael@0: if (altsub < 0) {continue;} // No alternative text michael@0: michael@0: int bytes2 = doc_tote->Value(altsub); michael@0: int reli2 = doc_tote->Reliability(altsub); michael@0: if (bytes2 == 0) {continue;} // Zero bytes michael@0: michael@0: // Reliable percent is stored reliable score over stored bytecount michael@0: int reliable_percent2 = reli2 / bytes2; michael@0: michael@0: // Merge one language into the other. Break ties toward lower lang # michael@0: int tosub = altsub; michael@0: int fromsub = sub; michael@0: bool into_lang = false; michael@0: if ((reliable_percent2 < reliable_percent) || michael@0: ((reliable_percent2 == reliable_percent) && (lang < altlang))) { michael@0: tosub = sub; michael@0: fromsub = altsub; michael@0: into_lang = true; michael@0: } michael@0: michael@0: // Make sure merged reliability doesn't drop and is enough to avoid delete michael@0: int newpercent = maxint(reliable_percent, reliable_percent2); michael@0: newpercent = maxint(newpercent, kMinReliableKeepPercent); michael@0: int newbytes = bytes + bytes2; michael@0: int newreli = newpercent * newbytes; michael@0: michael@0: doc_tote->SetKey(fromsub, DocTote::kUnusedKey); michael@0: doc_tote->SetScore(fromsub, 0); michael@0: doc_tote->SetReliability(fromsub, 0); michael@0: doc_tote->SetScore(tosub, newbytes); michael@0: doc_tote->SetReliability(tosub, newreli); michael@0: michael@0: // Show fate of unreliable languages if at least 10 bytes michael@0: if (FLAGS_cld2_html && (newbytes >= 10) && michael@0: !FLAGS_cld2_quiet) { michael@0: if (into_lang) { michael@0: fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ", michael@0: LanguageCode(altlang), reliable_percent2, bytes2, michael@0: LanguageCode(lang)); michael@0: } else { michael@0: fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ", michael@0: LanguageCode(lang), reliable_percent, bytes, michael@0: LanguageCode(altlang)); michael@0: } michael@0: } michael@0: } michael@0: michael@0: michael@0: // Pass to delete any remaining unreliable languages michael@0: for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { michael@0: int plang = doc_tote->Key(sub); michael@0: if (plang == DocTote::kUnusedKey) {continue;} // Empty slot michael@0: michael@0: Language lang = static_cast(plang); michael@0: int bytes = doc_tote->Value(sub); michael@0: int reli = doc_tote->Reliability(sub); michael@0: if (bytes == 0) {continue;} // Zero bytes michael@0: michael@0: // Reliable percent is stored as reliable score over stored bytecount michael@0: int reliable_percent = reli / bytes; michael@0: if (reliable_percent >= kMinReliableKeepPercent) { // Keeper? michael@0: continue; // yes michael@0: } michael@0: michael@0: // Delete unreliable entry michael@0: doc_tote->SetKey(sub, DocTote::kUnusedKey); michael@0: doc_tote->SetScore(sub, 0); michael@0: doc_tote->SetReliability(sub, 0); michael@0: michael@0: // Show fate of unreliable languages if at least 10 bytes michael@0: if (FLAGS_cld2_html && (bytes >= 10) && michael@0: !FLAGS_cld2_quiet) { michael@0: fprintf(stderr, "{Unreli %s.%dR,%dB} ", michael@0: LanguageCode(lang), reliable_percent, bytes); michael@0: } michael@0: } michael@0: michael@0: ////if (FLAGS_cld2_html) {fprintf(stderr, "
\n");} michael@0: } michael@0: michael@0: michael@0: // Move all the text bytes from lower byte-count to higher one michael@0: void MoveLang1ToLang2(Language lang1, Language lang2, michael@0: int lang1_sub, int lang2_sub, michael@0: DocTote* doc_tote, michael@0: ResultChunkVector* resultchunkvector) { michael@0: // In doc_tote, move all the bytes lang1 => lang2 michael@0: int sum = doc_tote->Value(lang2_sub) + doc_tote->Value(lang1_sub); michael@0: doc_tote->SetValue(lang2_sub, sum); michael@0: sum = doc_tote->Score(lang2_sub) + doc_tote->Score(lang1_sub); michael@0: doc_tote->SetScore(lang2_sub, sum); michael@0: sum = doc_tote->Reliability(lang2_sub) + doc_tote->Reliability(lang1_sub); michael@0: doc_tote->SetReliability(lang2_sub, sum); michael@0: michael@0: // Delete old entry michael@0: doc_tote->SetKey(lang1_sub, DocTote::kUnusedKey); michael@0: doc_tote->SetScore(lang1_sub, 0); michael@0: doc_tote->SetReliability(lang1_sub, 0); michael@0: michael@0: // In resultchunkvector, move all the bytes lang1 => lang2 michael@0: if (resultchunkvector == NULL) {return;} michael@0: michael@0: int k = 0; michael@0: uint16 prior_lang = UNKNOWN_LANGUAGE; michael@0: for (int i = 0; i < static_cast(resultchunkvector->size()); ++i) { michael@0: ResultChunk* rc = &(*resultchunkvector)[i]; michael@0: if (rc->lang1 == lang1) { michael@0: // Update entry[i] lang1 => lang2 michael@0: rc->lang1 = lang2; michael@0: } michael@0: // One change may produce two merges -- entry before and entry after michael@0: if ((rc->lang1 == prior_lang) && (k > 0)) { michael@0: // Merge with previous, deleting entry[i] michael@0: ResultChunk* prior_rc = &(*resultchunkvector)[k - 1]; michael@0: prior_rc->bytes += rc->bytes; michael@0: // fprintf(stderr, "MoveLang1ToLang2 merged [%d] => [%d]
\n", i, k-1); michael@0: } else { michael@0: // Keep entry[i] michael@0: (*resultchunkvector)[k] = (*resultchunkvector)[i]; michael@0: // fprintf(stderr, "MoveLang1ToLang2 keep [%d] => [%d]
\n", i, k); michael@0: ++k; michael@0: } michael@0: prior_lang = rc->lang1; michael@0: } michael@0: resultchunkvector->resize(k); michael@0: } michael@0: michael@0: michael@0: michael@0: // Move less likely byte count to more likely for close pairs of languages michael@0: // If given, also update resultchunkvector michael@0: void RefineScoredClosePairs(DocTote* doc_tote, michael@0: ResultChunkVector* resultchunkvector, michael@0: bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) { michael@0: for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { michael@0: int close_packedlang = doc_tote->Key(sub); michael@0: int subscr = LanguageCloseSet(static_cast(close_packedlang)); michael@0: if (subscr == 0) {continue;} michael@0: michael@0: // We have a close pair language -- if the other one is also scored and the michael@0: // longword score differs enough, put all our eggs into one basket michael@0: michael@0: // Nonzero longword score: Go look for the other of this pair michael@0: for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) { michael@0: if (LanguageCloseSet(static_cast(doc_tote->Key(sub2))) == subscr) { michael@0: // We have a matching pair michael@0: int close_packedlang2 = doc_tote->Key(sub2); michael@0: michael@0: // Move all the text bytes from lower byte-count to higher one michael@0: int from_sub, to_sub; michael@0: Language from_lang, to_lang; michael@0: if (doc_tote->Value(sub) < doc_tote->Value(sub2)) { michael@0: from_sub = sub; michael@0: to_sub = sub2; michael@0: from_lang = static_cast(close_packedlang); michael@0: to_lang = static_cast(close_packedlang2); michael@0: } else { michael@0: from_sub = sub2; michael@0: to_sub = sub; michael@0: from_lang = static_cast(close_packedlang2); michael@0: to_lang = static_cast(close_packedlang); michael@0: } michael@0: michael@0: if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) { michael@0: // Show fate of closepair language michael@0: int val = doc_tote->Value(from_sub); // byte count michael@0: int reli = doc_tote->Reliability(from_sub); michael@0: int reliable_percent = reli / (val ? val : 1); // avoid zdiv michael@0: fprintf(stderr, "{CloseLangPair: %s.%dR,%dB => %s}
\n", michael@0: LanguageCode(from_lang), michael@0: reliable_percent, michael@0: doc_tote->Value(from_sub), michael@0: LanguageCode(to_lang)); michael@0: } michael@0: MoveLang1ToLang2(from_lang, to_lang, from_sub, to_sub, michael@0: doc_tote, resultchunkvector); michael@0: break; // Exit inner for sub2 loop michael@0: } michael@0: } // End for sub2 michael@0: } // End for sub michael@0: } michael@0: michael@0: michael@0: void ApplyAllLanguageHints(Tote* chunk_tote, int tote_grams, michael@0: uint8* lang_hint_boost) { michael@0: } michael@0: michael@0: michael@0: void PrintHtmlEscapedText(FILE* f, const char* txt, int len) { michael@0: string temp(txt, len); michael@0: fprintf(f, "%s", GetHtmlEscapedText(temp).c_str()); michael@0: } michael@0: michael@0: void PrintLang(FILE* f, Tote* chunk_tote, michael@0: Language cur_lang, bool cur_unreliable, michael@0: Language prior_lang, bool prior_unreliable) { michael@0: if (cur_lang == prior_lang) { michael@0: fprintf(f, "[]"); michael@0: } else { michael@0: fprintf(f, "[%s%s]", LanguageCode(cur_lang), cur_unreliable ? "*" : ""); michael@0: } michael@0: } michael@0: michael@0: michael@0: void PrintTopLang(Language top_lang) { michael@0: if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) { michael@0: fprintf(stderr, "[] "); michael@0: } else { michael@0: fprintf(stderr, "[%s] ", LanguageName(top_lang)); michael@0: prior_lang = top_lang; michael@0: } michael@0: } michael@0: michael@0: void PrintTopLangSpeculative(Language top_lang) { michael@0: fprintf(stderr, "", 0xa0a0a0); michael@0: if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) { michael@0: fprintf(stderr, "[] "); michael@0: } else { michael@0: fprintf(stderr, "[%s] ", LanguageName(top_lang)); michael@0: prior_lang = top_lang; michael@0: } michael@0: fprintf(stderr, "\n"); michael@0: } michael@0: michael@0: void PrintLangs(FILE* f, const Language* language3, const int* percent3, michael@0: const int* text_bytes, const bool* is_reliable) { michael@0: fprintf(f, "
  Initial_Languages "); michael@0: if (language3[0] != UNKNOWN_LANGUAGE) { michael@0: fprintf(f, "%s%s(%d%%) ", michael@0: LanguageName(language3[0]), michael@0: *is_reliable ? "" : "*", michael@0: percent3[0]); michael@0: } michael@0: if (language3[1] != UNKNOWN_LANGUAGE) { michael@0: fprintf(f, "%s(%d%%) ", LanguageName(language3[1]), percent3[1]); michael@0: } michael@0: if (language3[2] != UNKNOWN_LANGUAGE) { michael@0: fprintf(f, "%s(%d%%) ", LanguageName(language3[2]), percent3[2]); michael@0: } michael@0: fprintf(f, "%d bytes \n", *text_bytes); michael@0: michael@0: fprintf(f, "
\n"); michael@0: } michael@0: michael@0: michael@0: // Return internal probability score (sum) per 1024 bytes michael@0: double GetNormalizedScore(Language lang, ULScript ulscript, michael@0: int bytecount, int score) { michael@0: if (bytecount <= 0) {return 0.0;} michael@0: return (score << 10) / bytecount; michael@0: } michael@0: michael@0: // Extract return values before fixups michael@0: void ExtractLangEtc(DocTote* doc_tote, int total_text_bytes, michael@0: int* reliable_percent3, Language* language3, int* percent3, michael@0: double* normalized_score3, michael@0: int* text_bytes, bool* is_reliable) { michael@0: reliable_percent3[0] = 0; michael@0: reliable_percent3[1] = 0; michael@0: reliable_percent3[2] = 0; michael@0: language3[0] = UNKNOWN_LANGUAGE; michael@0: language3[1] = UNKNOWN_LANGUAGE; michael@0: language3[2] = UNKNOWN_LANGUAGE; michael@0: percent3[0] = 0; michael@0: percent3[1] = 0; michael@0: percent3[2] = 0; michael@0: normalized_score3[0] = 0.0; michael@0: normalized_score3[1] = 0.0; michael@0: normalized_score3[2] = 0.0; michael@0: michael@0: *text_bytes = total_text_bytes; michael@0: *is_reliable = false; michael@0: michael@0: int bytecount1 = 0; michael@0: int bytecount2 = 0; michael@0: int bytecount3 = 0; michael@0: michael@0: int lang1 = doc_tote->Key(0); michael@0: if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) { michael@0: // We have a top language michael@0: language3[0] = static_cast(lang1); michael@0: bytecount1 = doc_tote->Value(0); michael@0: int reli1 = doc_tote->Reliability(0); michael@0: reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1); // avoid zdiv michael@0: normalized_score3[0] = GetNormalizedScore(language3[0], michael@0: ULScript_Common, michael@0: bytecount1, michael@0: doc_tote->Score(0)); michael@0: } michael@0: michael@0: int lang2 = doc_tote->Key(1); michael@0: if ((lang2 != DocTote::kUnusedKey) && (lang2 != UNKNOWN_LANGUAGE)) { michael@0: language3[1] = static_cast(lang2); michael@0: bytecount2 = doc_tote->Value(1); michael@0: int reli2 = doc_tote->Reliability(1); michael@0: reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1); // avoid zdiv michael@0: normalized_score3[1] = GetNormalizedScore(language3[1], michael@0: ULScript_Common, michael@0: bytecount2, michael@0: doc_tote->Score(1)); michael@0: } michael@0: michael@0: int lang3 = doc_tote->Key(2); michael@0: if ((lang3 != DocTote::kUnusedKey) && (lang3 != UNKNOWN_LANGUAGE)) { michael@0: language3[2] = static_cast(lang3); michael@0: bytecount3 = doc_tote->Value(2); michael@0: int reli3 = doc_tote->Reliability(2); michael@0: reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1); // avoid zdiv michael@0: normalized_score3[2] = GetNormalizedScore(language3[2], michael@0: ULScript_Common, michael@0: bytecount3, michael@0: doc_tote->Score(2)); michael@0: } michael@0: michael@0: // Increase total bytes to sum (top 3) if low for some reason michael@0: int total_bytecount12 = bytecount1 + bytecount2; michael@0: int total_bytecount123 = total_bytecount12 + bytecount3; michael@0: if (total_text_bytes < total_bytecount123) { michael@0: total_text_bytes = total_bytecount123; michael@0: *text_bytes = total_text_bytes; michael@0: } michael@0: michael@0: // Sum minus previous % gives better roundoff behavior than bytecount/total michael@0: int total_text_bytes_div = maxint(1, total_text_bytes); // Avoid zdiv michael@0: percent3[0] = (bytecount1 * 100) / total_text_bytes_div; michael@0: percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div; michael@0: percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div; michael@0: percent3[2] -= percent3[1]; michael@0: percent3[1] -= percent3[0]; michael@0: michael@0: // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2% michael@0: // Fix this explicitly michael@0: if (percent3[1] < percent3[2]) { michael@0: ++percent3[1]; michael@0: --percent3[2]; michael@0: } michael@0: if (percent3[0] < percent3[1]) { michael@0: ++percent3[0]; michael@0: --percent3[1]; michael@0: } michael@0: michael@0: *text_bytes = total_text_bytes; michael@0: michael@0: if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) { michael@0: // We have a top language michael@0: // Its reliability is overall result reliability michael@0: int bytecount = doc_tote->Value(0); michael@0: int reli = doc_tote->Reliability(0); michael@0: int reliable_percent = reli / (bytecount ? bytecount : 1); // avoid zdiv michael@0: *is_reliable = (reliable_percent >= kMinReliableKeepPercent); michael@0: } else { michael@0: // No top language at all. This can happen with zero text or 100% Klingon michael@0: // if extended=false. Just return all UNKNOWN_LANGUAGE, unreliable. michael@0: *is_reliable = false; michael@0: } michael@0: michael@0: // If ignore percent is too large, set unreliable. michael@0: int ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]); michael@0: if ((ignore_percent > kIgnoreMaxPercent)) { michael@0: *is_reliable = false; michael@0: } michael@0: } michael@0: michael@0: bool IsFIGS(Language lang) { michael@0: if (lang == FRENCH) {return true;} michael@0: if (lang == ITALIAN) {return true;} michael@0: if (lang == GERMAN) {return true;} michael@0: if (lang == SPANISH) {return true;} michael@0: return false; michael@0: } michael@0: michael@0: bool IsEFIGS(Language lang) { michael@0: if (lang == ENGLISH) {return true;} michael@0: if (lang == FRENCH) {return true;} michael@0: if (lang == ITALIAN) {return true;} michael@0: if (lang == GERMAN) {return true;} michael@0: if (lang == SPANISH) {return true;} michael@0: return false; michael@0: } michael@0: michael@0: // For Tier3 languages, require more bytes of text to override michael@0: // the first-place language michael@0: static const int kGoodSecondT1T2MinBytes = 15; // no second michael@0: static const int kGoodSecondT3MinBytes = 128; // no second michael@0: michael@0: // Calculate a single summary language for the document, and its reliability. michael@0: // Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE michael@0: // This is the heart of matching human-rater perception. michael@0: // reliable_percent3[] is currently unused michael@0: // michael@0: // Do not return Tier3 second language unless there are at least 128 bytes michael@0: void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes, michael@0: const int* reliable_percent3, michael@0: const Language* language3, michael@0: const int* percent3, michael@0: Language* summary_lang, bool* is_reliable, michael@0: bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) { michael@0: // Vector of active languages; changes if we delete some michael@0: int slot_count = 3; michael@0: int active_slot[3] = {0, 1, 2}; michael@0: michael@0: int ignore_percent = 0; michael@0: int return_percent = percent3[0]; // Default to top lang michael@0: *summary_lang = language3[0]; michael@0: *is_reliable = true; michael@0: if (percent3[0] < kKeepMinPercent) {*is_reliable = false;} michael@0: michael@0: // If any of top 3 is IGNORE, remove it and increment ignore_percent michael@0: for (int i = 0; i < 3; ++i) { michael@0: if (language3[i] == TG_UNKNOWN_LANGUAGE) { michael@0: ignore_percent += percent3[i]; michael@0: // Move the rest up, levaing input vectors unchanged michael@0: for (int j=i+1; j < 3; ++j) { michael@0: active_slot[j - 1] = active_slot[j]; michael@0: } michael@0: -- slot_count; michael@0: // Logically remove Ignore from percentage-text calculation michael@0: // (extra 1 in 101 avoids zdiv, biases slightly small) michael@0: return_percent = (percent3[0] * 100) / (101 - ignore_percent); michael@0: *summary_lang = language3[active_slot[0]]; michael@0: if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;} michael@0: } michael@0: } michael@0: michael@0: michael@0: // If English and X, where X (not UNK) is big enough, michael@0: // assume the English is boilerplate and return X. michael@0: // Logically remove English from percentage-text calculation michael@0: int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100; michael@0: // Require more bytes of text for Tier3 languages michael@0: int minbytesneeded = kGoodSecondT1T2MinBytes; michael@0: int plang_second = PerScriptNumber(ULScript_Latin, language3[active_slot[1]]); michael@0: michael@0: if ((language3[active_slot[0]] == ENGLISH) && michael@0: (language3[active_slot[1]] != ENGLISH) && michael@0: (language3[active_slot[1]] != UNKNOWN_LANGUAGE) && michael@0: (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) && michael@0: (second_bytes >= minbytesneeded)) { michael@0: ignore_percent += percent3[active_slot[0]]; michael@0: return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent); michael@0: *summary_lang = language3[active_slot[1]]; michael@0: if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;} michael@0: michael@0: // Else If FIGS and X, where X (not UNK, EFIGS) is big enough, michael@0: // assume the FIGS is boilerplate and return X. michael@0: // Logically remove FIGS from percentage-text calculation michael@0: } else if (IsFIGS(language3[active_slot[0]]) && michael@0: !IsEFIGS(language3[active_slot[1]]) && michael@0: (language3[active_slot[1]] != UNKNOWN_LANGUAGE) && michael@0: (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) && michael@0: (second_bytes >= minbytesneeded)) { michael@0: ignore_percent += percent3[active_slot[0]]; michael@0: return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent); michael@0: *summary_lang = language3[active_slot[1]]; michael@0: if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;} michael@0: michael@0: // Else we are returning the first language, but want to improve its michael@0: // return_percent if the second language should be ignored michael@0: } else if ((language3[active_slot[1]] == ENGLISH) && michael@0: (language3[active_slot[0]] != ENGLISH)) { michael@0: ignore_percent += percent3[active_slot[1]]; michael@0: return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent); michael@0: } else if (IsFIGS(language3[active_slot[1]]) && michael@0: !IsEFIGS(language3[active_slot[0]])) { michael@0: ignore_percent += percent3[active_slot[1]]; michael@0: return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent); michael@0: } michael@0: michael@0: // If return percent is too small (too many languages), return UNKNOWN michael@0: if ((return_percent < kGoodFirstMinPercent)) { michael@0: if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { michael@0: fprintf(stderr, "{Unreli %s %d%% percent too small} ", michael@0: LanguageCode(*summary_lang), return_percent); michael@0: } michael@0: *summary_lang = UNKNOWN_LANGUAGE; michael@0: *is_reliable = false; michael@0: } michael@0: michael@0: // If return percent is small, return language but set unreliable. michael@0: if ((return_percent < kGoodFirstReliableMinPercent)) { michael@0: *is_reliable = false; michael@0: } michael@0: michael@0: // If ignore percent is too large, set unreliable. michael@0: ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]); michael@0: if ((ignore_percent > kIgnoreMaxPercent)) { michael@0: *is_reliable = false; michael@0: } michael@0: michael@0: // If we removed all the active languages, return UNKNOWN michael@0: if (slot_count == 0) { michael@0: if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { michael@0: fprintf(stderr, "{Unreli %s no languages left} ", michael@0: LanguageCode(*summary_lang)); michael@0: } michael@0: *summary_lang = UNKNOWN_LANGUAGE; michael@0: *is_reliable = false; michael@0: } michael@0: } michael@0: michael@0: void AddLangPriorBoost(Language lang, uint32 langprob, michael@0: ScoringContext* scoringcontext) { michael@0: // This is called 0..n times with language hints michael@0: // but we don't know the script -- so boost either or both Latn, Othr. michael@0: michael@0: if (IsLatnLanguage(lang)) { michael@0: LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn; michael@0: int n = langprior_boost->n; michael@0: langprior_boost->langprob[n] = langprob; michael@0: langprior_boost->n = langprior_boost->wrap(n + 1); michael@0: } michael@0: michael@0: if (IsOthrLanguage(lang)) { michael@0: LangBoosts* langprior_boost = &scoringcontext->langprior_boost.othr; michael@0: int n = langprior_boost->n; michael@0: langprior_boost->langprob[n] = langprob; michael@0: langprior_boost->n = langprior_boost->wrap(n + 1); michael@0: } michael@0: michael@0: } michael@0: michael@0: void AddOneWhack(Language whacker_lang, Language whackee_lang, michael@0: ScoringContext* scoringcontext) { michael@0: uint32 langprob = MakeLangProb(whackee_lang, 1); michael@0: // This logic avoids hr-Latn whacking sr-Cyrl, but still whacks sr-Latn michael@0: if (IsLatnLanguage(whacker_lang) && IsLatnLanguage(whackee_lang)) { michael@0: LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn; michael@0: int n = langprior_whack->n; michael@0: langprior_whack->langprob[n] = langprob; michael@0: langprior_whack->n = langprior_whack->wrap(n + 1); michael@0: } michael@0: if (IsOthrLanguage(whacker_lang) && IsOthrLanguage(whackee_lang)) { michael@0: LangBoosts* langprior_whack = &scoringcontext->langprior_whack.othr; michael@0: int n = langprior_whack->n; michael@0: langprior_whack->langprob[n] = langprob; michael@0: langprior_whack->n = langprior_whack->wrap(n + 1); michael@0: } michael@0: } michael@0: michael@0: void AddCloseLangWhack(Language lang, ScoringContext* scoringcontext) { michael@0: // We do not in general want zh-Hans and zh-Hant to be close pairs, michael@0: // but we do here. michael@0: if (lang == CLD2::CHINESE) { michael@0: AddOneWhack(lang, CLD2::CHINESE_T, scoringcontext); michael@0: return; michael@0: } michael@0: if (lang == CLD2::CHINESE_T) { michael@0: AddOneWhack(lang, CLD2::CHINESE, scoringcontext); michael@0: return; michael@0: } michael@0: michael@0: int base_lang_set = LanguageCloseSet(lang); michael@0: if (base_lang_set == 0) {return;} michael@0: // TODO: add an explicit list of each set to avoid this 512-times loop michael@0: for (int i = 0; i < kLanguageToPLangSize; ++i) { michael@0: Language lang2 = static_cast(i); michael@0: if ((base_lang_set == LanguageCloseSet(lang2)) && (lang != lang2)) { michael@0: AddOneWhack(lang, lang2, scoringcontext); michael@0: } michael@0: } michael@0: } michael@0: michael@0: michael@0: void ApplyHints(const char* buffer, michael@0: int buffer_length, michael@0: bool is_plain_text, michael@0: const CLDHints* cld_hints, michael@0: ScoringContext* scoringcontext) { michael@0: CLDLangPriors lang_priors; michael@0: InitCLDLangPriors(&lang_priors); michael@0: michael@0: // We now use lang= tags. michael@0: // Last look, circa 2008 found only 15% of web pages with lang= tags and michael@0: // many of those were wrong. Now (July 2011), we find 44% of web pages have michael@0: // lang= tags, and most of them are correct. So we now give them substantial michael@0: // weight in each chunk scored. michael@0: if (!is_plain_text) { michael@0: // Get any contained language tags in first n KB michael@0: int32 max_scan_bytes = FLAGS_cld_max_lang_tag_scan_kb << 10; michael@0: string lang_tags = GetLangTagsFromHtml(buffer, buffer_length, michael@0: max_scan_bytes); michael@0: SetCLDLangTagsHint(lang_tags, &lang_priors); michael@0: if (scoringcontext->flags_cld2_html) { michael@0: if (!lang_tags.empty()) { michael@0: fprintf(scoringcontext->debug_file, "
lang_tags '%s'
\n", michael@0: lang_tags.c_str()); michael@0: } michael@0: } michael@0: } michael@0: michael@0: if (cld_hints != NULL) { michael@0: if ((cld_hints->content_language_hint != NULL) && michael@0: (cld_hints->content_language_hint[0] != '\0')) { michael@0: SetCLDContentLangHint(cld_hints->content_language_hint, &lang_priors); michael@0: } michael@0: michael@0: // Input is from GetTLD(), already lowercased michael@0: if ((cld_hints->tld_hint != NULL) && (cld_hints->tld_hint[0] != '\0')) { michael@0: SetCLDTLDHint(cld_hints->tld_hint, &lang_priors); michael@0: } michael@0: michael@0: if (cld_hints->encoding_hint != UNKNOWN_ENCODING) { michael@0: Encoding enc = static_cast(cld_hints->encoding_hint); michael@0: SetCLDEncodingHint(enc, &lang_priors); michael@0: } michael@0: michael@0: if (cld_hints->language_hint != UNKNOWN_LANGUAGE) { michael@0: SetCLDLanguageHint(cld_hints->language_hint, &lang_priors); michael@0: } michael@0: } michael@0: michael@0: // Keep no more than four different languages with hints michael@0: TrimCLDLangPriors(4, &lang_priors); michael@0: michael@0: if (scoringcontext->flags_cld2_html) { michael@0: string print_temp = DumpCLDLangPriors(&lang_priors); michael@0: if (!print_temp.empty()) { michael@0: fprintf(scoringcontext->debug_file, "DumpCLDLangPriors %s
\n", michael@0: print_temp.c_str()); michael@0: } michael@0: } michael@0: michael@0: // Put boosts into ScoringContext michael@0: for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) { michael@0: Language lang = GetCLDPriorLang(lang_priors.prior[i]); michael@0: int qprob = GetCLDPriorWeight(lang_priors.prior[i]); michael@0: if (qprob > 0) { michael@0: uint32 langprob = MakeLangProb(lang, qprob); michael@0: AddLangPriorBoost(lang, langprob, scoringcontext); michael@0: } michael@0: } michael@0: michael@0: // Put whacks into scoring context michael@0: // We do not in general want zh-Hans and zh-Hant to be close pairs, michael@0: // but we do here. Use close_set_count[kCloseSetSize] to count zh, zh-Hant michael@0: std::vector close_set_count(kCloseSetSize + 1, 0); michael@0: michael@0: for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) { michael@0: Language lang = GetCLDPriorLang(lang_priors.prior[i]); michael@0: ++close_set_count[LanguageCloseSet(lang)]; michael@0: if (lang == CLD2::CHINESE) {++close_set_count[kCloseSetSize];} michael@0: if (lang == CLD2::CHINESE_T) {++close_set_count[kCloseSetSize];} michael@0: } michael@0: michael@0: // If a boost language is in a close set, force suppressing the others in michael@0: // that set, if exactly one of the set is present michael@0: for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) { michael@0: Language lang = GetCLDPriorLang(lang_priors.prior[i]); michael@0: int qprob = GetCLDPriorWeight(lang_priors.prior[i]); michael@0: if (qprob > 0) { michael@0: int close_set = LanguageCloseSet(lang); michael@0: if ((close_set > 0) && (close_set_count[close_set] == 1)) { michael@0: AddCloseLangWhack(lang, scoringcontext); michael@0: } michael@0: if (((lang == CLD2::CHINESE) || (lang == CLD2::CHINESE_T)) && michael@0: (close_set_count[kCloseSetSize] == 1)) { michael@0: AddCloseLangWhack(lang, scoringcontext); michael@0: } michael@0: } michael@0: } michael@0: michael@0: michael@0: michael@0: michael@0: michael@0: michael@0: } michael@0: michael@0: michael@0: michael@0: // Results language3/percent3/text_bytes must be exactly three items michael@0: Language DetectLanguageSummaryV2( michael@0: const char* buffer, michael@0: int buffer_length, michael@0: bool is_plain_text, michael@0: const CLDHints* cld_hints, michael@0: bool allow_extended_lang, michael@0: int flags, michael@0: Language plus_one, michael@0: Language* language3, michael@0: int* percent3, michael@0: double* normalized_score3, michael@0: ResultChunkVector* resultchunkvector, michael@0: int* text_bytes, michael@0: bool* is_reliable) { michael@0: language3[0] = UNKNOWN_LANGUAGE; michael@0: language3[1] = UNKNOWN_LANGUAGE; michael@0: language3[2] = UNKNOWN_LANGUAGE; michael@0: percent3[0] = 0; michael@0: percent3[1] = 0; michael@0: percent3[2] = 0; michael@0: normalized_score3[0] = 0.0; michael@0: normalized_score3[1] = 0.0; michael@0: normalized_score3[2] = 0.0; michael@0: if (resultchunkvector != NULL) { michael@0: resultchunkvector->clear(); michael@0: } michael@0: *text_bytes = 0; michael@0: *is_reliable = false; michael@0: michael@0: if ((flags & kCLDFlagEcho) != 0) { michael@0: string temp(buffer, buffer_length); michael@0: if ((flags & kCLDFlagHtml) != 0) { michael@0: fprintf(stderr, "CLD2[%d] '%s'
\n", michael@0: buffer_length, GetHtmlEscapedText(temp).c_str()); michael@0: } else { michael@0: fprintf(stderr, "CLD2[%d] '%s'\n", michael@0: buffer_length, GetPlainEscapedText(temp).c_str()); michael@0: } michael@0: } michael@0: michael@0: #ifdef CLD2_DYNAMIC_MODE michael@0: // In dynamic mode, we immediately return UNKNOWN_LANGUAGE if the data file michael@0: // hasn't been loaded yet. This is the only sane thing we can do, as there michael@0: // are no scoring tables to consult. michael@0: bool dataLoaded = isDataLoaded(); michael@0: if ((flags & kCLDFlagVerbose) != 0) { michael@0: fprintf(stderr, "Data loaded: %s\n", (dataLoaded ? "true" : "false")); michael@0: } michael@0: if (!dataLoaded) { michael@0: return UNKNOWN_LANGUAGE; michael@0: } michael@0: #endif michael@0: michael@0: // Exit now if no text michael@0: if (buffer_length == 0) {return UNKNOWN_LANGUAGE;} michael@0: if (kScoringtables.quadgram_obj == NULL) {return UNKNOWN_LANGUAGE;} michael@0: michael@0: // Document totals michael@0: DocTote doc_tote; // Reliability = 0..100 michael@0: michael@0: // ScoringContext carries state across scriptspans michael@0: ScoringContext scoringcontext; michael@0: scoringcontext.debug_file = stderr; michael@0: scoringcontext.flags_cld2_score_as_quads = michael@0: ((flags & kCLDFlagScoreAsQuads) != 0); michael@0: scoringcontext.flags_cld2_html = ((flags & kCLDFlagHtml) != 0); michael@0: scoringcontext.flags_cld2_cr = ((flags & kCLDFlagCr) != 0); michael@0: scoringcontext.flags_cld2_verbose = ((flags & kCLDFlagVerbose) != 0); michael@0: scoringcontext.prior_chunk_lang = UNKNOWN_LANGUAGE; michael@0: scoringcontext.ulscript = ULScript_Common; michael@0: scoringcontext.scoringtables = &kScoringtables; michael@0: scoringcontext.scanner = NULL; michael@0: scoringcontext.init(); // Clear the internal memory arrays michael@0: michael@0: // Now thread safe. michael@0: bool FLAGS_cld2_html = ((flags & kCLDFlagHtml) != 0); michael@0: bool FLAGS_cld2_quiet = ((flags & kCLDFlagQuiet) != 0); michael@0: michael@0: ApplyHints(buffer, buffer_length, is_plain_text, cld_hints, &scoringcontext); michael@0: michael@0: // Four individual script totals, Latin, Han, other2, other3 michael@0: int next_other_tote = 2; michael@0: int tote_num = 0; michael@0: michael@0: // Four totes for up to four different scripts pending at once michael@0: Tote totes[4]; // [0] Latn [1] Hani [2] other [3] other michael@0: bool tote_seen[4] = {false, false, false, false}; michael@0: int tote_grams[4] = {0, 0, 0, 0}; // Number in partial chunk michael@0: ULScript tote_script[4] = michael@0: {ULScript_Latin, ULScript_Hani, ULScript_Common, ULScript_Common}; michael@0: michael@0: // Loop through text spans in a single script michael@0: ScriptScanner ss(buffer, buffer_length, is_plain_text); michael@0: LangSpan scriptspan; michael@0: michael@0: scoringcontext.scanner = &ss; michael@0: michael@0: scriptspan.text = NULL; michael@0: scriptspan.text_bytes = 0; michael@0: scriptspan.offset = 0; michael@0: scriptspan.ulscript = ULScript_Common; michael@0: scriptspan.lang = UNKNOWN_LANGUAGE; michael@0: michael@0: int total_text_bytes = 0; michael@0: int textlimit = FLAGS_cld_textlimit << 10; // in KB michael@0: if (textlimit == 0) {textlimit = 0x7fffffff;} michael@0: michael@0: int advance_by = 2; // Advance 2 bytes michael@0: int advance_limit = textlimit >> 3; // For first 1/8 of max document michael@0: michael@0: int initial_word_span = kDefaultWordSpan; michael@0: if (FLAGS_cld_forcewords) { michael@0: initial_word_span = kReallyBigWordSpan; michael@0: } michael@0: michael@0: // Pick up chunk sizes michael@0: // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each michael@0: // Sanity check -- force into a reasonable range michael@0: int chunksizequads = FLAGS_cld_smoothwidth; michael@0: chunksizequads = minint(maxint(chunksizequads, kMinChunkSizeQuads), michael@0: kMaxChunkSizeQuads); michael@0: int chunksizeunis = (chunksizequads * 5) >> 1; michael@0: michael@0: // Varying short-span limit doesn't work well -- skips too much beyond 20KB michael@0: // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth; michael@0: int spantooshortlimit = kShortSpanThresh; michael@0: michael@0: // For debugging only. Not thread-safe michael@0: prior_lang = UNKNOWN_LANGUAGE; michael@0: prior_unreliable = false; michael@0: michael@0: // Allocate full-document prediction table for finding repeating words michael@0: int hash = 0; michael@0: int* predict_tbl = new int[kPredictionTableSize]; michael@0: if (FlagRepeats(flags)) { michael@0: memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); michael@0: } michael@0: michael@0: michael@0: michael@0: // Loop through scriptspans accumulating number of text bytes in each language michael@0: while (ss.GetOneScriptSpanLower(&scriptspan)) { michael@0: ULScript ulscript = scriptspan.ulscript; michael@0: michael@0: // Squeeze out big chunks of text span if asked to michael@0: if (FlagSqueeze(flags)) { michael@0: // Remove repetitive or mostly-spaces chunks michael@0: int newlen; michael@0: int chunksize = 0; // Use the default michael@0: if (resultchunkvector != NULL) { michael@0: newlen = CheapSqueezeInplaceOverwrite(scriptspan.text, michael@0: scriptspan.text_bytes, michael@0: chunksize); michael@0: } else { michael@0: newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes, michael@0: chunksize); michael@0: } michael@0: scriptspan.text_bytes = newlen; michael@0: } else { michael@0: // Check now and then to see if we should be squeezing michael@0: if (((kCheapSqueezeTestThresh >> 1) < scriptspan.text_bytes) && michael@0: !FlagFinish(flags)) { michael@0: // fprintf(stderr, "CheapSqueezeTriggerTest, " michael@0: // "first %d bytes of %d (>%d/2)
\n", michael@0: // kCheapSqueezeTestLen, michael@0: // scriptspan.text_bytes, michael@0: // kCheapSqueezeTestThresh); michael@0: michael@0: if (CheapSqueezeTriggerTest(scriptspan.text, michael@0: scriptspan.text_bytes, michael@0: kCheapSqueezeTestLen)) { michael@0: // Recursive call with big-chunk squeezing set michael@0: if (FLAGS_cld2_html || FLAGS_dbgscore) { michael@0: fprintf(stderr, michael@0: "
---text_bytes[%d] Recursive(Squeeze)---

\n", michael@0: total_text_bytes); michael@0: } michael@0: // Deallocate full-document prediction table michael@0: delete[] predict_tbl; michael@0: michael@0: return DetectLanguageSummaryV2( michael@0: buffer, michael@0: buffer_length, michael@0: is_plain_text, michael@0: cld_hints, michael@0: allow_extended_lang, michael@0: flags | kCLDFlagSqueeze, michael@0: plus_one, michael@0: language3, michael@0: percent3, michael@0: normalized_score3, michael@0: resultchunkvector, michael@0: text_bytes, michael@0: is_reliable); michael@0: } michael@0: } michael@0: } michael@0: michael@0: // Remove repetitive words if asked to michael@0: if (FlagRepeats(flags)) { michael@0: // Remove repetitive words michael@0: int newlen; michael@0: if (resultchunkvector != NULL) { michael@0: newlen = CheapRepWordsInplaceOverwrite(scriptspan.text, michael@0: scriptspan.text_bytes, michael@0: &hash, predict_tbl); michael@0: } else { michael@0: newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes, michael@0: &hash, predict_tbl); michael@0: } michael@0: scriptspan.text_bytes = newlen; michael@0: } michael@0: michael@0: // Scoring depends on scriptspan buffer ALWAYS having michael@0: // leading space and off-the-end space space space NUL, michael@0: // DCHECK(scriptspan.text[0] == ' '); michael@0: // DCHECK(scriptspan.text[scriptspan.text_bytes + 0] == ' '); michael@0: // DCHECK(scriptspan.text[scriptspan.text_bytes + 1] == ' '); michael@0: // DCHECK(scriptspan.text[scriptspan.text_bytes + 2] == ' '); michael@0: // DCHECK(scriptspan.text[scriptspan.text_bytes + 3] == '\0'); michael@0: michael@0: // The real scoring michael@0: // Accumulate directly into the document total, or accmulate in one of four michael@0: // chunk totals. The purpose of the multiple chunk totals is to piece michael@0: // together short choppy pieces of text in alternating scripts. One total is michael@0: // dedicated to Latin text, one to Han text, and the other two are dynamicly michael@0: // assigned. michael@0: michael@0: scoringcontext.ulscript = scriptspan.ulscript; michael@0: // FLAGS_cld2_html = scoringcontext.flags_cld2_html; michael@0: michael@0: ScoreOneScriptSpan(scriptspan, michael@0: &scoringcontext, michael@0: &doc_tote, michael@0: resultchunkvector); michael@0: michael@0: total_text_bytes += scriptspan.text_bytes; michael@0: } // End while (ss.GetOneScriptSpanLower()) michael@0: michael@0: // Deallocate full-document prediction table michael@0: delete[] predict_tbl; michael@0: michael@0: if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { michael@0: // If no forced , put one in front of dump michael@0: if (!scoringcontext.flags_cld2_cr) {fprintf(stderr, "
\n");} michael@0: doc_tote.Dump(stderr); michael@0: } michael@0: michael@0: michael@0: // If extended langauges are disallowed, remove them here michael@0: if (!allow_extended_lang) { michael@0: RemoveExtendedLanguages(&doc_tote); michael@0: } michael@0: michael@0: // Force close pairs to one or the other michael@0: // If given, also update resultchunkvector michael@0: RefineScoredClosePairs(&doc_tote, resultchunkvector, michael@0: FLAGS_cld2_html, FLAGS_cld2_quiet); michael@0: michael@0: michael@0: // Calculate return results michael@0: // Find top three byte counts in tote heap michael@0: int reliable_percent3[3]; michael@0: michael@0: // Cannot use Add, etc. after sorting michael@0: doc_tote.Sort(3); michael@0: michael@0: ExtractLangEtc(&doc_tote, total_text_bytes, michael@0: reliable_percent3, language3, percent3, normalized_score3, michael@0: text_bytes, is_reliable); michael@0: michael@0: bool have_good_answer = false; michael@0: if (FlagFinish(flags)) { michael@0: // Force a result michael@0: have_good_answer = true; michael@0: } else if (total_text_bytes <= kShortTextThresh) { michael@0: // Don't recurse on short text -- we already did word scores michael@0: have_good_answer = true; michael@0: } else if (*is_reliable && michael@0: (percent3[0] >= kGoodLang1Percent)) { michael@0: have_good_answer = true; michael@0: } else if (*is_reliable && michael@0: ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) { michael@0: have_good_answer = true; michael@0: } michael@0: michael@0: michael@0: if (have_good_answer) { michael@0: // This is the real, non-recursive return michael@0: michael@0: // Move bytes for unreliable langs to another lang or UNKNOWN michael@0: RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet); michael@0: michael@0: // Redo the result extraction after the removal above michael@0: doc_tote.Sort(3); michael@0: ExtractLangEtc(&doc_tote, total_text_bytes, michael@0: reliable_percent3, language3, percent3, normalized_score3, michael@0: text_bytes, is_reliable); michael@0: michael@0: michael@0: michael@0: Language summary_lang; michael@0: CalcSummaryLang(&doc_tote, total_text_bytes, michael@0: reliable_percent3, language3, percent3, michael@0: &summary_lang, is_reliable, michael@0: FLAGS_cld2_html, FLAGS_cld2_quiet); michael@0: michael@0: if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { michael@0: for (int i = 0; i < 3; ++i) { michael@0: if (language3[i] != UNKNOWN_LANGUAGE) { michael@0: fprintf(stderr, "%s.%dR(%d%%) ", michael@0: LanguageCode(language3[i]), michael@0: reliable_percent3[i], michael@0: percent3[i]); michael@0: } michael@0: } michael@0: michael@0: fprintf(stderr, "%d bytes ", total_text_bytes); michael@0: fprintf(stderr, "= %s%c ", michael@0: LanguageName(summary_lang), *is_reliable ? ' ' : '*'); michael@0: fprintf(stderr, "

\n"); michael@0: } michael@0: michael@0: // Slightly condensed if quiet michael@0: if (FLAGS_cld2_html && FLAGS_cld2_quiet) { michael@0: fprintf(stderr, "       "); michael@0: for (int i = 0; i < 3; ++i) { michael@0: if (language3[i] != UNKNOWN_LANGUAGE) { michael@0: fprintf(stderr, "  %s %d%% ", michael@0: LanguageCode(language3[i]), michael@0: percent3[i]); michael@0: } michael@0: } michael@0: fprintf(stderr, "= %s%c ", michael@0: LanguageName(summary_lang), *is_reliable ? ' ' : '*'); michael@0: fprintf(stderr, "
\n"); michael@0: } michael@0: michael@0: return summary_lang; michael@0: } michael@0: michael@0: // Not a good answer -- do recursive call to refine michael@0: if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) { michael@0: // This is what we hope to improve on in the recursive call, if any michael@0: PrintLangs(stderr, language3, percent3, text_bytes, is_reliable); michael@0: } michael@0: michael@0: // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40 michael@0: // For this purpose, we treate "Ignore" as top40 michael@0: Language new_plus_one = UNKNOWN_LANGUAGE; michael@0: michael@0: if (total_text_bytes < kShortTextThresh) { michael@0: // Short text: Recursive call with top40 and short set michael@0: if (FLAGS_cld2_html || FLAGS_dbgscore) { michael@0: fprintf(stderr, "  ---text_bytes[%d] " michael@0: "Recursive(Top40/Rep/Short/Words)---

\n", michael@0: total_text_bytes); michael@0: } michael@0: return DetectLanguageSummaryV2( michael@0: buffer, michael@0: buffer_length, michael@0: is_plain_text, michael@0: cld_hints, michael@0: allow_extended_lang, michael@0: flags | kCLDFlagTop40 | kCLDFlagRepeats | michael@0: kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish, michael@0: new_plus_one, michael@0: language3, michael@0: percent3, michael@0: normalized_score3, michael@0: resultchunkvector, michael@0: text_bytes, michael@0: is_reliable); michael@0: } michael@0: michael@0: // Longer text: Recursive call with top40 set michael@0: if (FLAGS_cld2_html || FLAGS_dbgscore) { michael@0: fprintf(stderr, michael@0: "  ---text_bytes[%d] Recursive(Top40/Rep)---

\n", michael@0: total_text_bytes); michael@0: } michael@0: return DetectLanguageSummaryV2( michael@0: buffer, michael@0: buffer_length, michael@0: is_plain_text, michael@0: cld_hints, michael@0: allow_extended_lang, michael@0: flags | kCLDFlagTop40 | kCLDFlagRepeats | michael@0: kCLDFlagFinish, michael@0: new_plus_one, michael@0: language3, michael@0: percent3, michael@0: normalized_score3, michael@0: resultchunkvector, michael@0: text_bytes, michael@0: is_reliable); michael@0: } michael@0: michael@0: michael@0: // For debugging and wrappers. Not thread safe. michael@0: static char temp_detectlanguageversion[32]; michael@0: michael@0: // Return version text string michael@0: // String is "code_version - data_build_date" michael@0: const char* DetectLanguageVersion() { michael@0: if (kScoringtables.quadgram_obj == NULL) {return "";} michael@0: sprintf(temp_detectlanguageversion, michael@0: "V2.0 - %u", kScoringtables.quadgram_obj->kCLDTableBuildDate); michael@0: return temp_detectlanguageversion; michael@0: } michael@0: michael@0: michael@0: } // End namespace CLD2