browser/components/translation/cld2/internal/compact_lang_det_hint_code.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 // Copyright 2013 Google Inc. All Rights Reserved.
     2 //
     3 // Licensed under the Apache License, Version 2.0 (the "License");
     4 // you may not use this file except in compliance with the License.
     5 // You may obtain a copy of the License at
     6 //
     7 //     http://www.apache.org/licenses/LICENSE-2.0
     8 //
     9 // Unless required by applicable law or agreed to in writing, software
    10 // distributed under the License is distributed on an "AS IS" BASIS,
    11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12 // See the License for the specific language governing permissions and
    13 // limitations under the License.
    15 //
    16 // Author: dsites@google.com (Dick Sites)
    17 //
    19 #include "compact_lang_det_hint_code.h"
    21 #include <stdlib.h>     // for abs()
    22 #include <stdio.h>      // for sprintf()
    23 #include <string.h>     //
    24 #include "lang_script.h"
    25 #include "port.h"
    27 using namespace std;
    29 namespace CLD2 {
    31 static const int kCLDPriorEncodingWeight = 4;   // 100x more likely
    32 static const int kCLDPriorLanguageWeight = 8;   // 10000x more likely
    35 // Tables to map lang="..." language code lists to actual languages.
    36 // based on scraping and hand-edits, dsites June 2011
    38 // n = f(string, &a) gives list of n<=4 language pairs: primary, secondary
    40 // For close pairs like ms/id, more weight on TLD and lang=
    41 // Alternately, weaker boost but mark others of set as negative;
    42 // makes "neither" an easier result.
    43 // lang=en low weight 4
    44 // tld=lu boost lu maaybe 4. but lang= alwyas overcomes tld and encoding
    45 // (except maybe en)
    47 // TLD to separate, e.g., burundi from rwanda
    49 // Encoding lookup: OneLangProb array
    50 // TLD lookup:   tld OneLangProb pairs
    53 typedef struct {
    54   const char* const langtag;    // Lowercased, hyphen only lookup key
    55   const char* const langcode;   // Canonical language codes; two if ambiguous
    56   OneCLDLangPrior onelangprior1;
    57   OneCLDLangPrior onelangprior2;
    58 } LangTagLookup;
    60 typedef struct {
    61   const char* const tld;        // Lowercased, hyphen only lookup key
    62   OneCLDLangPrior onelangprior1;
    63   OneCLDLangPrior onelangprior2;
    64 } TLDLookup;
    67 #define W2 (2 << 10)            // 3**2 = 10x more likely
    68 #define W4 (4 << 10)            // 3**4 = 100x more likely
    69 #define W6 (6 << 10)            // 3**6 = 1000x more likely
    70 #define W8 (8 << 10)            // 3**8 = 10K x more likely
    71 #define W10 (10 << 10)          // 3**10 = 100K x more likely
    72 #define W12 (12 << 10)          // 3**12 = 1M x more likely
    74 // TODO: more about ba hr sr sr-ME and sl
    75 // Temporary state of affairs:
    76 //   BOSNIAN CROATIAN MONTENEGRIN SERBIAN detecting just CROATIAN SERBIAN
    77 // Eventually, we want to do all four, but it requires a CLD change to handle
    78 // up to six languages per quadgram.
    81 // Close pairs boost one of pair, demote other.
    82 //   Statistically close pairs:
    83 //   INDONESIAN/MALAY difficult to distinguish -- extra word-based lookups used
    84 //
    85 //   INDONESIAN MALAY coef=0.4698        Problematic w/o extra words
    86 //   TIBETAN DZONGKHA coef=0.4571
    87 //   CZECH SLOVAK coef=0.4273
    88 //   NORWEGIAN NORWEGIAN_N coef=0.4182
    89 //
    90 //   HINDI MARATHI coef=0.3795
    91 //   ZULU XHOSA coef=0.3716
    92 //
    93 //   DANISH NORWEGIAN coef=0.3672        Usually OK
    94 //   BIHARI HINDI coef=0.3668            Usually OK
    95 //   ICELANDIC FAROESE coef=0.3519       Usually OK
    97 //
    98 // Table to look up lang= tags longer than three characters
    99 // Overrides table below, which is truncated at first hyphen
   100 // In alphabetical order for binary search
   101 static const int kCLDTable1Size = 213;
   102 static const LangTagLookup kCLDLangTagsHintTable1[kCLDTable1Size] = {
   103   {"abkhazian", "ab", ABKHAZIAN + W10, 0},
   104   {"afar", "aa", AFAR + W10, 0},
   105   {"afrikaans", "af", AFRIKAANS + W10, 0},
   106   {"akan", "ak", AKAN + W10, 0},
   107   {"albanian", "sq", ALBANIAN + W10, 0},
   108   {"am-am", "hy", ARMENIAN + W10, 0},        // 1:2 Armenian, not ambiguous
   109   {"amharic", "am", AMHARIC + W10, 0},
   110   {"arabic", "ar", ARABIC + W10, 0},
   111   {"argentina", "es", SPANISH + W10, 0},
   112   {"armenian", "hy", ARMENIAN + W10, 0},
   113   {"assamese", "as", ASSAMESE + W10, 0},
   114   {"aymara", "ay", AYMARA + W10, 0},
   115   {"azerbaijani", "az", AZERBAIJANI + W10, 0},
   117   {"bangla", "bn", BENGALI + W10, 0},
   118   {"bashkir", "ba", BASHKIR + W10, 0},
   119   {"basque", "eu", BASQUE + W10, 0},
   120   {"belarusian", "be", BELARUSIAN + W10, 0},
   121   {"bengali", "bn", BENGALI + W10, 0},
   122   {"bihari", "bh", BIHARI + W10, HINDI - W4},
   123   {"bislama", "bi", BISLAMA + W10, 0},
   124   {"bosnian", "bs", BOSNIAN + W10, 0},      // Bosnian => Bosnian
   125   {"br-br", "pt", PORTUGUESE + W10, 0},     // 1:2 Portuguese, not ambiguous
   126   {"br-fr", "br", BRETON + W10, 0},         // 1:2 Breton, not ambiguous
   127   {"breton", "br", BRETON + W10, 0},
   128   {"bulgarian", "bg", BULGARIAN + W10, 0},
   129   {"burmese", "my", BURMESE + W10, 0},      // Myanmar
   131   {"catalan", "ca", CATALAN + W10, 0},
   132   {"cherokee", "chr", CHEROKEE + W10, 0},
   133   {"chichewa", "ny", NYANJA + W10, 0},
   135   {"chinese", "zh", CHINESE + W10, 0},
   136   {"chinese-t", "zhT", CHINESE_T + W10, 0},
   137   {"chineset", "zhT", CHINESE_T + W10, 0},
   138   {"corsican", "co", CORSICAN + W10, 0},
   139   {"cpf-hat", "ht", HAITIAN_CREOLE + W10, 0}, // Creole, French-based
   140   {"croatian", "hr", CROATIAN + W10, 0},
   141   {"czech", "cs", CZECH + W10, SLOVAK - W4},
   143   {"danish", "da", DANISH + W10, NORWEGIAN - W4},
   144   {"deutsch", "de", GERMAN + W10, 0},
   145   {"dhivehi", "dv", DHIVEHI + W10, 0},
   146   {"dutch", "nl", DUTCH + W10, 0},
   147   {"dzongkha", "dz", DZONGKHA + W10,  TIBETAN - W4},
   149   {"ell-gr", "el", GREEK + W10, 0},
   150   {"english", "en", ENGLISH + W4, 0},
   151   {"esperanto", "eo", ESPERANTO + W10, 0},
   152   {"estonian", "et", ESTONIAN + W10, 0},
   153   {"euc-jp", "ja", JAPANESE + W10, 0},       // Japanese encoding
   154   {"euc-kr", "ko", KOREAN + W10, 0},         // Korean encoding
   156   {"faroese", "fo", FAROESE + W10, ICELANDIC - W4},
   157   {"fijian", "fj", FIJIAN + W10, 0},
   158   {"finnish", "fi", FINNISH + W10, 0},
   159   {"fran", "fr", FRENCH + W10, 0},            // Truncated at non-ASCII
   160   {"francais", "fr", FRENCH + W10, 0},
   161   {"french", "fr", FRENCH + W10, 0},
   162   {"frisian", "fy", FRISIAN + W10, 0},
   164   {"ga-es", "gl", GALICIAN + W10, 0},         // 1:2 Galician, not ambiguous
   165   {"galician", "gl", GALICIAN + W10, 0},
   166   {"ganda", "lg", GANDA + W10, 0},
   167   {"georgian", "ka", GEORGIAN + W10, 0},
   168   {"german", "de", GERMAN + W10, 0},
   169   {"greek", "el", GREEK + W10, 0},
   170   {"greenlandic", "kl", GREENLANDIC + W10, 0},
   171   {"guarani", "gn", GUARANI + W10, 0},
   172   {"gujarati", "gu", GUJARATI + W10, 0},
   174   {"haitian_creole", "ht", HAITIAN_CREOLE + W10, 0},
   175   {"hausa", "ha", HAUSA + W10, 0},
   176   {"hawaiian", "haw", HAWAIIAN + W10, 0},
   177   {"hebrew", "iw", HEBREW + W10, 0},
   178   {"hindi", "hi", HINDI + W10, MARATHI - W4},
   179   {"hn-in", "hi", HINDI + W10, MARATHI - W4},
   180   {"hungarian", "hu", HUNGARIAN + W10, 0},
   182   {"icelandic", "is", ICELANDIC + W10, FAROESE - W4},
   183   {"igbo", "ig", IGBO + W10, 0},
   184   {"indonesian", "id", INDONESIAN + W10, MALAY - W4},
   185   {"interlingua", "ia", INTERLINGUA + W10, 0},
   186   {"interlingue", "ie", INTERLINGUE + W10, 0},
   187   // 1:2 iu-Cans ik-Latn
   188   {"inuktitut", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2
   189   {"inupiak", "ik,iu", INUPIAK + W10, INUKTITUT + W10},   // 1:2
   190   {"ir-ie", "ga", IRISH + W10, 0},          // Irish
   191   {"irish", "ga", IRISH + W10, 0},
   192   {"italian", "it", ITALIAN + W10, 0},
   194   {"ja-euc", "ja", JAPANESE + W10, 0},      // Japanese encoding
   195   {"jan-jp", "ja", JAPANESE + W10, 0},      // Japanese encoding
   196   {"japanese", "ja", JAPANESE + W10, 0},
   197   {"javanese", "jw", JAVANESE + W10, 0},
   199   {"kannada", "kn", KANNADA + W10, 0},
   200   {"kashmiri", "ks", KASHMIRI + W10, 0},
   201   {"kazakh", "kk", KAZAKH + W10, 0},
   202   {"khasi", "kha", KHASI + W10, 0},
   203   {"khmer", "km", KHMER + W10, 0},
   204   {"kinyarwanda", "rw", KINYARWANDA + W10, 0},
   205   {"klingon", "tlh", X_KLINGON + W10, 0},
   206   {"korean", "ko", KOREAN + W10, 0},
   207   {"kurdish", "ku", KURDISH + W10, 0},
   208   {"kyrgyz", "ky", KYRGYZ + W10, 0},
   210   {"laothian", "lo", LAOTHIAN + W10, 0},
   211   {"latin", "la", LATIN + W10, 0},
   212   {"latvian", "lv", LATVIAN + W10, 0},
   213   {"limbu", "sit", LIMBU + W10, 0},
   214   {"lingala", "ln", LINGALA + W10, 0},
   215   {"lithuanian", "lt", LITHUANIAN + W10, 0},
   216   {"luxembourgish", "lb", LUXEMBOURGISH + W10, 0},
   218   {"macedonian", "mk", MACEDONIAN + W10, 0},
   219   {"malagasy", "mg", MALAGASY + W10, 0},
   220   {"malay", "ms", MALAY + W10, INDONESIAN - W4},
   221   {"malayalam", "ml", MALAYALAM + W10, 0},
   222   {"maltese", "mt", MALTESE + W10, 0},
   223   {"manx", "gv", MANX + W10, 0},
   224   {"maori", "mi", MAORI + W10, 0},
   225   {"marathi", "mr", MARATHI + W10, HINDI - W4},
   226   {"mauritian_creole", "mfe", MAURITIAN_CREOLE + W10, 0},
   227   {"moldavian", "mo", ROMANIAN + W10, 0},
   228   {"mongolian", "mn", MONGOLIAN + W10, 0},
   229   {"montenegrin", "sr-me", MONTENEGRIN + W10, 0},
   230   {"myanmar", "my", BURMESE + W10, 0},      // Myanmar
   231   {"nauru", "na", NAURU + W10, 0},
   232   {"ndebele", "nr", NDEBELE + W10, 0},
   233   {"nepali", "ne", NEPALI + W10, 0},
   234   {"no-bok", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},       // Bokmaal
   235   {"no-bokmaal", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
   236   {"no-nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},        // Bokmaal
   237   {"no-no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
   238   {"no-nyn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},       // Nynorsk
   239   {"no-nynorsk", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
   240   {"norwegian", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
   241   {"norwegian_n", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
   242   {"nyanja", "ny", NYANJA + W10, 0},
   244   {"occitan", "oc", OCCITAN + W10, 0},
   245   {"oriya", "or", ORIYA + W10, 0},
   246   {"oromo", "om", OROMO + W10, 0},
   247   {"parsi", "fa", PERSIAN + W10, 0},
   249   {"pashto", "ps", PASHTO + W10, 0},
   250   {"pedi", "nso", PEDI + W10, 0},
   251   {"persian", "fa", PERSIAN + W10, 0},
   252   {"polish", "pl", POLISH + W10, 0},
   253   {"polska", "pl", POLISH + W10, 0},
   254   {"polski", "pl", POLISH + W10, 0},
   255   {"portugu", "pt", PORTUGUESE + W10, 0},     // Truncated at non-ASCII
   256   {"portuguese", "pt", PORTUGUESE + W10, 0},
   257   {"punjabi", "pa", PUNJABI + W10, 0},
   259   {"quechua", "qu", QUECHUA + W10, 0},
   261   {"rhaeto_romance", "rm", RHAETO_ROMANCE + W10, 0},
   262   {"romanian", "ro", ROMANIAN + W10, 0},
   263   {"rundi", "rn", RUNDI + W10, 0},
   264   {"russian", "ru", RUSSIAN + W10, 0},
   266   {"samoan", "sm", SAMOAN + W10, 0},
   267   {"sango", "sg", SANGO + W10, 0},
   268   {"sanskrit", "sa", SANSKRIT + W10, 0},
   269   {"scots", "sco", SCOTS + W10, ENGLISH - W4},
   270   {"scots_gaelic", "gd", SCOTS_GAELIC + W10, 0},
   271   {"serbian", "sr", SERBIAN + W10, 0},
   272   {"seselwa", "crs", SESELWA + W10, 0},
   273   {"sesotho", "st", SESOTHO + W10, 0},
   274   {"shift-jis", "ja", JAPANESE + W10, 0},   // Japanese encoding
   275   {"shift-js", "ja", JAPANESE + W10, 0},    // Japanese encoding
   276   {"shona", "sn", SHONA + W10, 0},
   277   {"si-lk", "si", SINHALESE + W10, 0},      // 1:2 Sri Lanka, not ambiguous
   278   {"si-si", "sl", SLOVENIAN + W10, 0},      // 1:2 Slovenia, not ambiguous
   279   {"si-sl", "sl", SLOVENIAN + W10, 0},      // 1:2 Slovenia, not ambiguous
   280   {"sindhi", "sd", SINDHI + W10, 0},
   281   {"sinhalese", "si", SINHALESE + W10, 0},
   282   {"siswant", "ss", SISWANT + W10, 0},
   283   {"sit-np", "sit", LIMBU + W10, 0},
   284   {"slovak", "sk", SLOVAK + W10, CZECH - W4},
   285   {"slovenian", "sl", SLOVENIAN + W10, 0},
   286   {"somali", "so", SOMALI + W10, 0},
   287   {"spanish", "es", SPANISH + W10, 0},
   288   {"sr-me", "sr-me", MONTENEGRIN + W10, 0}, // Montenegrin => Montenegrin
   289   {"sundanese", "su", SUNDANESE + W10, 0},
   290   {"suomi", "fi", FINNISH + W10, 0},        // Finnish
   291   {"swahili", "sw", SWAHILI + W10, 0},
   292   {"swedish", "sv", SWEDISH + W10, 0},
   293   {"syriac", "syr", SYRIAC + W10, 0},
   295   {"tagalog", "tl", TAGALOG + W10, 0},
   296   {"tajik", "tg", TAJIK + W10, 0},
   297   {"tamil", "ta", TAMIL + W10, 0},
   298   {"tatar", "tt", TATAR + W10, 0},
   299   {"tb-tb", "bo", TIBETAN + W10, DZONGKHA - W4},        // Tibet
   300   {"tchinese", "zhT", CHINESE_T + W10, 0},
   301   {"telugu", "te", TELUGU + W10, 0},
   302   {"thai", "th", THAI + W10, 0},
   303   {"tibetan", "bo", TIBETAN + W10, DZONGKHA - W4},
   304   {"tigrinya", "ti", TIGRINYA + W10, 0},
   305   {"tonga", "to", TONGA + W10, 0},
   306   {"tsonga", "ts", TSONGA + W10, 0},
   307   {"tswana", "tn", TSWANA + W10, 0},
   308   {"tt-ru", "tt", TATAR + W10, 0},
   309   {"tur-tr", "tr", TURKISH + W10, 0},
   310   {"turkish", "tr", TURKISH + W10, 0},
   311   {"turkmen", "tk", TURKMEN + W10, 0},
   312   {"uighur", "ug", UIGHUR + W10, 0},
   313   {"ukrainian", "uk", UKRAINIAN + W10, 0},
   314   {"urdu", "ur", URDU + W10, 0},
   315   {"uzbek", "uz", UZBEK + W10, 0},
   317   {"venda", "ve", VENDA + W10, 0},
   318   {"vietnam", "vi", VIETNAMESE + W10, 0},
   319   {"vietnamese", "vi", VIETNAMESE + W10, 0},
   320   {"volapuk", "vo", VOLAPUK + W10, 0},
   322   {"welsh", "cy", WELSH + W10, 0},
   323   {"wolof", "wo", WOLOF + W10, 0},
   325   {"xhosa", "xh", XHOSA + W10, ZULU - W4},
   327   {"yiddish", "yi", YIDDISH + W10, 0},
   328   {"yoruba", "yo", YORUBA + W10, 0},
   330   {"zh-classical", "zhT", CHINESE_T + W10, 0},
   331   {"zh-cn", "zh", CHINESE + W10, 0},
   332   {"zh-hans", "zh", CHINESE + W10, 0},
   333   {"zh-hant", "zhT", CHINESE_T + W10, 0},
   334   {"zh-hk", "zhT", CHINESE_T + W10, 0},
   335   {"zh-min-nan", "zhT", CHINESE_T + W10, 0}, // Min Nan => ChineseT
   336   {"zh-sg", "zhT", CHINESE_T + W10, 0},
   337   {"zh-tw", "zhT", CHINESE_T + W10, 0},
   338   {"zh-yue", "zh", CHINESE + W10, 0},       // Yue (Cantonese) => Chinese
   339   {"zhuang", "za", ZHUANG + W10, 0},
   340   {"zulu", "zu", ZULU + W10, XHOSA - W4},
   341 };
   345 // Table to look up lang= tags of two/three characters after truncate at hyphen
   346 // In alphabetical order for binary search
   347 static const int kCLDTable2Size = 257;
   348 static const LangTagLookup kCLDLangTagsHintTable2[kCLDTable2Size] = {
   349   {"aa", "aa", AFAR + W10, 0},
   350   {"ab", "ab", ABKHAZIAN + W10, 0},
   351   {"af", "af", AFRIKAANS + W10, 0},
   352   {"ak", "ak", AKAN + W10, 0},
   353   {"al", "sq", ALBANIAN + W10, 0},          // Albania
   354   {"am", "am,hy", AMHARIC + W10, ARMENIAN + W10},  // 1:2 Amharic Armenian
   355   {"ar", "ar", ARABIC + W10, 0},
   356   {"ara", "ar", ARABIC + W10, 0},
   357   {"arm", "hy", ARMENIAN + W10, 0},         // Armenia
   358   {"arz", "ar", ARABIC + W10, 0},           // Egyptian Arabic
   359   {"as", "as", ASSAMESE + W10, 0},
   360   {"at", "de", GERMAN + W10, 0},            // Austria
   361   {"au", "de", GERMAN + W10, 0},            // Austria
   362   {"ay", "ay", AYMARA + W10, 0},
   363   {"az", "az", AZERBAIJANI + W10, 0},
   364   {"aze", "az", AZERBAIJANI + W10, 0},
   366   {"ba", "ba,bs", BASHKIR + W10, BOSNIAN + W10},  // 1:2  Bashkir Bosnia
   367   {"be", "be", BELARUSIAN + W10, 0},
   368   {"bel", "be", BELARUSIAN + W10, 0},
   369   {"bg", "bg", BULGARIAN + W10, 0},
   370   {"bh", "bh", BIHARI + W10, HINDI - W4},
   371   {"bi", "bi", BISLAMA + W10, 0},
   372   {"big", "zhT", CHINESE_T + W10, 0},        // Big5 encoding
   373   {"bm", "ms", MALAY + W10, INDONESIAN - W4},             // Bahasa Malaysia
   374   {"bn", "bn", BENGALI + W10, 0},
   375   {"bo", "bo", TIBETAN + W10, DZONGKHA - W4},
   376   // 1:2 Breton, Brazil country code, both Latn .br TLD enough for pt to win
   377   {"br", "br,pt", BRETON + W10, PORTUGUESE + W8}, // 1:2 Breton, Brazil
   378   {"bs", "bs", BOSNIAN + W10, 0},           // Bosnian => Bosnian
   380   {"ca", "ca", CATALAN + W10, 0},
   381   {"cat", "ca", CATALAN + W10, 0},
   382   {"ch", "de,fr", GERMAN + W10, FRENCH + W10},    // 1:2 Switzerland
   383   {"chn", "zh", CHINESE + W10, 0},
   384   {"chr", "chr", CHEROKEE + W10, 0},
   385   {"ckb", "ku", KURDISH + W10, 0},          // Central Kurdish
   386   {"cn", "zh,zhT", CHINESE + W6, CHINESE_T + W4},   // Ambiguous, so weaker.
   387                                                 // Offset by 2 so that TLD=tw or
   388                                                 // enc=big5 will put zhT ahead
   389   {"co", "co", CORSICAN + W10, 0},
   390   {"cro", "hr", CROATIAN + W10, 0},          // Croatia
   391   {"crs", "crs", SESELWA + W10, 0},
   392   {"cs", "cs", CZECH + W10, SLOVAK - W4},
   393   {"ct", "ca", CATALAN + W10, 0},
   394   {"cy", "cy", WELSH + W10, 0},
   395   {"cym", "cy", WELSH + W10, 0},
   396   {"cz", "cs", CZECH + W10, SLOVAK - W4},
   398   {"da", "da", DANISH + W10, NORWEGIAN - W4},
   399   {"dan", "da", DANISH + W10, NORWEGIAN - W4},
   400   {"de", "de", GERMAN + W10, 0},
   401   {"deu", "de", GERMAN + W10, 0},
   402   {"div", "dv", DHIVEHI + W10, 0},
   403   {"dk", "da", DANISH + W10, NORWEGIAN - W4},            // Denmark
   404   {"dut", "nl", DUTCH + W10, 0},            // Dutch
   405   {"dv", "dv", DHIVEHI + W10, 0},
   406   {"dz", "dz", DZONGKHA + W10, TIBETAN - W4},
   408   {"ee", "et", ESTONIAN + W10, 0},          // Estonia
   409   {"eg", "ar", ARABIC + W10, 0},            // Egypt
   410   {"el", "el", GREEK + W10, 0},
   411   {"en", "en", ENGLISH + W4, 0},
   412   {"eng", "en", ENGLISH + W4, 0},
   413   {"eo", "eo", ESPERANTO + W10, 0},
   414   {"er", "ur", URDU + W10, 0},              // "Erdu"
   415   {"es", "es", SPANISH + W10, 0},
   416   {"esp", "es", SPANISH + W10, 0},
   417   {"est", "et", ESTONIAN + W10, 0},
   418   {"et", "et", ESTONIAN + W10, 0},
   419   {"eu", "eu", BASQUE + W10, 0},
   421   {"fa", "fa", PERSIAN + W10, 0},
   422   {"far", "fa", PERSIAN + W10, 0},
   423   {"fi", "fi", FINNISH + W10, 0},
   424   {"fil", "tl", TAGALOG + W10, 0},          // Philippines
   425   {"fj", "fj", FIJIAN + W10, 0},
   426   {"fo", "fo", FAROESE + W10, ICELANDIC - W4},
   427   {"fr", "fr", FRENCH + W10, 0},
   428   {"fra", "fr", FRENCH + W10, 0},
   429   {"fre", "fr", FRENCH + W10, 0},
   430   {"fy", "fy", FRISIAN + W10, 0},
   432   {"ga", "ga,gl", IRISH + W10, GALICIAN + W10},       // 1:2 Irish, Galician
   433   {"gae", "gd,ga", SCOTS_GAELIC + W10, IRISH + W10},  // 1:2 Gaelic, either
   434   {"gal", "gl", GALICIAN + W10, 0},
   435   {"gb", "zh", CHINESE + W10, 0},           // GB2312 encoding
   436   {"gbk", "zh", CHINESE + W10, 0},          // GBK encoding
   437   {"gd", "gd", SCOTS_GAELIC + W10, 0},
   438   {"ge", "ka", GEORGIAN + W10, 0},          // Georgia
   439   {"geo", "ka", GEORGIAN + W10, 0},
   440   {"ger", "de", GERMAN + W10, 0},
   441   {"gl", "gl", GALICIAN + W10, 0},          // Also Greenland; hard to confuse
   442   {"gn", "gn", GUARANI + W10, 0},
   443   {"gr", "el", GREEK + W10, 0},             // Greece
   444   {"gu", "gu", GUJARATI + W10, 0},
   445   {"gv", "gv", MANX + W10, 0},
   447   {"ha", "ha", HAUSA + W10, 0},
   448   {"hat", "ht", HAITIAN_CREOLE + W10, 0},   // Haiti
   449   {"haw", "haw", HAWAIIAN + W10, 0},
   450   {"hb", "iw", HEBREW + W10, 0},
   451   {"he", "iw", HEBREW + W10, 0},
   452   {"heb", "iw", HEBREW + W10, 0},
   453   {"hi", "hi", HINDI + W10, MARATHI - W4},
   454   {"hk", "zhT", CHINESE_T + W10, 0},          // Hong Kong
   455   {"hr", "hr", CROATIAN + W10, 0},
   456   {"ht", "ht", HAITIAN_CREOLE + W10, 0},
   457   {"hu", "hu", HUNGARIAN + W10, 0},
   458   {"hun", "hu", HUNGARIAN + W10, 0},
   459   {"hy", "hy", ARMENIAN + W10, 0},
   461   {"ia", "ia", INTERLINGUA + W10, 0},
   462   {"ice", "is", ICELANDIC + W10, FAROESE - W4},        // Iceland
   463   {"id", "id", INDONESIAN + W10, MALAY - W4},
   464   {"ids", "id", INDONESIAN + W10, MALAY - W4},
   465   {"ie", "ie", INTERLINGUE + W10, 0},
   466   {"ig", "ig", IGBO + W10, 0},
   467   // 1:2 iu-Cans ik-Latn
   468   {"ik", "ik,iu", INUPIAK + W10, INUKTITUT + W10},        // 1:2
   469   {"in", "id", INDONESIAN + W10, MALAY - W4},
   470   {"ind", "id", INDONESIAN + W10, MALAY - W4},       // Indonesia
   471   {"inu", "iu,ik", INUKTITUT + W10, INUPIAK + W10},       // 1:2
   472   {"is", "is", ICELANDIC + W10, FAROESE - W4},
   473   {"it", "it", ITALIAN + W10, 0},
   474   {"ita", "it", ITALIAN + W10, 0},
   475   {"iu", "iu,ik", INUKTITUT + W10, INUPIAK + W10},        // 1:2
   476   {"iw", "iw", HEBREW + W10, 0},
   478   {"ja", "ja", JAPANESE + W10, 0},
   479   {"jp", "ja", JAPANESE + W10, 0},          // Japan
   480   {"jpn", "ja", JAPANESE + W10, 0},
   481   {"jv", "jw", JAVANESE + W10, 0},
   482   {"jw", "jw", JAVANESE + W10, 0},
   484   {"ka", "ka", GEORGIAN + W10, 0},
   485   {"kc", "qu", QUECHUA + W10, 0},           // (K)Quechua
   486   {"kg", "ky", KYRGYZ + W10, 0},            // Kyrgyzstan
   487   {"kh", "km", KHMER + W10, 0},             // Country code Khmer (Cambodia)
   488   {"kha", "kha", KHASI + W10, 0},
   489   {"kk", "kk", KAZAKH + W10, 0},            // Kazakh
   490   {"kl", "kl", GREENLANDIC + W10, 0},
   491   {"km", "km", KHMER + W10, 0},
   492   {"kn", "kn", KANNADA + W10, 0},
   493   {"ko", "ko", KOREAN + W10, 0},
   494   {"kor", "ko", KOREAN + W10, 0},
   495   {"kr", "ko", KOREAN + W10, 0},            // Country code Korea
   496   {"ks", "ks", KASHMIRI + W10, 0},
   497   {"ksc", "ko", KOREAN + W10, 0},           // KSC encoding
   498   {"ku", "ku", KURDISH + W10, 0},
   499   {"ky", "ky", KYRGYZ + W10, 0},
   500   {"kz", "kk", KAZAKH + W10, 0},            // Kazakhstan
   501   {"la", "la", LATIN + W10, 0},
   502   {"lao", "lo", LAOTHIAN + W10, 0},         // Laos
   504   {"lb", "lb", LUXEMBOURGISH + W10, 0},
   505   {"lg", "lg", GANDA + W10, 0},
   506   {"lit", "lt", LITHUANIAN + W10, 0},
   507   {"ln", "ln", LINGALA + W10, 0},
   508   {"lo", "lo", LAOTHIAN + W10, 0},
   509   {"lt", "lt", LITHUANIAN + W10, 0},
   510   {"ltu", "lt", LITHUANIAN + W10, 0},
   511   {"lv", "lv", LATVIAN + W10, 0},
   513   {"mfe", "mfe", MAURITIAN_CREOLE + W10, 0},
   514   {"mg", "mg", MALAGASY + W10, 0},
   515   {"mi", "mi", MAORI + W10, 0},
   516   {"mk", "mk", MACEDONIAN + W10, 0},
   517   {"ml", "ml", MALAYALAM + W10, 0},
   518   {"mn", "mn", MONGOLIAN + W10, 0},
   519   {"mo", "mo", ROMANIAN + W10, 0},
   520   {"mon", "mn", MONGOLIAN + W10, 0},        // Mongolian
   521   {"mr", "mr", MARATHI + W10, HINDI - W4},
   522   {"ms", "ms", MALAY + W10, INDONESIAN - W4},
   523   {"mt", "mt", MALTESE + W10, 0},
   524   {"mx", "es", SPANISH + W10, 0},           // Mexico
   525   {"my", "my,ms", BURMESE + W10, MALAY + W10}, // Myanmar, Malaysia
   527   {"na", "na", NAURU + W10, 0},
   528   {"nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
   529   {"ne", "ne", NEPALI + W10, 0},
   530   {"nl", "nl", DUTCH + W10, 0},
   531   {"nn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
   532   {"no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
   533   {"nr", "nr", NDEBELE + W10, 0},
   534   {"nso", "nso", PEDI + W10, 0},
   535   {"ny", "ny", NYANJA + W10, 0},
   537   {"oc", "oc", OCCITAN + W10, 0},
   538   {"om", "om", OROMO + W10, 0},
   539   {"or", "or", ORIYA + W10, 0},
   541   {"pa", "pa,ps", PUNJABI + W10, PASHTO + W10},   // 1:2 pa-Guru ps-Arab
   542   {"per", "fa", PERSIAN + W10, 0},
   543   {"ph", "tl", TAGALOG + W10, 0},           // Philippines
   544   {"pk", "ur", URDU + W10, 0},              // Pakistan
   545   {"pl", "pl", POLISH + W10, 0},
   546   {"pnb", "pa", PUNJABI + W10, 0},          // Western Punjabi
   547   {"pol", "pl", POLISH + W10, 0},
   548   {"por", "pt", PORTUGUESE + W10, 0},
   549   {"ps", "ps", PASHTO + W10, 0},
   550   {"pt", "pt", PORTUGUESE + W10, 0},
   551   {"ptg", "pt", PORTUGUESE + W10, 0},
   552   {"qc", "fr", FRENCH + W10, 0},            // Quebec "country" code
   553   {"qu", "qu", QUECHUA + W10, 0},
   555   {"rm", "rm", RHAETO_ROMANCE + W10, 0},
   556   {"rn", "rn", RUNDI + W10, 0},
   557   {"ro", "ro", ROMANIAN + W10, 0},
   558   {"rs", "sr", SERBIAN + W10, 0},           // Serbia country code
   559   {"ru", "ru", RUSSIAN + W10, 0},
   560   {"rus", "ru", RUSSIAN + W10, 0},
   561   {"rw", "rw", KINYARWANDA + W10, 0},
   563   {"sa", "sa", SANSKRIT + W10, 0},
   564   {"sco", "sco", SCOTS + W10, ENGLISH - W4},
   565   {"sd", "sd", SINDHI + W10, 0},
   566   {"se", "sv", SWEDISH + W10, 0},
   567   {"sg", "sg", SANGO + W10, 0},
   568   {"si", "si,sl", SINHALESE + W10, SLOVENIAN + W10},  // 1:2 Sinhalese, Slovinia
   569   {"sk", "sk", SLOVAK + W10, CZECH - W4},
   570   {"sl", "sl", SLOVENIAN + W10, 0},
   571   {"slo", "sl", SLOVENIAN + W10, 0},
   572   {"sm", "sm", SAMOAN + W10, 0},
   573   {"sn", "sn", SHONA + W10, 0},
   574   {"so", "so", SOMALI + W10, 0},
   575   {"sp", "es", SPANISH + W10, 0},
   576   {"sq", "sq", ALBANIAN + W10, 0},
   577   {"sr", "sr", SERBIAN + W10, 0},
   578   {"srb", "sr", SERBIAN + W10, 0},
   579   {"srl", "sr", SERBIAN + W10, 0},          // Serbian Latin
   580   {"srp", "sr", SERBIAN + W10, 0},
   581   {"ss", "ss", SISWANT + W10, 0},
   582   {"st", "st", SESOTHO + W10, 0},
   583   {"su", "su", SUNDANESE + W10, 0},
   584   {"sv", "sv", SWEDISH + W10, 0},
   585   {"sve", "sv", SWEDISH + W10, 0},
   586   {"sw", "sw", SWAHILI + W10, 0},
   587   {"swe", "sv", SWEDISH + W10, 0},
   588   {"sy", "syr", SYRIAC + W10, 0},
   589   {"syr", "syr", SYRIAC + W10, 0},
   591   {"ta", "ta", TAMIL + W10, 0},
   592   {"te", "te", TELUGU + W10, 0},
   593   {"tg", "tg", TAJIK + W10, 0},
   594   {"th", "th", THAI + W10, 0},
   595   {"ti", "ti,bo", TIGRINYA + W10, TIBETAN + W10},    // 1:2 Tigrinya, Tibet
   596   {"tj", "tg", TAJIK + W10, 0},             // Tajikistan
   597   {"tk", "tk", TURKMEN + W10, 0},
   598   {"tl", "tl", TAGALOG + W10, 0},
   599   {"tlh", "tlh", X_KLINGON + W10, 0},
   600   {"tn", "tn", TSWANA + W10, 0},
   601   {"to", "to", TONGA + W10, 0},
   602   {"tr", "tr", TURKISH + W10, 0},
   603   {"ts", "ts", TSONGA + W10, 0},
   604   {"tt", "tt", TATAR + W10, 0},
   605   {"tw", "ak,zhT", AKAN + W10, CHINESE_T + W10},   // 1:2 Twi => Akan, Taiwan
   606   {"twi", "ak", AKAN + W10, 0},             // Twi => Akan
   608   {"ua", "uk", UKRAINIAN + W10, 0},         // Ukraine
   609   {"ug", "ug", UIGHUR + W10, 0},
   610   {"uk", "uk", UKRAINIAN + W10, 0},
   611   {"ur", "ur", URDU + W10, 0},
   612   {"uz", "uz", UZBEK + W10, 0},
   614   {"va", "ca", CATALAN + W10, 0},           // Valencia => Catalan
   615   {"val", "ca", CATALAN + W10, 0},          // Valencia => Catalan
   616   {"ve", "ve", VENDA + W10, 0},
   617   {"vi", "vi", VIETNAMESE + W10, 0},
   618   {"vie", "vi", VIETNAMESE + W10, 0},
   619   {"vn", "vi", VIETNAMESE + W10, 0},
   620   {"vo", "vo", VOLAPUK + W10, 0},
   622   {"wo", "wo", WOLOF + W10, 0},
   624   {"xh", "xh", XHOSA + W10, ZULU - W4},
   625   {"xho", "xh", XHOSA + W10, ZULU - W4},
   627   {"yi", "yi", YIDDISH + W10, 0},
   628   {"yo", "yo", YORUBA + W10, 0},
   630   {"za", "za", ZHUANG + W10, 0},
   631   {"zh", "zh", CHINESE + W10, 0},
   632   {"zht", "zhT", CHINESE_T + W10, 0},
   633   {"zu", "zu", ZULU + W10, XHOSA - W4},
   634 };
   637 // Possibly map to tl:
   638 // -LangTags tl-Latn /7val.com/ ,bcl 2 Central Bicolano
   639 // -LangTags tl-Latn /7val.com/ ,ceb 6 Cebuano
   640 // -LangTags tl-Latn /7val.com/ ,war 1 Waray
   644 // Table to look up country TLD (no general TLD)
   645 // In alphabetical order for binary search
   646 static const int kCLDTable3Size = 181;
   647 static const TLDLookup kCLDTLDHintTable[kCLDTable3Size] = {
   648   {"ac", JAPANESE + W2, 0},
   649   {"ad", CATALAN + W4, 0},
   650   {"ae", ARABIC + W4, 0},
   651   {"af", PASHTO + W4, PERSIAN + W4},
   652   {"ag", GERMAN + W2, 0},                // meager
   653   // {"ai", 0, 0},                          // meager
   654   {"al", ALBANIAN + W4, 0},
   655   {"am", ARMENIAN + W4, 0},
   656   {"an", DUTCH + W4, 0},                 // meager
   657   {"ao", PORTUGUESE + W4, 0},
   658   // {"aq", 0, 0},                          // meager
   659   {"ar", SPANISH + W4, 0},
   660   // {"as", 0, 0},
   661   {"at", GERMAN + W4, 0},
   662   {"au", ENGLISH + W2, 0},
   663   {"aw", DUTCH + W4, 0},
   664   {"ax", SWEDISH + W4, 0},
   665   {"az", AZERBAIJANI + W4, 0},
   667   {"ba", BOSNIAN + W8, CROATIAN - W4},
   668   // {"bb", 0, 0},
   669   {"bd", BENGALI + W4, 0},
   670   {"be", DUTCH + W4, FRENCH + W4},
   671   {"bf", FRENCH + W4, 0},
   672   {"bg", BULGARIAN + W4, 0},
   673   {"bh", ARABIC + W4, 0},
   674   {"bi", RUNDI + W4, FRENCH + W4},
   675   {"bj", FRENCH + W4, 0},
   676   {"bm", ENGLISH + W2, 0},
   677   {"bn", MALAY + W4, INDONESIAN - W4},
   678   {"bo", SPANISH + W4, AYMARA + W2},   // and GUARANI QUECHUA
   679   {"br", PORTUGUESE + W4, 0},
   680   // {"bs", 0, 0},
   681   {"bt", DZONGKHA + W10, TIBETAN - W10},      // Strong presumption of Dzongha
   682   {"bw", TSWANA + W4, 0},
   683   {"by", BELARUSIAN + W4, 0},
   684   // {"bz", 0, 0},
   686   {"ca", FRENCH + W4, ENGLISH + W2},
   687   {"cat", CATALAN + W4, 0},
   688   {"cc", 0, 0},
   689   {"cd", FRENCH + W4, 0},
   690   {"cf", FRENCH + W4, 0},
   691   {"cg", FRENCH + W4, 0},
   692   {"ch", GERMAN + W4, FRENCH + W4},
   693   {"ci", FRENCH + W4, 0},
   694   // {"ck", 0, 0},
   695   {"cl", SPANISH + W4, 0},
   696   {"cm", FRENCH + W4, 0},
   697   {"cn", CHINESE + W4, 0},
   698   {"co", SPANISH + W4, 0},
   699   {"cr", SPANISH + W4, 0},
   700   {"cu", SPANISH + W4, 0},
   701   {"cv", PORTUGUESE + W4, 0},
   702   // {"cx", 0, 0},
   703   {"cy", GREEK + W4, TURKISH + W4},
   704   {"cz", CZECH + W4, SLOVAK - W4},
   706   {"de", GERMAN + W4, 0},
   707   {"dj", 0, 0},
   708   {"dk", DANISH + W4, NORWEGIAN - W4},
   709   {"dm", 0, 0},
   710   {"do", SPANISH + W4, 0},
   711   {"dz", FRENCH + W4, ARABIC + W4},
   713   {"ec", SPANISH + W4, 0},
   714   {"ee", ESTONIAN + W4, 0},
   715   {"eg", ARABIC + W4, 0},
   716   {"er", AFAR + W4, 0},
   717   {"es", SPANISH + W4, 0},
   718   {"et", AMHARIC + W4, AFAR + W4},
   720   {"fi", FINNISH + W4, 0},
   721   {"fj", FIJIAN + W4, 0},
   722   // {"fk", 0, 0},
   723   // {"fm", 0, 0},
   724   {"fo", FAROESE + W4, ICELANDIC - W4},
   725   {"fr", FRENCH + W4, 0},
   727   {"ga", FRENCH + W4, 0},
   728   {"gd", 0, 0},
   729   {"ge", GEORGIAN + W4, 0},
   730   {"gf", FRENCH + W4, 0},
   731   // {"gg", 0, 0},
   732   // {"gh", 0, 0},
   733   // {"gi", 0, 0},
   734   {"gl", GREENLANDIC + W4, DANISH + W4},
   735   // {"gm", 0, 0},
   736   {"gn", FRENCH + W4, 0},
   737   // {"gp", 0, 0},
   738   // {"gq", 0, 0},
   739   {"gr", GREEK + W4, 0},
   740   // {"gs", 0, 0},
   741   {"gt", SPANISH + W4, 0},
   742   // {"gu", 0, 0},
   743   // {"gy", 0, 0},
   745   {"hk", CHINESE_T + W4, 0},
   746   // {"hm", 0, 0},
   747   {"hn", SPANISH + W4, 0},
   748   {"hr", CROATIAN + W8, BOSNIAN - W4},
   749   {"ht", HAITIAN_CREOLE + W4, FRENCH + W4},
   750   {"hu", HUNGARIAN + W4, 0},
   752   {"id", INDONESIAN + W4, MALAY - W4},
   753   {"ie", IRISH + W4, 0},
   754   {"il", HEBREW + W4, 0},
   755   {"im", MANX + W4, 0},
   756   // {"in", 0, 0},
   757   // {"io", 0, 0},
   758   {"iq", ARABIC + W4, 0},
   759   {"ir", PERSIAN + W4, 0},
   760   {"is", ICELANDIC + W4, FAROESE - W4},
   761   {"it", ITALIAN + W4, 0},
   763   // {"je", 0, 0},
   764   // {"jm", 0, 0},
   765   {"jo", ARABIC + W4, 0},
   766   {"jp", JAPANESE + W4, 0},
   768   // {"ke", 0, 0},
   769   {"kg", KYRGYZ + W4, 0},
   770   {"kh", KHMER + W4, 0},
   771   // {"ki", 0, 0},
   772   {"km", FRENCH + W4, 0},
   773   // {"kn", 0, 0},
   774   {"kp", KOREAN + W4, 0},
   775   {"kr", KOREAN + W4, 0},
   776   {"kw", ARABIC + W4, 0},
   777   // {"ky", 0, 0},
   778   {"kz", KAZAKH + W4, 0},
   780   {"la", LAOTHIAN + W4, 0},
   781   {"lb", ARABIC + W4, FRENCH + W4},
   782   // {"lc", 0, 0},
   783   {"li", GERMAN + W4, 0},
   784   {"lk", SINHALESE + W4, 0},
   785   // {"lr", 0, 0},
   786   {"ls", SESOTHO + W4, 0},
   787   {"lt", LITHUANIAN + W4, 0},
   788   {"lu", LUXEMBOURGISH + W4},
   789   {"lv", LATVIAN + W4, 0},
   790   {"ly", ARABIC + W4, 0},
   792   {"ma", FRENCH + W4, 0},
   793   {"mc", FRENCH + W4, 0},
   794   {"md", ROMANIAN + W4, 0},
   795   {"me", MONTENEGRIN + W8, SERBIAN - W4},
   796   {"mg", FRENCH + W4, 0},
   797   {"mk", MACEDONIAN + W4, 0},
   798   {"ml", FRENCH + W4, 0},
   799   {"mm", BURMESE + W4, 0},
   800   {"mn", MONGOLIAN + W4, 0},
   801   {"mo", CHINESE_T + W4, PORTUGUESE + W4},
   802   // {"mp", 0, 0},
   803   {"mq", FRENCH + W4, 0},
   804   {"mr", FRENCH + W4, ARABIC + W4},
   805   // {"ms", 0, 0},
   806   {"mt", MALTESE + W4, 0},
   807   // {"mu", 0, 0},
   808   {"mv", DHIVEHI + W4, 0},
   809   // {"mw", 0, 0},
   810   {"mx", SPANISH + W4, 0},
   811   {"my", MALAY + W4, INDONESIAN - W4},
   812   {"mz", PORTUGUESE + W4, 0},
   814   {"na", 0, 0},            // Namibia
   815   {"nc", FRENCH + W4, 0},
   816   {"ne", FRENCH + W4, 0},
   817   {"nf", FRENCH + W4, 0},
   818   // {"ng", 0, 0},
   819   {"ni", SPANISH + W4, 0},
   820   {"nl", DUTCH + W4, 0},
   821   {"no", NORWEGIAN + W4, NORWEGIAN_N + W2},
   822   {"np", NEPALI + W4, 0},
   823   {"nr", NAURU + W4, 0},
   824   {"nu", SWEDISH + W4, 0},
   825   {"nz", MAORI + W4, ENGLISH + W2},
   827   {"om", ARABIC + W4, 0},
   829   {"pa", SPANISH + W4, 0},
   830   {"pe", SPANISH + W4, QUECHUA + W2},   // also AYMARA
   831   {"pf", FRENCH + W4, 0},
   832   // {"pg", 0, 0},
   833   {"ph", TAGALOG + W4, 0},
   834   {"pk", URDU + W4, 0},
   835   {"pl", POLISH + W4, 0},
   836   // {"pn", 0, 0},
   837   {"pr", SPANISH + W4, 0},
   838   {"ps", ARABIC + W4, 0},
   839   {"pt", PORTUGUESE + W4, 0},
   840   {"py", SPANISH + W4, GUARANI + W2},
   842   {"qa", ARABIC + W4, 0},
   844   {"re", FRENCH + W4, 0},
   845   {"ro", ROMANIAN + W4, 0},
   846   {"rs", SERBIAN + W8, MONTENEGRIN - W4},
   847   {"ru", RUSSIAN + W4, 0},
   848   {"rw", KINYARWANDA + W4, FRENCH + W2},
   850   {"sa", ARABIC + W4, 0},
   851   // {"sb", 0, 0},
   852   {"sc", SESELWA + W4, 0},
   853   {"sd", ARABIC + W4, 0},
   854   {"se", SWEDISH + W4, 0},
   855   // {"sg", 0, 0},
   856   // {"sh", 0, 0},
   857   {"si", SLOVENIAN + W4, 0},
   858   {"sk", SLOVAK + W4, CZECH - W4},
   859   // {"sl", 0, 0},
   860   {"sm", ITALIAN + W4, 0},
   861   {"sn", FRENCH + W4, 0},
   862   // {"sr", 0, 0},
   863   {"ss", ARABIC + W4, 0},     // Presumed South Sudan TLD. dsites 2011.07.07
   864   // {"st", 0, 0},
   865   {"su", RUSSIAN + W4, 0},
   866   {"sv", SPANISH + W4, 0},
   867   {"sy", ARABIC + W4, 0},
   868   // {"sz", 0, 0},
   870   // {"tc", 0, 0},
   871   {"td", FRENCH + W4, 0},
   872   // {"tf", 0, 0},
   873   {"tg", FRENCH + W4, 0},
   874   {"th", THAI + W4, 0},
   875                               // Tibet has no country code (see .cn)
   876   {"tj", TAJIK + W4, 0},
   877   // {"tk", 0, 0},
   878   // {"tl", 0, 0},
   879   {"tm", TURKISH + W4, 0},
   880   {"tn", FRENCH + W4, ARABIC + W4},
   881   // {"to", 0, 0},
   882   {"tp", JAPANESE + W4, 0},
   883   {"tr", TURKISH + W4, 0},
   884   // {"tt", 0, 0},
   885   // {"tv", 0, 0},
   886   {"tw", CHINESE_T + W4, 0},
   887   {"tz", SWAHILI + W4, AKAN + W4},
   889   {"ua", UKRAINIAN + W4, 0},
   890   {"ug", GANDA + W4, 0},
   891   {"uk", ENGLISH + W2, 0},
   892   {"us", ENGLISH + W2, 0},
   893   {"uy", SPANISH + W4, 0},
   894   {"uz", UZBEK + W4, 0},
   896   {"va", ITALIAN + W4, LATIN + W2},
   897   // {"vc", 0, 0},
   898   {"ve", SPANISH + W4, 0},
   899   // {"vg", 0, 0},
   900   // {"vi", 0, 0},
   901   {"vn", VIETNAMESE + W4, 0},
   902   // {"vu", 0, 0},
   904   {"wf", FRENCH + W4, 0},
   905   // {"ws", 0, 0},
   907   {"ye", ARABIC + W4, 0},
   909   {"za", AFRIKAANS + W4, 0},
   910   // {"zm", 0, 0},
   911   // {"zw", 0, 0},
   912 };
   914 #undef W2
   915 #undef W4
   916 #undef W6
   917 #undef W8
   918 #undef W10
   919 #undef W12
   925 inline void SetCLDPriorWeight(int w, OneCLDLangPrior* olp) {
   926   *olp = (*olp & 0x3ff) + (w << 10);
   927 }
   928 inline void SetCLDPriorLang(Language lang, OneCLDLangPrior* olp) {
   929   *olp = (*olp & ~0x3ff) + lang;
   930 }
   932 OneCLDLangPrior PackCLDPriorLangWeight(Language lang, int w) {
   933   return (w << 10) + lang;
   934 }
   936 inline int MaxInt(int a, int b) {
   937   return (a >= b) ? a : b;
   938 }
   940 // Merge in another language prior, taking max if already there
   941 void MergeCLDLangPriorsMax(OneCLDLangPrior olp, CLDLangPriors* lps) {
   942   if (olp == 0) {return;}
   943   Language target_lang = GetCLDPriorLang(olp);
   944   for (int i = 0; i < lps->n; ++i) {
   945     if (GetCLDPriorLang(lps->prior[i]) == target_lang) {
   946       int new_weight = MaxInt(GetCLDPriorWeight(lps->prior[i]),
   947                               GetCLDPriorWeight(olp));
   948       SetCLDPriorWeight(new_weight, &lps->prior[i]);
   949       return;
   950     }
   951   }
   952   // Not found; add it if room
   953   if (lps->n >= kMaxOneCLDLangPrior) {return;}
   954   lps->prior[lps->n++] = olp;
   955 }
   957 // Merge in another language prior, boosting 10x if already there
   958 void MergeCLDLangPriorsBoost(OneCLDLangPrior olp, CLDLangPriors* lps) {
   959   if (olp == 0) {return;}
   960   Language target_lang = GetCLDPriorLang(olp);
   961   for (int i = 0; i < lps->n; ++i) {
   962     if (GetCLDPriorLang(lps->prior[i]) == target_lang) {
   963       int new_weight = GetCLDPriorWeight(lps->prior[i]) + 2;
   964       SetCLDPriorWeight(new_weight, &lps->prior[i]);
   965       return;
   966     }
   967   }
   968   // Not found; add it if room
   969   if (lps->n >= kMaxOneCLDLangPrior) {return;}
   970   lps->prior[lps->n++] = olp;
   971 }
   974 // Trim language priors to no more than max_entries, keeping largest abs weights
   975 void TrimCLDLangPriors(int max_entries, CLDLangPriors* lps) {
   976   if (lps->n <= max_entries) {return;}
   978   // Insertion sort in-place by abs(weight)
   979   for (int i = 0; i < lps->n; ++i) {
   980     OneCLDLangPrior temp_olp = lps->prior[i];
   981     int w = abs(GetCLDPriorWeight(temp_olp));
   982     int kk = i;
   983     for (; kk > 0; --kk) {
   984       if (abs(GetCLDPriorWeight(lps->prior[kk - 1])) < w) {
   985         // Move down and continue
   986         lps->prior[kk] = lps->prior[kk - 1];
   987       } else {
   988         // abs(weight[kk - 1]) >= w, time to stop
   989         break;
   990       }
   991     }
   992     lps->prior[kk] = temp_olp;
   993   }
   995   lps->n = max_entries;
   996 }
   998 int CountCommas(const string& langtags) {
   999   int commas = 0;
  1000   for (int i = 0; i < static_cast<int>(langtags.size()); ++i) {
  1001     if (langtags[i] == ',') {++commas;}
  1003   return commas;
  1006 // Binary lookup on language tag
  1007 const LangTagLookup* DoLangTagLookup(const char* key,
  1008                                      const LangTagLookup* tbl, int tbl_size) {
  1009   // Key is always in range [lo..hi)
  1010   int lo = 0;
  1011   int hi = tbl_size;
  1012   while (lo < hi) {
  1013     int mid = (lo + hi) >> 1;
  1014     int comp = strcmp(tbl[mid].langtag, key);
  1015     if (comp < 0) {
  1016       lo = mid + 1;
  1017     } else if (comp > 0) {
  1018       hi = mid;
  1019     } else {
  1020       return &tbl[mid];
  1023   return NULL;
  1026 // Binary lookup on tld
  1027 const TLDLookup* DoTLDLookup(const char* key,
  1028                              const TLDLookup* tbl, int tbl_size) {
  1029   // Key is always in range [lo..hi)
  1030   int lo = 0;
  1031   int hi = tbl_size;
  1032   while (lo < hi) {
  1033     int mid = (lo + hi) >> 1;
  1034     int comp = strcmp(tbl[mid].tld, key);
  1035     if (comp < 0) {
  1036       lo = mid + 1;
  1037     } else if (comp > 0) {
  1038       hi = mid;
  1039     } else {
  1040       return &tbl[mid];
  1043   return NULL;
  1048 // Trim language tag string to canonical form for each language
  1049 // Input is from GetLangTagsFromHtml(), already lowercased
  1050 string TrimCLDLangTagsHint(const string& langtags) {
  1051   string retval;
  1052   if (langtags.empty()) {return retval;}
  1053   int commas = CountCommas(langtags);
  1054   if (commas > 4) {return retval;}       // Ignore if too many language tags
  1056   char temp[20];
  1057   int pos = 0;
  1058   while (pos < static_cast<int>(langtags.size())) {
  1059     int comma = langtags.find(',', pos);
  1060     if (comma == string::npos) {comma = langtags.size();} // fake trailing comma
  1061     int len = comma - pos;
  1062     if (len <= 16) {
  1063       // Short enough to use
  1064       memcpy(temp, &langtags[pos], len);
  1065       temp[len] = '\0';
  1066       const LangTagLookup* entry = DoLangTagLookup(temp,
  1067                                                    kCLDLangTagsHintTable1,
  1068                                                    kCLDTable1Size);
  1069       if (entry != NULL) {
  1070         // First table hit
  1071         retval.append(entry->langcode);     // may be "code1,code2"
  1072         retval.append(1, ',');
  1073       } else {
  1074         // Try second table with language code truncated at first hyphen
  1075         char* hyphen = strchr(temp, '-');
  1076         if (hyphen != NULL) {*hyphen = '\0';}
  1077         len = strlen(temp);
  1078         if (len <= 3) {                 // Short enough to use
  1079           entry = DoLangTagLookup(temp,
  1080                                   kCLDLangTagsHintTable2,
  1081                                   kCLDTable2Size);
  1082           if (entry != NULL) {
  1083             // Second table hit
  1084             retval.append(entry->langcode);     // may be "code1,code2"
  1085             retval.append(1, ',');
  1090     pos = comma + 1;
  1093   // Remove trainling comma, if any
  1094   if (!retval.empty()) {retval.resize(retval.size() - 1);}
  1095   return retval;
  1100 //==============================================================================
  1102 // Little state machine to scan insides of language attribute quoted-string.
  1103 // Each language code is lowercased and copied to the output string. Underscore
  1104 // is mapped to minus. Space, tab, and comma are all mapped to comma, and
  1105 // multiple consecutive commas are removed.
  1106 // Each language code in the output list will be followed by a single comma.
  1108 // There are three states, and we start in state 1:
  1109 // State 0: After a letter.
  1110 //  Copy all letters/minus[0], copy comma[1]; all others copy comma and skip [2]
  1111 // State 1: Just after a comma.
  1112 //  Copy letter [0], Ignore subsequent commas[1]. minus and all others skip [2]
  1113 // State 2: Skipping.
  1114 //  All characters except comma skip and stay in [2]. comma goes to [1]
  1116 // The thing that is copied is kLangCodeRemap[c] when going to state 0,
  1117 // and always comma when going to state 1 or 2. The design depends on copying
  1118 // a comma at the *beginning* of skipping, and in state 2 never doing a copy.
  1120 // We pack all this into 8 bits:
  1121 //    +--+---+---+
  1122 //    |78|654|321|
  1123 //    +--+---+---+
  1124 //
  1125 // Shift byte right by 3*state, giving [0] 321, [1] 654, [2] .78
  1126 // where . is always zero
  1127 // Of these 3 bits, low two are next state ss, high bit is copy bit C.
  1128 // If C=1 and ss == 0, copy kLangCodeRemap[c], else copy a comma
  1130 #define SKIP0 0
  1131 #define SKIP1 1
  1132 #define SKIP2 2
  1133 #define COPY0 4   // copy kLangCodeRemap[c]
  1134 #define COPY1 5   // copy ','
  1135 #define COPY2 6   // copy ','
  1137 // These combined actions pack three states into one byte.
  1138 // Ninth bit must be zero, so all state 2 values must be skips.
  1139 //              state[2]       state[1]      state[0]
  1140 #define LTR   ((SKIP2 << 6) + (COPY0 << 3) + COPY0)
  1141 #define MINUS ((SKIP2 << 6) + (COPY2 << 3) + COPY0)
  1142 #define COMMA ((SKIP1 << 6) + (SKIP1 << 3) + COPY1)
  1143 #define Bad   ((SKIP2 << 6) + (COPY2 << 3) + COPY2)
  1145 // Treat as letter: a-z,  A-Z
  1146 // Treat as minus:  2D minus,  5F underscore
  1147 // Treat as comma:  09 tab,  20 space,  2C comma
  1149 static const unsigned char kLangCodeAction[256] = {
  1150   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,COMMA,Bad,Bad,Bad,Bad,Bad,Bad,
  1151   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1152   COMMA,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,COMMA,MINUS,Bad,Bad,
  1153   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1155   Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR,  LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,
  1156   LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,  LTR,LTR,LTR,Bad,Bad,Bad,Bad,MINUS,
  1157   Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR,  LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,
  1158   LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,  LTR,LTR,LTR,Bad,Bad,Bad,Bad,Bad,
  1160   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1161   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1162   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1163   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1165   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1166   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1167   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1168   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1169 };
  1171 // This does lowercasing, maps underscore to minus, and maps tab/space to comma
  1172 static const unsigned char kLangCodeRemap[256] = {
  1173   0,0,0,0,0,0,0,0,  0,',',0,0,0,0,0,0,          // 09 tab
  1174   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1175   ',',0,0,0,0,0,0,0,  0,0,0,0,',','-',0,0,      // 20 space 2C comma 2D minus
  1176   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1178     0,'a','b','c','d','e','f','g',  'h','i','j','k','l','m','n','o',
  1179   'p','q','r','s','t','u','v','w',  'x','y','z',0,0,0,0,'-',  // 5F underscore
  1180     0,'a','b','c','d','e','f','g',  'h','i','j','k','l','m','n','o',
  1181   'p','q','r','s','t','u','v','w',  'x','y','z',0,0,0,0,0,
  1183   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1184   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1185   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1186   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1188   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1189   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1190   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1191   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1192 };
  1194 #undef LTR
  1195 #undef MINUS
  1196 #undef COMMA
  1197 #undef Bad
  1199 #undef SKIP0
  1200 #undef SKIP1
  1201 #undef SKIP2
  1202 #undef COPY0
  1203 #undef COPY1
  1204 #undef COPY2
  1207 // Find opening '<' for HTML tag
  1208 // Note: this is all somewhat insensitive to mismatched quotes
  1209 int32 FindTagStart(const char* utf8_body, int32 pos, int32 max_pos) {
  1210   int i = pos;
  1211   // Advance i by 4 if none of the next 4 bytes are '<'
  1212   for (i = pos; i < (max_pos - 3); i += 4) {
  1213     // Fast check for any <
  1214     const char* p = &utf8_body[i];
  1215     uint32 s0123 = UNALIGNED_LOAD32(p);
  1216     uint32 temp = s0123 ^ 0x3c3c3c3c;    // <<<<
  1217     if (((temp - 0x01010101) & (~temp & 0x80808080)) != 0) {
  1218       // At least one byte is '<'
  1219       break;
  1222   // Continue, advancing i by 1
  1223   for (; i < max_pos; ++i) {
  1224     if (utf8_body[i] == '<') {return i;}
  1226   return -1;
  1230 // Find closing '>' for HTML tag. Also stop on < and & (simplistic parsing)
  1231 int32 FindTagEnd(const char* utf8_body, int32 pos, int32 max_pos) {
  1232   // Always outside quotes
  1233   for (int i = pos; i < max_pos; ++i) {
  1234     char c = utf8_body[i];
  1235     if (c == '>') {return i;}
  1236     if (c == '<') {return i - 1;}
  1237     if (c == '&') {return i - 1;}
  1239   return -1;              // nothing found
  1242 // Find opening quote or apostrophe, skipping spaces
  1243 // Note: this is all somewhat insensitive to mismatched quotes
  1244 int32 FindQuoteStart(const char* utf8_body, int32 pos, int32 max_pos) {
  1245   for (int i = pos; i < max_pos; ++i) {
  1246     char c = utf8_body[i];
  1247     if (c == '"') {return i;}
  1248     if (c == '\'') {return i;}
  1249     if (c != ' ') {return -1;}
  1251   return -1;
  1254 // Find closing quot/apos. Also stop on = > < and & (simplistic parsing)
  1255 int32 FindQuoteEnd(const char* utf8_body, int32 pos, int32 max_pos) {
  1256   // Always outside quotes
  1257   for (int i = pos; i < max_pos; ++i) {
  1258     char c = utf8_body[i];
  1259     if (c == '"') {return i;}
  1260     if (c == '\'') {return i;}
  1261     if (c == '>') {return i - 1;}
  1262     if (c == '=') {return i - 1;}
  1263     if (c == '<') {return i - 1;}
  1264     if (c == '&') {return i - 1;}
  1266   return -1;              // nothing found
  1269 int32 FindEqualSign(const char* utf8_body, int32 pos, int32 max_pos) {
  1270   // Outside quotes/apostrophes loop
  1271   for (int i = pos; i < max_pos; ++i) {
  1272     char c = utf8_body[i];
  1273     if (c == '=') {       // Found bare equal sign inside tag
  1274       return i;
  1275     } else if (c == '"') {
  1276       // Inside quotes loop
  1277       int j;
  1278       for (j = i + 1; j < max_pos; ++j) {
  1279         if (utf8_body[j] == '"') {
  1280           break;
  1281         } else if (utf8_body[j] == '\\') {
  1282           ++j;
  1285       i = j;
  1286     } else if (c == '\'') {
  1287       // Inside apostrophes loop
  1288       int j;
  1289       for (j = i + 1; j < max_pos; ++j) {
  1290         if (utf8_body[j] == '\'') {
  1291           break;
  1292         } else if (utf8_body[j] == '\\') {
  1293           ++j;
  1296       i = j;
  1300   return -1;              // nothing found
  1303 // Scan backwards for case-insensitive string s in [min_pos..pos)
  1304 // Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f]
  1305 // Cheap lowercase. Control codes will masquerade as 20..3f
  1306 bool FindBefore(const char* utf8_body,
  1307                  int32 min_pos, int32 pos, const char* s) {
  1308   int len = strlen(s);
  1309   if ((pos - min_pos) < len) {return false;}     // Too small to fit s
  1311   // Skip trailing spaces
  1312   int i = pos;
  1313   while ((i > (min_pos + len)) && (utf8_body[i - 1] == ' ')) {--i;}
  1314   i -= len;
  1315   if (i < min_pos) {return false;}   // pos - min_pos < len, so s can't be found
  1317   const char* p = &utf8_body[i];
  1318   for (int j = 0; j < len; ++j) {
  1319     if ((p[j] | 0x20) != s[j])  {return false;}    // Unequal byte
  1321   return true;                                     // All bytes equal at i
  1324 // Scan forwards for case-insensitive string s in [pos..max_pos)
  1325 // Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f]
  1326 // Cheap lowercase. Control codes will masquerade as 20..3f
  1327 // Allows but does not require quoted/apostrophe string
  1328 bool FindAfter(const char* utf8_body,
  1329                  int32 pos, int32 max_pos, const char* s) {
  1330   int len = strlen(s);
  1331   if ((max_pos - pos) < len) {return false;}     // Too small to fit s
  1333   // Skip leading spaces, quote, apostrophe
  1334   int i = pos;
  1335   while (i < (max_pos - len)) {
  1336     unsigned char c = utf8_body[i];
  1337     if ((c == ' ') || (c == '"') || (c == '\'')) {++i;}
  1338     else {break;}
  1341   const char* p = &utf8_body[i];
  1342   for (int j = 0; j < len; ++j) {
  1343     if ((p[j] | 0x20) != s[j])  {return false;}    // Unequal byte
  1345   return true;                                     // All bytes equal
  1350 // Copy attribute value in [pos..max_pos)
  1351 // pos is just after an opening quote/apostrophe and max_pos is the ending one
  1352 // String must all be on a single line.
  1353 // Return slightly-normalized language list, empty or ending in comma
  1354 // Does lowercasing and removes excess punctuation/space
  1355 string CopyOneQuotedString(const char* utf8_body,
  1356                          int32 pos, int32 max_pos) {
  1357   string s;
  1358   int state = 1;        // Front is logically just after a comma
  1359   for (int i = pos; i < max_pos; ++i) {
  1360     unsigned char c = utf8_body[i];
  1361     int e = kLangCodeAction[c] >> (3 * state);
  1362     state = e & 3;      // Update to next state
  1363     if ((e & 4) != 0) {
  1364       // Copy a remapped byte if going to state 0, else copy a comma
  1365       if (state == 0) {
  1366         s.append(1, kLangCodeRemap[c]);
  1367       } else {
  1368         s.append(1, ',');
  1373   // Add final comma if needed
  1374   if (state == 0) {
  1375     s.append(1, ',');
  1377   return s;
  1380 // Find and copy attribute value: quoted string in [pos..max_pos)
  1381 // Return slightly-normalized language list, empty or ending in comma
  1382 string CopyQuotedString(const char* utf8_body,
  1383                          int32 pos, int32 max_pos) {
  1384   int32 start_quote = FindQuoteStart(utf8_body, pos, max_pos);
  1385   if (start_quote < 0) {return string("");}
  1386   int32 end_quote = FindQuoteEnd(utf8_body, start_quote + 1, max_pos);
  1387   if (end_quote < 0) {return string("");}
  1389   return CopyOneQuotedString(utf8_body, start_quote + 1, end_quote);
  1392 // Add hints to vector of langpriors
  1393 // Input is from GetLangTagsFromHtml(), already lowercased
  1394 void SetCLDLangTagsHint(const string& langtags, CLDLangPriors* langpriors) {
  1395   if (langtags.empty()) {return;}
  1396   int commas = CountCommas(langtags);
  1397   if (commas > 4) {return;}       // Ignore if too many language tags
  1399   char temp[20];
  1400   int pos = 0;
  1401   while (pos < static_cast<int>(langtags.size())) {
  1402     int comma = langtags.find(',', pos);
  1403     if (comma == string::npos) {comma = langtags.size();} // fake trailing comma
  1404     int len = comma - pos;
  1405     if (len <= 16) {
  1406       // Short enough to use
  1407       memcpy(temp, &langtags[pos], len);
  1408       temp[len] = '\0';
  1409       const LangTagLookup* entry = DoLangTagLookup(temp,
  1410                                                    kCLDLangTagsHintTable1,
  1411                                                    kCLDTable1Size);
  1412       if (entry != NULL) {
  1413         // First table hit
  1414         MergeCLDLangPriorsMax(entry->onelangprior1, langpriors);
  1415         MergeCLDLangPriorsMax(entry->onelangprior2, langpriors);
  1416       } else {
  1417         // Try second table with language code truncated at first hyphen
  1418         char* hyphen = strchr(temp, '-');
  1419         if (hyphen != NULL) {*hyphen = '\0';}
  1420         len = strlen(temp);
  1421         if (len <= 3) {                 // Short enough to use
  1422           entry = DoLangTagLookup(temp,
  1423                                   kCLDLangTagsHintTable2,
  1424                                   kCLDTable2Size);
  1425           if (entry != NULL) {
  1426             // Second table hit
  1427             MergeCLDLangPriorsMax(entry->onelangprior1, langpriors);
  1428             MergeCLDLangPriorsMax(entry->onelangprior2, langpriors);
  1433     pos = comma + 1;
  1437 // Add hints to vector of langpriors
  1438 // Input is string after HTTP header Content-Language:
  1439 void SetCLDContentLangHint(const char* contentlang, CLDLangPriors* langpriors) {
  1440   string langtags = CopyOneQuotedString(contentlang, 0, strlen(contentlang));
  1441   SetCLDLangTagsHint(langtags, langpriors);
  1444 // Add hints to vector of langpriors
  1445 // Input is last element of hostname (no dot), e.g. from GetTLD()
  1446 void SetCLDTLDHint(const char* tld, CLDLangPriors* langpriors) {
  1447   int len = strlen(tld);
  1448   if (len > 3) {return;}        // Ignore if more than three letters
  1449   char local_tld[4];
  1450   strncpy(local_tld, tld, 4);
  1451   local_tld[3] = '\0';          // Safety move
  1452   // Lowercase
  1453   for (int i = 0; i < len; ++i) {local_tld[i] |= 0x20;}
  1454   const TLDLookup* entry = DoTLDLookup(local_tld,
  1455                                        kCLDTLDHintTable,
  1456                                        kCLDTable3Size);
  1457   if (entry != NULL) {
  1458     // Table hit
  1459     MergeCLDLangPriorsBoost(entry->onelangprior1, langpriors);
  1460     MergeCLDLangPriorsBoost(entry->onelangprior2, langpriors);
  1464 // Add hints to vector of langpriors
  1465 // Input is from DetectEncoding()
  1466 void SetCLDEncodingHint(Encoding enc, CLDLangPriors* langpriors) {
  1467   OneCLDLangPrior olp;
  1468   switch (enc) {
  1469   case CHINESE_GB:
  1470   case GBK:
  1471   case GB18030:
  1472   case ISO_2022_CN:
  1473   case HZ_GB_2312:
  1474     olp = PackCLDPriorLangWeight(CHINESE, kCLDPriorEncodingWeight);
  1475     MergeCLDLangPriorsBoost(olp, langpriors);
  1476     break;
  1477   case CHINESE_BIG5:
  1478   case CHINESE_BIG5_CP950:
  1479   case BIG5_HKSCS:
  1480     olp = PackCLDPriorLangWeight(CHINESE_T, kCLDPriorEncodingWeight);
  1481     MergeCLDLangPriorsBoost(olp, langpriors);
  1482     break;
  1483   case JAPANESE_EUC_JP:
  1484   case JAPANESE_SHIFT_JIS:
  1485   case JAPANESE_CP932:
  1486   case JAPANESE_JIS:          // ISO-2022-JP
  1487     olp = PackCLDPriorLangWeight(JAPANESE, kCLDPriorEncodingWeight);
  1488     MergeCLDLangPriorsBoost(olp, langpriors);
  1489     break;
  1490   case KOREAN_EUC_KR:
  1491   case ISO_2022_KR:
  1492     olp = PackCLDPriorLangWeight(KOREAN, kCLDPriorEncodingWeight);
  1493     MergeCLDLangPriorsBoost(olp, langpriors);
  1494     break;
  1496   default:
  1497     break;
  1501 // Add hints to vector of langpriors
  1502 // Input is from random source
  1503 void SetCLDLanguageHint(Language lang, CLDLangPriors* langpriors) {
  1504   OneCLDLangPrior olp = PackCLDPriorLangWeight(lang, kCLDPriorLanguageWeight);
  1505   MergeCLDLangPriorsBoost(olp, langpriors);
  1509 // Make printable string of priors
  1510 string DumpCLDLangPriors(const CLDLangPriors* langpriors) {
  1511   string retval;
  1512   for (int i = 0; i < langpriors->n; ++i) {
  1513     char temp[64];
  1514     sprintf(temp, "%s.%d ",
  1515              LanguageCode(GetCLDPriorLang(langpriors->prior[i])),
  1516              GetCLDPriorWeight(langpriors->prior[i]));
  1517     retval.append(temp);
  1519   return retval;
  1525 // Look for
  1526 //  <html lang="en">
  1527 //  <doc xml:lang="en">
  1528 //  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en-US">
  1529 //  <meta http-equiv="content-language" content="en-GB" />
  1530 //  <meta name="language" content="Srpski">
  1531 //  <meta name="DC.language" scheme="RFCOMMA766" content="en">
  1532 //  <SPAN id="msg1" class="info" lang='en'>
  1533 //
  1534 // Do not trigger on
  1535 //  <!-- lang=french ...-->
  1536 //  <font lang=postscript ...>
  1537 //  <link href="index.fr.html" hreflang="fr-FR" xml:lang="fr-FR" />
  1538 //  <META name="Author" lang="fr" content="Arnaud Le Hors">
  1539 //
  1540 // Stop fairly quickly on mismatched quotes
  1541 //
  1542 // Allowed language characters
  1543 //  a-z A-Z -_ , space\t
  1544 // Think about: GB2312, big5, shift-jis, euc-jp, ksc euc-kr
  1545 //  zh-hans zh-TW cmn-Hani zh_cn.gb18030_CN zh-min-nan zh-yue
  1546 //  de-x-mtfrom-en  zh-tw-x-mtfrom-en  (machine translation)
  1547 // GB2312 => gb
  1548 // Big5 => big
  1549 // zh_CN.gb18030_C => zh-cn
  1550 //
  1551 // Remove duplicates and extra spaces as we go
  1552 // Lowercase as we go.
  1554 // Get language tag hints from HTML body
  1555 // Normalize: remove spaces and make lowercase comma list
  1557 string GetLangTagsFromHtml(const char* utf8_body, int32 utf8_body_len,
  1558                            int32 max_scan_bytes) {
  1559   string retval;
  1560   if (max_scan_bytes > utf8_body_len) {
  1561     max_scan_bytes = utf8_body_len;
  1564   int32 k = 0;
  1565   while (k < max_scan_bytes) {
  1566     int32 start_tag = FindTagStart(utf8_body, k, max_scan_bytes);
  1567     if (start_tag < 0) {break;}
  1568     int32 end_tag = FindTagEnd(utf8_body, start_tag + 1, max_scan_bytes);
  1569     // FindTagEnd exits on < > &
  1570     if (end_tag < 0) {break;}
  1572     // Skip <!--...>
  1573     // Skip <font ...>
  1574     // Skip <script ...>
  1575     // Skip <link ...>
  1576     // Skip <img ...>
  1577     // Skip <a ...>
  1578     if (FindAfter(utf8_body, start_tag + 1, end_tag, "!--") ||
  1579         FindAfter(utf8_body, start_tag + 1, end_tag, "font ") ||
  1580         FindAfter(utf8_body, start_tag + 1, end_tag, "script ") ||
  1581         FindAfter(utf8_body, start_tag + 1, end_tag, "link ") ||
  1582         FindAfter(utf8_body, start_tag + 1, end_tag, "img ") ||
  1583         FindAfter(utf8_body, start_tag + 1, end_tag, "a ")) {
  1584       k = end_tag + 1;
  1585       continue;
  1588     // Remember <meta ...>
  1589     bool in_meta = false;
  1590     if (FindAfter(utf8_body, start_tag + 1, end_tag, "meta ")) {
  1591       in_meta = true;
  1594     // Scan for each equal sign inside tag
  1595     bool content_is_lang = false;
  1596     int32 kk = start_tag + 1;
  1597     int32 equal_sign;
  1598     while ((equal_sign = FindEqualSign(utf8_body, kk, end_tag)) >= 0) {
  1599       // eq exits on < > &
  1601       // Look inside a meta tag
  1602       // <meta ... http-equiv="content-language" ...>
  1603       // <meta ... name="language" ...>
  1604       // <meta ... name="dc.language" ...>
  1605       if (in_meta) {
  1606         if (FindBefore(utf8_body, kk, equal_sign, " http-equiv") &&
  1607             FindAfter(utf8_body, equal_sign + 1, end_tag,
  1608                       "content-language ")) {
  1609           content_is_lang = true;
  1610         } else if (FindBefore(utf8_body, kk, equal_sign, " name") &&
  1611                    (FindAfter(utf8_body, equal_sign + 1, end_tag,
  1612                               "dc.language ") ||
  1613                     FindAfter(utf8_body, equal_sign + 1, end_tag,
  1614                               "language "))) {
  1615           content_is_lang = true;
  1619       // Look inside any tag
  1620       // <meta ... content="lang-list" ...>
  1621       // <... lang="lang-list" ...>
  1622       // <... xml:lang="lang-list" ...>
  1623       if ((content_is_lang && FindBefore(utf8_body, kk, equal_sign,
  1624                                          " content")) ||
  1625           FindBefore(utf8_body, kk, equal_sign, " lang") ||
  1626           FindBefore(utf8_body, kk, equal_sign, ":lang")) {
  1627         string temp = CopyQuotedString(utf8_body, equal_sign + 1, end_tag);
  1629         // Append new lang tag(s) if not a duplicate
  1630         if (!temp.empty() && (retval.find(temp) == string::npos)) {
  1631           retval.append(temp);
  1635       kk = equal_sign + 1;
  1637     k = end_tag + 1;
  1640   // Strip last comma
  1641   if (retval.size() > 1) {
  1642     retval.erase(retval.size() - 1);
  1644   return retval;
  1647 }       // End namespace CLD2
  1649 //==============================================================================

mercurial