michael@0: // Copyright 2013 Google Inc. All Rights Reserved.
michael@0: //
michael@0: // Licensed under the Apache License, Version 2.0 (the "License");
michael@0: // you may not use this file except in compliance with the License.
michael@0: // You may obtain a copy of the License at
michael@0: //
michael@0: //     http://www.apache.org/licenses/LICENSE-2.0
michael@0: //
michael@0: // Unless required by applicable law or agreed to in writing, software
michael@0: // distributed under the License is distributed on an "AS IS" BASIS,
michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0: // See the License for the specific language governing permissions and
michael@0: // limitations under the License.
michael@0: 
michael@0: //
michael@0: // Author: dsites@google.com (Dick Sites)
michael@0: //
michael@0: 
michael@0: #include "compact_lang_det_hint_code.h"
michael@0: 
michael@0: #include <stdlib.h>     // for abs()
michael@0: #include <stdio.h>      // for sprintf()
michael@0: #include <string.h>     //
michael@0: #include "lang_script.h"
michael@0: #include "port.h"
michael@0: 
michael@0: using namespace std;
michael@0: 
michael@0: namespace CLD2 {
michael@0: 
michael@0: static const int kCLDPriorEncodingWeight = 4;   // 100x more likely
michael@0: static const int kCLDPriorLanguageWeight = 8;   // 10000x more likely
michael@0: 
michael@0: 
michael@0: // Tables to map lang="..." language code lists to actual languages.
michael@0: // based on scraping and hand-edits, dsites June 2011
michael@0: 
michael@0: // n = f(string, &a) gives list of n<=4 language pairs: primary, secondary
michael@0: 
michael@0: // For close pairs like ms/id, more weight on TLD and lang=
michael@0: // Alternately, weaker boost but mark others of set as negative;
michael@0: // makes "neither" an easier result.
michael@0: // lang=en low weight 4
michael@0: // tld=lu boost lu maaybe 4. but lang= alwyas overcomes tld and encoding
michael@0: // (except maybe en)
michael@0: 
michael@0: // TLD to separate, e.g., burundi from rwanda
michael@0: 
michael@0: // Encoding lookup: OneLangProb array
michael@0: // TLD lookup:   tld OneLangProb pairs
michael@0: 
michael@0: 
michael@0: typedef struct {
michael@0:   const char* const langtag;    // Lowercased, hyphen only lookup key
michael@0:   const char* const langcode;   // Canonical language codes; two if ambiguous
michael@0:   OneCLDLangPrior onelangprior1;
michael@0:   OneCLDLangPrior onelangprior2;
michael@0: } LangTagLookup;
michael@0: 
michael@0: typedef struct {
michael@0:   const char* const tld;        // Lowercased, hyphen only lookup key
michael@0:   OneCLDLangPrior onelangprior1;
michael@0:   OneCLDLangPrior onelangprior2;
michael@0: } TLDLookup;
michael@0: 
michael@0: 
michael@0: #define W2 (2 << 10)            // 3**2 = 10x more likely
michael@0: #define W4 (4 << 10)            // 3**4 = 100x more likely
michael@0: #define W6 (6 << 10)            // 3**6 = 1000x more likely
michael@0: #define W8 (8 << 10)            // 3**8 = 10K x more likely
michael@0: #define W10 (10 << 10)          // 3**10 = 100K x more likely
michael@0: #define W12 (12 << 10)          // 3**12 = 1M x more likely
michael@0: 
michael@0: // TODO: more about ba hr sr sr-ME and sl
michael@0: // Temporary state of affairs:
michael@0: //   BOSNIAN CROATIAN MONTENEGRIN SERBIAN detecting just CROATIAN SERBIAN
michael@0: // Eventually, we want to do all four, but it requires a CLD change to handle
michael@0: // up to six languages per quadgram.
michael@0: 
michael@0: 
michael@0: // Close pairs boost one of pair, demote other.
michael@0: //   Statistically close pairs:
michael@0: //   INDONESIAN/MALAY difficult to distinguish -- extra word-based lookups used
michael@0: //
michael@0: //   INDONESIAN MALAY coef=0.4698        Problematic w/o extra words
michael@0: //   TIBETAN DZONGKHA coef=0.4571
michael@0: //   CZECH SLOVAK coef=0.4273
michael@0: //   NORWEGIAN NORWEGIAN_N coef=0.4182
michael@0: //
michael@0: //   HINDI MARATHI coef=0.3795
michael@0: //   ZULU XHOSA coef=0.3716
michael@0: //
michael@0: //   DANISH NORWEGIAN coef=0.3672        Usually OK
michael@0: //   BIHARI HINDI coef=0.3668            Usually OK
michael@0: //   ICELANDIC FAROESE coef=0.3519       Usually OK
michael@0: 
michael@0: //
michael@0: // Table to look up lang= tags longer than three characters
michael@0: // Overrides table below, which is truncated at first hyphen
michael@0: // In alphabetical order for binary search
michael@0: static const int kCLDTable1Size = 213;
michael@0: static const LangTagLookup kCLDLangTagsHintTable1[kCLDTable1Size] = {
michael@0:   {"abkhazian", "ab", ABKHAZIAN + W10, 0},
michael@0:   {"afar", "aa", AFAR + W10, 0},
michael@0:   {"afrikaans", "af", AFRIKAANS + W10, 0},
michael@0:   {"akan", "ak", AKAN + W10, 0},
michael@0:   {"albanian", "sq", ALBANIAN + W10, 0},
michael@0:   {"am-am", "hy", ARMENIAN + W10, 0},        // 1:2 Armenian, not ambiguous
michael@0:   {"amharic", "am", AMHARIC + W10, 0},
michael@0:   {"arabic", "ar", ARABIC + W10, 0},
michael@0:   {"argentina", "es", SPANISH + W10, 0},
michael@0:   {"armenian", "hy", ARMENIAN + W10, 0},
michael@0:   {"assamese", "as", ASSAMESE + W10, 0},
michael@0:   {"aymara", "ay", AYMARA + W10, 0},
michael@0:   {"azerbaijani", "az", AZERBAIJANI + W10, 0},
michael@0: 
michael@0:   {"bangla", "bn", BENGALI + W10, 0},
michael@0:   {"bashkir", "ba", BASHKIR + W10, 0},
michael@0:   {"basque", "eu", BASQUE + W10, 0},
michael@0:   {"belarusian", "be", BELARUSIAN + W10, 0},
michael@0:   {"bengali", "bn", BENGALI + W10, 0},
michael@0:   {"bihari", "bh", BIHARI + W10, HINDI - W4},
michael@0:   {"bislama", "bi", BISLAMA + W10, 0},
michael@0:   {"bosnian", "bs", BOSNIAN + W10, 0},      // Bosnian => Bosnian
michael@0:   {"br-br", "pt", PORTUGUESE + W10, 0},     // 1:2 Portuguese, not ambiguous
michael@0:   {"br-fr", "br", BRETON + W10, 0},         // 1:2 Breton, not ambiguous
michael@0:   {"breton", "br", BRETON + W10, 0},
michael@0:   {"bulgarian", "bg", BULGARIAN + W10, 0},
michael@0:   {"burmese", "my", BURMESE + W10, 0},      // Myanmar
michael@0: 
michael@0:   {"catalan", "ca", CATALAN + W10, 0},
michael@0:   {"cherokee", "chr", CHEROKEE + W10, 0},
michael@0:   {"chichewa", "ny", NYANJA + W10, 0},
michael@0: 
michael@0:   {"chinese", "zh", CHINESE + W10, 0},
michael@0:   {"chinese-t", "zhT", CHINESE_T + W10, 0},
michael@0:   {"chineset", "zhT", CHINESE_T + W10, 0},
michael@0:   {"corsican", "co", CORSICAN + W10, 0},
michael@0:   {"cpf-hat", "ht", HAITIAN_CREOLE + W10, 0}, // Creole, French-based
michael@0:   {"croatian", "hr", CROATIAN + W10, 0},
michael@0:   {"czech", "cs", CZECH + W10, SLOVAK - W4},
michael@0: 
michael@0:   {"danish", "da", DANISH + W10, NORWEGIAN - W4},
michael@0:   {"deutsch", "de", GERMAN + W10, 0},
michael@0:   {"dhivehi", "dv", DHIVEHI + W10, 0},
michael@0:   {"dutch", "nl", DUTCH + W10, 0},
michael@0:   {"dzongkha", "dz", DZONGKHA + W10,  TIBETAN - W4},
michael@0: 
michael@0:   {"ell-gr", "el", GREEK + W10, 0},
michael@0:   {"english", "en", ENGLISH + W4, 0},
michael@0:   {"esperanto", "eo", ESPERANTO + W10, 0},
michael@0:   {"estonian", "et", ESTONIAN + W10, 0},
michael@0:   {"euc-jp", "ja", JAPANESE + W10, 0},       // Japanese encoding
michael@0:   {"euc-kr", "ko", KOREAN + W10, 0},         // Korean encoding
michael@0: 
michael@0:   {"faroese", "fo", FAROESE + W10, ICELANDIC - W4},
michael@0:   {"fijian", "fj", FIJIAN + W10, 0},
michael@0:   {"finnish", "fi", FINNISH + W10, 0},
michael@0:   {"fran", "fr", FRENCH + W10, 0},            // Truncated at non-ASCII
michael@0:   {"francais", "fr", FRENCH + W10, 0},
michael@0:   {"french", "fr", FRENCH + W10, 0},
michael@0:   {"frisian", "fy", FRISIAN + W10, 0},
michael@0: 
michael@0:   {"ga-es", "gl", GALICIAN + W10, 0},         // 1:2 Galician, not ambiguous
michael@0:   {"galician", "gl", GALICIAN + W10, 0},
michael@0:   {"ganda", "lg", GANDA + W10, 0},
michael@0:   {"georgian", "ka", GEORGIAN + W10, 0},
michael@0:   {"german", "de", GERMAN + W10, 0},
michael@0:   {"greek", "el", GREEK + W10, 0},
michael@0:   {"greenlandic", "kl", GREENLANDIC + W10, 0},
michael@0:   {"guarani", "gn", GUARANI + W10, 0},
michael@0:   {"gujarati", "gu", GUJARATI + W10, 0},
michael@0: 
michael@0:   {"haitian_creole", "ht", HAITIAN_CREOLE + W10, 0},
michael@0:   {"hausa", "ha", HAUSA + W10, 0},
michael@0:   {"hawaiian", "haw", HAWAIIAN + W10, 0},
michael@0:   {"hebrew", "iw", HEBREW + W10, 0},
michael@0:   {"hindi", "hi", HINDI + W10, MARATHI - W4},
michael@0:   {"hn-in", "hi", HINDI + W10, MARATHI - W4},
michael@0:   {"hungarian", "hu", HUNGARIAN + W10, 0},
michael@0: 
michael@0:   {"icelandic", "is", ICELANDIC + W10, FAROESE - W4},
michael@0:   {"igbo", "ig", IGBO + W10, 0},
michael@0:   {"indonesian", "id", INDONESIAN + W10, MALAY - W4},
michael@0:   {"interlingua", "ia", INTERLINGUA + W10, 0},
michael@0:   {"interlingue", "ie", INTERLINGUE + W10, 0},
michael@0:   // 1:2 iu-Cans ik-Latn
michael@0:   {"inuktitut", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2
michael@0:   {"inupiak", "ik,iu", INUPIAK + W10, INUKTITUT + W10},   // 1:2
michael@0:   {"ir-ie", "ga", IRISH + W10, 0},          // Irish
michael@0:   {"irish", "ga", IRISH + W10, 0},
michael@0:   {"italian", "it", ITALIAN + W10, 0},
michael@0: 
michael@0:   {"ja-euc", "ja", JAPANESE + W10, 0},      // Japanese encoding
michael@0:   {"jan-jp", "ja", JAPANESE + W10, 0},      // Japanese encoding
michael@0:   {"japanese", "ja", JAPANESE + W10, 0},
michael@0:   {"javanese", "jw", JAVANESE + W10, 0},
michael@0: 
michael@0:   {"kannada", "kn", KANNADA + W10, 0},
michael@0:   {"kashmiri", "ks", KASHMIRI + W10, 0},
michael@0:   {"kazakh", "kk", KAZAKH + W10, 0},
michael@0:   {"khasi", "kha", KHASI + W10, 0},
michael@0:   {"khmer", "km", KHMER + W10, 0},
michael@0:   {"kinyarwanda", "rw", KINYARWANDA + W10, 0},
michael@0:   {"klingon", "tlh", X_KLINGON + W10, 0},
michael@0:   {"korean", "ko", KOREAN + W10, 0},
michael@0:   {"kurdish", "ku", KURDISH + W10, 0},
michael@0:   {"kyrgyz", "ky", KYRGYZ + W10, 0},
michael@0: 
michael@0:   {"laothian", "lo", LAOTHIAN + W10, 0},
michael@0:   {"latin", "la", LATIN + W10, 0},
michael@0:   {"latvian", "lv", LATVIAN + W10, 0},
michael@0:   {"limbu", "sit", LIMBU + W10, 0},
michael@0:   {"lingala", "ln", LINGALA + W10, 0},
michael@0:   {"lithuanian", "lt", LITHUANIAN + W10, 0},
michael@0:   {"luxembourgish", "lb", LUXEMBOURGISH + W10, 0},
michael@0: 
michael@0:   {"macedonian", "mk", MACEDONIAN + W10, 0},
michael@0:   {"malagasy", "mg", MALAGASY + W10, 0},
michael@0:   {"malay", "ms", MALAY + W10, INDONESIAN - W4},
michael@0:   {"malayalam", "ml", MALAYALAM + W10, 0},
michael@0:   {"maltese", "mt", MALTESE + W10, 0},
michael@0:   {"manx", "gv", MANX + W10, 0},
michael@0:   {"maori", "mi", MAORI + W10, 0},
michael@0:   {"marathi", "mr", MARATHI + W10, HINDI - W4},
michael@0:   {"mauritian_creole", "mfe", MAURITIAN_CREOLE + W10, 0},
michael@0:   {"moldavian", "mo", ROMANIAN + W10, 0},
michael@0:   {"mongolian", "mn", MONGOLIAN + W10, 0},
michael@0:   {"montenegrin", "sr-me", MONTENEGRIN + W10, 0},
michael@0:   {"myanmar", "my", BURMESE + W10, 0},      // Myanmar
michael@0:   {"nauru", "na", NAURU + W10, 0},
michael@0:   {"ndebele", "nr", NDEBELE + W10, 0},
michael@0:   {"nepali", "ne", NEPALI + W10, 0},
michael@0:   {"no-bok", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},       // Bokmaal
michael@0:   {"no-bokmaal", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
michael@0:   {"no-nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},        // Bokmaal
michael@0:   {"no-no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
michael@0:   {"no-nyn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},       // Nynorsk
michael@0:   {"no-nynorsk", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
michael@0:   {"norwegian", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
michael@0:   {"norwegian_n", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
michael@0:   {"nyanja", "ny", NYANJA + W10, 0},
michael@0: 
michael@0:   {"occitan", "oc", OCCITAN + W10, 0},
michael@0:   {"oriya", "or", ORIYA + W10, 0},
michael@0:   {"oromo", "om", OROMO + W10, 0},
michael@0:   {"parsi", "fa", PERSIAN + W10, 0},
michael@0: 
michael@0:   {"pashto", "ps", PASHTO + W10, 0},
michael@0:   {"pedi", "nso", PEDI + W10, 0},
michael@0:   {"persian", "fa", PERSIAN + W10, 0},
michael@0:   {"polish", "pl", POLISH + W10, 0},
michael@0:   {"polska", "pl", POLISH + W10, 0},
michael@0:   {"polski", "pl", POLISH + W10, 0},
michael@0:   {"portugu", "pt", PORTUGUESE + W10, 0},     // Truncated at non-ASCII
michael@0:   {"portuguese", "pt", PORTUGUESE + W10, 0},
michael@0:   {"punjabi", "pa", PUNJABI + W10, 0},
michael@0: 
michael@0:   {"quechua", "qu", QUECHUA + W10, 0},
michael@0: 
michael@0:   {"rhaeto_romance", "rm", RHAETO_ROMANCE + W10, 0},
michael@0:   {"romanian", "ro", ROMANIAN + W10, 0},
michael@0:   {"rundi", "rn", RUNDI + W10, 0},
michael@0:   {"russian", "ru", RUSSIAN + W10, 0},
michael@0: 
michael@0:   {"samoan", "sm", SAMOAN + W10, 0},
michael@0:   {"sango", "sg", SANGO + W10, 0},
michael@0:   {"sanskrit", "sa", SANSKRIT + W10, 0},
michael@0:   {"scots", "sco", SCOTS + W10, ENGLISH - W4},
michael@0:   {"scots_gaelic", "gd", SCOTS_GAELIC + W10, 0},
michael@0:   {"serbian", "sr", SERBIAN + W10, 0},
michael@0:   {"seselwa", "crs", SESELWA + W10, 0},
michael@0:   {"sesotho", "st", SESOTHO + W10, 0},
michael@0:   {"shift-jis", "ja", JAPANESE + W10, 0},   // Japanese encoding
michael@0:   {"shift-js", "ja", JAPANESE + W10, 0},    // Japanese encoding
michael@0:   {"shona", "sn", SHONA + W10, 0},
michael@0:   {"si-lk", "si", SINHALESE + W10, 0},      // 1:2 Sri Lanka, not ambiguous
michael@0:   {"si-si", "sl", SLOVENIAN + W10, 0},      // 1:2 Slovenia, not ambiguous
michael@0:   {"si-sl", "sl", SLOVENIAN + W10, 0},      // 1:2 Slovenia, not ambiguous
michael@0:   {"sindhi", "sd", SINDHI + W10, 0},
michael@0:   {"sinhalese", "si", SINHALESE + W10, 0},
michael@0:   {"siswant", "ss", SISWANT + W10, 0},
michael@0:   {"sit-np", "sit", LIMBU + W10, 0},
michael@0:   {"slovak", "sk", SLOVAK + W10, CZECH - W4},
michael@0:   {"slovenian", "sl", SLOVENIAN + W10, 0},
michael@0:   {"somali", "so", SOMALI + W10, 0},
michael@0:   {"spanish", "es", SPANISH + W10, 0},
michael@0:   {"sr-me", "sr-me", MONTENEGRIN + W10, 0}, // Montenegrin => Montenegrin
michael@0:   {"sundanese", "su", SUNDANESE + W10, 0},
michael@0:   {"suomi", "fi", FINNISH + W10, 0},        // Finnish
michael@0:   {"swahili", "sw", SWAHILI + W10, 0},
michael@0:   {"swedish", "sv", SWEDISH + W10, 0},
michael@0:   {"syriac", "syr", SYRIAC + W10, 0},
michael@0: 
michael@0:   {"tagalog", "tl", TAGALOG + W10, 0},
michael@0:   {"tajik", "tg", TAJIK + W10, 0},
michael@0:   {"tamil", "ta", TAMIL + W10, 0},
michael@0:   {"tatar", "tt", TATAR + W10, 0},
michael@0:   {"tb-tb", "bo", TIBETAN + W10, DZONGKHA - W4},        // Tibet
michael@0:   {"tchinese", "zhT", CHINESE_T + W10, 0},
michael@0:   {"telugu", "te", TELUGU + W10, 0},
michael@0:   {"thai", "th", THAI + W10, 0},
michael@0:   {"tibetan", "bo", TIBETAN + W10, DZONGKHA - W4},
michael@0:   {"tigrinya", "ti", TIGRINYA + W10, 0},
michael@0:   {"tonga", "to", TONGA + W10, 0},
michael@0:   {"tsonga", "ts", TSONGA + W10, 0},
michael@0:   {"tswana", "tn", TSWANA + W10, 0},
michael@0:   {"tt-ru", "tt", TATAR + W10, 0},
michael@0:   {"tur-tr", "tr", TURKISH + W10, 0},
michael@0:   {"turkish", "tr", TURKISH + W10, 0},
michael@0:   {"turkmen", "tk", TURKMEN + W10, 0},
michael@0:   {"uighur", "ug", UIGHUR + W10, 0},
michael@0:   {"ukrainian", "uk", UKRAINIAN + W10, 0},
michael@0:   {"urdu", "ur", URDU + W10, 0},
michael@0:   {"uzbek", "uz", UZBEK + W10, 0},
michael@0: 
michael@0:   {"venda", "ve", VENDA + W10, 0},
michael@0:   {"vietnam", "vi", VIETNAMESE + W10, 0},
michael@0:   {"vietnamese", "vi", VIETNAMESE + W10, 0},
michael@0:   {"volapuk", "vo", VOLAPUK + W10, 0},
michael@0: 
michael@0:   {"welsh", "cy", WELSH + W10, 0},
michael@0:   {"wolof", "wo", WOLOF + W10, 0},
michael@0: 
michael@0:   {"xhosa", "xh", XHOSA + W10, ZULU - W4},
michael@0: 
michael@0:   {"yiddish", "yi", YIDDISH + W10, 0},
michael@0:   {"yoruba", "yo", YORUBA + W10, 0},
michael@0: 
michael@0:   {"zh-classical", "zhT", CHINESE_T + W10, 0},
michael@0:   {"zh-cn", "zh", CHINESE + W10, 0},
michael@0:   {"zh-hans", "zh", CHINESE + W10, 0},
michael@0:   {"zh-hant", "zhT", CHINESE_T + W10, 0},
michael@0:   {"zh-hk", "zhT", CHINESE_T + W10, 0},
michael@0:   {"zh-min-nan", "zhT", CHINESE_T + W10, 0}, // Min Nan => ChineseT
michael@0:   {"zh-sg", "zhT", CHINESE_T + W10, 0},
michael@0:   {"zh-tw", "zhT", CHINESE_T + W10, 0},
michael@0:   {"zh-yue", "zh", CHINESE + W10, 0},       // Yue (Cantonese) => Chinese
michael@0:   {"zhuang", "za", ZHUANG + W10, 0},
michael@0:   {"zulu", "zu", ZULU + W10, XHOSA - W4},
michael@0: };
michael@0: 
michael@0: 
michael@0: 
michael@0: // Table to look up lang= tags of two/three characters after truncate at hyphen
michael@0: // In alphabetical order for binary search
michael@0: static const int kCLDTable2Size = 257;
michael@0: static const LangTagLookup kCLDLangTagsHintTable2[kCLDTable2Size] = {
michael@0:   {"aa", "aa", AFAR + W10, 0},
michael@0:   {"ab", "ab", ABKHAZIAN + W10, 0},
michael@0:   {"af", "af", AFRIKAANS + W10, 0},
michael@0:   {"ak", "ak", AKAN + W10, 0},
michael@0:   {"al", "sq", ALBANIAN + W10, 0},          // Albania
michael@0:   {"am", "am,hy", AMHARIC + W10, ARMENIAN + W10},  // 1:2 Amharic Armenian
michael@0:   {"ar", "ar", ARABIC + W10, 0},
michael@0:   {"ara", "ar", ARABIC + W10, 0},
michael@0:   {"arm", "hy", ARMENIAN + W10, 0},         // Armenia
michael@0:   {"arz", "ar", ARABIC + W10, 0},           // Egyptian Arabic
michael@0:   {"as", "as", ASSAMESE + W10, 0},
michael@0:   {"at", "de", GERMAN + W10, 0},            // Austria
michael@0:   {"au", "de", GERMAN + W10, 0},            // Austria
michael@0:   {"ay", "ay", AYMARA + W10, 0},
michael@0:   {"az", "az", AZERBAIJANI + W10, 0},
michael@0:   {"aze", "az", AZERBAIJANI + W10, 0},
michael@0: 
michael@0:   {"ba", "ba,bs", BASHKIR + W10, BOSNIAN + W10},  // 1:2  Bashkir Bosnia
michael@0:   {"be", "be", BELARUSIAN + W10, 0},
michael@0:   {"bel", "be", BELARUSIAN + W10, 0},
michael@0:   {"bg", "bg", BULGARIAN + W10, 0},
michael@0:   {"bh", "bh", BIHARI + W10, HINDI - W4},
michael@0:   {"bi", "bi", BISLAMA + W10, 0},
michael@0:   {"big", "zhT", CHINESE_T + W10, 0},        // Big5 encoding
michael@0:   {"bm", "ms", MALAY + W10, INDONESIAN - W4},             // Bahasa Malaysia
michael@0:   {"bn", "bn", BENGALI + W10, 0},
michael@0:   {"bo", "bo", TIBETAN + W10, DZONGKHA - W4},
michael@0:   // 1:2 Breton, Brazil country code, both Latn .br TLD enough for pt to win
michael@0:   {"br", "br,pt", BRETON + W10, PORTUGUESE + W8}, // 1:2 Breton, Brazil
michael@0:   {"bs", "bs", BOSNIAN + W10, 0},           // Bosnian => Bosnian
michael@0: 
michael@0:   {"ca", "ca", CATALAN + W10, 0},
michael@0:   {"cat", "ca", CATALAN + W10, 0},
michael@0:   {"ch", "de,fr", GERMAN + W10, FRENCH + W10},    // 1:2 Switzerland
michael@0:   {"chn", "zh", CHINESE + W10, 0},
michael@0:   {"chr", "chr", CHEROKEE + W10, 0},
michael@0:   {"ckb", "ku", KURDISH + W10, 0},          // Central Kurdish
michael@0:   {"cn", "zh,zhT", CHINESE + W6, CHINESE_T + W4},   // Ambiguous, so weaker.
michael@0:                                                 // Offset by 2 so that TLD=tw or
michael@0:                                                 // enc=big5 will put zhT ahead
michael@0:   {"co", "co", CORSICAN + W10, 0},
michael@0:   {"cro", "hr", CROATIAN + W10, 0},          // Croatia
michael@0:   {"crs", "crs", SESELWA + W10, 0},
michael@0:   {"cs", "cs", CZECH + W10, SLOVAK - W4},
michael@0:   {"ct", "ca", CATALAN + W10, 0},
michael@0:   {"cy", "cy", WELSH + W10, 0},
michael@0:   {"cym", "cy", WELSH + W10, 0},
michael@0:   {"cz", "cs", CZECH + W10, SLOVAK - W4},
michael@0: 
michael@0:   {"da", "da", DANISH + W10, NORWEGIAN - W4},
michael@0:   {"dan", "da", DANISH + W10, NORWEGIAN - W4},
michael@0:   {"de", "de", GERMAN + W10, 0},
michael@0:   {"deu", "de", GERMAN + W10, 0},
michael@0:   {"div", "dv", DHIVEHI + W10, 0},
michael@0:   {"dk", "da", DANISH + W10, NORWEGIAN - W4},            // Denmark
michael@0:   {"dut", "nl", DUTCH + W10, 0},            // Dutch
michael@0:   {"dv", "dv", DHIVEHI + W10, 0},
michael@0:   {"dz", "dz", DZONGKHA + W10, TIBETAN - W4},
michael@0: 
michael@0:   {"ee", "et", ESTONIAN + W10, 0},          // Estonia
michael@0:   {"eg", "ar", ARABIC + W10, 0},            // Egypt
michael@0:   {"el", "el", GREEK + W10, 0},
michael@0:   {"en", "en", ENGLISH + W4, 0},
michael@0:   {"eng", "en", ENGLISH + W4, 0},
michael@0:   {"eo", "eo", ESPERANTO + W10, 0},
michael@0:   {"er", "ur", URDU + W10, 0},              // "Erdu"
michael@0:   {"es", "es", SPANISH + W10, 0},
michael@0:   {"esp", "es", SPANISH + W10, 0},
michael@0:   {"est", "et", ESTONIAN + W10, 0},
michael@0:   {"et", "et", ESTONIAN + W10, 0},
michael@0:   {"eu", "eu", BASQUE + W10, 0},
michael@0: 
michael@0:   {"fa", "fa", PERSIAN + W10, 0},
michael@0:   {"far", "fa", PERSIAN + W10, 0},
michael@0:   {"fi", "fi", FINNISH + W10, 0},
michael@0:   {"fil", "tl", TAGALOG + W10, 0},          // Philippines
michael@0:   {"fj", "fj", FIJIAN + W10, 0},
michael@0:   {"fo", "fo", FAROESE + W10, ICELANDIC - W4},
michael@0:   {"fr", "fr", FRENCH + W10, 0},
michael@0:   {"fra", "fr", FRENCH + W10, 0},
michael@0:   {"fre", "fr", FRENCH + W10, 0},
michael@0:   {"fy", "fy", FRISIAN + W10, 0},
michael@0: 
michael@0:   {"ga", "ga,gl", IRISH + W10, GALICIAN + W10},       // 1:2 Irish, Galician
michael@0:   {"gae", "gd,ga", SCOTS_GAELIC + W10, IRISH + W10},  // 1:2 Gaelic, either
michael@0:   {"gal", "gl", GALICIAN + W10, 0},
michael@0:   {"gb", "zh", CHINESE + W10, 0},           // GB2312 encoding
michael@0:   {"gbk", "zh", CHINESE + W10, 0},          // GBK encoding
michael@0:   {"gd", "gd", SCOTS_GAELIC + W10, 0},
michael@0:   {"ge", "ka", GEORGIAN + W10, 0},          // Georgia
michael@0:   {"geo", "ka", GEORGIAN + W10, 0},
michael@0:   {"ger", "de", GERMAN + W10, 0},
michael@0:   {"gl", "gl", GALICIAN + W10, 0},          // Also Greenland; hard to confuse
michael@0:   {"gn", "gn", GUARANI + W10, 0},
michael@0:   {"gr", "el", GREEK + W10, 0},             // Greece
michael@0:   {"gu", "gu", GUJARATI + W10, 0},
michael@0:   {"gv", "gv", MANX + W10, 0},
michael@0: 
michael@0:   {"ha", "ha", HAUSA + W10, 0},
michael@0:   {"hat", "ht", HAITIAN_CREOLE + W10, 0},   // Haiti
michael@0:   {"haw", "haw", HAWAIIAN + W10, 0},
michael@0:   {"hb", "iw", HEBREW + W10, 0},
michael@0:   {"he", "iw", HEBREW + W10, 0},
michael@0:   {"heb", "iw", HEBREW + W10, 0},
michael@0:   {"hi", "hi", HINDI + W10, MARATHI - W4},
michael@0:   {"hk", "zhT", CHINESE_T + W10, 0},          // Hong Kong
michael@0:   {"hr", "hr", CROATIAN + W10, 0},
michael@0:   {"ht", "ht", HAITIAN_CREOLE + W10, 0},
michael@0:   {"hu", "hu", HUNGARIAN + W10, 0},
michael@0:   {"hun", "hu", HUNGARIAN + W10, 0},
michael@0:   {"hy", "hy", ARMENIAN + W10, 0},
michael@0: 
michael@0:   {"ia", "ia", INTERLINGUA + W10, 0},
michael@0:   {"ice", "is", ICELANDIC + W10, FAROESE - W4},        // Iceland
michael@0:   {"id", "id", INDONESIAN + W10, MALAY - W4},
michael@0:   {"ids", "id", INDONESIAN + W10, MALAY - W4},
michael@0:   {"ie", "ie", INTERLINGUE + W10, 0},
michael@0:   {"ig", "ig", IGBO + W10, 0},
michael@0:   // 1:2 iu-Cans ik-Latn
michael@0:   {"ik", "ik,iu", INUPIAK + W10, INUKTITUT + W10},        // 1:2
michael@0:   {"in", "id", INDONESIAN + W10, MALAY - W4},
michael@0:   {"ind", "id", INDONESIAN + W10, MALAY - W4},       // Indonesia
michael@0:   {"inu", "iu,ik", INUKTITUT + W10, INUPIAK + W10},       // 1:2
michael@0:   {"is", "is", ICELANDIC + W10, FAROESE - W4},
michael@0:   {"it", "it", ITALIAN + W10, 0},
michael@0:   {"ita", "it", ITALIAN + W10, 0},
michael@0:   {"iu", "iu,ik", INUKTITUT + W10, INUPIAK + W10},        // 1:2
michael@0:   {"iw", "iw", HEBREW + W10, 0},
michael@0: 
michael@0:   {"ja", "ja", JAPANESE + W10, 0},
michael@0:   {"jp", "ja", JAPANESE + W10, 0},          // Japan
michael@0:   {"jpn", "ja", JAPANESE + W10, 0},
michael@0:   {"jv", "jw", JAVANESE + W10, 0},
michael@0:   {"jw", "jw", JAVANESE + W10, 0},
michael@0: 
michael@0:   {"ka", "ka", GEORGIAN + W10, 0},
michael@0:   {"kc", "qu", QUECHUA + W10, 0},           // (K)Quechua
michael@0:   {"kg", "ky", KYRGYZ + W10, 0},            // Kyrgyzstan
michael@0:   {"kh", "km", KHMER + W10, 0},             // Country code Khmer (Cambodia)
michael@0:   {"kha", "kha", KHASI + W10, 0},
michael@0:   {"kk", "kk", KAZAKH + W10, 0},            // Kazakh
michael@0:   {"kl", "kl", GREENLANDIC + W10, 0},
michael@0:   {"km", "km", KHMER + W10, 0},
michael@0:   {"kn", "kn", KANNADA + W10, 0},
michael@0:   {"ko", "ko", KOREAN + W10, 0},
michael@0:   {"kor", "ko", KOREAN + W10, 0},
michael@0:   {"kr", "ko", KOREAN + W10, 0},            // Country code Korea
michael@0:   {"ks", "ks", KASHMIRI + W10, 0},
michael@0:   {"ksc", "ko", KOREAN + W10, 0},           // KSC encoding
michael@0:   {"ku", "ku", KURDISH + W10, 0},
michael@0:   {"ky", "ky", KYRGYZ + W10, 0},
michael@0:   {"kz", "kk", KAZAKH + W10, 0},            // Kazakhstan
michael@0:   {"la", "la", LATIN + W10, 0},
michael@0:   {"lao", "lo", LAOTHIAN + W10, 0},         // Laos
michael@0: 
michael@0:   {"lb", "lb", LUXEMBOURGISH + W10, 0},
michael@0:   {"lg", "lg", GANDA + W10, 0},
michael@0:   {"lit", "lt", LITHUANIAN + W10, 0},
michael@0:   {"ln", "ln", LINGALA + W10, 0},
michael@0:   {"lo", "lo", LAOTHIAN + W10, 0},
michael@0:   {"lt", "lt", LITHUANIAN + W10, 0},
michael@0:   {"ltu", "lt", LITHUANIAN + W10, 0},
michael@0:   {"lv", "lv", LATVIAN + W10, 0},
michael@0: 
michael@0:   {"mfe", "mfe", MAURITIAN_CREOLE + W10, 0},
michael@0:   {"mg", "mg", MALAGASY + W10, 0},
michael@0:   {"mi", "mi", MAORI + W10, 0},
michael@0:   {"mk", "mk", MACEDONIAN + W10, 0},
michael@0:   {"ml", "ml", MALAYALAM + W10, 0},
michael@0:   {"mn", "mn", MONGOLIAN + W10, 0},
michael@0:   {"mo", "mo", ROMANIAN + W10, 0},
michael@0:   {"mon", "mn", MONGOLIAN + W10, 0},        // Mongolian
michael@0:   {"mr", "mr", MARATHI + W10, HINDI - W4},
michael@0:   {"ms", "ms", MALAY + W10, INDONESIAN - W4},
michael@0:   {"mt", "mt", MALTESE + W10, 0},
michael@0:   {"mx", "es", SPANISH + W10, 0},           // Mexico
michael@0:   {"my", "my,ms", BURMESE + W10, MALAY + W10}, // Myanmar, Malaysia
michael@0: 
michael@0:   {"na", "na", NAURU + W10, 0},
michael@0:   {"nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
michael@0:   {"ne", "ne", NEPALI + W10, 0},
michael@0:   {"nl", "nl", DUTCH + W10, 0},
michael@0:   {"nn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
michael@0:   {"no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
michael@0:   {"nr", "nr", NDEBELE + W10, 0},
michael@0:   {"nso", "nso", PEDI + W10, 0},
michael@0:   {"ny", "ny", NYANJA + W10, 0},
michael@0: 
michael@0:   {"oc", "oc", OCCITAN + W10, 0},
michael@0:   {"om", "om", OROMO + W10, 0},
michael@0:   {"or", "or", ORIYA + W10, 0},
michael@0: 
michael@0:   {"pa", "pa,ps", PUNJABI + W10, PASHTO + W10},   // 1:2 pa-Guru ps-Arab
michael@0:   {"per", "fa", PERSIAN + W10, 0},
michael@0:   {"ph", "tl", TAGALOG + W10, 0},           // Philippines
michael@0:   {"pk", "ur", URDU + W10, 0},              // Pakistan
michael@0:   {"pl", "pl", POLISH + W10, 0},
michael@0:   {"pnb", "pa", PUNJABI + W10, 0},          // Western Punjabi
michael@0:   {"pol", "pl", POLISH + W10, 0},
michael@0:   {"por", "pt", PORTUGUESE + W10, 0},
michael@0:   {"ps", "ps", PASHTO + W10, 0},
michael@0:   {"pt", "pt", PORTUGUESE + W10, 0},
michael@0:   {"ptg", "pt", PORTUGUESE + W10, 0},
michael@0:   {"qc", "fr", FRENCH + W10, 0},            // Quebec "country" code
michael@0:   {"qu", "qu", QUECHUA + W10, 0},
michael@0: 
michael@0:   {"rm", "rm", RHAETO_ROMANCE + W10, 0},
michael@0:   {"rn", "rn", RUNDI + W10, 0},
michael@0:   {"ro", "ro", ROMANIAN + W10, 0},
michael@0:   {"rs", "sr", SERBIAN + W10, 0},           // Serbia country code
michael@0:   {"ru", "ru", RUSSIAN + W10, 0},
michael@0:   {"rus", "ru", RUSSIAN + W10, 0},
michael@0:   {"rw", "rw", KINYARWANDA + W10, 0},
michael@0: 
michael@0:   {"sa", "sa", SANSKRIT + W10, 0},
michael@0:   {"sco", "sco", SCOTS + W10, ENGLISH - W4},
michael@0:   {"sd", "sd", SINDHI + W10, 0},
michael@0:   {"se", "sv", SWEDISH + W10, 0},
michael@0:   {"sg", "sg", SANGO + W10, 0},
michael@0:   {"si", "si,sl", SINHALESE + W10, SLOVENIAN + W10},  // 1:2 Sinhalese, Slovinia
michael@0:   {"sk", "sk", SLOVAK + W10, CZECH - W4},
michael@0:   {"sl", "sl", SLOVENIAN + W10, 0},
michael@0:   {"slo", "sl", SLOVENIAN + W10, 0},
michael@0:   {"sm", "sm", SAMOAN + W10, 0},
michael@0:   {"sn", "sn", SHONA + W10, 0},
michael@0:   {"so", "so", SOMALI + W10, 0},
michael@0:   {"sp", "es", SPANISH + W10, 0},
michael@0:   {"sq", "sq", ALBANIAN + W10, 0},
michael@0:   {"sr", "sr", SERBIAN + W10, 0},
michael@0:   {"srb", "sr", SERBIAN + W10, 0},
michael@0:   {"srl", "sr", SERBIAN + W10, 0},          // Serbian Latin
michael@0:   {"srp", "sr", SERBIAN + W10, 0},
michael@0:   {"ss", "ss", SISWANT + W10, 0},
michael@0:   {"st", "st", SESOTHO + W10, 0},
michael@0:   {"su", "su", SUNDANESE + W10, 0},
michael@0:   {"sv", "sv", SWEDISH + W10, 0},
michael@0:   {"sve", "sv", SWEDISH + W10, 0},
michael@0:   {"sw", "sw", SWAHILI + W10, 0},
michael@0:   {"swe", "sv", SWEDISH + W10, 0},
michael@0:   {"sy", "syr", SYRIAC + W10, 0},
michael@0:   {"syr", "syr", SYRIAC + W10, 0},
michael@0: 
michael@0:   {"ta", "ta", TAMIL + W10, 0},
michael@0:   {"te", "te", TELUGU + W10, 0},
michael@0:   {"tg", "tg", TAJIK + W10, 0},
michael@0:   {"th", "th", THAI + W10, 0},
michael@0:   {"ti", "ti,bo", TIGRINYA + W10, TIBETAN + W10},    // 1:2 Tigrinya, Tibet
michael@0:   {"tj", "tg", TAJIK + W10, 0},             // Tajikistan
michael@0:   {"tk", "tk", TURKMEN + W10, 0},
michael@0:   {"tl", "tl", TAGALOG + W10, 0},
michael@0:   {"tlh", "tlh", X_KLINGON + W10, 0},
michael@0:   {"tn", "tn", TSWANA + W10, 0},
michael@0:   {"to", "to", TONGA + W10, 0},
michael@0:   {"tr", "tr", TURKISH + W10, 0},
michael@0:   {"ts", "ts", TSONGA + W10, 0},
michael@0:   {"tt", "tt", TATAR + W10, 0},
michael@0:   {"tw", "ak,zhT", AKAN + W10, CHINESE_T + W10},   // 1:2 Twi => Akan, Taiwan
michael@0:   {"twi", "ak", AKAN + W10, 0},             // Twi => Akan
michael@0: 
michael@0:   {"ua", "uk", UKRAINIAN + W10, 0},         // Ukraine
michael@0:   {"ug", "ug", UIGHUR + W10, 0},
michael@0:   {"uk", "uk", UKRAINIAN + W10, 0},
michael@0:   {"ur", "ur", URDU + W10, 0},
michael@0:   {"uz", "uz", UZBEK + W10, 0},
michael@0: 
michael@0:   {"va", "ca", CATALAN + W10, 0},           // Valencia => Catalan
michael@0:   {"val", "ca", CATALAN + W10, 0},          // Valencia => Catalan
michael@0:   {"ve", "ve", VENDA + W10, 0},
michael@0:   {"vi", "vi", VIETNAMESE + W10, 0},
michael@0:   {"vie", "vi", VIETNAMESE + W10, 0},
michael@0:   {"vn", "vi", VIETNAMESE + W10, 0},
michael@0:   {"vo", "vo", VOLAPUK + W10, 0},
michael@0: 
michael@0:   {"wo", "wo", WOLOF + W10, 0},
michael@0: 
michael@0:   {"xh", "xh", XHOSA + W10, ZULU - W4},
michael@0:   {"xho", "xh", XHOSA + W10, ZULU - W4},
michael@0: 
michael@0:   {"yi", "yi", YIDDISH + W10, 0},
michael@0:   {"yo", "yo", YORUBA + W10, 0},
michael@0: 
michael@0:   {"za", "za", ZHUANG + W10, 0},
michael@0:   {"zh", "zh", CHINESE + W10, 0},
michael@0:   {"zht", "zhT", CHINESE_T + W10, 0},
michael@0:   {"zu", "zu", ZULU + W10, XHOSA - W4},
michael@0: };
michael@0: 
michael@0: 
michael@0: // Possibly map to tl:
michael@0: // -LangTags tl-Latn /7val.com/ ,bcl 2 Central Bicolano
michael@0: // -LangTags tl-Latn /7val.com/ ,ceb 6 Cebuano
michael@0: // -LangTags tl-Latn /7val.com/ ,war 1 Waray
michael@0: 
michael@0: 
michael@0: 
michael@0: // Table to look up country TLD (no general TLD)
michael@0: // In alphabetical order for binary search
michael@0: static const int kCLDTable3Size = 181;
michael@0: static const TLDLookup kCLDTLDHintTable[kCLDTable3Size] = {
michael@0:   {"ac", JAPANESE + W2, 0},
michael@0:   {"ad", CATALAN + W4, 0},
michael@0:   {"ae", ARABIC + W4, 0},
michael@0:   {"af", PASHTO + W4, PERSIAN + W4},
michael@0:   {"ag", GERMAN + W2, 0},                // meager
michael@0:   // {"ai", 0, 0},                          // meager
michael@0:   {"al", ALBANIAN + W4, 0},
michael@0:   {"am", ARMENIAN + W4, 0},
michael@0:   {"an", DUTCH + W4, 0},                 // meager
michael@0:   {"ao", PORTUGUESE + W4, 0},
michael@0:   // {"aq", 0, 0},                          // meager
michael@0:   {"ar", SPANISH + W4, 0},
michael@0:   // {"as", 0, 0},
michael@0:   {"at", GERMAN + W4, 0},
michael@0:   {"au", ENGLISH + W2, 0},
michael@0:   {"aw", DUTCH + W4, 0},
michael@0:   {"ax", SWEDISH + W4, 0},
michael@0:   {"az", AZERBAIJANI + W4, 0},
michael@0: 
michael@0:   {"ba", BOSNIAN + W8, CROATIAN - W4},
michael@0:   // {"bb", 0, 0},
michael@0:   {"bd", BENGALI + W4, 0},
michael@0:   {"be", DUTCH + W4, FRENCH + W4},
michael@0:   {"bf", FRENCH + W4, 0},
michael@0:   {"bg", BULGARIAN + W4, 0},
michael@0:   {"bh", ARABIC + W4, 0},
michael@0:   {"bi", RUNDI + W4, FRENCH + W4},
michael@0:   {"bj", FRENCH + W4, 0},
michael@0:   {"bm", ENGLISH + W2, 0},
michael@0:   {"bn", MALAY + W4, INDONESIAN - W4},
michael@0:   {"bo", SPANISH + W4, AYMARA + W2},   // and GUARANI QUECHUA
michael@0:   {"br", PORTUGUESE + W4, 0},
michael@0:   // {"bs", 0, 0},
michael@0:   {"bt", DZONGKHA + W10, TIBETAN - W10},      // Strong presumption of Dzongha
michael@0:   {"bw", TSWANA + W4, 0},
michael@0:   {"by", BELARUSIAN + W4, 0},
michael@0:   // {"bz", 0, 0},
michael@0: 
michael@0:   {"ca", FRENCH + W4, ENGLISH + W2},
michael@0:   {"cat", CATALAN + W4, 0},
michael@0:   {"cc", 0, 0},
michael@0:   {"cd", FRENCH + W4, 0},
michael@0:   {"cf", FRENCH + W4, 0},
michael@0:   {"cg", FRENCH + W4, 0},
michael@0:   {"ch", GERMAN + W4, FRENCH + W4},
michael@0:   {"ci", FRENCH + W4, 0},
michael@0:   // {"ck", 0, 0},
michael@0:   {"cl", SPANISH + W4, 0},
michael@0:   {"cm", FRENCH + W4, 0},
michael@0:   {"cn", CHINESE + W4, 0},
michael@0:   {"co", SPANISH + W4, 0},
michael@0:   {"cr", SPANISH + W4, 0},
michael@0:   {"cu", SPANISH + W4, 0},
michael@0:   {"cv", PORTUGUESE + W4, 0},
michael@0:   // {"cx", 0, 0},
michael@0:   {"cy", GREEK + W4, TURKISH + W4},
michael@0:   {"cz", CZECH + W4, SLOVAK - W4},
michael@0: 
michael@0:   {"de", GERMAN + W4, 0},
michael@0:   {"dj", 0, 0},
michael@0:   {"dk", DANISH + W4, NORWEGIAN - W4},
michael@0:   {"dm", 0, 0},
michael@0:   {"do", SPANISH + W4, 0},
michael@0:   {"dz", FRENCH + W4, ARABIC + W4},
michael@0: 
michael@0:   {"ec", SPANISH + W4, 0},
michael@0:   {"ee", ESTONIAN + W4, 0},
michael@0:   {"eg", ARABIC + W4, 0},
michael@0:   {"er", AFAR + W4, 0},
michael@0:   {"es", SPANISH + W4, 0},
michael@0:   {"et", AMHARIC + W4, AFAR + W4},
michael@0: 
michael@0:   {"fi", FINNISH + W4, 0},
michael@0:   {"fj", FIJIAN + W4, 0},
michael@0:   // {"fk", 0, 0},
michael@0:   // {"fm", 0, 0},
michael@0:   {"fo", FAROESE + W4, ICELANDIC - W4},
michael@0:   {"fr", FRENCH + W4, 0},
michael@0: 
michael@0:   {"ga", FRENCH + W4, 0},
michael@0:   {"gd", 0, 0},
michael@0:   {"ge", GEORGIAN + W4, 0},
michael@0:   {"gf", FRENCH + W4, 0},
michael@0:   // {"gg", 0, 0},
michael@0:   // {"gh", 0, 0},
michael@0:   // {"gi", 0, 0},
michael@0:   {"gl", GREENLANDIC + W4, DANISH + W4},
michael@0:   // {"gm", 0, 0},
michael@0:   {"gn", FRENCH + W4, 0},
michael@0:   // {"gp", 0, 0},
michael@0:   // {"gq", 0, 0},
michael@0:   {"gr", GREEK + W4, 0},
michael@0:   // {"gs", 0, 0},
michael@0:   {"gt", SPANISH + W4, 0},
michael@0:   // {"gu", 0, 0},
michael@0:   // {"gy", 0, 0},
michael@0: 
michael@0:   {"hk", CHINESE_T + W4, 0},
michael@0:   // {"hm", 0, 0},
michael@0:   {"hn", SPANISH + W4, 0},
michael@0:   {"hr", CROATIAN + W8, BOSNIAN - W4},
michael@0:   {"ht", HAITIAN_CREOLE + W4, FRENCH + W4},
michael@0:   {"hu", HUNGARIAN + W4, 0},
michael@0: 
michael@0:   {"id", INDONESIAN + W4, MALAY - W4},
michael@0:   {"ie", IRISH + W4, 0},
michael@0:   {"il", HEBREW + W4, 0},
michael@0:   {"im", MANX + W4, 0},
michael@0:   // {"in", 0, 0},
michael@0:   // {"io", 0, 0},
michael@0:   {"iq", ARABIC + W4, 0},
michael@0:   {"ir", PERSIAN + W4, 0},
michael@0:   {"is", ICELANDIC + W4, FAROESE - W4},
michael@0:   {"it", ITALIAN + W4, 0},
michael@0: 
michael@0:   // {"je", 0, 0},
michael@0:   // {"jm", 0, 0},
michael@0:   {"jo", ARABIC + W4, 0},
michael@0:   {"jp", JAPANESE + W4, 0},
michael@0: 
michael@0:   // {"ke", 0, 0},
michael@0:   {"kg", KYRGYZ + W4, 0},
michael@0:   {"kh", KHMER + W4, 0},
michael@0:   // {"ki", 0, 0},
michael@0:   {"km", FRENCH + W4, 0},
michael@0:   // {"kn", 0, 0},
michael@0:   {"kp", KOREAN + W4, 0},
michael@0:   {"kr", KOREAN + W4, 0},
michael@0:   {"kw", ARABIC + W4, 0},
michael@0:   // {"ky", 0, 0},
michael@0:   {"kz", KAZAKH + W4, 0},
michael@0: 
michael@0:   {"la", LAOTHIAN + W4, 0},
michael@0:   {"lb", ARABIC + W4, FRENCH + W4},
michael@0:   // {"lc", 0, 0},
michael@0:   {"li", GERMAN + W4, 0},
michael@0:   {"lk", SINHALESE + W4, 0},
michael@0:   // {"lr", 0, 0},
michael@0:   {"ls", SESOTHO + W4, 0},
michael@0:   {"lt", LITHUANIAN + W4, 0},
michael@0:   {"lu", LUXEMBOURGISH + W4},
michael@0:   {"lv", LATVIAN + W4, 0},
michael@0:   {"ly", ARABIC + W4, 0},
michael@0: 
michael@0:   {"ma", FRENCH + W4, 0},
michael@0:   {"mc", FRENCH + W4, 0},
michael@0:   {"md", ROMANIAN + W4, 0},
michael@0:   {"me", MONTENEGRIN + W8, SERBIAN - W4},
michael@0:   {"mg", FRENCH + W4, 0},
michael@0:   {"mk", MACEDONIAN + W4, 0},
michael@0:   {"ml", FRENCH + W4, 0},
michael@0:   {"mm", BURMESE + W4, 0},
michael@0:   {"mn", MONGOLIAN + W4, 0},
michael@0:   {"mo", CHINESE_T + W4, PORTUGUESE + W4},
michael@0:   // {"mp", 0, 0},
michael@0:   {"mq", FRENCH + W4, 0},
michael@0:   {"mr", FRENCH + W4, ARABIC + W4},
michael@0:   // {"ms", 0, 0},
michael@0:   {"mt", MALTESE + W4, 0},
michael@0:   // {"mu", 0, 0},
michael@0:   {"mv", DHIVEHI + W4, 0},
michael@0:   // {"mw", 0, 0},
michael@0:   {"mx", SPANISH + W4, 0},
michael@0:   {"my", MALAY + W4, INDONESIAN - W4},
michael@0:   {"mz", PORTUGUESE + W4, 0},
michael@0: 
michael@0:   {"na", 0, 0},            // Namibia
michael@0:   {"nc", FRENCH + W4, 0},
michael@0:   {"ne", FRENCH + W4, 0},
michael@0:   {"nf", FRENCH + W4, 0},
michael@0:   // {"ng", 0, 0},
michael@0:   {"ni", SPANISH + W4, 0},
michael@0:   {"nl", DUTCH + W4, 0},
michael@0:   {"no", NORWEGIAN + W4, NORWEGIAN_N + W2},
michael@0:   {"np", NEPALI + W4, 0},
michael@0:   {"nr", NAURU + W4, 0},
michael@0:   {"nu", SWEDISH + W4, 0},
michael@0:   {"nz", MAORI + W4, ENGLISH + W2},
michael@0: 
michael@0:   {"om", ARABIC + W4, 0},
michael@0: 
michael@0:   {"pa", SPANISH + W4, 0},
michael@0:   {"pe", SPANISH + W4, QUECHUA + W2},   // also AYMARA
michael@0:   {"pf", FRENCH + W4, 0},
michael@0:   // {"pg", 0, 0},
michael@0:   {"ph", TAGALOG + W4, 0},
michael@0:   {"pk", URDU + W4, 0},
michael@0:   {"pl", POLISH + W4, 0},
michael@0:   // {"pn", 0, 0},
michael@0:   {"pr", SPANISH + W4, 0},
michael@0:   {"ps", ARABIC + W4, 0},
michael@0:   {"pt", PORTUGUESE + W4, 0},
michael@0:   {"py", SPANISH + W4, GUARANI + W2},
michael@0: 
michael@0:   {"qa", ARABIC + W4, 0},
michael@0: 
michael@0:   {"re", FRENCH + W4, 0},
michael@0:   {"ro", ROMANIAN + W4, 0},
michael@0:   {"rs", SERBIAN + W8, MONTENEGRIN - W4},
michael@0:   {"ru", RUSSIAN + W4, 0},
michael@0:   {"rw", KINYARWANDA + W4, FRENCH + W2},
michael@0: 
michael@0:   {"sa", ARABIC + W4, 0},
michael@0:   // {"sb", 0, 0},
michael@0:   {"sc", SESELWA + W4, 0},
michael@0:   {"sd", ARABIC + W4, 0},
michael@0:   {"se", SWEDISH + W4, 0},
michael@0:   // {"sg", 0, 0},
michael@0:   // {"sh", 0, 0},
michael@0:   {"si", SLOVENIAN + W4, 0},
michael@0:   {"sk", SLOVAK + W4, CZECH - W4},
michael@0:   // {"sl", 0, 0},
michael@0:   {"sm", ITALIAN + W4, 0},
michael@0:   {"sn", FRENCH + W4, 0},
michael@0:   // {"sr", 0, 0},
michael@0:   {"ss", ARABIC + W4, 0},     // Presumed South Sudan TLD. dsites 2011.07.07
michael@0:   // {"st", 0, 0},
michael@0:   {"su", RUSSIAN + W4, 0},
michael@0:   {"sv", SPANISH + W4, 0},
michael@0:   {"sy", ARABIC + W4, 0},
michael@0:   // {"sz", 0, 0},
michael@0: 
michael@0:   // {"tc", 0, 0},
michael@0:   {"td", FRENCH + W4, 0},
michael@0:   // {"tf", 0, 0},
michael@0:   {"tg", FRENCH + W4, 0},
michael@0:   {"th", THAI + W4, 0},
michael@0:                               // Tibet has no country code (see .cn)
michael@0:   {"tj", TAJIK + W4, 0},
michael@0:   // {"tk", 0, 0},
michael@0:   // {"tl", 0, 0},
michael@0:   {"tm", TURKISH + W4, 0},
michael@0:   {"tn", FRENCH + W4, ARABIC + W4},
michael@0:   // {"to", 0, 0},
michael@0:   {"tp", JAPANESE + W4, 0},
michael@0:   {"tr", TURKISH + W4, 0},
michael@0:   // {"tt", 0, 0},
michael@0:   // {"tv", 0, 0},
michael@0:   {"tw", CHINESE_T + W4, 0},
michael@0:   {"tz", SWAHILI + W4, AKAN + W4},
michael@0: 
michael@0:   {"ua", UKRAINIAN + W4, 0},
michael@0:   {"ug", GANDA + W4, 0},
michael@0:   {"uk", ENGLISH + W2, 0},
michael@0:   {"us", ENGLISH + W2, 0},
michael@0:   {"uy", SPANISH + W4, 0},
michael@0:   {"uz", UZBEK + W4, 0},
michael@0: 
michael@0:   {"va", ITALIAN + W4, LATIN + W2},
michael@0:   // {"vc", 0, 0},
michael@0:   {"ve", SPANISH + W4, 0},
michael@0:   // {"vg", 0, 0},
michael@0:   // {"vi", 0, 0},
michael@0:   {"vn", VIETNAMESE + W4, 0},
michael@0:   // {"vu", 0, 0},
michael@0: 
michael@0:   {"wf", FRENCH + W4, 0},
michael@0:   // {"ws", 0, 0},
michael@0: 
michael@0:   {"ye", ARABIC + W4, 0},
michael@0: 
michael@0:   {"za", AFRIKAANS + W4, 0},
michael@0:   // {"zm", 0, 0},
michael@0:   // {"zw", 0, 0},
michael@0: };
michael@0: 
michael@0: #undef W2
michael@0: #undef W4
michael@0: #undef W6
michael@0: #undef W8
michael@0: #undef W10
michael@0: #undef W12
michael@0: 
michael@0: 
michael@0: 
michael@0: 
michael@0: 
michael@0: inline void SetCLDPriorWeight(int w, OneCLDLangPrior* olp) {
michael@0:   *olp = (*olp & 0x3ff) + (w << 10);
michael@0: }
michael@0: inline void SetCLDPriorLang(Language lang, OneCLDLangPrior* olp) {
michael@0:   *olp = (*olp & ~0x3ff) + lang;
michael@0: }
michael@0: 
michael@0: OneCLDLangPrior PackCLDPriorLangWeight(Language lang, int w) {
michael@0:   return (w << 10) + lang;
michael@0: }
michael@0: 
michael@0: inline int MaxInt(int a, int b) {
michael@0:   return (a >= b) ? a : b;
michael@0: }
michael@0: 
michael@0: // Merge in another language prior, taking max if already there
michael@0: void MergeCLDLangPriorsMax(OneCLDLangPrior olp, CLDLangPriors* lps) {
michael@0:   if (olp == 0) {return;}
michael@0:   Language target_lang = GetCLDPriorLang(olp);
michael@0:   for (int i = 0; i < lps->n; ++i) {
michael@0:     if (GetCLDPriorLang(lps->prior[i]) == target_lang) {
michael@0:       int new_weight = MaxInt(GetCLDPriorWeight(lps->prior[i]),
michael@0:                               GetCLDPriorWeight(olp));
michael@0:       SetCLDPriorWeight(new_weight, &lps->prior[i]);
michael@0:       return;
michael@0:     }
michael@0:   }
michael@0:   // Not found; add it if room
michael@0:   if (lps->n >= kMaxOneCLDLangPrior) {return;}
michael@0:   lps->prior[lps->n++] = olp;
michael@0: }
michael@0: 
michael@0: // Merge in another language prior, boosting 10x if already there
michael@0: void MergeCLDLangPriorsBoost(OneCLDLangPrior olp, CLDLangPriors* lps) {
michael@0:   if (olp == 0) {return;}
michael@0:   Language target_lang = GetCLDPriorLang(olp);
michael@0:   for (int i = 0; i < lps->n; ++i) {
michael@0:     if (GetCLDPriorLang(lps->prior[i]) == target_lang) {
michael@0:       int new_weight = GetCLDPriorWeight(lps->prior[i]) + 2;
michael@0:       SetCLDPriorWeight(new_weight, &lps->prior[i]);
michael@0:       return;
michael@0:     }
michael@0:   }
michael@0:   // Not found; add it if room
michael@0:   if (lps->n >= kMaxOneCLDLangPrior) {return;}
michael@0:   lps->prior[lps->n++] = olp;
michael@0: }
michael@0: 
michael@0: 
michael@0: // Trim language priors to no more than max_entries, keeping largest abs weights
michael@0: void TrimCLDLangPriors(int max_entries, CLDLangPriors* lps) {
michael@0:   if (lps->n <= max_entries) {return;}
michael@0: 
michael@0:   // Insertion sort in-place by abs(weight)
michael@0:   for (int i = 0; i < lps->n; ++i) {
michael@0:     OneCLDLangPrior temp_olp = lps->prior[i];
michael@0:     int w = abs(GetCLDPriorWeight(temp_olp));
michael@0:     int kk = i;
michael@0:     for (; kk > 0; --kk) {
michael@0:       if (abs(GetCLDPriorWeight(lps->prior[kk - 1])) < w) {
michael@0:         // Move down and continue
michael@0:         lps->prior[kk] = lps->prior[kk - 1];
michael@0:       } else {
michael@0:         // abs(weight[kk - 1]) >= w, time to stop
michael@0:         break;
michael@0:       }
michael@0:     }
michael@0:     lps->prior[kk] = temp_olp;
michael@0:   }
michael@0: 
michael@0:   lps->n = max_entries;
michael@0: }
michael@0: 
michael@0: int CountCommas(const string& langtags) {
michael@0:   int commas = 0;
michael@0:   for (int i = 0; i < static_cast<int>(langtags.size()); ++i) {
michael@0:     if (langtags[i] == ',') {++commas;}
michael@0:   }
michael@0:   return commas;
michael@0: }
michael@0: 
michael@0: // Binary lookup on language tag
michael@0: const LangTagLookup* DoLangTagLookup(const char* key,
michael@0:                                      const LangTagLookup* tbl, int tbl_size) {
michael@0:   // Key is always in range [lo..hi)
michael@0:   int lo = 0;
michael@0:   int hi = tbl_size;
michael@0:   while (lo < hi) {
michael@0:     int mid = (lo + hi) >> 1;
michael@0:     int comp = strcmp(tbl[mid].langtag, key);
michael@0:     if (comp < 0) {
michael@0:       lo = mid + 1;
michael@0:     } else if (comp > 0) {
michael@0:       hi = mid;
michael@0:     } else {
michael@0:       return &tbl[mid];
michael@0:     }
michael@0:   }
michael@0:   return NULL;
michael@0: }
michael@0: 
michael@0: // Binary lookup on tld
michael@0: const TLDLookup* DoTLDLookup(const char* key,
michael@0:                              const TLDLookup* tbl, int tbl_size) {
michael@0:   // Key is always in range [lo..hi)
michael@0:   int lo = 0;
michael@0:   int hi = tbl_size;
michael@0:   while (lo < hi) {
michael@0:     int mid = (lo + hi) >> 1;
michael@0:     int comp = strcmp(tbl[mid].tld, key);
michael@0:     if (comp < 0) {
michael@0:       lo = mid + 1;
michael@0:     } else if (comp > 0) {
michael@0:       hi = mid;
michael@0:     } else {
michael@0:       return &tbl[mid];
michael@0:     }
michael@0:   }
michael@0:   return NULL;
michael@0: }
michael@0: 
michael@0: 
michael@0: 
michael@0: // Trim language tag string to canonical form for each language
michael@0: // Input is from GetLangTagsFromHtml(), already lowercased
michael@0: string TrimCLDLangTagsHint(const string& langtags) {
michael@0:   string retval;
michael@0:   if (langtags.empty()) {return retval;}
michael@0:   int commas = CountCommas(langtags);
michael@0:   if (commas > 4) {return retval;}       // Ignore if too many language tags
michael@0: 
michael@0:   char temp[20];
michael@0:   int pos = 0;
michael@0:   while (pos < static_cast<int>(langtags.size())) {
michael@0:     int comma = langtags.find(',', pos);
michael@0:     if (comma == string::npos) {comma = langtags.size();} // fake trailing comma
michael@0:     int len = comma - pos;
michael@0:     if (len <= 16) {
michael@0:       // Short enough to use
michael@0:       memcpy(temp, &langtags[pos], len);
michael@0:       temp[len] = '\0';
michael@0:       const LangTagLookup* entry = DoLangTagLookup(temp,
michael@0:                                                    kCLDLangTagsHintTable1,
michael@0:                                                    kCLDTable1Size);
michael@0:       if (entry != NULL) {
michael@0:         // First table hit
michael@0:         retval.append(entry->langcode);     // may be "code1,code2"
michael@0:         retval.append(1, ',');
michael@0:       } else {
michael@0:         // Try second table with language code truncated at first hyphen
michael@0:         char* hyphen = strchr(temp, '-');
michael@0:         if (hyphen != NULL) {*hyphen = '\0';}
michael@0:         len = strlen(temp);
michael@0:         if (len <= 3) {                 // Short enough to use
michael@0:           entry = DoLangTagLookup(temp,
michael@0:                                   kCLDLangTagsHintTable2,
michael@0:                                   kCLDTable2Size);
michael@0:           if (entry != NULL) {
michael@0:             // Second table hit
michael@0:             retval.append(entry->langcode);     // may be "code1,code2"
michael@0:             retval.append(1, ',');
michael@0:           }
michael@0:         }
michael@0:       }
michael@0:     }
michael@0:     pos = comma + 1;
michael@0:   }
michael@0: 
michael@0:   // Remove trainling comma, if any
michael@0:   if (!retval.empty()) {retval.resize(retval.size() - 1);}
michael@0:   return retval;
michael@0: }
michael@0: 
michael@0: 
michael@0: 
michael@0: //==============================================================================
michael@0: 
michael@0: // Little state machine to scan insides of language attribute quoted-string.
michael@0: // Each language code is lowercased and copied to the output string. Underscore
michael@0: // is mapped to minus. Space, tab, and comma are all mapped to comma, and
michael@0: // multiple consecutive commas are removed.
michael@0: // Each language code in the output list will be followed by a single comma.
michael@0: 
michael@0: // There are three states, and we start in state 1:
michael@0: // State 0: After a letter.
michael@0: //  Copy all letters/minus[0], copy comma[1]; all others copy comma and skip [2]
michael@0: // State 1: Just after a comma.
michael@0: //  Copy letter [0], Ignore subsequent commas[1]. minus and all others skip [2]
michael@0: // State 2: Skipping.
michael@0: //  All characters except comma skip and stay in [2]. comma goes to [1]
michael@0: 
michael@0: // The thing that is copied is kLangCodeRemap[c] when going to state 0,
michael@0: // and always comma when going to state 1 or 2. The design depends on copying
michael@0: // a comma at the *beginning* of skipping, and in state 2 never doing a copy.
michael@0: 
michael@0: // We pack all this into 8 bits:
michael@0: //    +--+---+---+
michael@0: //    |78|654|321|
michael@0: //    +--+---+---+
michael@0: //
michael@0: // Shift byte right by 3*state, giving [0] 321, [1] 654, [2] .78
michael@0: // where . is always zero
michael@0: // Of these 3 bits, low two are next state ss, high bit is copy bit C.
michael@0: // If C=1 and ss == 0, copy kLangCodeRemap[c], else copy a comma
michael@0: 
michael@0: #define SKIP0 0
michael@0: #define SKIP1 1
michael@0: #define SKIP2 2
michael@0: #define COPY0 4   // copy kLangCodeRemap[c]
michael@0: #define COPY1 5   // copy ','
michael@0: #define COPY2 6   // copy ','
michael@0: 
michael@0: // These combined actions pack three states into one byte.
michael@0: // Ninth bit must be zero, so all state 2 values must be skips.
michael@0: //              state[2]       state[1]      state[0]
michael@0: #define LTR   ((SKIP2 << 6) + (COPY0 << 3) + COPY0)
michael@0: #define MINUS ((SKIP2 << 6) + (COPY2 << 3) + COPY0)
michael@0: #define COMMA ((SKIP1 << 6) + (SKIP1 << 3) + COPY1)
michael@0: #define Bad   ((SKIP2 << 6) + (COPY2 << 3) + COPY2)
michael@0: 
michael@0: // Treat as letter: a-z,  A-Z
michael@0: // Treat as minus:  2D minus,  5F underscore
michael@0: // Treat as comma:  09 tab,  20 space,  2C comma
michael@0: 
michael@0: static const unsigned char kLangCodeAction[256] = {
michael@0:   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,COMMA,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0:   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0:   COMMA,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,COMMA,MINUS,Bad,Bad,
michael@0:   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0: 
michael@0:   Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR,  LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,
michael@0:   LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,  LTR,LTR,LTR,Bad,Bad,Bad,Bad,MINUS,
michael@0:   Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR,  LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,
michael@0:   LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,  LTR,LTR,LTR,Bad,Bad,Bad,Bad,Bad,
michael@0: 
michael@0:   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0:   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0:   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0:   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0: 
michael@0:   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0:   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0:   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0:   Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0: };
michael@0: 
michael@0: // This does lowercasing, maps underscore to minus, and maps tab/space to comma
michael@0: static const unsigned char kLangCodeRemap[256] = {
michael@0:   0,0,0,0,0,0,0,0,  0,',',0,0,0,0,0,0,          // 09 tab
michael@0:   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
michael@0:   ',',0,0,0,0,0,0,0,  0,0,0,0,',','-',0,0,      // 20 space 2C comma 2D minus
michael@0:   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
michael@0: 
michael@0:     0,'a','b','c','d','e','f','g',  'h','i','j','k','l','m','n','o',
michael@0:   'p','q','r','s','t','u','v','w',  'x','y','z',0,0,0,0,'-',  // 5F underscore
michael@0:     0,'a','b','c','d','e','f','g',  'h','i','j','k','l','m','n','o',
michael@0:   'p','q','r','s','t','u','v','w',  'x','y','z',0,0,0,0,0,
michael@0: 
michael@0:   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
michael@0:   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
michael@0:   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
michael@0:   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
michael@0: 
michael@0:   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
michael@0:   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
michael@0:   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
michael@0:   0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
michael@0: };
michael@0: 
michael@0: #undef LTR
michael@0: #undef MINUS
michael@0: #undef COMMA
michael@0: #undef Bad
michael@0: 
michael@0: #undef SKIP0
michael@0: #undef SKIP1
michael@0: #undef SKIP2
michael@0: #undef COPY0
michael@0: #undef COPY1
michael@0: #undef COPY2
michael@0: 
michael@0: 
michael@0: // Find opening '<' for HTML tag
michael@0: // Note: this is all somewhat insensitive to mismatched quotes
michael@0: int32 FindTagStart(const char* utf8_body, int32 pos, int32 max_pos) {
michael@0:   int i = pos;
michael@0:   // Advance i by 4 if none of the next 4 bytes are '<'
michael@0:   for (i = pos; i < (max_pos - 3); i += 4) {
michael@0:     // Fast check for any <
michael@0:     const char* p = &utf8_body[i];
michael@0:     uint32 s0123 = UNALIGNED_LOAD32(p);
michael@0:     uint32 temp = s0123 ^ 0x3c3c3c3c;    // <<<<
michael@0:     if (((temp - 0x01010101) & (~temp & 0x80808080)) != 0) {
michael@0:       // At least one byte is '<'
michael@0:       break;
michael@0:     }
michael@0:   }
michael@0:   // Continue, advancing i by 1
michael@0:   for (; i < max_pos; ++i) {
michael@0:     if (utf8_body[i] == '<') {return i;}
michael@0:   }
michael@0:   return -1;
michael@0: }
michael@0: 
michael@0: 
michael@0: // Find closing '>' for HTML tag. Also stop on < and & (simplistic parsing)
michael@0: int32 FindTagEnd(const char* utf8_body, int32 pos, int32 max_pos) {
michael@0:   // Always outside quotes
michael@0:   for (int i = pos; i < max_pos; ++i) {
michael@0:     char c = utf8_body[i];
michael@0:     if (c == '>') {return i;}
michael@0:     if (c == '<') {return i - 1;}
michael@0:     if (c == '&') {return i - 1;}
michael@0:   }
michael@0:   return -1;              // nothing found
michael@0: }
michael@0: 
michael@0: // Find opening quote or apostrophe, skipping spaces
michael@0: // Note: this is all somewhat insensitive to mismatched quotes
michael@0: int32 FindQuoteStart(const char* utf8_body, int32 pos, int32 max_pos) {
michael@0:   for (int i = pos; i < max_pos; ++i) {
michael@0:     char c = utf8_body[i];
michael@0:     if (c == '"') {return i;}
michael@0:     if (c == '\'') {return i;}
michael@0:     if (c != ' ') {return -1;}
michael@0:   }
michael@0:   return -1;
michael@0: }
michael@0: 
michael@0: // Find closing quot/apos. Also stop on = > < and & (simplistic parsing)
michael@0: int32 FindQuoteEnd(const char* utf8_body, int32 pos, int32 max_pos) {
michael@0:   // Always outside quotes
michael@0:   for (int i = pos; i < max_pos; ++i) {
michael@0:     char c = utf8_body[i];
michael@0:     if (c == '"') {return i;}
michael@0:     if (c == '\'') {return i;}
michael@0:     if (c == '>') {return i - 1;}
michael@0:     if (c == '=') {return i - 1;}
michael@0:     if (c == '<') {return i - 1;}
michael@0:     if (c == '&') {return i - 1;}
michael@0:   }
michael@0:   return -1;              // nothing found
michael@0: }
michael@0: 
michael@0: int32 FindEqualSign(const char* utf8_body, int32 pos, int32 max_pos) {
michael@0:   // Outside quotes/apostrophes loop
michael@0:   for (int i = pos; i < max_pos; ++i) {
michael@0:     char c = utf8_body[i];
michael@0:     if (c == '=') {       // Found bare equal sign inside tag
michael@0:       return i;
michael@0:     } else if (c == '"') {
michael@0:       // Inside quotes loop
michael@0:       int j;
michael@0:       for (j = i + 1; j < max_pos; ++j) {
michael@0:         if (utf8_body[j] == '"') {
michael@0:           break;
michael@0:         } else if (utf8_body[j] == '\\') {
michael@0:           ++j;
michael@0:         }
michael@0:       }
michael@0:       i = j;
michael@0:     } else if (c == '\'') {
michael@0:       // Inside apostrophes loop
michael@0:       int j;
michael@0:       for (j = i + 1; j < max_pos; ++j) {
michael@0:         if (utf8_body[j] == '\'') {
michael@0:           break;
michael@0:         } else if (utf8_body[j] == '\\') {
michael@0:           ++j;
michael@0:         }
michael@0:       }
michael@0:       i = j;
michael@0:     }
michael@0: 
michael@0:   }
michael@0:   return -1;              // nothing found
michael@0: }
michael@0: 
michael@0: // Scan backwards for case-insensitive string s in [min_pos..pos)
michael@0: // Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f]
michael@0: // Cheap lowercase. Control codes will masquerade as 20..3f
michael@0: bool FindBefore(const char* utf8_body,
michael@0:                  int32 min_pos, int32 pos, const char* s) {
michael@0:   int len = strlen(s);
michael@0:   if ((pos - min_pos) < len) {return false;}     // Too small to fit s
michael@0: 
michael@0:   // Skip trailing spaces
michael@0:   int i = pos;
michael@0:   while ((i > (min_pos + len)) && (utf8_body[i - 1] == ' ')) {--i;}
michael@0:   i -= len;
michael@0:   if (i < min_pos) {return false;}   // pos - min_pos < len, so s can't be found
michael@0: 
michael@0:   const char* p = &utf8_body[i];
michael@0:   for (int j = 0; j < len; ++j) {
michael@0:     if ((p[j] | 0x20) != s[j])  {return false;}    // Unequal byte
michael@0:   }
michael@0:   return true;                                     // All bytes equal at i
michael@0: }
michael@0: 
michael@0: // Scan forwards for case-insensitive string s in [pos..max_pos)
michael@0: // Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f]
michael@0: // Cheap lowercase. Control codes will masquerade as 20..3f
michael@0: // Allows but does not require quoted/apostrophe string
michael@0: bool FindAfter(const char* utf8_body,
michael@0:                  int32 pos, int32 max_pos, const char* s) {
michael@0:   int len = strlen(s);
michael@0:   if ((max_pos - pos) < len) {return false;}     // Too small to fit s
michael@0: 
michael@0:   // Skip leading spaces, quote, apostrophe
michael@0:   int i = pos;
michael@0:   while (i < (max_pos - len)) {
michael@0:     unsigned char c = utf8_body[i];
michael@0:     if ((c == ' ') || (c == '"') || (c == '\'')) {++i;}
michael@0:     else {break;}
michael@0:   }
michael@0: 
michael@0:   const char* p = &utf8_body[i];
michael@0:   for (int j = 0; j < len; ++j) {
michael@0:     if ((p[j] | 0x20) != s[j])  {return false;}    // Unequal byte
michael@0:   }
michael@0:   return true;                                     // All bytes equal
michael@0: }
michael@0: 
michael@0: 
michael@0: 
michael@0: // Copy attribute value in [pos..max_pos)
michael@0: // pos is just after an opening quote/apostrophe and max_pos is the ending one
michael@0: // String must all be on a single line.
michael@0: // Return slightly-normalized language list, empty or ending in comma
michael@0: // Does lowercasing and removes excess punctuation/space
michael@0: string CopyOneQuotedString(const char* utf8_body,
michael@0:                          int32 pos, int32 max_pos) {
michael@0:   string s;
michael@0:   int state = 1;        // Front is logically just after a comma
michael@0:   for (int i = pos; i < max_pos; ++i) {
michael@0:     unsigned char c = utf8_body[i];
michael@0:     int e = kLangCodeAction[c] >> (3 * state);
michael@0:     state = e & 3;      // Update to next state
michael@0:     if ((e & 4) != 0) {
michael@0:       // Copy a remapped byte if going to state 0, else copy a comma
michael@0:       if (state == 0) {
michael@0:         s.append(1, kLangCodeRemap[c]);
michael@0:       } else {
michael@0:         s.append(1, ',');
michael@0:       }
michael@0:     }
michael@0:   }
michael@0: 
michael@0:   // Add final comma if needed
michael@0:   if (state == 0) {
michael@0:     s.append(1, ',');
michael@0:   }
michael@0:   return s;
michael@0: }
michael@0: 
michael@0: // Find and copy attribute value: quoted string in [pos..max_pos)
michael@0: // Return slightly-normalized language list, empty or ending in comma
michael@0: string CopyQuotedString(const char* utf8_body,
michael@0:                          int32 pos, int32 max_pos) {
michael@0:   int32 start_quote = FindQuoteStart(utf8_body, pos, max_pos);
michael@0:   if (start_quote < 0) {return string("");}
michael@0:   int32 end_quote = FindQuoteEnd(utf8_body, start_quote + 1, max_pos);
michael@0:   if (end_quote < 0) {return string("");}
michael@0: 
michael@0:   return CopyOneQuotedString(utf8_body, start_quote + 1, end_quote);
michael@0: }
michael@0: 
michael@0: // Add hints to vector of langpriors
michael@0: // Input is from GetLangTagsFromHtml(), already lowercased
michael@0: void SetCLDLangTagsHint(const string& langtags, CLDLangPriors* langpriors) {
michael@0:   if (langtags.empty()) {return;}
michael@0:   int commas = CountCommas(langtags);
michael@0:   if (commas > 4) {return;}       // Ignore if too many language tags
michael@0: 
michael@0:   char temp[20];
michael@0:   int pos = 0;
michael@0:   while (pos < static_cast<int>(langtags.size())) {
michael@0:     int comma = langtags.find(',', pos);
michael@0:     if (comma == string::npos) {comma = langtags.size();} // fake trailing comma
michael@0:     int len = comma - pos;
michael@0:     if (len <= 16) {
michael@0:       // Short enough to use
michael@0:       memcpy(temp, &langtags[pos], len);
michael@0:       temp[len] = '\0';
michael@0:       const LangTagLookup* entry = DoLangTagLookup(temp,
michael@0:                                                    kCLDLangTagsHintTable1,
michael@0:                                                    kCLDTable1Size);
michael@0:       if (entry != NULL) {
michael@0:         // First table hit
michael@0:         MergeCLDLangPriorsMax(entry->onelangprior1, langpriors);
michael@0:         MergeCLDLangPriorsMax(entry->onelangprior2, langpriors);
michael@0:       } else {
michael@0:         // Try second table with language code truncated at first hyphen
michael@0:         char* hyphen = strchr(temp, '-');
michael@0:         if (hyphen != NULL) {*hyphen = '\0';}
michael@0:         len = strlen(temp);
michael@0:         if (len <= 3) {                 // Short enough to use
michael@0:           entry = DoLangTagLookup(temp,
michael@0:                                   kCLDLangTagsHintTable2,
michael@0:                                   kCLDTable2Size);
michael@0:           if (entry != NULL) {
michael@0:             // Second table hit
michael@0:             MergeCLDLangPriorsMax(entry->onelangprior1, langpriors);
michael@0:             MergeCLDLangPriorsMax(entry->onelangprior2, langpriors);
michael@0:           }
michael@0:         }
michael@0:       }
michael@0:     }
michael@0:     pos = comma + 1;
michael@0:   }
michael@0: }
michael@0: 
michael@0: // Add hints to vector of langpriors
michael@0: // Input is string after HTTP header Content-Language:
michael@0: void SetCLDContentLangHint(const char* contentlang, CLDLangPriors* langpriors) {
michael@0:   string langtags = CopyOneQuotedString(contentlang, 0, strlen(contentlang));
michael@0:   SetCLDLangTagsHint(langtags, langpriors);
michael@0: }
michael@0: 
michael@0: // Add hints to vector of langpriors
michael@0: // Input is last element of hostname (no dot), e.g. from GetTLD()
michael@0: void SetCLDTLDHint(const char* tld, CLDLangPriors* langpriors) {
michael@0:   int len = strlen(tld);
michael@0:   if (len > 3) {return;}        // Ignore if more than three letters
michael@0:   char local_tld[4];
michael@0:   strncpy(local_tld, tld, 4);
michael@0:   local_tld[3] = '\0';          // Safety move
michael@0:   // Lowercase
michael@0:   for (int i = 0; i < len; ++i) {local_tld[i] |= 0x20;}
michael@0:   const TLDLookup* entry = DoTLDLookup(local_tld,
michael@0:                                        kCLDTLDHintTable,
michael@0:                                        kCLDTable3Size);
michael@0:   if (entry != NULL) {
michael@0:     // Table hit
michael@0:     MergeCLDLangPriorsBoost(entry->onelangprior1, langpriors);
michael@0:     MergeCLDLangPriorsBoost(entry->onelangprior2, langpriors);
michael@0:   }
michael@0: }
michael@0: 
michael@0: // Add hints to vector of langpriors
michael@0: // Input is from DetectEncoding()
michael@0: void SetCLDEncodingHint(Encoding enc, CLDLangPriors* langpriors) {
michael@0:   OneCLDLangPrior olp;
michael@0:   switch (enc) {
michael@0:   case CHINESE_GB:
michael@0:   case GBK:
michael@0:   case GB18030:
michael@0:   case ISO_2022_CN:
michael@0:   case HZ_GB_2312:
michael@0:     olp = PackCLDPriorLangWeight(CHINESE, kCLDPriorEncodingWeight);
michael@0:     MergeCLDLangPriorsBoost(olp, langpriors);
michael@0:     break;
michael@0:   case CHINESE_BIG5:
michael@0:   case CHINESE_BIG5_CP950:
michael@0:   case BIG5_HKSCS:
michael@0:     olp = PackCLDPriorLangWeight(CHINESE_T, kCLDPriorEncodingWeight);
michael@0:     MergeCLDLangPriorsBoost(olp, langpriors);
michael@0:     break;
michael@0:   case JAPANESE_EUC_JP:
michael@0:   case JAPANESE_SHIFT_JIS:
michael@0:   case JAPANESE_CP932:
michael@0:   case JAPANESE_JIS:          // ISO-2022-JP
michael@0:     olp = PackCLDPriorLangWeight(JAPANESE, kCLDPriorEncodingWeight);
michael@0:     MergeCLDLangPriorsBoost(olp, langpriors);
michael@0:     break;
michael@0:   case KOREAN_EUC_KR:
michael@0:   case ISO_2022_KR:
michael@0:     olp = PackCLDPriorLangWeight(KOREAN, kCLDPriorEncodingWeight);
michael@0:     MergeCLDLangPriorsBoost(olp, langpriors);
michael@0:     break;
michael@0: 
michael@0:   default:
michael@0:     break;
michael@0:   }
michael@0: }
michael@0: 
michael@0: // Add hints to vector of langpriors
michael@0: // Input is from random source
michael@0: void SetCLDLanguageHint(Language lang, CLDLangPriors* langpriors) {
michael@0:   OneCLDLangPrior olp = PackCLDPriorLangWeight(lang, kCLDPriorLanguageWeight);
michael@0:   MergeCLDLangPriorsBoost(olp, langpriors);
michael@0: }
michael@0: 
michael@0: 
michael@0: // Make printable string of priors
michael@0: string DumpCLDLangPriors(const CLDLangPriors* langpriors) {
michael@0:   string retval;
michael@0:   for (int i = 0; i < langpriors->n; ++i) {
michael@0:     char temp[64];
michael@0:     sprintf(temp, "%s.%d ",
michael@0:              LanguageCode(GetCLDPriorLang(langpriors->prior[i])),
michael@0:              GetCLDPriorWeight(langpriors->prior[i]));
michael@0:     retval.append(temp);
michael@0:   }
michael@0:   return retval;
michael@0: }
michael@0: 
michael@0: 
michael@0: 
michael@0: 
michael@0: // Look for
michael@0: //  <html lang="en">
michael@0: //  <doc xml:lang="en">
michael@0: //  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en-US">
michael@0: //  <meta http-equiv="content-language" content="en-GB" />
michael@0: //  <meta name="language" content="Srpski">
michael@0: //  <meta name="DC.language" scheme="RFCOMMA766" content="en">
michael@0: //  <SPAN id="msg1" class="info" lang='en'>
michael@0: //
michael@0: // Do not trigger on
michael@0: //  <!-- lang=french ...-->
michael@0: //  <font lang=postscript ...>
michael@0: //  <link href="index.fr.html" hreflang="fr-FR" xml:lang="fr-FR" />
michael@0: //  <META name="Author" lang="fr" content="Arnaud Le Hors">
michael@0: //
michael@0: // Stop fairly quickly on mismatched quotes
michael@0: //
michael@0: // Allowed language characters
michael@0: //  a-z A-Z -_ , space\t
michael@0: // Think about: GB2312, big5, shift-jis, euc-jp, ksc euc-kr
michael@0: //  zh-hans zh-TW cmn-Hani zh_cn.gb18030_CN zh-min-nan zh-yue
michael@0: //  de-x-mtfrom-en  zh-tw-x-mtfrom-en  (machine translation)
michael@0: // GB2312 => gb
michael@0: // Big5 => big
michael@0: // zh_CN.gb18030_C => zh-cn
michael@0: //
michael@0: // Remove duplicates and extra spaces as we go
michael@0: // Lowercase as we go.
michael@0: 
michael@0: // Get language tag hints from HTML body
michael@0: // Normalize: remove spaces and make lowercase comma list
michael@0: 
michael@0: string GetLangTagsFromHtml(const char* utf8_body, int32 utf8_body_len,
michael@0:                            int32 max_scan_bytes) {
michael@0:   string retval;
michael@0:   if (max_scan_bytes > utf8_body_len) {
michael@0:     max_scan_bytes = utf8_body_len;
michael@0:   }
michael@0: 
michael@0:   int32 k = 0;
michael@0:   while (k < max_scan_bytes) {
michael@0:     int32 start_tag = FindTagStart(utf8_body, k, max_scan_bytes);
michael@0:     if (start_tag < 0) {break;}
michael@0:     int32 end_tag = FindTagEnd(utf8_body, start_tag + 1, max_scan_bytes);
michael@0:     // FindTagEnd exits on < > &
michael@0:     if (end_tag < 0) {break;}
michael@0: 
michael@0:     // Skip <!--...>
michael@0:     // Skip <font ...>
michael@0:     // Skip <script ...>
michael@0:     // Skip <link ...>
michael@0:     // Skip <img ...>
michael@0:     // Skip <a ...>
michael@0:     if (FindAfter(utf8_body, start_tag + 1, end_tag, "!--") ||
michael@0:         FindAfter(utf8_body, start_tag + 1, end_tag, "font ") ||
michael@0:         FindAfter(utf8_body, start_tag + 1, end_tag, "script ") ||
michael@0:         FindAfter(utf8_body, start_tag + 1, end_tag, "link ") ||
michael@0:         FindAfter(utf8_body, start_tag + 1, end_tag, "img ") ||
michael@0:         FindAfter(utf8_body, start_tag + 1, end_tag, "a ")) {
michael@0:       k = end_tag + 1;
michael@0:       continue;
michael@0:     }
michael@0: 
michael@0:     // Remember <meta ...>
michael@0:     bool in_meta = false;
michael@0:     if (FindAfter(utf8_body, start_tag + 1, end_tag, "meta ")) {
michael@0:       in_meta = true;
michael@0:     }
michael@0: 
michael@0:     // Scan for each equal sign inside tag
michael@0:     bool content_is_lang = false;
michael@0:     int32 kk = start_tag + 1;
michael@0:     int32 equal_sign;
michael@0:     while ((equal_sign = FindEqualSign(utf8_body, kk, end_tag)) >= 0) {
michael@0:       // eq exits on < > &
michael@0: 
michael@0:       // Look inside a meta tag
michael@0:       // <meta ... http-equiv="content-language" ...>
michael@0:       // <meta ... name="language" ...>
michael@0:       // <meta ... name="dc.language" ...>
michael@0:       if (in_meta) {
michael@0:         if (FindBefore(utf8_body, kk, equal_sign, " http-equiv") &&
michael@0:             FindAfter(utf8_body, equal_sign + 1, end_tag,
michael@0:                       "content-language ")) {
michael@0:           content_is_lang = true;
michael@0:         } else if (FindBefore(utf8_body, kk, equal_sign, " name") &&
michael@0:                    (FindAfter(utf8_body, equal_sign + 1, end_tag,
michael@0:                               "dc.language ") ||
michael@0:                     FindAfter(utf8_body, equal_sign + 1, end_tag,
michael@0:                               "language "))) {
michael@0:           content_is_lang = true;
michael@0:         }
michael@0:       }
michael@0: 
michael@0:       // Look inside any tag
michael@0:       // <meta ... content="lang-list" ...>
michael@0:       // <... lang="lang-list" ...>
michael@0:       // <... xml:lang="lang-list" ...>
michael@0:       if ((content_is_lang && FindBefore(utf8_body, kk, equal_sign,
michael@0:                                          " content")) ||
michael@0:           FindBefore(utf8_body, kk, equal_sign, " lang") ||
michael@0:           FindBefore(utf8_body, kk, equal_sign, ":lang")) {
michael@0:         string temp = CopyQuotedString(utf8_body, equal_sign + 1, end_tag);
michael@0: 
michael@0:         // Append new lang tag(s) if not a duplicate
michael@0:         if (!temp.empty() && (retval.find(temp) == string::npos)) {
michael@0:           retval.append(temp);
michael@0:         }
michael@0:       }
michael@0: 
michael@0:       kk = equal_sign + 1;
michael@0:     }
michael@0:     k = end_tag + 1;
michael@0:   }
michael@0: 
michael@0:   // Strip last comma
michael@0:   if (retval.size() > 1) {
michael@0:     retval.erase(retval.size() - 1);
michael@0:   }
michael@0:   return retval;
michael@0: }
michael@0: 
michael@0: }       // End namespace CLD2
michael@0: 
michael@0: //==============================================================================
michael@0: 
michael@0: