michael@0: // Copyright 2013 Google Inc. All Rights Reserved. michael@0: // michael@0: // Licensed under the Apache License, Version 2.0 (the "License"); michael@0: // you may not use this file except in compliance with the License. michael@0: // You may obtain a copy of the License at michael@0: // michael@0: // http://www.apache.org/licenses/LICENSE-2.0 michael@0: // michael@0: // Unless required by applicable law or agreed to in writing, software michael@0: // distributed under the License is distributed on an "AS IS" BASIS, michael@0: // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. michael@0: // See the License for the specific language governing permissions and michael@0: // limitations under the License. michael@0: michael@0: // michael@0: // Author: dsites@google.com (Dick Sites) michael@0: // michael@0: michael@0: #include "compact_lang_det_hint_code.h" michael@0: michael@0: #include // for abs() michael@0: #include // for sprintf() michael@0: #include // michael@0: #include "lang_script.h" michael@0: #include "port.h" michael@0: michael@0: using namespace std; michael@0: michael@0: namespace CLD2 { michael@0: michael@0: static const int kCLDPriorEncodingWeight = 4; // 100x more likely michael@0: static const int kCLDPriorLanguageWeight = 8; // 10000x more likely michael@0: michael@0: michael@0: // Tables to map lang="..." language code lists to actual languages. michael@0: // based on scraping and hand-edits, dsites June 2011 michael@0: michael@0: // n = f(string, &a) gives list of n<=4 language pairs: primary, secondary michael@0: michael@0: // For close pairs like ms/id, more weight on TLD and lang= michael@0: // Alternately, weaker boost but mark others of set as negative; michael@0: // makes "neither" an easier result. michael@0: // lang=en low weight 4 michael@0: // tld=lu boost lu maaybe 4. but lang= alwyas overcomes tld and encoding michael@0: // (except maybe en) michael@0: michael@0: // TLD to separate, e.g., burundi from rwanda michael@0: michael@0: // Encoding lookup: OneLangProb array michael@0: // TLD lookup: tld OneLangProb pairs michael@0: michael@0: michael@0: typedef struct { michael@0: const char* const langtag; // Lowercased, hyphen only lookup key michael@0: const char* const langcode; // Canonical language codes; two if ambiguous michael@0: OneCLDLangPrior onelangprior1; michael@0: OneCLDLangPrior onelangprior2; michael@0: } LangTagLookup; michael@0: michael@0: typedef struct { michael@0: const char* const tld; // Lowercased, hyphen only lookup key michael@0: OneCLDLangPrior onelangprior1; michael@0: OneCLDLangPrior onelangprior2; michael@0: } TLDLookup; michael@0: michael@0: michael@0: #define W2 (2 << 10) // 3**2 = 10x more likely michael@0: #define W4 (4 << 10) // 3**4 = 100x more likely michael@0: #define W6 (6 << 10) // 3**6 = 1000x more likely michael@0: #define W8 (8 << 10) // 3**8 = 10K x more likely michael@0: #define W10 (10 << 10) // 3**10 = 100K x more likely michael@0: #define W12 (12 << 10) // 3**12 = 1M x more likely michael@0: michael@0: // TODO: more about ba hr sr sr-ME and sl michael@0: // Temporary state of affairs: michael@0: // BOSNIAN CROATIAN MONTENEGRIN SERBIAN detecting just CROATIAN SERBIAN michael@0: // Eventually, we want to do all four, but it requires a CLD change to handle michael@0: // up to six languages per quadgram. michael@0: michael@0: michael@0: // Close pairs boost one of pair, demote other. michael@0: // Statistically close pairs: michael@0: // INDONESIAN/MALAY difficult to distinguish -- extra word-based lookups used michael@0: // michael@0: // INDONESIAN MALAY coef=0.4698 Problematic w/o extra words michael@0: // TIBETAN DZONGKHA coef=0.4571 michael@0: // CZECH SLOVAK coef=0.4273 michael@0: // NORWEGIAN NORWEGIAN_N coef=0.4182 michael@0: // michael@0: // HINDI MARATHI coef=0.3795 michael@0: // ZULU XHOSA coef=0.3716 michael@0: // michael@0: // DANISH NORWEGIAN coef=0.3672 Usually OK michael@0: // BIHARI HINDI coef=0.3668 Usually OK michael@0: // ICELANDIC FAROESE coef=0.3519 Usually OK michael@0: michael@0: // michael@0: // Table to look up lang= tags longer than three characters michael@0: // Overrides table below, which is truncated at first hyphen michael@0: // In alphabetical order for binary search michael@0: static const int kCLDTable1Size = 213; michael@0: static const LangTagLookup kCLDLangTagsHintTable1[kCLDTable1Size] = { michael@0: {"abkhazian", "ab", ABKHAZIAN + W10, 0}, michael@0: {"afar", "aa", AFAR + W10, 0}, michael@0: {"afrikaans", "af", AFRIKAANS + W10, 0}, michael@0: {"akan", "ak", AKAN + W10, 0}, michael@0: {"albanian", "sq", ALBANIAN + W10, 0}, michael@0: {"am-am", "hy", ARMENIAN + W10, 0}, // 1:2 Armenian, not ambiguous michael@0: {"amharic", "am", AMHARIC + W10, 0}, michael@0: {"arabic", "ar", ARABIC + W10, 0}, michael@0: {"argentina", "es", SPANISH + W10, 0}, michael@0: {"armenian", "hy", ARMENIAN + W10, 0}, michael@0: {"assamese", "as", ASSAMESE + W10, 0}, michael@0: {"aymara", "ay", AYMARA + W10, 0}, michael@0: {"azerbaijani", "az", AZERBAIJANI + W10, 0}, michael@0: michael@0: {"bangla", "bn", BENGALI + W10, 0}, michael@0: {"bashkir", "ba", BASHKIR + W10, 0}, michael@0: {"basque", "eu", BASQUE + W10, 0}, michael@0: {"belarusian", "be", BELARUSIAN + W10, 0}, michael@0: {"bengali", "bn", BENGALI + W10, 0}, michael@0: {"bihari", "bh", BIHARI + W10, HINDI - W4}, michael@0: {"bislama", "bi", BISLAMA + W10, 0}, michael@0: {"bosnian", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian michael@0: {"br-br", "pt", PORTUGUESE + W10, 0}, // 1:2 Portuguese, not ambiguous michael@0: {"br-fr", "br", BRETON + W10, 0}, // 1:2 Breton, not ambiguous michael@0: {"breton", "br", BRETON + W10, 0}, michael@0: {"bulgarian", "bg", BULGARIAN + W10, 0}, michael@0: {"burmese", "my", BURMESE + W10, 0}, // Myanmar michael@0: michael@0: {"catalan", "ca", CATALAN + W10, 0}, michael@0: {"cherokee", "chr", CHEROKEE + W10, 0}, michael@0: {"chichewa", "ny", NYANJA + W10, 0}, michael@0: michael@0: {"chinese", "zh", CHINESE + W10, 0}, michael@0: {"chinese-t", "zhT", CHINESE_T + W10, 0}, michael@0: {"chineset", "zhT", CHINESE_T + W10, 0}, michael@0: {"corsican", "co", CORSICAN + W10, 0}, michael@0: {"cpf-hat", "ht", HAITIAN_CREOLE + W10, 0}, // Creole, French-based michael@0: {"croatian", "hr", CROATIAN + W10, 0}, michael@0: {"czech", "cs", CZECH + W10, SLOVAK - W4}, michael@0: michael@0: {"danish", "da", DANISH + W10, NORWEGIAN - W4}, michael@0: {"deutsch", "de", GERMAN + W10, 0}, michael@0: {"dhivehi", "dv", DHIVEHI + W10, 0}, michael@0: {"dutch", "nl", DUTCH + W10, 0}, michael@0: {"dzongkha", "dz", DZONGKHA + W10, TIBETAN - W4}, michael@0: michael@0: {"ell-gr", "el", GREEK + W10, 0}, michael@0: {"english", "en", ENGLISH + W4, 0}, michael@0: {"esperanto", "eo", ESPERANTO + W10, 0}, michael@0: {"estonian", "et", ESTONIAN + W10, 0}, michael@0: {"euc-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding michael@0: {"euc-kr", "ko", KOREAN + W10, 0}, // Korean encoding michael@0: michael@0: {"faroese", "fo", FAROESE + W10, ICELANDIC - W4}, michael@0: {"fijian", "fj", FIJIAN + W10, 0}, michael@0: {"finnish", "fi", FINNISH + W10, 0}, michael@0: {"fran", "fr", FRENCH + W10, 0}, // Truncated at non-ASCII michael@0: {"francais", "fr", FRENCH + W10, 0}, michael@0: {"french", "fr", FRENCH + W10, 0}, michael@0: {"frisian", "fy", FRISIAN + W10, 0}, michael@0: michael@0: {"ga-es", "gl", GALICIAN + W10, 0}, // 1:2 Galician, not ambiguous michael@0: {"galician", "gl", GALICIAN + W10, 0}, michael@0: {"ganda", "lg", GANDA + W10, 0}, michael@0: {"georgian", "ka", GEORGIAN + W10, 0}, michael@0: {"german", "de", GERMAN + W10, 0}, michael@0: {"greek", "el", GREEK + W10, 0}, michael@0: {"greenlandic", "kl", GREENLANDIC + W10, 0}, michael@0: {"guarani", "gn", GUARANI + W10, 0}, michael@0: {"gujarati", "gu", GUJARATI + W10, 0}, michael@0: michael@0: {"haitian_creole", "ht", HAITIAN_CREOLE + W10, 0}, michael@0: {"hausa", "ha", HAUSA + W10, 0}, michael@0: {"hawaiian", "haw", HAWAIIAN + W10, 0}, michael@0: {"hebrew", "iw", HEBREW + W10, 0}, michael@0: {"hindi", "hi", HINDI + W10, MARATHI - W4}, michael@0: {"hn-in", "hi", HINDI + W10, MARATHI - W4}, michael@0: {"hungarian", "hu", HUNGARIAN + W10, 0}, michael@0: michael@0: {"icelandic", "is", ICELANDIC + W10, FAROESE - W4}, michael@0: {"igbo", "ig", IGBO + W10, 0}, michael@0: {"indonesian", "id", INDONESIAN + W10, MALAY - W4}, michael@0: {"interlingua", "ia", INTERLINGUA + W10, 0}, michael@0: {"interlingue", "ie", INTERLINGUE + W10, 0}, michael@0: // 1:2 iu-Cans ik-Latn michael@0: {"inuktitut", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2 michael@0: {"inupiak", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2 michael@0: {"ir-ie", "ga", IRISH + W10, 0}, // Irish michael@0: {"irish", "ga", IRISH + W10, 0}, michael@0: {"italian", "it", ITALIAN + W10, 0}, michael@0: michael@0: {"ja-euc", "ja", JAPANESE + W10, 0}, // Japanese encoding michael@0: {"jan-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding michael@0: {"japanese", "ja", JAPANESE + W10, 0}, michael@0: {"javanese", "jw", JAVANESE + W10, 0}, michael@0: michael@0: {"kannada", "kn", KANNADA + W10, 0}, michael@0: {"kashmiri", "ks", KASHMIRI + W10, 0}, michael@0: {"kazakh", "kk", KAZAKH + W10, 0}, michael@0: {"khasi", "kha", KHASI + W10, 0}, michael@0: {"khmer", "km", KHMER + W10, 0}, michael@0: {"kinyarwanda", "rw", KINYARWANDA + W10, 0}, michael@0: {"klingon", "tlh", X_KLINGON + W10, 0}, michael@0: {"korean", "ko", KOREAN + W10, 0}, michael@0: {"kurdish", "ku", KURDISH + W10, 0}, michael@0: {"kyrgyz", "ky", KYRGYZ + W10, 0}, michael@0: michael@0: {"laothian", "lo", LAOTHIAN + W10, 0}, michael@0: {"latin", "la", LATIN + W10, 0}, michael@0: {"latvian", "lv", LATVIAN + W10, 0}, michael@0: {"limbu", "sit", LIMBU + W10, 0}, michael@0: {"lingala", "ln", LINGALA + W10, 0}, michael@0: {"lithuanian", "lt", LITHUANIAN + W10, 0}, michael@0: {"luxembourgish", "lb", LUXEMBOURGISH + W10, 0}, michael@0: michael@0: {"macedonian", "mk", MACEDONIAN + W10, 0}, michael@0: {"malagasy", "mg", MALAGASY + W10, 0}, michael@0: {"malay", "ms", MALAY + W10, INDONESIAN - W4}, michael@0: {"malayalam", "ml", MALAYALAM + W10, 0}, michael@0: {"maltese", "mt", MALTESE + W10, 0}, michael@0: {"manx", "gv", MANX + W10, 0}, michael@0: {"maori", "mi", MAORI + W10, 0}, michael@0: {"marathi", "mr", MARATHI + W10, HINDI - W4}, michael@0: {"mauritian_creole", "mfe", MAURITIAN_CREOLE + W10, 0}, michael@0: {"moldavian", "mo", ROMANIAN + W10, 0}, michael@0: {"mongolian", "mn", MONGOLIAN + W10, 0}, michael@0: {"montenegrin", "sr-me", MONTENEGRIN + W10, 0}, michael@0: {"myanmar", "my", BURMESE + W10, 0}, // Myanmar michael@0: {"nauru", "na", NAURU + W10, 0}, michael@0: {"ndebele", "nr", NDEBELE + W10, 0}, michael@0: {"nepali", "ne", NEPALI + W10, 0}, michael@0: {"no-bok", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal michael@0: {"no-bokmaal", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, michael@0: {"no-nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal michael@0: {"no-no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, michael@0: {"no-nyn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, // Nynorsk michael@0: {"no-nynorsk", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, michael@0: {"norwegian", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, michael@0: {"norwegian_n", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, michael@0: {"nyanja", "ny", NYANJA + W10, 0}, michael@0: michael@0: {"occitan", "oc", OCCITAN + W10, 0}, michael@0: {"oriya", "or", ORIYA + W10, 0}, michael@0: {"oromo", "om", OROMO + W10, 0}, michael@0: {"parsi", "fa", PERSIAN + W10, 0}, michael@0: michael@0: {"pashto", "ps", PASHTO + W10, 0}, michael@0: {"pedi", "nso", PEDI + W10, 0}, michael@0: {"persian", "fa", PERSIAN + W10, 0}, michael@0: {"polish", "pl", POLISH + W10, 0}, michael@0: {"polska", "pl", POLISH + W10, 0}, michael@0: {"polski", "pl", POLISH + W10, 0}, michael@0: {"portugu", "pt", PORTUGUESE + W10, 0}, // Truncated at non-ASCII michael@0: {"portuguese", "pt", PORTUGUESE + W10, 0}, michael@0: {"punjabi", "pa", PUNJABI + W10, 0}, michael@0: michael@0: {"quechua", "qu", QUECHUA + W10, 0}, michael@0: michael@0: {"rhaeto_romance", "rm", RHAETO_ROMANCE + W10, 0}, michael@0: {"romanian", "ro", ROMANIAN + W10, 0}, michael@0: {"rundi", "rn", RUNDI + W10, 0}, michael@0: {"russian", "ru", RUSSIAN + W10, 0}, michael@0: michael@0: {"samoan", "sm", SAMOAN + W10, 0}, michael@0: {"sango", "sg", SANGO + W10, 0}, michael@0: {"sanskrit", "sa", SANSKRIT + W10, 0}, michael@0: {"scots", "sco", SCOTS + W10, ENGLISH - W4}, michael@0: {"scots_gaelic", "gd", SCOTS_GAELIC + W10, 0}, michael@0: {"serbian", "sr", SERBIAN + W10, 0}, michael@0: {"seselwa", "crs", SESELWA + W10, 0}, michael@0: {"sesotho", "st", SESOTHO + W10, 0}, michael@0: {"shift-jis", "ja", JAPANESE + W10, 0}, // Japanese encoding michael@0: {"shift-js", "ja", JAPANESE + W10, 0}, // Japanese encoding michael@0: {"shona", "sn", SHONA + W10, 0}, michael@0: {"si-lk", "si", SINHALESE + W10, 0}, // 1:2 Sri Lanka, not ambiguous michael@0: {"si-si", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous michael@0: {"si-sl", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous michael@0: {"sindhi", "sd", SINDHI + W10, 0}, michael@0: {"sinhalese", "si", SINHALESE + W10, 0}, michael@0: {"siswant", "ss", SISWANT + W10, 0}, michael@0: {"sit-np", "sit", LIMBU + W10, 0}, michael@0: {"slovak", "sk", SLOVAK + W10, CZECH - W4}, michael@0: {"slovenian", "sl", SLOVENIAN + W10, 0}, michael@0: {"somali", "so", SOMALI + W10, 0}, michael@0: {"spanish", "es", SPANISH + W10, 0}, michael@0: {"sr-me", "sr-me", MONTENEGRIN + W10, 0}, // Montenegrin => Montenegrin michael@0: {"sundanese", "su", SUNDANESE + W10, 0}, michael@0: {"suomi", "fi", FINNISH + W10, 0}, // Finnish michael@0: {"swahili", "sw", SWAHILI + W10, 0}, michael@0: {"swedish", "sv", SWEDISH + W10, 0}, michael@0: {"syriac", "syr", SYRIAC + W10, 0}, michael@0: michael@0: {"tagalog", "tl", TAGALOG + W10, 0}, michael@0: {"tajik", "tg", TAJIK + W10, 0}, michael@0: {"tamil", "ta", TAMIL + W10, 0}, michael@0: {"tatar", "tt", TATAR + W10, 0}, michael@0: {"tb-tb", "bo", TIBETAN + W10, DZONGKHA - W4}, // Tibet michael@0: {"tchinese", "zhT", CHINESE_T + W10, 0}, michael@0: {"telugu", "te", TELUGU + W10, 0}, michael@0: {"thai", "th", THAI + W10, 0}, michael@0: {"tibetan", "bo", TIBETAN + W10, DZONGKHA - W4}, michael@0: {"tigrinya", "ti", TIGRINYA + W10, 0}, michael@0: {"tonga", "to", TONGA + W10, 0}, michael@0: {"tsonga", "ts", TSONGA + W10, 0}, michael@0: {"tswana", "tn", TSWANA + W10, 0}, michael@0: {"tt-ru", "tt", TATAR + W10, 0}, michael@0: {"tur-tr", "tr", TURKISH + W10, 0}, michael@0: {"turkish", "tr", TURKISH + W10, 0}, michael@0: {"turkmen", "tk", TURKMEN + W10, 0}, michael@0: {"uighur", "ug", UIGHUR + W10, 0}, michael@0: {"ukrainian", "uk", UKRAINIAN + W10, 0}, michael@0: {"urdu", "ur", URDU + W10, 0}, michael@0: {"uzbek", "uz", UZBEK + W10, 0}, michael@0: michael@0: {"venda", "ve", VENDA + W10, 0}, michael@0: {"vietnam", "vi", VIETNAMESE + W10, 0}, michael@0: {"vietnamese", "vi", VIETNAMESE + W10, 0}, michael@0: {"volapuk", "vo", VOLAPUK + W10, 0}, michael@0: michael@0: {"welsh", "cy", WELSH + W10, 0}, michael@0: {"wolof", "wo", WOLOF + W10, 0}, michael@0: michael@0: {"xhosa", "xh", XHOSA + W10, ZULU - W4}, michael@0: michael@0: {"yiddish", "yi", YIDDISH + W10, 0}, michael@0: {"yoruba", "yo", YORUBA + W10, 0}, michael@0: michael@0: {"zh-classical", "zhT", CHINESE_T + W10, 0}, michael@0: {"zh-cn", "zh", CHINESE + W10, 0}, michael@0: {"zh-hans", "zh", CHINESE + W10, 0}, michael@0: {"zh-hant", "zhT", CHINESE_T + W10, 0}, michael@0: {"zh-hk", "zhT", CHINESE_T + W10, 0}, michael@0: {"zh-min-nan", "zhT", CHINESE_T + W10, 0}, // Min Nan => ChineseT michael@0: {"zh-sg", "zhT", CHINESE_T + W10, 0}, michael@0: {"zh-tw", "zhT", CHINESE_T + W10, 0}, michael@0: {"zh-yue", "zh", CHINESE + W10, 0}, // Yue (Cantonese) => Chinese michael@0: {"zhuang", "za", ZHUANG + W10, 0}, michael@0: {"zulu", "zu", ZULU + W10, XHOSA - W4}, michael@0: }; michael@0: michael@0: michael@0: michael@0: // Table to look up lang= tags of two/three characters after truncate at hyphen michael@0: // In alphabetical order for binary search michael@0: static const int kCLDTable2Size = 257; michael@0: static const LangTagLookup kCLDLangTagsHintTable2[kCLDTable2Size] = { michael@0: {"aa", "aa", AFAR + W10, 0}, michael@0: {"ab", "ab", ABKHAZIAN + W10, 0}, michael@0: {"af", "af", AFRIKAANS + W10, 0}, michael@0: {"ak", "ak", AKAN + W10, 0}, michael@0: {"al", "sq", ALBANIAN + W10, 0}, // Albania michael@0: {"am", "am,hy", AMHARIC + W10, ARMENIAN + W10}, // 1:2 Amharic Armenian michael@0: {"ar", "ar", ARABIC + W10, 0}, michael@0: {"ara", "ar", ARABIC + W10, 0}, michael@0: {"arm", "hy", ARMENIAN + W10, 0}, // Armenia michael@0: {"arz", "ar", ARABIC + W10, 0}, // Egyptian Arabic michael@0: {"as", "as", ASSAMESE + W10, 0}, michael@0: {"at", "de", GERMAN + W10, 0}, // Austria michael@0: {"au", "de", GERMAN + W10, 0}, // Austria michael@0: {"ay", "ay", AYMARA + W10, 0}, michael@0: {"az", "az", AZERBAIJANI + W10, 0}, michael@0: {"aze", "az", AZERBAIJANI + W10, 0}, michael@0: michael@0: {"ba", "ba,bs", BASHKIR + W10, BOSNIAN + W10}, // 1:2 Bashkir Bosnia michael@0: {"be", "be", BELARUSIAN + W10, 0}, michael@0: {"bel", "be", BELARUSIAN + W10, 0}, michael@0: {"bg", "bg", BULGARIAN + W10, 0}, michael@0: {"bh", "bh", BIHARI + W10, HINDI - W4}, michael@0: {"bi", "bi", BISLAMA + W10, 0}, michael@0: {"big", "zhT", CHINESE_T + W10, 0}, // Big5 encoding michael@0: {"bm", "ms", MALAY + W10, INDONESIAN - W4}, // Bahasa Malaysia michael@0: {"bn", "bn", BENGALI + W10, 0}, michael@0: {"bo", "bo", TIBETAN + W10, DZONGKHA - W4}, michael@0: // 1:2 Breton, Brazil country code, both Latn .br TLD enough for pt to win michael@0: {"br", "br,pt", BRETON + W10, PORTUGUESE + W8}, // 1:2 Breton, Brazil michael@0: {"bs", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian michael@0: michael@0: {"ca", "ca", CATALAN + W10, 0}, michael@0: {"cat", "ca", CATALAN + W10, 0}, michael@0: {"ch", "de,fr", GERMAN + W10, FRENCH + W10}, // 1:2 Switzerland michael@0: {"chn", "zh", CHINESE + W10, 0}, michael@0: {"chr", "chr", CHEROKEE + W10, 0}, michael@0: {"ckb", "ku", KURDISH + W10, 0}, // Central Kurdish michael@0: {"cn", "zh,zhT", CHINESE + W6, CHINESE_T + W4}, // Ambiguous, so weaker. michael@0: // Offset by 2 so that TLD=tw or michael@0: // enc=big5 will put zhT ahead michael@0: {"co", "co", CORSICAN + W10, 0}, michael@0: {"cro", "hr", CROATIAN + W10, 0}, // Croatia michael@0: {"crs", "crs", SESELWA + W10, 0}, michael@0: {"cs", "cs", CZECH + W10, SLOVAK - W4}, michael@0: {"ct", "ca", CATALAN + W10, 0}, michael@0: {"cy", "cy", WELSH + W10, 0}, michael@0: {"cym", "cy", WELSH + W10, 0}, michael@0: {"cz", "cs", CZECH + W10, SLOVAK - W4}, michael@0: michael@0: {"da", "da", DANISH + W10, NORWEGIAN - W4}, michael@0: {"dan", "da", DANISH + W10, NORWEGIAN - W4}, michael@0: {"de", "de", GERMAN + W10, 0}, michael@0: {"deu", "de", GERMAN + W10, 0}, michael@0: {"div", "dv", DHIVEHI + W10, 0}, michael@0: {"dk", "da", DANISH + W10, NORWEGIAN - W4}, // Denmark michael@0: {"dut", "nl", DUTCH + W10, 0}, // Dutch michael@0: {"dv", "dv", DHIVEHI + W10, 0}, michael@0: {"dz", "dz", DZONGKHA + W10, TIBETAN - W4}, michael@0: michael@0: {"ee", "et", ESTONIAN + W10, 0}, // Estonia michael@0: {"eg", "ar", ARABIC + W10, 0}, // Egypt michael@0: {"el", "el", GREEK + W10, 0}, michael@0: {"en", "en", ENGLISH + W4, 0}, michael@0: {"eng", "en", ENGLISH + W4, 0}, michael@0: {"eo", "eo", ESPERANTO + W10, 0}, michael@0: {"er", "ur", URDU + W10, 0}, // "Erdu" michael@0: {"es", "es", SPANISH + W10, 0}, michael@0: {"esp", "es", SPANISH + W10, 0}, michael@0: {"est", "et", ESTONIAN + W10, 0}, michael@0: {"et", "et", ESTONIAN + W10, 0}, michael@0: {"eu", "eu", BASQUE + W10, 0}, michael@0: michael@0: {"fa", "fa", PERSIAN + W10, 0}, michael@0: {"far", "fa", PERSIAN + W10, 0}, michael@0: {"fi", "fi", FINNISH + W10, 0}, michael@0: {"fil", "tl", TAGALOG + W10, 0}, // Philippines michael@0: {"fj", "fj", FIJIAN + W10, 0}, michael@0: {"fo", "fo", FAROESE + W10, ICELANDIC - W4}, michael@0: {"fr", "fr", FRENCH + W10, 0}, michael@0: {"fra", "fr", FRENCH + W10, 0}, michael@0: {"fre", "fr", FRENCH + W10, 0}, michael@0: {"fy", "fy", FRISIAN + W10, 0}, michael@0: michael@0: {"ga", "ga,gl", IRISH + W10, GALICIAN + W10}, // 1:2 Irish, Galician michael@0: {"gae", "gd,ga", SCOTS_GAELIC + W10, IRISH + W10}, // 1:2 Gaelic, either michael@0: {"gal", "gl", GALICIAN + W10, 0}, michael@0: {"gb", "zh", CHINESE + W10, 0}, // GB2312 encoding michael@0: {"gbk", "zh", CHINESE + W10, 0}, // GBK encoding michael@0: {"gd", "gd", SCOTS_GAELIC + W10, 0}, michael@0: {"ge", "ka", GEORGIAN + W10, 0}, // Georgia michael@0: {"geo", "ka", GEORGIAN + W10, 0}, michael@0: {"ger", "de", GERMAN + W10, 0}, michael@0: {"gl", "gl", GALICIAN + W10, 0}, // Also Greenland; hard to confuse michael@0: {"gn", "gn", GUARANI + W10, 0}, michael@0: {"gr", "el", GREEK + W10, 0}, // Greece michael@0: {"gu", "gu", GUJARATI + W10, 0}, michael@0: {"gv", "gv", MANX + W10, 0}, michael@0: michael@0: {"ha", "ha", HAUSA + W10, 0}, michael@0: {"hat", "ht", HAITIAN_CREOLE + W10, 0}, // Haiti michael@0: {"haw", "haw", HAWAIIAN + W10, 0}, michael@0: {"hb", "iw", HEBREW + W10, 0}, michael@0: {"he", "iw", HEBREW + W10, 0}, michael@0: {"heb", "iw", HEBREW + W10, 0}, michael@0: {"hi", "hi", HINDI + W10, MARATHI - W4}, michael@0: {"hk", "zhT", CHINESE_T + W10, 0}, // Hong Kong michael@0: {"hr", "hr", CROATIAN + W10, 0}, michael@0: {"ht", "ht", HAITIAN_CREOLE + W10, 0}, michael@0: {"hu", "hu", HUNGARIAN + W10, 0}, michael@0: {"hun", "hu", HUNGARIAN + W10, 0}, michael@0: {"hy", "hy", ARMENIAN + W10, 0}, michael@0: michael@0: {"ia", "ia", INTERLINGUA + W10, 0}, michael@0: {"ice", "is", ICELANDIC + W10, FAROESE - W4}, // Iceland michael@0: {"id", "id", INDONESIAN + W10, MALAY - W4}, michael@0: {"ids", "id", INDONESIAN + W10, MALAY - W4}, michael@0: {"ie", "ie", INTERLINGUE + W10, 0}, michael@0: {"ig", "ig", IGBO + W10, 0}, michael@0: // 1:2 iu-Cans ik-Latn michael@0: {"ik", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2 michael@0: {"in", "id", INDONESIAN + W10, MALAY - W4}, michael@0: {"ind", "id", INDONESIAN + W10, MALAY - W4}, // Indonesia michael@0: {"inu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2 michael@0: {"is", "is", ICELANDIC + W10, FAROESE - W4}, michael@0: {"it", "it", ITALIAN + W10, 0}, michael@0: {"ita", "it", ITALIAN + W10, 0}, michael@0: {"iu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2 michael@0: {"iw", "iw", HEBREW + W10, 0}, michael@0: michael@0: {"ja", "ja", JAPANESE + W10, 0}, michael@0: {"jp", "ja", JAPANESE + W10, 0}, // Japan michael@0: {"jpn", "ja", JAPANESE + W10, 0}, michael@0: {"jv", "jw", JAVANESE + W10, 0}, michael@0: {"jw", "jw", JAVANESE + W10, 0}, michael@0: michael@0: {"ka", "ka", GEORGIAN + W10, 0}, michael@0: {"kc", "qu", QUECHUA + W10, 0}, // (K)Quechua michael@0: {"kg", "ky", KYRGYZ + W10, 0}, // Kyrgyzstan michael@0: {"kh", "km", KHMER + W10, 0}, // Country code Khmer (Cambodia) michael@0: {"kha", "kha", KHASI + W10, 0}, michael@0: {"kk", "kk", KAZAKH + W10, 0}, // Kazakh michael@0: {"kl", "kl", GREENLANDIC + W10, 0}, michael@0: {"km", "km", KHMER + W10, 0}, michael@0: {"kn", "kn", KANNADA + W10, 0}, michael@0: {"ko", "ko", KOREAN + W10, 0}, michael@0: {"kor", "ko", KOREAN + W10, 0}, michael@0: {"kr", "ko", KOREAN + W10, 0}, // Country code Korea michael@0: {"ks", "ks", KASHMIRI + W10, 0}, michael@0: {"ksc", "ko", KOREAN + W10, 0}, // KSC encoding michael@0: {"ku", "ku", KURDISH + W10, 0}, michael@0: {"ky", "ky", KYRGYZ + W10, 0}, michael@0: {"kz", "kk", KAZAKH + W10, 0}, // Kazakhstan michael@0: {"la", "la", LATIN + W10, 0}, michael@0: {"lao", "lo", LAOTHIAN + W10, 0}, // Laos michael@0: michael@0: {"lb", "lb", LUXEMBOURGISH + W10, 0}, michael@0: {"lg", "lg", GANDA + W10, 0}, michael@0: {"lit", "lt", LITHUANIAN + W10, 0}, michael@0: {"ln", "ln", LINGALA + W10, 0}, michael@0: {"lo", "lo", LAOTHIAN + W10, 0}, michael@0: {"lt", "lt", LITHUANIAN + W10, 0}, michael@0: {"ltu", "lt", LITHUANIAN + W10, 0}, michael@0: {"lv", "lv", LATVIAN + W10, 0}, michael@0: michael@0: {"mfe", "mfe", MAURITIAN_CREOLE + W10, 0}, michael@0: {"mg", "mg", MALAGASY + W10, 0}, michael@0: {"mi", "mi", MAORI + W10, 0}, michael@0: {"mk", "mk", MACEDONIAN + W10, 0}, michael@0: {"ml", "ml", MALAYALAM + W10, 0}, michael@0: {"mn", "mn", MONGOLIAN + W10, 0}, michael@0: {"mo", "mo", ROMANIAN + W10, 0}, michael@0: {"mon", "mn", MONGOLIAN + W10, 0}, // Mongolian michael@0: {"mr", "mr", MARATHI + W10, HINDI - W4}, michael@0: {"ms", "ms", MALAY + W10, INDONESIAN - W4}, michael@0: {"mt", "mt", MALTESE + W10, 0}, michael@0: {"mx", "es", SPANISH + W10, 0}, // Mexico michael@0: {"my", "my,ms", BURMESE + W10, MALAY + W10}, // Myanmar, Malaysia michael@0: michael@0: {"na", "na", NAURU + W10, 0}, michael@0: {"nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, michael@0: {"ne", "ne", NEPALI + W10, 0}, michael@0: {"nl", "nl", DUTCH + W10, 0}, michael@0: {"nn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, michael@0: {"no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, michael@0: {"nr", "nr", NDEBELE + W10, 0}, michael@0: {"nso", "nso", PEDI + W10, 0}, michael@0: {"ny", "ny", NYANJA + W10, 0}, michael@0: michael@0: {"oc", "oc", OCCITAN + W10, 0}, michael@0: {"om", "om", OROMO + W10, 0}, michael@0: {"or", "or", ORIYA + W10, 0}, michael@0: michael@0: {"pa", "pa,ps", PUNJABI + W10, PASHTO + W10}, // 1:2 pa-Guru ps-Arab michael@0: {"per", "fa", PERSIAN + W10, 0}, michael@0: {"ph", "tl", TAGALOG + W10, 0}, // Philippines michael@0: {"pk", "ur", URDU + W10, 0}, // Pakistan michael@0: {"pl", "pl", POLISH + W10, 0}, michael@0: {"pnb", "pa", PUNJABI + W10, 0}, // Western Punjabi michael@0: {"pol", "pl", POLISH + W10, 0}, michael@0: {"por", "pt", PORTUGUESE + W10, 0}, michael@0: {"ps", "ps", PASHTO + W10, 0}, michael@0: {"pt", "pt", PORTUGUESE + W10, 0}, michael@0: {"ptg", "pt", PORTUGUESE + W10, 0}, michael@0: {"qc", "fr", FRENCH + W10, 0}, // Quebec "country" code michael@0: {"qu", "qu", QUECHUA + W10, 0}, michael@0: michael@0: {"rm", "rm", RHAETO_ROMANCE + W10, 0}, michael@0: {"rn", "rn", RUNDI + W10, 0}, michael@0: {"ro", "ro", ROMANIAN + W10, 0}, michael@0: {"rs", "sr", SERBIAN + W10, 0}, // Serbia country code michael@0: {"ru", "ru", RUSSIAN + W10, 0}, michael@0: {"rus", "ru", RUSSIAN + W10, 0}, michael@0: {"rw", "rw", KINYARWANDA + W10, 0}, michael@0: michael@0: {"sa", "sa", SANSKRIT + W10, 0}, michael@0: {"sco", "sco", SCOTS + W10, ENGLISH - W4}, michael@0: {"sd", "sd", SINDHI + W10, 0}, michael@0: {"se", "sv", SWEDISH + W10, 0}, michael@0: {"sg", "sg", SANGO + W10, 0}, michael@0: {"si", "si,sl", SINHALESE + W10, SLOVENIAN + W10}, // 1:2 Sinhalese, Slovinia michael@0: {"sk", "sk", SLOVAK + W10, CZECH - W4}, michael@0: {"sl", "sl", SLOVENIAN + W10, 0}, michael@0: {"slo", "sl", SLOVENIAN + W10, 0}, michael@0: {"sm", "sm", SAMOAN + W10, 0}, michael@0: {"sn", "sn", SHONA + W10, 0}, michael@0: {"so", "so", SOMALI + W10, 0}, michael@0: {"sp", "es", SPANISH + W10, 0}, michael@0: {"sq", "sq", ALBANIAN + W10, 0}, michael@0: {"sr", "sr", SERBIAN + W10, 0}, michael@0: {"srb", "sr", SERBIAN + W10, 0}, michael@0: {"srl", "sr", SERBIAN + W10, 0}, // Serbian Latin michael@0: {"srp", "sr", SERBIAN + W10, 0}, michael@0: {"ss", "ss", SISWANT + W10, 0}, michael@0: {"st", "st", SESOTHO + W10, 0}, michael@0: {"su", "su", SUNDANESE + W10, 0}, michael@0: {"sv", "sv", SWEDISH + W10, 0}, michael@0: {"sve", "sv", SWEDISH + W10, 0}, michael@0: {"sw", "sw", SWAHILI + W10, 0}, michael@0: {"swe", "sv", SWEDISH + W10, 0}, michael@0: {"sy", "syr", SYRIAC + W10, 0}, michael@0: {"syr", "syr", SYRIAC + W10, 0}, michael@0: michael@0: {"ta", "ta", TAMIL + W10, 0}, michael@0: {"te", "te", TELUGU + W10, 0}, michael@0: {"tg", "tg", TAJIK + W10, 0}, michael@0: {"th", "th", THAI + W10, 0}, michael@0: {"ti", "ti,bo", TIGRINYA + W10, TIBETAN + W10}, // 1:2 Tigrinya, Tibet michael@0: {"tj", "tg", TAJIK + W10, 0}, // Tajikistan michael@0: {"tk", "tk", TURKMEN + W10, 0}, michael@0: {"tl", "tl", TAGALOG + W10, 0}, michael@0: {"tlh", "tlh", X_KLINGON + W10, 0}, michael@0: {"tn", "tn", TSWANA + W10, 0}, michael@0: {"to", "to", TONGA + W10, 0}, michael@0: {"tr", "tr", TURKISH + W10, 0}, michael@0: {"ts", "ts", TSONGA + W10, 0}, michael@0: {"tt", "tt", TATAR + W10, 0}, michael@0: {"tw", "ak,zhT", AKAN + W10, CHINESE_T + W10}, // 1:2 Twi => Akan, Taiwan michael@0: {"twi", "ak", AKAN + W10, 0}, // Twi => Akan michael@0: michael@0: {"ua", "uk", UKRAINIAN + W10, 0}, // Ukraine michael@0: {"ug", "ug", UIGHUR + W10, 0}, michael@0: {"uk", "uk", UKRAINIAN + W10, 0}, michael@0: {"ur", "ur", URDU + W10, 0}, michael@0: {"uz", "uz", UZBEK + W10, 0}, michael@0: michael@0: {"va", "ca", CATALAN + W10, 0}, // Valencia => Catalan michael@0: {"val", "ca", CATALAN + W10, 0}, // Valencia => Catalan michael@0: {"ve", "ve", VENDA + W10, 0}, michael@0: {"vi", "vi", VIETNAMESE + W10, 0}, michael@0: {"vie", "vi", VIETNAMESE + W10, 0}, michael@0: {"vn", "vi", VIETNAMESE + W10, 0}, michael@0: {"vo", "vo", VOLAPUK + W10, 0}, michael@0: michael@0: {"wo", "wo", WOLOF + W10, 0}, michael@0: michael@0: {"xh", "xh", XHOSA + W10, ZULU - W4}, michael@0: {"xho", "xh", XHOSA + W10, ZULU - W4}, michael@0: michael@0: {"yi", "yi", YIDDISH + W10, 0}, michael@0: {"yo", "yo", YORUBA + W10, 0}, michael@0: michael@0: {"za", "za", ZHUANG + W10, 0}, michael@0: {"zh", "zh", CHINESE + W10, 0}, michael@0: {"zht", "zhT", CHINESE_T + W10, 0}, michael@0: {"zu", "zu", ZULU + W10, XHOSA - W4}, michael@0: }; michael@0: michael@0: michael@0: // Possibly map to tl: michael@0: // -LangTags tl-Latn /7val.com/ ,bcl 2 Central Bicolano michael@0: // -LangTags tl-Latn /7val.com/ ,ceb 6 Cebuano michael@0: // -LangTags tl-Latn /7val.com/ ,war 1 Waray michael@0: michael@0: michael@0: michael@0: // Table to look up country TLD (no general TLD) michael@0: // In alphabetical order for binary search michael@0: static const int kCLDTable3Size = 181; michael@0: static const TLDLookup kCLDTLDHintTable[kCLDTable3Size] = { michael@0: {"ac", JAPANESE + W2, 0}, michael@0: {"ad", CATALAN + W4, 0}, michael@0: {"ae", ARABIC + W4, 0}, michael@0: {"af", PASHTO + W4, PERSIAN + W4}, michael@0: {"ag", GERMAN + W2, 0}, // meager michael@0: // {"ai", 0, 0}, // meager michael@0: {"al", ALBANIAN + W4, 0}, michael@0: {"am", ARMENIAN + W4, 0}, michael@0: {"an", DUTCH + W4, 0}, // meager michael@0: {"ao", PORTUGUESE + W4, 0}, michael@0: // {"aq", 0, 0}, // meager michael@0: {"ar", SPANISH + W4, 0}, michael@0: // {"as", 0, 0}, michael@0: {"at", GERMAN + W4, 0}, michael@0: {"au", ENGLISH + W2, 0}, michael@0: {"aw", DUTCH + W4, 0}, michael@0: {"ax", SWEDISH + W4, 0}, michael@0: {"az", AZERBAIJANI + W4, 0}, michael@0: michael@0: {"ba", BOSNIAN + W8, CROATIAN - W4}, michael@0: // {"bb", 0, 0}, michael@0: {"bd", BENGALI + W4, 0}, michael@0: {"be", DUTCH + W4, FRENCH + W4}, michael@0: {"bf", FRENCH + W4, 0}, michael@0: {"bg", BULGARIAN + W4, 0}, michael@0: {"bh", ARABIC + W4, 0}, michael@0: {"bi", RUNDI + W4, FRENCH + W4}, michael@0: {"bj", FRENCH + W4, 0}, michael@0: {"bm", ENGLISH + W2, 0}, michael@0: {"bn", MALAY + W4, INDONESIAN - W4}, michael@0: {"bo", SPANISH + W4, AYMARA + W2}, // and GUARANI QUECHUA michael@0: {"br", PORTUGUESE + W4, 0}, michael@0: // {"bs", 0, 0}, michael@0: {"bt", DZONGKHA + W10, TIBETAN - W10}, // Strong presumption of Dzongha michael@0: {"bw", TSWANA + W4, 0}, michael@0: {"by", BELARUSIAN + W4, 0}, michael@0: // {"bz", 0, 0}, michael@0: michael@0: {"ca", FRENCH + W4, ENGLISH + W2}, michael@0: {"cat", CATALAN + W4, 0}, michael@0: {"cc", 0, 0}, michael@0: {"cd", FRENCH + W4, 0}, michael@0: {"cf", FRENCH + W4, 0}, michael@0: {"cg", FRENCH + W4, 0}, michael@0: {"ch", GERMAN + W4, FRENCH + W4}, michael@0: {"ci", FRENCH + W4, 0}, michael@0: // {"ck", 0, 0}, michael@0: {"cl", SPANISH + W4, 0}, michael@0: {"cm", FRENCH + W4, 0}, michael@0: {"cn", CHINESE + W4, 0}, michael@0: {"co", SPANISH + W4, 0}, michael@0: {"cr", SPANISH + W4, 0}, michael@0: {"cu", SPANISH + W4, 0}, michael@0: {"cv", PORTUGUESE + W4, 0}, michael@0: // {"cx", 0, 0}, michael@0: {"cy", GREEK + W4, TURKISH + W4}, michael@0: {"cz", CZECH + W4, SLOVAK - W4}, michael@0: michael@0: {"de", GERMAN + W4, 0}, michael@0: {"dj", 0, 0}, michael@0: {"dk", DANISH + W4, NORWEGIAN - W4}, michael@0: {"dm", 0, 0}, michael@0: {"do", SPANISH + W4, 0}, michael@0: {"dz", FRENCH + W4, ARABIC + W4}, michael@0: michael@0: {"ec", SPANISH + W4, 0}, michael@0: {"ee", ESTONIAN + W4, 0}, michael@0: {"eg", ARABIC + W4, 0}, michael@0: {"er", AFAR + W4, 0}, michael@0: {"es", SPANISH + W4, 0}, michael@0: {"et", AMHARIC + W4, AFAR + W4}, michael@0: michael@0: {"fi", FINNISH + W4, 0}, michael@0: {"fj", FIJIAN + W4, 0}, michael@0: // {"fk", 0, 0}, michael@0: // {"fm", 0, 0}, michael@0: {"fo", FAROESE + W4, ICELANDIC - W4}, michael@0: {"fr", FRENCH + W4, 0}, michael@0: michael@0: {"ga", FRENCH + W4, 0}, michael@0: {"gd", 0, 0}, michael@0: {"ge", GEORGIAN + W4, 0}, michael@0: {"gf", FRENCH + W4, 0}, michael@0: // {"gg", 0, 0}, michael@0: // {"gh", 0, 0}, michael@0: // {"gi", 0, 0}, michael@0: {"gl", GREENLANDIC + W4, DANISH + W4}, michael@0: // {"gm", 0, 0}, michael@0: {"gn", FRENCH + W4, 0}, michael@0: // {"gp", 0, 0}, michael@0: // {"gq", 0, 0}, michael@0: {"gr", GREEK + W4, 0}, michael@0: // {"gs", 0, 0}, michael@0: {"gt", SPANISH + W4, 0}, michael@0: // {"gu", 0, 0}, michael@0: // {"gy", 0, 0}, michael@0: michael@0: {"hk", CHINESE_T + W4, 0}, michael@0: // {"hm", 0, 0}, michael@0: {"hn", SPANISH + W4, 0}, michael@0: {"hr", CROATIAN + W8, BOSNIAN - W4}, michael@0: {"ht", HAITIAN_CREOLE + W4, FRENCH + W4}, michael@0: {"hu", HUNGARIAN + W4, 0}, michael@0: michael@0: {"id", INDONESIAN + W4, MALAY - W4}, michael@0: {"ie", IRISH + W4, 0}, michael@0: {"il", HEBREW + W4, 0}, michael@0: {"im", MANX + W4, 0}, michael@0: // {"in", 0, 0}, michael@0: // {"io", 0, 0}, michael@0: {"iq", ARABIC + W4, 0}, michael@0: {"ir", PERSIAN + W4, 0}, michael@0: {"is", ICELANDIC + W4, FAROESE - W4}, michael@0: {"it", ITALIAN + W4, 0}, michael@0: michael@0: // {"je", 0, 0}, michael@0: // {"jm", 0, 0}, michael@0: {"jo", ARABIC + W4, 0}, michael@0: {"jp", JAPANESE + W4, 0}, michael@0: michael@0: // {"ke", 0, 0}, michael@0: {"kg", KYRGYZ + W4, 0}, michael@0: {"kh", KHMER + W4, 0}, michael@0: // {"ki", 0, 0}, michael@0: {"km", FRENCH + W4, 0}, michael@0: // {"kn", 0, 0}, michael@0: {"kp", KOREAN + W4, 0}, michael@0: {"kr", KOREAN + W4, 0}, michael@0: {"kw", ARABIC + W4, 0}, michael@0: // {"ky", 0, 0}, michael@0: {"kz", KAZAKH + W4, 0}, michael@0: michael@0: {"la", LAOTHIAN + W4, 0}, michael@0: {"lb", ARABIC + W4, FRENCH + W4}, michael@0: // {"lc", 0, 0}, michael@0: {"li", GERMAN + W4, 0}, michael@0: {"lk", SINHALESE + W4, 0}, michael@0: // {"lr", 0, 0}, michael@0: {"ls", SESOTHO + W4, 0}, michael@0: {"lt", LITHUANIAN + W4, 0}, michael@0: {"lu", LUXEMBOURGISH + W4}, michael@0: {"lv", LATVIAN + W4, 0}, michael@0: {"ly", ARABIC + W4, 0}, michael@0: michael@0: {"ma", FRENCH + W4, 0}, michael@0: {"mc", FRENCH + W4, 0}, michael@0: {"md", ROMANIAN + W4, 0}, michael@0: {"me", MONTENEGRIN + W8, SERBIAN - W4}, michael@0: {"mg", FRENCH + W4, 0}, michael@0: {"mk", MACEDONIAN + W4, 0}, michael@0: {"ml", FRENCH + W4, 0}, michael@0: {"mm", BURMESE + W4, 0}, michael@0: {"mn", MONGOLIAN + W4, 0}, michael@0: {"mo", CHINESE_T + W4, PORTUGUESE + W4}, michael@0: // {"mp", 0, 0}, michael@0: {"mq", FRENCH + W4, 0}, michael@0: {"mr", FRENCH + W4, ARABIC + W4}, michael@0: // {"ms", 0, 0}, michael@0: {"mt", MALTESE + W4, 0}, michael@0: // {"mu", 0, 0}, michael@0: {"mv", DHIVEHI + W4, 0}, michael@0: // {"mw", 0, 0}, michael@0: {"mx", SPANISH + W4, 0}, michael@0: {"my", MALAY + W4, INDONESIAN - W4}, michael@0: {"mz", PORTUGUESE + W4, 0}, michael@0: michael@0: {"na", 0, 0}, // Namibia michael@0: {"nc", FRENCH + W4, 0}, michael@0: {"ne", FRENCH + W4, 0}, michael@0: {"nf", FRENCH + W4, 0}, michael@0: // {"ng", 0, 0}, michael@0: {"ni", SPANISH + W4, 0}, michael@0: {"nl", DUTCH + W4, 0}, michael@0: {"no", NORWEGIAN + W4, NORWEGIAN_N + W2}, michael@0: {"np", NEPALI + W4, 0}, michael@0: {"nr", NAURU + W4, 0}, michael@0: {"nu", SWEDISH + W4, 0}, michael@0: {"nz", MAORI + W4, ENGLISH + W2}, michael@0: michael@0: {"om", ARABIC + W4, 0}, michael@0: michael@0: {"pa", SPANISH + W4, 0}, michael@0: {"pe", SPANISH + W4, QUECHUA + W2}, // also AYMARA michael@0: {"pf", FRENCH + W4, 0}, michael@0: // {"pg", 0, 0}, michael@0: {"ph", TAGALOG + W4, 0}, michael@0: {"pk", URDU + W4, 0}, michael@0: {"pl", POLISH + W4, 0}, michael@0: // {"pn", 0, 0}, michael@0: {"pr", SPANISH + W4, 0}, michael@0: {"ps", ARABIC + W4, 0}, michael@0: {"pt", PORTUGUESE + W4, 0}, michael@0: {"py", SPANISH + W4, GUARANI + W2}, michael@0: michael@0: {"qa", ARABIC + W4, 0}, michael@0: michael@0: {"re", FRENCH + W4, 0}, michael@0: {"ro", ROMANIAN + W4, 0}, michael@0: {"rs", SERBIAN + W8, MONTENEGRIN - W4}, michael@0: {"ru", RUSSIAN + W4, 0}, michael@0: {"rw", KINYARWANDA + W4, FRENCH + W2}, michael@0: michael@0: {"sa", ARABIC + W4, 0}, michael@0: // {"sb", 0, 0}, michael@0: {"sc", SESELWA + W4, 0}, michael@0: {"sd", ARABIC + W4, 0}, michael@0: {"se", SWEDISH + W4, 0}, michael@0: // {"sg", 0, 0}, michael@0: // {"sh", 0, 0}, michael@0: {"si", SLOVENIAN + W4, 0}, michael@0: {"sk", SLOVAK + W4, CZECH - W4}, michael@0: // {"sl", 0, 0}, michael@0: {"sm", ITALIAN + W4, 0}, michael@0: {"sn", FRENCH + W4, 0}, michael@0: // {"sr", 0, 0}, michael@0: {"ss", ARABIC + W4, 0}, // Presumed South Sudan TLD. dsites 2011.07.07 michael@0: // {"st", 0, 0}, michael@0: {"su", RUSSIAN + W4, 0}, michael@0: {"sv", SPANISH + W4, 0}, michael@0: {"sy", ARABIC + W4, 0}, michael@0: // {"sz", 0, 0}, michael@0: michael@0: // {"tc", 0, 0}, michael@0: {"td", FRENCH + W4, 0}, michael@0: // {"tf", 0, 0}, michael@0: {"tg", FRENCH + W4, 0}, michael@0: {"th", THAI + W4, 0}, michael@0: // Tibet has no country code (see .cn) michael@0: {"tj", TAJIK + W4, 0}, michael@0: // {"tk", 0, 0}, michael@0: // {"tl", 0, 0}, michael@0: {"tm", TURKISH + W4, 0}, michael@0: {"tn", FRENCH + W4, ARABIC + W4}, michael@0: // {"to", 0, 0}, michael@0: {"tp", JAPANESE + W4, 0}, michael@0: {"tr", TURKISH + W4, 0}, michael@0: // {"tt", 0, 0}, michael@0: // {"tv", 0, 0}, michael@0: {"tw", CHINESE_T + W4, 0}, michael@0: {"tz", SWAHILI + W4, AKAN + W4}, michael@0: michael@0: {"ua", UKRAINIAN + W4, 0}, michael@0: {"ug", GANDA + W4, 0}, michael@0: {"uk", ENGLISH + W2, 0}, michael@0: {"us", ENGLISH + W2, 0}, michael@0: {"uy", SPANISH + W4, 0}, michael@0: {"uz", UZBEK + W4, 0}, michael@0: michael@0: {"va", ITALIAN + W4, LATIN + W2}, michael@0: // {"vc", 0, 0}, michael@0: {"ve", SPANISH + W4, 0}, michael@0: // {"vg", 0, 0}, michael@0: // {"vi", 0, 0}, michael@0: {"vn", VIETNAMESE + W4, 0}, michael@0: // {"vu", 0, 0}, michael@0: michael@0: {"wf", FRENCH + W4, 0}, michael@0: // {"ws", 0, 0}, michael@0: michael@0: {"ye", ARABIC + W4, 0}, michael@0: michael@0: {"za", AFRIKAANS + W4, 0}, michael@0: // {"zm", 0, 0}, michael@0: // {"zw", 0, 0}, michael@0: }; michael@0: michael@0: #undef W2 michael@0: #undef W4 michael@0: #undef W6 michael@0: #undef W8 michael@0: #undef W10 michael@0: #undef W12 michael@0: michael@0: michael@0: michael@0: michael@0: michael@0: inline void SetCLDPriorWeight(int w, OneCLDLangPrior* olp) { michael@0: *olp = (*olp & 0x3ff) + (w << 10); michael@0: } michael@0: inline void SetCLDPriorLang(Language lang, OneCLDLangPrior* olp) { michael@0: *olp = (*olp & ~0x3ff) + lang; michael@0: } michael@0: michael@0: OneCLDLangPrior PackCLDPriorLangWeight(Language lang, int w) { michael@0: return (w << 10) + lang; michael@0: } michael@0: michael@0: inline int MaxInt(int a, int b) { michael@0: return (a >= b) ? a : b; michael@0: } michael@0: michael@0: // Merge in another language prior, taking max if already there michael@0: void MergeCLDLangPriorsMax(OneCLDLangPrior olp, CLDLangPriors* lps) { michael@0: if (olp == 0) {return;} michael@0: Language target_lang = GetCLDPriorLang(olp); michael@0: for (int i = 0; i < lps->n; ++i) { michael@0: if (GetCLDPriorLang(lps->prior[i]) == target_lang) { michael@0: int new_weight = MaxInt(GetCLDPriorWeight(lps->prior[i]), michael@0: GetCLDPriorWeight(olp)); michael@0: SetCLDPriorWeight(new_weight, &lps->prior[i]); michael@0: return; michael@0: } michael@0: } michael@0: // Not found; add it if room michael@0: if (lps->n >= kMaxOneCLDLangPrior) {return;} michael@0: lps->prior[lps->n++] = olp; michael@0: } michael@0: michael@0: // Merge in another language prior, boosting 10x if already there michael@0: void MergeCLDLangPriorsBoost(OneCLDLangPrior olp, CLDLangPriors* lps) { michael@0: if (olp == 0) {return;} michael@0: Language target_lang = GetCLDPriorLang(olp); michael@0: for (int i = 0; i < lps->n; ++i) { michael@0: if (GetCLDPriorLang(lps->prior[i]) == target_lang) { michael@0: int new_weight = GetCLDPriorWeight(lps->prior[i]) + 2; michael@0: SetCLDPriorWeight(new_weight, &lps->prior[i]); michael@0: return; michael@0: } michael@0: } michael@0: // Not found; add it if room michael@0: if (lps->n >= kMaxOneCLDLangPrior) {return;} michael@0: lps->prior[lps->n++] = olp; michael@0: } michael@0: michael@0: michael@0: // Trim language priors to no more than max_entries, keeping largest abs weights michael@0: void TrimCLDLangPriors(int max_entries, CLDLangPriors* lps) { michael@0: if (lps->n <= max_entries) {return;} michael@0: michael@0: // Insertion sort in-place by abs(weight) michael@0: for (int i = 0; i < lps->n; ++i) { michael@0: OneCLDLangPrior temp_olp = lps->prior[i]; michael@0: int w = abs(GetCLDPriorWeight(temp_olp)); michael@0: int kk = i; michael@0: for (; kk > 0; --kk) { michael@0: if (abs(GetCLDPriorWeight(lps->prior[kk - 1])) < w) { michael@0: // Move down and continue michael@0: lps->prior[kk] = lps->prior[kk - 1]; michael@0: } else { michael@0: // abs(weight[kk - 1]) >= w, time to stop michael@0: break; michael@0: } michael@0: } michael@0: lps->prior[kk] = temp_olp; michael@0: } michael@0: michael@0: lps->n = max_entries; michael@0: } michael@0: michael@0: int CountCommas(const string& langtags) { michael@0: int commas = 0; michael@0: for (int i = 0; i < static_cast(langtags.size()); ++i) { michael@0: if (langtags[i] == ',') {++commas;} michael@0: } michael@0: return commas; michael@0: } michael@0: michael@0: // Binary lookup on language tag michael@0: const LangTagLookup* DoLangTagLookup(const char* key, michael@0: const LangTagLookup* tbl, int tbl_size) { michael@0: // Key is always in range [lo..hi) michael@0: int lo = 0; michael@0: int hi = tbl_size; michael@0: while (lo < hi) { michael@0: int mid = (lo + hi) >> 1; michael@0: int comp = strcmp(tbl[mid].langtag, key); michael@0: if (comp < 0) { michael@0: lo = mid + 1; michael@0: } else if (comp > 0) { michael@0: hi = mid; michael@0: } else { michael@0: return &tbl[mid]; michael@0: } michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: // Binary lookup on tld michael@0: const TLDLookup* DoTLDLookup(const char* key, michael@0: const TLDLookup* tbl, int tbl_size) { michael@0: // Key is always in range [lo..hi) michael@0: int lo = 0; michael@0: int hi = tbl_size; michael@0: while (lo < hi) { michael@0: int mid = (lo + hi) >> 1; michael@0: int comp = strcmp(tbl[mid].tld, key); michael@0: if (comp < 0) { michael@0: lo = mid + 1; michael@0: } else if (comp > 0) { michael@0: hi = mid; michael@0: } else { michael@0: return &tbl[mid]; michael@0: } michael@0: } michael@0: return NULL; michael@0: } michael@0: michael@0: michael@0: michael@0: // Trim language tag string to canonical form for each language michael@0: // Input is from GetLangTagsFromHtml(), already lowercased michael@0: string TrimCLDLangTagsHint(const string& langtags) { michael@0: string retval; michael@0: if (langtags.empty()) {return retval;} michael@0: int commas = CountCommas(langtags); michael@0: if (commas > 4) {return retval;} // Ignore if too many language tags michael@0: michael@0: char temp[20]; michael@0: int pos = 0; michael@0: while (pos < static_cast(langtags.size())) { michael@0: int comma = langtags.find(',', pos); michael@0: if (comma == string::npos) {comma = langtags.size();} // fake trailing comma michael@0: int len = comma - pos; michael@0: if (len <= 16) { michael@0: // Short enough to use michael@0: memcpy(temp, &langtags[pos], len); michael@0: temp[len] = '\0'; michael@0: const LangTagLookup* entry = DoLangTagLookup(temp, michael@0: kCLDLangTagsHintTable1, michael@0: kCLDTable1Size); michael@0: if (entry != NULL) { michael@0: // First table hit michael@0: retval.append(entry->langcode); // may be "code1,code2" michael@0: retval.append(1, ','); michael@0: } else { michael@0: // Try second table with language code truncated at first hyphen michael@0: char* hyphen = strchr(temp, '-'); michael@0: if (hyphen != NULL) {*hyphen = '\0';} michael@0: len = strlen(temp); michael@0: if (len <= 3) { // Short enough to use michael@0: entry = DoLangTagLookup(temp, michael@0: kCLDLangTagsHintTable2, michael@0: kCLDTable2Size); michael@0: if (entry != NULL) { michael@0: // Second table hit michael@0: retval.append(entry->langcode); // may be "code1,code2" michael@0: retval.append(1, ','); michael@0: } michael@0: } michael@0: } michael@0: } michael@0: pos = comma + 1; michael@0: } michael@0: michael@0: // Remove trainling comma, if any michael@0: if (!retval.empty()) {retval.resize(retval.size() - 1);} michael@0: return retval; michael@0: } michael@0: michael@0: michael@0: michael@0: //============================================================================== michael@0: michael@0: // Little state machine to scan insides of language attribute quoted-string. michael@0: // Each language code is lowercased and copied to the output string. Underscore michael@0: // is mapped to minus. Space, tab, and comma are all mapped to comma, and michael@0: // multiple consecutive commas are removed. michael@0: // Each language code in the output list will be followed by a single comma. michael@0: michael@0: // There are three states, and we start in state 1: michael@0: // State 0: After a letter. michael@0: // Copy all letters/minus[0], copy comma[1]; all others copy comma and skip [2] michael@0: // State 1: Just after a comma. michael@0: // Copy letter [0], Ignore subsequent commas[1]. minus and all others skip [2] michael@0: // State 2: Skipping. michael@0: // All characters except comma skip and stay in [2]. comma goes to [1] michael@0: michael@0: // The thing that is copied is kLangCodeRemap[c] when going to state 0, michael@0: // and always comma when going to state 1 or 2. The design depends on copying michael@0: // a comma at the *beginning* of skipping, and in state 2 never doing a copy. michael@0: michael@0: // We pack all this into 8 bits: michael@0: // +--+---+---+ michael@0: // |78|654|321| michael@0: // +--+---+---+ michael@0: // michael@0: // Shift byte right by 3*state, giving [0] 321, [1] 654, [2] .78 michael@0: // where . is always zero michael@0: // Of these 3 bits, low two are next state ss, high bit is copy bit C. michael@0: // If C=1 and ss == 0, copy kLangCodeRemap[c], else copy a comma michael@0: michael@0: #define SKIP0 0 michael@0: #define SKIP1 1 michael@0: #define SKIP2 2 michael@0: #define COPY0 4 // copy kLangCodeRemap[c] michael@0: #define COPY1 5 // copy ',' michael@0: #define COPY2 6 // copy ',' michael@0: michael@0: // These combined actions pack three states into one byte. michael@0: // Ninth bit must be zero, so all state 2 values must be skips. michael@0: // state[2] state[1] state[0] michael@0: #define LTR ((SKIP2 << 6) + (COPY0 << 3) + COPY0) michael@0: #define MINUS ((SKIP2 << 6) + (COPY2 << 3) + COPY0) michael@0: #define COMMA ((SKIP1 << 6) + (SKIP1 << 3) + COPY1) michael@0: #define Bad ((SKIP2 << 6) + (COPY2 << 3) + COPY2) michael@0: michael@0: // Treat as letter: a-z, A-Z michael@0: // Treat as minus: 2D minus, 5F underscore michael@0: // Treat as comma: 09 tab, 20 space, 2C comma michael@0: michael@0: static const unsigned char kLangCodeAction[256] = { michael@0: Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,COMMA,Bad,Bad,Bad,Bad,Bad,Bad, michael@0: Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, michael@0: COMMA,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,COMMA,MINUS,Bad,Bad, michael@0: Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, michael@0: michael@0: Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, michael@0: LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,MINUS, michael@0: Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, michael@0: LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,Bad, michael@0: michael@0: Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, michael@0: Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, michael@0: Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, michael@0: Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, michael@0: michael@0: Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, michael@0: Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, michael@0: Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, michael@0: Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, michael@0: }; michael@0: michael@0: // This does lowercasing, maps underscore to minus, and maps tab/space to comma michael@0: static const unsigned char kLangCodeRemap[256] = { michael@0: 0,0,0,0,0,0,0,0, 0,',',0,0,0,0,0,0, // 09 tab michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: ',',0,0,0,0,0,0,0, 0,0,0,0,',','-',0,0, // 20 space 2C comma 2D minus michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: michael@0: 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o', michael@0: 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,'-', // 5F underscore michael@0: 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o', michael@0: 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,0, michael@0: michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, michael@0: }; michael@0: michael@0: #undef LTR michael@0: #undef MINUS michael@0: #undef COMMA michael@0: #undef Bad michael@0: michael@0: #undef SKIP0 michael@0: #undef SKIP1 michael@0: #undef SKIP2 michael@0: #undef COPY0 michael@0: #undef COPY1 michael@0: #undef COPY2 michael@0: michael@0: michael@0: // Find opening '<' for HTML tag michael@0: // Note: this is all somewhat insensitive to mismatched quotes michael@0: int32 FindTagStart(const char* utf8_body, int32 pos, int32 max_pos) { michael@0: int i = pos; michael@0: // Advance i by 4 if none of the next 4 bytes are '<' michael@0: for (i = pos; i < (max_pos - 3); i += 4) { michael@0: // Fast check for any < michael@0: const char* p = &utf8_body[i]; michael@0: uint32 s0123 = UNALIGNED_LOAD32(p); michael@0: uint32 temp = s0123 ^ 0x3c3c3c3c; // <<<< michael@0: if (((temp - 0x01010101) & (~temp & 0x80808080)) != 0) { michael@0: // At least one byte is '<' michael@0: break; michael@0: } michael@0: } michael@0: // Continue, advancing i by 1 michael@0: for (; i < max_pos; ++i) { michael@0: if (utf8_body[i] == '<') {return i;} michael@0: } michael@0: return -1; michael@0: } michael@0: michael@0: michael@0: // Find closing '>' for HTML tag. Also stop on < and & (simplistic parsing) michael@0: int32 FindTagEnd(const char* utf8_body, int32 pos, int32 max_pos) { michael@0: // Always outside quotes michael@0: for (int i = pos; i < max_pos; ++i) { michael@0: char c = utf8_body[i]; michael@0: if (c == '>') {return i;} michael@0: if (c == '<') {return i - 1;} michael@0: if (c == '&') {return i - 1;} michael@0: } michael@0: return -1; // nothing found michael@0: } michael@0: michael@0: // Find opening quote or apostrophe, skipping spaces michael@0: // Note: this is all somewhat insensitive to mismatched quotes michael@0: int32 FindQuoteStart(const char* utf8_body, int32 pos, int32 max_pos) { michael@0: for (int i = pos; i < max_pos; ++i) { michael@0: char c = utf8_body[i]; michael@0: if (c == '"') {return i;} michael@0: if (c == '\'') {return i;} michael@0: if (c != ' ') {return -1;} michael@0: } michael@0: return -1; michael@0: } michael@0: michael@0: // Find closing quot/apos. Also stop on = > < and & (simplistic parsing) michael@0: int32 FindQuoteEnd(const char* utf8_body, int32 pos, int32 max_pos) { michael@0: // Always outside quotes michael@0: for (int i = pos; i < max_pos; ++i) { michael@0: char c = utf8_body[i]; michael@0: if (c == '"') {return i;} michael@0: if (c == '\'') {return i;} michael@0: if (c == '>') {return i - 1;} michael@0: if (c == '=') {return i - 1;} michael@0: if (c == '<') {return i - 1;} michael@0: if (c == '&') {return i - 1;} michael@0: } michael@0: return -1; // nothing found michael@0: } michael@0: michael@0: int32 FindEqualSign(const char* utf8_body, int32 pos, int32 max_pos) { michael@0: // Outside quotes/apostrophes loop michael@0: for (int i = pos; i < max_pos; ++i) { michael@0: char c = utf8_body[i]; michael@0: if (c == '=') { // Found bare equal sign inside tag michael@0: return i; michael@0: } else if (c == '"') { michael@0: // Inside quotes loop michael@0: int j; michael@0: for (j = i + 1; j < max_pos; ++j) { michael@0: if (utf8_body[j] == '"') { michael@0: break; michael@0: } else if (utf8_body[j] == '\\') { michael@0: ++j; michael@0: } michael@0: } michael@0: i = j; michael@0: } else if (c == '\'') { michael@0: // Inside apostrophes loop michael@0: int j; michael@0: for (j = i + 1; j < max_pos; ++j) { michael@0: if (utf8_body[j] == '\'') { michael@0: break; michael@0: } else if (utf8_body[j] == '\\') { michael@0: ++j; michael@0: } michael@0: } michael@0: i = j; michael@0: } michael@0: michael@0: } michael@0: return -1; // nothing found michael@0: } michael@0: michael@0: // Scan backwards for case-insensitive string s in [min_pos..pos) michael@0: // Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f] michael@0: // Cheap lowercase. Control codes will masquerade as 20..3f michael@0: bool FindBefore(const char* utf8_body, michael@0: int32 min_pos, int32 pos, const char* s) { michael@0: int len = strlen(s); michael@0: if ((pos - min_pos) < len) {return false;} // Too small to fit s michael@0: michael@0: // Skip trailing spaces michael@0: int i = pos; michael@0: while ((i > (min_pos + len)) && (utf8_body[i - 1] == ' ')) {--i;} michael@0: i -= len; michael@0: if (i < min_pos) {return false;} // pos - min_pos < len, so s can't be found michael@0: michael@0: const char* p = &utf8_body[i]; michael@0: for (int j = 0; j < len; ++j) { michael@0: if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte michael@0: } michael@0: return true; // All bytes equal at i michael@0: } michael@0: michael@0: // Scan forwards for case-insensitive string s in [pos..max_pos) michael@0: // Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f] michael@0: // Cheap lowercase. Control codes will masquerade as 20..3f michael@0: // Allows but does not require quoted/apostrophe string michael@0: bool FindAfter(const char* utf8_body, michael@0: int32 pos, int32 max_pos, const char* s) { michael@0: int len = strlen(s); michael@0: if ((max_pos - pos) < len) {return false;} // Too small to fit s michael@0: michael@0: // Skip leading spaces, quote, apostrophe michael@0: int i = pos; michael@0: while (i < (max_pos - len)) { michael@0: unsigned char c = utf8_body[i]; michael@0: if ((c == ' ') || (c == '"') || (c == '\'')) {++i;} michael@0: else {break;} michael@0: } michael@0: michael@0: const char* p = &utf8_body[i]; michael@0: for (int j = 0; j < len; ++j) { michael@0: if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte michael@0: } michael@0: return true; // All bytes equal michael@0: } michael@0: michael@0: michael@0: michael@0: // Copy attribute value in [pos..max_pos) michael@0: // pos is just after an opening quote/apostrophe and max_pos is the ending one michael@0: // String must all be on a single line. michael@0: // Return slightly-normalized language list, empty or ending in comma michael@0: // Does lowercasing and removes excess punctuation/space michael@0: string CopyOneQuotedString(const char* utf8_body, michael@0: int32 pos, int32 max_pos) { michael@0: string s; michael@0: int state = 1; // Front is logically just after a comma michael@0: for (int i = pos; i < max_pos; ++i) { michael@0: unsigned char c = utf8_body[i]; michael@0: int e = kLangCodeAction[c] >> (3 * state); michael@0: state = e & 3; // Update to next state michael@0: if ((e & 4) != 0) { michael@0: // Copy a remapped byte if going to state 0, else copy a comma michael@0: if (state == 0) { michael@0: s.append(1, kLangCodeRemap[c]); michael@0: } else { michael@0: s.append(1, ','); michael@0: } michael@0: } michael@0: } michael@0: michael@0: // Add final comma if needed michael@0: if (state == 0) { michael@0: s.append(1, ','); michael@0: } michael@0: return s; michael@0: } michael@0: michael@0: // Find and copy attribute value: quoted string in [pos..max_pos) michael@0: // Return slightly-normalized language list, empty or ending in comma michael@0: string CopyQuotedString(const char* utf8_body, michael@0: int32 pos, int32 max_pos) { michael@0: int32 start_quote = FindQuoteStart(utf8_body, pos, max_pos); michael@0: if (start_quote < 0) {return string("");} michael@0: int32 end_quote = FindQuoteEnd(utf8_body, start_quote + 1, max_pos); michael@0: if (end_quote < 0) {return string("");} michael@0: michael@0: return CopyOneQuotedString(utf8_body, start_quote + 1, end_quote); michael@0: } michael@0: michael@0: // Add hints to vector of langpriors michael@0: // Input is from GetLangTagsFromHtml(), already lowercased michael@0: void SetCLDLangTagsHint(const string& langtags, CLDLangPriors* langpriors) { michael@0: if (langtags.empty()) {return;} michael@0: int commas = CountCommas(langtags); michael@0: if (commas > 4) {return;} // Ignore if too many language tags michael@0: michael@0: char temp[20]; michael@0: int pos = 0; michael@0: while (pos < static_cast(langtags.size())) { michael@0: int comma = langtags.find(',', pos); michael@0: if (comma == string::npos) {comma = langtags.size();} // fake trailing comma michael@0: int len = comma - pos; michael@0: if (len <= 16) { michael@0: // Short enough to use michael@0: memcpy(temp, &langtags[pos], len); michael@0: temp[len] = '\0'; michael@0: const LangTagLookup* entry = DoLangTagLookup(temp, michael@0: kCLDLangTagsHintTable1, michael@0: kCLDTable1Size); michael@0: if (entry != NULL) { michael@0: // First table hit michael@0: MergeCLDLangPriorsMax(entry->onelangprior1, langpriors); michael@0: MergeCLDLangPriorsMax(entry->onelangprior2, langpriors); michael@0: } else { michael@0: // Try second table with language code truncated at first hyphen michael@0: char* hyphen = strchr(temp, '-'); michael@0: if (hyphen != NULL) {*hyphen = '\0';} michael@0: len = strlen(temp); michael@0: if (len <= 3) { // Short enough to use michael@0: entry = DoLangTagLookup(temp, michael@0: kCLDLangTagsHintTable2, michael@0: kCLDTable2Size); michael@0: if (entry != NULL) { michael@0: // Second table hit michael@0: MergeCLDLangPriorsMax(entry->onelangprior1, langpriors); michael@0: MergeCLDLangPriorsMax(entry->onelangprior2, langpriors); michael@0: } michael@0: } michael@0: } michael@0: } michael@0: pos = comma + 1; michael@0: } michael@0: } michael@0: michael@0: // Add hints to vector of langpriors michael@0: // Input is string after HTTP header Content-Language: michael@0: void SetCLDContentLangHint(const char* contentlang, CLDLangPriors* langpriors) { michael@0: string langtags = CopyOneQuotedString(contentlang, 0, strlen(contentlang)); michael@0: SetCLDLangTagsHint(langtags, langpriors); michael@0: } michael@0: michael@0: // Add hints to vector of langpriors michael@0: // Input is last element of hostname (no dot), e.g. from GetTLD() michael@0: void SetCLDTLDHint(const char* tld, CLDLangPriors* langpriors) { michael@0: int len = strlen(tld); michael@0: if (len > 3) {return;} // Ignore if more than three letters michael@0: char local_tld[4]; michael@0: strncpy(local_tld, tld, 4); michael@0: local_tld[3] = '\0'; // Safety move michael@0: // Lowercase michael@0: for (int i = 0; i < len; ++i) {local_tld[i] |= 0x20;} michael@0: const TLDLookup* entry = DoTLDLookup(local_tld, michael@0: kCLDTLDHintTable, michael@0: kCLDTable3Size); michael@0: if (entry != NULL) { michael@0: // Table hit michael@0: MergeCLDLangPriorsBoost(entry->onelangprior1, langpriors); michael@0: MergeCLDLangPriorsBoost(entry->onelangprior2, langpriors); michael@0: } michael@0: } michael@0: michael@0: // Add hints to vector of langpriors michael@0: // Input is from DetectEncoding() michael@0: void SetCLDEncodingHint(Encoding enc, CLDLangPriors* langpriors) { michael@0: OneCLDLangPrior olp; michael@0: switch (enc) { michael@0: case CHINESE_GB: michael@0: case GBK: michael@0: case GB18030: michael@0: case ISO_2022_CN: michael@0: case HZ_GB_2312: michael@0: olp = PackCLDPriorLangWeight(CHINESE, kCLDPriorEncodingWeight); michael@0: MergeCLDLangPriorsBoost(olp, langpriors); michael@0: break; michael@0: case CHINESE_BIG5: michael@0: case CHINESE_BIG5_CP950: michael@0: case BIG5_HKSCS: michael@0: olp = PackCLDPriorLangWeight(CHINESE_T, kCLDPriorEncodingWeight); michael@0: MergeCLDLangPriorsBoost(olp, langpriors); michael@0: break; michael@0: case JAPANESE_EUC_JP: michael@0: case JAPANESE_SHIFT_JIS: michael@0: case JAPANESE_CP932: michael@0: case JAPANESE_JIS: // ISO-2022-JP michael@0: olp = PackCLDPriorLangWeight(JAPANESE, kCLDPriorEncodingWeight); michael@0: MergeCLDLangPriorsBoost(olp, langpriors); michael@0: break; michael@0: case KOREAN_EUC_KR: michael@0: case ISO_2022_KR: michael@0: olp = PackCLDPriorLangWeight(KOREAN, kCLDPriorEncodingWeight); michael@0: MergeCLDLangPriorsBoost(olp, langpriors); michael@0: break; michael@0: michael@0: default: michael@0: break; michael@0: } michael@0: } michael@0: michael@0: // Add hints to vector of langpriors michael@0: // Input is from random source michael@0: void SetCLDLanguageHint(Language lang, CLDLangPriors* langpriors) { michael@0: OneCLDLangPrior olp = PackCLDPriorLangWeight(lang, kCLDPriorLanguageWeight); michael@0: MergeCLDLangPriorsBoost(olp, langpriors); michael@0: } michael@0: michael@0: michael@0: // Make printable string of priors michael@0: string DumpCLDLangPriors(const CLDLangPriors* langpriors) { michael@0: string retval; michael@0: for (int i = 0; i < langpriors->n; ++i) { michael@0: char temp[64]; michael@0: sprintf(temp, "%s.%d ", michael@0: LanguageCode(GetCLDPriorLang(langpriors->prior[i])), michael@0: GetCLDPriorWeight(langpriors->prior[i])); michael@0: retval.append(temp); michael@0: } michael@0: return retval; michael@0: } michael@0: michael@0: michael@0: michael@0: michael@0: // Look for michael@0: // michael@0: // michael@0: // michael@0: // michael@0: // michael@0: // michael@0: // michael@0: // michael@0: // Do not trigger on michael@0: // michael@0: // michael@0: // michael@0: // michael@0: // michael@0: // Stop fairly quickly on mismatched quotes michael@0: // michael@0: // Allowed language characters michael@0: // a-z A-Z -_ , space\t michael@0: // Think about: GB2312, big5, shift-jis, euc-jp, ksc euc-kr michael@0: // zh-hans zh-TW cmn-Hani zh_cn.gb18030_CN zh-min-nan zh-yue michael@0: // de-x-mtfrom-en zh-tw-x-mtfrom-en (machine translation) michael@0: // GB2312 => gb michael@0: // Big5 => big michael@0: // zh_CN.gb18030_C => zh-cn michael@0: // michael@0: // Remove duplicates and extra spaces as we go michael@0: // Lowercase as we go. michael@0: michael@0: // Get language tag hints from HTML body michael@0: // Normalize: remove spaces and make lowercase comma list michael@0: michael@0: string GetLangTagsFromHtml(const char* utf8_body, int32 utf8_body_len, michael@0: int32 max_scan_bytes) { michael@0: string retval; michael@0: if (max_scan_bytes > utf8_body_len) { michael@0: max_scan_bytes = utf8_body_len; michael@0: } michael@0: michael@0: int32 k = 0; michael@0: while (k < max_scan_bytes) { michael@0: int32 start_tag = FindTagStart(utf8_body, k, max_scan_bytes); michael@0: if (start_tag < 0) {break;} michael@0: int32 end_tag = FindTagEnd(utf8_body, start_tag + 1, max_scan_bytes); michael@0: // FindTagEnd exits on < > & michael@0: if (end_tag < 0) {break;} michael@0: michael@0: // Skip