1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/internal/compact_lang_det_hint_code.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1651 @@ 1.4 +// Copyright 2013 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +// 1.19 +// Author: dsites@google.com (Dick Sites) 1.20 +// 1.21 + 1.22 +#include "compact_lang_det_hint_code.h" 1.23 + 1.24 +#include <stdlib.h> // for abs() 1.25 +#include <stdio.h> // for sprintf() 1.26 +#include <string.h> // 1.27 +#include "lang_script.h" 1.28 +#include "port.h" 1.29 + 1.30 +using namespace std; 1.31 + 1.32 +namespace CLD2 { 1.33 + 1.34 +static const int kCLDPriorEncodingWeight = 4; // 100x more likely 1.35 +static const int kCLDPriorLanguageWeight = 8; // 10000x more likely 1.36 + 1.37 + 1.38 +// Tables to map lang="..." language code lists to actual languages. 1.39 +// based on scraping and hand-edits, dsites June 2011 1.40 + 1.41 +// n = f(string, &a) gives list of n<=4 language pairs: primary, secondary 1.42 + 1.43 +// For close pairs like ms/id, more weight on TLD and lang= 1.44 +// Alternately, weaker boost but mark others of set as negative; 1.45 +// makes "neither" an easier result. 1.46 +// lang=en low weight 4 1.47 +// tld=lu boost lu maaybe 4. but lang= alwyas overcomes tld and encoding 1.48 +// (except maybe en) 1.49 + 1.50 +// TLD to separate, e.g., burundi from rwanda 1.51 + 1.52 +// Encoding lookup: OneLangProb array 1.53 +// TLD lookup: tld OneLangProb pairs 1.54 + 1.55 + 1.56 +typedef struct { 1.57 + const char* const langtag; // Lowercased, hyphen only lookup key 1.58 + const char* const langcode; // Canonical language codes; two if ambiguous 1.59 + OneCLDLangPrior onelangprior1; 1.60 + OneCLDLangPrior onelangprior2; 1.61 +} LangTagLookup; 1.62 + 1.63 +typedef struct { 1.64 + const char* const tld; // Lowercased, hyphen only lookup key 1.65 + OneCLDLangPrior onelangprior1; 1.66 + OneCLDLangPrior onelangprior2; 1.67 +} TLDLookup; 1.68 + 1.69 + 1.70 +#define W2 (2 << 10) // 3**2 = 10x more likely 1.71 +#define W4 (4 << 10) // 3**4 = 100x more likely 1.72 +#define W6 (6 << 10) // 3**6 = 1000x more likely 1.73 +#define W8 (8 << 10) // 3**8 = 10K x more likely 1.74 +#define W10 (10 << 10) // 3**10 = 100K x more likely 1.75 +#define W12 (12 << 10) // 3**12 = 1M x more likely 1.76 + 1.77 +// TODO: more about ba hr sr sr-ME and sl 1.78 +// Temporary state of affairs: 1.79 +// BOSNIAN CROATIAN MONTENEGRIN SERBIAN detecting just CROATIAN SERBIAN 1.80 +// Eventually, we want to do all four, but it requires a CLD change to handle 1.81 +// up to six languages per quadgram. 1.82 + 1.83 + 1.84 +// Close pairs boost one of pair, demote other. 1.85 +// Statistically close pairs: 1.86 +// INDONESIAN/MALAY difficult to distinguish -- extra word-based lookups used 1.87 +// 1.88 +// INDONESIAN MALAY coef=0.4698 Problematic w/o extra words 1.89 +// TIBETAN DZONGKHA coef=0.4571 1.90 +// CZECH SLOVAK coef=0.4273 1.91 +// NORWEGIAN NORWEGIAN_N coef=0.4182 1.92 +// 1.93 +// HINDI MARATHI coef=0.3795 1.94 +// ZULU XHOSA coef=0.3716 1.95 +// 1.96 +// DANISH NORWEGIAN coef=0.3672 Usually OK 1.97 +// BIHARI HINDI coef=0.3668 Usually OK 1.98 +// ICELANDIC FAROESE coef=0.3519 Usually OK 1.99 + 1.100 +// 1.101 +// Table to look up lang= tags longer than three characters 1.102 +// Overrides table below, which is truncated at first hyphen 1.103 +// In alphabetical order for binary search 1.104 +static const int kCLDTable1Size = 213; 1.105 +static const LangTagLookup kCLDLangTagsHintTable1[kCLDTable1Size] = { 1.106 + {"abkhazian", "ab", ABKHAZIAN + W10, 0}, 1.107 + {"afar", "aa", AFAR + W10, 0}, 1.108 + {"afrikaans", "af", AFRIKAANS + W10, 0}, 1.109 + {"akan", "ak", AKAN + W10, 0}, 1.110 + {"albanian", "sq", ALBANIAN + W10, 0}, 1.111 + {"am-am", "hy", ARMENIAN + W10, 0}, // 1:2 Armenian, not ambiguous 1.112 + {"amharic", "am", AMHARIC + W10, 0}, 1.113 + {"arabic", "ar", ARABIC + W10, 0}, 1.114 + {"argentina", "es", SPANISH + W10, 0}, 1.115 + {"armenian", "hy", ARMENIAN + W10, 0}, 1.116 + {"assamese", "as", ASSAMESE + W10, 0}, 1.117 + {"aymara", "ay", AYMARA + W10, 0}, 1.118 + {"azerbaijani", "az", AZERBAIJANI + W10, 0}, 1.119 + 1.120 + {"bangla", "bn", BENGALI + W10, 0}, 1.121 + {"bashkir", "ba", BASHKIR + W10, 0}, 1.122 + {"basque", "eu", BASQUE + W10, 0}, 1.123 + {"belarusian", "be", BELARUSIAN + W10, 0}, 1.124 + {"bengali", "bn", BENGALI + W10, 0}, 1.125 + {"bihari", "bh", BIHARI + W10, HINDI - W4}, 1.126 + {"bislama", "bi", BISLAMA + W10, 0}, 1.127 + {"bosnian", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian 1.128 + {"br-br", "pt", PORTUGUESE + W10, 0}, // 1:2 Portuguese, not ambiguous 1.129 + {"br-fr", "br", BRETON + W10, 0}, // 1:2 Breton, not ambiguous 1.130 + {"breton", "br", BRETON + W10, 0}, 1.131 + {"bulgarian", "bg", BULGARIAN + W10, 0}, 1.132 + {"burmese", "my", BURMESE + W10, 0}, // Myanmar 1.133 + 1.134 + {"catalan", "ca", CATALAN + W10, 0}, 1.135 + {"cherokee", "chr", CHEROKEE + W10, 0}, 1.136 + {"chichewa", "ny", NYANJA + W10, 0}, 1.137 + 1.138 + {"chinese", "zh", CHINESE + W10, 0}, 1.139 + {"chinese-t", "zhT", CHINESE_T + W10, 0}, 1.140 + {"chineset", "zhT", CHINESE_T + W10, 0}, 1.141 + {"corsican", "co", CORSICAN + W10, 0}, 1.142 + {"cpf-hat", "ht", HAITIAN_CREOLE + W10, 0}, // Creole, French-based 1.143 + {"croatian", "hr", CROATIAN + W10, 0}, 1.144 + {"czech", "cs", CZECH + W10, SLOVAK - W4}, 1.145 + 1.146 + {"danish", "da", DANISH + W10, NORWEGIAN - W4}, 1.147 + {"deutsch", "de", GERMAN + W10, 0}, 1.148 + {"dhivehi", "dv", DHIVEHI + W10, 0}, 1.149 + {"dutch", "nl", DUTCH + W10, 0}, 1.150 + {"dzongkha", "dz", DZONGKHA + W10, TIBETAN - W4}, 1.151 + 1.152 + {"ell-gr", "el", GREEK + W10, 0}, 1.153 + {"english", "en", ENGLISH + W4, 0}, 1.154 + {"esperanto", "eo", ESPERANTO + W10, 0}, 1.155 + {"estonian", "et", ESTONIAN + W10, 0}, 1.156 + {"euc-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding 1.157 + {"euc-kr", "ko", KOREAN + W10, 0}, // Korean encoding 1.158 + 1.159 + {"faroese", "fo", FAROESE + W10, ICELANDIC - W4}, 1.160 + {"fijian", "fj", FIJIAN + W10, 0}, 1.161 + {"finnish", "fi", FINNISH + W10, 0}, 1.162 + {"fran", "fr", FRENCH + W10, 0}, // Truncated at non-ASCII 1.163 + {"francais", "fr", FRENCH + W10, 0}, 1.164 + {"french", "fr", FRENCH + W10, 0}, 1.165 + {"frisian", "fy", FRISIAN + W10, 0}, 1.166 + 1.167 + {"ga-es", "gl", GALICIAN + W10, 0}, // 1:2 Galician, not ambiguous 1.168 + {"galician", "gl", GALICIAN + W10, 0}, 1.169 + {"ganda", "lg", GANDA + W10, 0}, 1.170 + {"georgian", "ka", GEORGIAN + W10, 0}, 1.171 + {"german", "de", GERMAN + W10, 0}, 1.172 + {"greek", "el", GREEK + W10, 0}, 1.173 + {"greenlandic", "kl", GREENLANDIC + W10, 0}, 1.174 + {"guarani", "gn", GUARANI + W10, 0}, 1.175 + {"gujarati", "gu", GUJARATI + W10, 0}, 1.176 + 1.177 + {"haitian_creole", "ht", HAITIAN_CREOLE + W10, 0}, 1.178 + {"hausa", "ha", HAUSA + W10, 0}, 1.179 + {"hawaiian", "haw", HAWAIIAN + W10, 0}, 1.180 + {"hebrew", "iw", HEBREW + W10, 0}, 1.181 + {"hindi", "hi", HINDI + W10, MARATHI - W4}, 1.182 + {"hn-in", "hi", HINDI + W10, MARATHI - W4}, 1.183 + {"hungarian", "hu", HUNGARIAN + W10, 0}, 1.184 + 1.185 + {"icelandic", "is", ICELANDIC + W10, FAROESE - W4}, 1.186 + {"igbo", "ig", IGBO + W10, 0}, 1.187 + {"indonesian", "id", INDONESIAN + W10, MALAY - W4}, 1.188 + {"interlingua", "ia", INTERLINGUA + W10, 0}, 1.189 + {"interlingue", "ie", INTERLINGUE + W10, 0}, 1.190 + // 1:2 iu-Cans ik-Latn 1.191 + {"inuktitut", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2 1.192 + {"inupiak", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2 1.193 + {"ir-ie", "ga", IRISH + W10, 0}, // Irish 1.194 + {"irish", "ga", IRISH + W10, 0}, 1.195 + {"italian", "it", ITALIAN + W10, 0}, 1.196 + 1.197 + {"ja-euc", "ja", JAPANESE + W10, 0}, // Japanese encoding 1.198 + {"jan-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding 1.199 + {"japanese", "ja", JAPANESE + W10, 0}, 1.200 + {"javanese", "jw", JAVANESE + W10, 0}, 1.201 + 1.202 + {"kannada", "kn", KANNADA + W10, 0}, 1.203 + {"kashmiri", "ks", KASHMIRI + W10, 0}, 1.204 + {"kazakh", "kk", KAZAKH + W10, 0}, 1.205 + {"khasi", "kha", KHASI + W10, 0}, 1.206 + {"khmer", "km", KHMER + W10, 0}, 1.207 + {"kinyarwanda", "rw", KINYARWANDA + W10, 0}, 1.208 + {"klingon", "tlh", X_KLINGON + W10, 0}, 1.209 + {"korean", "ko", KOREAN + W10, 0}, 1.210 + {"kurdish", "ku", KURDISH + W10, 0}, 1.211 + {"kyrgyz", "ky", KYRGYZ + W10, 0}, 1.212 + 1.213 + {"laothian", "lo", LAOTHIAN + W10, 0}, 1.214 + {"latin", "la", LATIN + W10, 0}, 1.215 + {"latvian", "lv", LATVIAN + W10, 0}, 1.216 + {"limbu", "sit", LIMBU + W10, 0}, 1.217 + {"lingala", "ln", LINGALA + W10, 0}, 1.218 + {"lithuanian", "lt", LITHUANIAN + W10, 0}, 1.219 + {"luxembourgish", "lb", LUXEMBOURGISH + W10, 0}, 1.220 + 1.221 + {"macedonian", "mk", MACEDONIAN + W10, 0}, 1.222 + {"malagasy", "mg", MALAGASY + W10, 0}, 1.223 + {"malay", "ms", MALAY + W10, INDONESIAN - W4}, 1.224 + {"malayalam", "ml", MALAYALAM + W10, 0}, 1.225 + {"maltese", "mt", MALTESE + W10, 0}, 1.226 + {"manx", "gv", MANX + W10, 0}, 1.227 + {"maori", "mi", MAORI + W10, 0}, 1.228 + {"marathi", "mr", MARATHI + W10, HINDI - W4}, 1.229 + {"mauritian_creole", "mfe", MAURITIAN_CREOLE + W10, 0}, 1.230 + {"moldavian", "mo", ROMANIAN + W10, 0}, 1.231 + {"mongolian", "mn", MONGOLIAN + W10, 0}, 1.232 + {"montenegrin", "sr-me", MONTENEGRIN + W10, 0}, 1.233 + {"myanmar", "my", BURMESE + W10, 0}, // Myanmar 1.234 + {"nauru", "na", NAURU + W10, 0}, 1.235 + {"ndebele", "nr", NDEBELE + W10, 0}, 1.236 + {"nepali", "ne", NEPALI + W10, 0}, 1.237 + {"no-bok", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal 1.238 + {"no-bokmaal", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, 1.239 + {"no-nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal 1.240 + {"no-no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, 1.241 + {"no-nyn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, // Nynorsk 1.242 + {"no-nynorsk", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, 1.243 + {"norwegian", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, 1.244 + {"norwegian_n", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, 1.245 + {"nyanja", "ny", NYANJA + W10, 0}, 1.246 + 1.247 + {"occitan", "oc", OCCITAN + W10, 0}, 1.248 + {"oriya", "or", ORIYA + W10, 0}, 1.249 + {"oromo", "om", OROMO + W10, 0}, 1.250 + {"parsi", "fa", PERSIAN + W10, 0}, 1.251 + 1.252 + {"pashto", "ps", PASHTO + W10, 0}, 1.253 + {"pedi", "nso", PEDI + W10, 0}, 1.254 + {"persian", "fa", PERSIAN + W10, 0}, 1.255 + {"polish", "pl", POLISH + W10, 0}, 1.256 + {"polska", "pl", POLISH + W10, 0}, 1.257 + {"polski", "pl", POLISH + W10, 0}, 1.258 + {"portugu", "pt", PORTUGUESE + W10, 0}, // Truncated at non-ASCII 1.259 + {"portuguese", "pt", PORTUGUESE + W10, 0}, 1.260 + {"punjabi", "pa", PUNJABI + W10, 0}, 1.261 + 1.262 + {"quechua", "qu", QUECHUA + W10, 0}, 1.263 + 1.264 + {"rhaeto_romance", "rm", RHAETO_ROMANCE + W10, 0}, 1.265 + {"romanian", "ro", ROMANIAN + W10, 0}, 1.266 + {"rundi", "rn", RUNDI + W10, 0}, 1.267 + {"russian", "ru", RUSSIAN + W10, 0}, 1.268 + 1.269 + {"samoan", "sm", SAMOAN + W10, 0}, 1.270 + {"sango", "sg", SANGO + W10, 0}, 1.271 + {"sanskrit", "sa", SANSKRIT + W10, 0}, 1.272 + {"scots", "sco", SCOTS + W10, ENGLISH - W4}, 1.273 + {"scots_gaelic", "gd", SCOTS_GAELIC + W10, 0}, 1.274 + {"serbian", "sr", SERBIAN + W10, 0}, 1.275 + {"seselwa", "crs", SESELWA + W10, 0}, 1.276 + {"sesotho", "st", SESOTHO + W10, 0}, 1.277 + {"shift-jis", "ja", JAPANESE + W10, 0}, // Japanese encoding 1.278 + {"shift-js", "ja", JAPANESE + W10, 0}, // Japanese encoding 1.279 + {"shona", "sn", SHONA + W10, 0}, 1.280 + {"si-lk", "si", SINHALESE + W10, 0}, // 1:2 Sri Lanka, not ambiguous 1.281 + {"si-si", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous 1.282 + {"si-sl", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous 1.283 + {"sindhi", "sd", SINDHI + W10, 0}, 1.284 + {"sinhalese", "si", SINHALESE + W10, 0}, 1.285 + {"siswant", "ss", SISWANT + W10, 0}, 1.286 + {"sit-np", "sit", LIMBU + W10, 0}, 1.287 + {"slovak", "sk", SLOVAK + W10, CZECH - W4}, 1.288 + {"slovenian", "sl", SLOVENIAN + W10, 0}, 1.289 + {"somali", "so", SOMALI + W10, 0}, 1.290 + {"spanish", "es", SPANISH + W10, 0}, 1.291 + {"sr-me", "sr-me", MONTENEGRIN + W10, 0}, // Montenegrin => Montenegrin 1.292 + {"sundanese", "su", SUNDANESE + W10, 0}, 1.293 + {"suomi", "fi", FINNISH + W10, 0}, // Finnish 1.294 + {"swahili", "sw", SWAHILI + W10, 0}, 1.295 + {"swedish", "sv", SWEDISH + W10, 0}, 1.296 + {"syriac", "syr", SYRIAC + W10, 0}, 1.297 + 1.298 + {"tagalog", "tl", TAGALOG + W10, 0}, 1.299 + {"tajik", "tg", TAJIK + W10, 0}, 1.300 + {"tamil", "ta", TAMIL + W10, 0}, 1.301 + {"tatar", "tt", TATAR + W10, 0}, 1.302 + {"tb-tb", "bo", TIBETAN + W10, DZONGKHA - W4}, // Tibet 1.303 + {"tchinese", "zhT", CHINESE_T + W10, 0}, 1.304 + {"telugu", "te", TELUGU + W10, 0}, 1.305 + {"thai", "th", THAI + W10, 0}, 1.306 + {"tibetan", "bo", TIBETAN + W10, DZONGKHA - W4}, 1.307 + {"tigrinya", "ti", TIGRINYA + W10, 0}, 1.308 + {"tonga", "to", TONGA + W10, 0}, 1.309 + {"tsonga", "ts", TSONGA + W10, 0}, 1.310 + {"tswana", "tn", TSWANA + W10, 0}, 1.311 + {"tt-ru", "tt", TATAR + W10, 0}, 1.312 + {"tur-tr", "tr", TURKISH + W10, 0}, 1.313 + {"turkish", "tr", TURKISH + W10, 0}, 1.314 + {"turkmen", "tk", TURKMEN + W10, 0}, 1.315 + {"uighur", "ug", UIGHUR + W10, 0}, 1.316 + {"ukrainian", "uk", UKRAINIAN + W10, 0}, 1.317 + {"urdu", "ur", URDU + W10, 0}, 1.318 + {"uzbek", "uz", UZBEK + W10, 0}, 1.319 + 1.320 + {"venda", "ve", VENDA + W10, 0}, 1.321 + {"vietnam", "vi", VIETNAMESE + W10, 0}, 1.322 + {"vietnamese", "vi", VIETNAMESE + W10, 0}, 1.323 + {"volapuk", "vo", VOLAPUK + W10, 0}, 1.324 + 1.325 + {"welsh", "cy", WELSH + W10, 0}, 1.326 + {"wolof", "wo", WOLOF + W10, 0}, 1.327 + 1.328 + {"xhosa", "xh", XHOSA + W10, ZULU - W4}, 1.329 + 1.330 + {"yiddish", "yi", YIDDISH + W10, 0}, 1.331 + {"yoruba", "yo", YORUBA + W10, 0}, 1.332 + 1.333 + {"zh-classical", "zhT", CHINESE_T + W10, 0}, 1.334 + {"zh-cn", "zh", CHINESE + W10, 0}, 1.335 + {"zh-hans", "zh", CHINESE + W10, 0}, 1.336 + {"zh-hant", "zhT", CHINESE_T + W10, 0}, 1.337 + {"zh-hk", "zhT", CHINESE_T + W10, 0}, 1.338 + {"zh-min-nan", "zhT", CHINESE_T + W10, 0}, // Min Nan => ChineseT 1.339 + {"zh-sg", "zhT", CHINESE_T + W10, 0}, 1.340 + {"zh-tw", "zhT", CHINESE_T + W10, 0}, 1.341 + {"zh-yue", "zh", CHINESE + W10, 0}, // Yue (Cantonese) => Chinese 1.342 + {"zhuang", "za", ZHUANG + W10, 0}, 1.343 + {"zulu", "zu", ZULU + W10, XHOSA - W4}, 1.344 +}; 1.345 + 1.346 + 1.347 + 1.348 +// Table to look up lang= tags of two/three characters after truncate at hyphen 1.349 +// In alphabetical order for binary search 1.350 +static const int kCLDTable2Size = 257; 1.351 +static const LangTagLookup kCLDLangTagsHintTable2[kCLDTable2Size] = { 1.352 + {"aa", "aa", AFAR + W10, 0}, 1.353 + {"ab", "ab", ABKHAZIAN + W10, 0}, 1.354 + {"af", "af", AFRIKAANS + W10, 0}, 1.355 + {"ak", "ak", AKAN + W10, 0}, 1.356 + {"al", "sq", ALBANIAN + W10, 0}, // Albania 1.357 + {"am", "am,hy", AMHARIC + W10, ARMENIAN + W10}, // 1:2 Amharic Armenian 1.358 + {"ar", "ar", ARABIC + W10, 0}, 1.359 + {"ara", "ar", ARABIC + W10, 0}, 1.360 + {"arm", "hy", ARMENIAN + W10, 0}, // Armenia 1.361 + {"arz", "ar", ARABIC + W10, 0}, // Egyptian Arabic 1.362 + {"as", "as", ASSAMESE + W10, 0}, 1.363 + {"at", "de", GERMAN + W10, 0}, // Austria 1.364 + {"au", "de", GERMAN + W10, 0}, // Austria 1.365 + {"ay", "ay", AYMARA + W10, 0}, 1.366 + {"az", "az", AZERBAIJANI + W10, 0}, 1.367 + {"aze", "az", AZERBAIJANI + W10, 0}, 1.368 + 1.369 + {"ba", "ba,bs", BASHKIR + W10, BOSNIAN + W10}, // 1:2 Bashkir Bosnia 1.370 + {"be", "be", BELARUSIAN + W10, 0}, 1.371 + {"bel", "be", BELARUSIAN + W10, 0}, 1.372 + {"bg", "bg", BULGARIAN + W10, 0}, 1.373 + {"bh", "bh", BIHARI + W10, HINDI - W4}, 1.374 + {"bi", "bi", BISLAMA + W10, 0}, 1.375 + {"big", "zhT", CHINESE_T + W10, 0}, // Big5 encoding 1.376 + {"bm", "ms", MALAY + W10, INDONESIAN - W4}, // Bahasa Malaysia 1.377 + {"bn", "bn", BENGALI + W10, 0}, 1.378 + {"bo", "bo", TIBETAN + W10, DZONGKHA - W4}, 1.379 + // 1:2 Breton, Brazil country code, both Latn .br TLD enough for pt to win 1.380 + {"br", "br,pt", BRETON + W10, PORTUGUESE + W8}, // 1:2 Breton, Brazil 1.381 + {"bs", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian 1.382 + 1.383 + {"ca", "ca", CATALAN + W10, 0}, 1.384 + {"cat", "ca", CATALAN + W10, 0}, 1.385 + {"ch", "de,fr", GERMAN + W10, FRENCH + W10}, // 1:2 Switzerland 1.386 + {"chn", "zh", CHINESE + W10, 0}, 1.387 + {"chr", "chr", CHEROKEE + W10, 0}, 1.388 + {"ckb", "ku", KURDISH + W10, 0}, // Central Kurdish 1.389 + {"cn", "zh,zhT", CHINESE + W6, CHINESE_T + W4}, // Ambiguous, so weaker. 1.390 + // Offset by 2 so that TLD=tw or 1.391 + // enc=big5 will put zhT ahead 1.392 + {"co", "co", CORSICAN + W10, 0}, 1.393 + {"cro", "hr", CROATIAN + W10, 0}, // Croatia 1.394 + {"crs", "crs", SESELWA + W10, 0}, 1.395 + {"cs", "cs", CZECH + W10, SLOVAK - W4}, 1.396 + {"ct", "ca", CATALAN + W10, 0}, 1.397 + {"cy", "cy", WELSH + W10, 0}, 1.398 + {"cym", "cy", WELSH + W10, 0}, 1.399 + {"cz", "cs", CZECH + W10, SLOVAK - W4}, 1.400 + 1.401 + {"da", "da", DANISH + W10, NORWEGIAN - W4}, 1.402 + {"dan", "da", DANISH + W10, NORWEGIAN - W4}, 1.403 + {"de", "de", GERMAN + W10, 0}, 1.404 + {"deu", "de", GERMAN + W10, 0}, 1.405 + {"div", "dv", DHIVEHI + W10, 0}, 1.406 + {"dk", "da", DANISH + W10, NORWEGIAN - W4}, // Denmark 1.407 + {"dut", "nl", DUTCH + W10, 0}, // Dutch 1.408 + {"dv", "dv", DHIVEHI + W10, 0}, 1.409 + {"dz", "dz", DZONGKHA + W10, TIBETAN - W4}, 1.410 + 1.411 + {"ee", "et", ESTONIAN + W10, 0}, // Estonia 1.412 + {"eg", "ar", ARABIC + W10, 0}, // Egypt 1.413 + {"el", "el", GREEK + W10, 0}, 1.414 + {"en", "en", ENGLISH + W4, 0}, 1.415 + {"eng", "en", ENGLISH + W4, 0}, 1.416 + {"eo", "eo", ESPERANTO + W10, 0}, 1.417 + {"er", "ur", URDU + W10, 0}, // "Erdu" 1.418 + {"es", "es", SPANISH + W10, 0}, 1.419 + {"esp", "es", SPANISH + W10, 0}, 1.420 + {"est", "et", ESTONIAN + W10, 0}, 1.421 + {"et", "et", ESTONIAN + W10, 0}, 1.422 + {"eu", "eu", BASQUE + W10, 0}, 1.423 + 1.424 + {"fa", "fa", PERSIAN + W10, 0}, 1.425 + {"far", "fa", PERSIAN + W10, 0}, 1.426 + {"fi", "fi", FINNISH + W10, 0}, 1.427 + {"fil", "tl", TAGALOG + W10, 0}, // Philippines 1.428 + {"fj", "fj", FIJIAN + W10, 0}, 1.429 + {"fo", "fo", FAROESE + W10, ICELANDIC - W4}, 1.430 + {"fr", "fr", FRENCH + W10, 0}, 1.431 + {"fra", "fr", FRENCH + W10, 0}, 1.432 + {"fre", "fr", FRENCH + W10, 0}, 1.433 + {"fy", "fy", FRISIAN + W10, 0}, 1.434 + 1.435 + {"ga", "ga,gl", IRISH + W10, GALICIAN + W10}, // 1:2 Irish, Galician 1.436 + {"gae", "gd,ga", SCOTS_GAELIC + W10, IRISH + W10}, // 1:2 Gaelic, either 1.437 + {"gal", "gl", GALICIAN + W10, 0}, 1.438 + {"gb", "zh", CHINESE + W10, 0}, // GB2312 encoding 1.439 + {"gbk", "zh", CHINESE + W10, 0}, // GBK encoding 1.440 + {"gd", "gd", SCOTS_GAELIC + W10, 0}, 1.441 + {"ge", "ka", GEORGIAN + W10, 0}, // Georgia 1.442 + {"geo", "ka", GEORGIAN + W10, 0}, 1.443 + {"ger", "de", GERMAN + W10, 0}, 1.444 + {"gl", "gl", GALICIAN + W10, 0}, // Also Greenland; hard to confuse 1.445 + {"gn", "gn", GUARANI + W10, 0}, 1.446 + {"gr", "el", GREEK + W10, 0}, // Greece 1.447 + {"gu", "gu", GUJARATI + W10, 0}, 1.448 + {"gv", "gv", MANX + W10, 0}, 1.449 + 1.450 + {"ha", "ha", HAUSA + W10, 0}, 1.451 + {"hat", "ht", HAITIAN_CREOLE + W10, 0}, // Haiti 1.452 + {"haw", "haw", HAWAIIAN + W10, 0}, 1.453 + {"hb", "iw", HEBREW + W10, 0}, 1.454 + {"he", "iw", HEBREW + W10, 0}, 1.455 + {"heb", "iw", HEBREW + W10, 0}, 1.456 + {"hi", "hi", HINDI + W10, MARATHI - W4}, 1.457 + {"hk", "zhT", CHINESE_T + W10, 0}, // Hong Kong 1.458 + {"hr", "hr", CROATIAN + W10, 0}, 1.459 + {"ht", "ht", HAITIAN_CREOLE + W10, 0}, 1.460 + {"hu", "hu", HUNGARIAN + W10, 0}, 1.461 + {"hun", "hu", HUNGARIAN + W10, 0}, 1.462 + {"hy", "hy", ARMENIAN + W10, 0}, 1.463 + 1.464 + {"ia", "ia", INTERLINGUA + W10, 0}, 1.465 + {"ice", "is", ICELANDIC + W10, FAROESE - W4}, // Iceland 1.466 + {"id", "id", INDONESIAN + W10, MALAY - W4}, 1.467 + {"ids", "id", INDONESIAN + W10, MALAY - W4}, 1.468 + {"ie", "ie", INTERLINGUE + W10, 0}, 1.469 + {"ig", "ig", IGBO + W10, 0}, 1.470 + // 1:2 iu-Cans ik-Latn 1.471 + {"ik", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2 1.472 + {"in", "id", INDONESIAN + W10, MALAY - W4}, 1.473 + {"ind", "id", INDONESIAN + W10, MALAY - W4}, // Indonesia 1.474 + {"inu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2 1.475 + {"is", "is", ICELANDIC + W10, FAROESE - W4}, 1.476 + {"it", "it", ITALIAN + W10, 0}, 1.477 + {"ita", "it", ITALIAN + W10, 0}, 1.478 + {"iu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2 1.479 + {"iw", "iw", HEBREW + W10, 0}, 1.480 + 1.481 + {"ja", "ja", JAPANESE + W10, 0}, 1.482 + {"jp", "ja", JAPANESE + W10, 0}, // Japan 1.483 + {"jpn", "ja", JAPANESE + W10, 0}, 1.484 + {"jv", "jw", JAVANESE + W10, 0}, 1.485 + {"jw", "jw", JAVANESE + W10, 0}, 1.486 + 1.487 + {"ka", "ka", GEORGIAN + W10, 0}, 1.488 + {"kc", "qu", QUECHUA + W10, 0}, // (K)Quechua 1.489 + {"kg", "ky", KYRGYZ + W10, 0}, // Kyrgyzstan 1.490 + {"kh", "km", KHMER + W10, 0}, // Country code Khmer (Cambodia) 1.491 + {"kha", "kha", KHASI + W10, 0}, 1.492 + {"kk", "kk", KAZAKH + W10, 0}, // Kazakh 1.493 + {"kl", "kl", GREENLANDIC + W10, 0}, 1.494 + {"km", "km", KHMER + W10, 0}, 1.495 + {"kn", "kn", KANNADA + W10, 0}, 1.496 + {"ko", "ko", KOREAN + W10, 0}, 1.497 + {"kor", "ko", KOREAN + W10, 0}, 1.498 + {"kr", "ko", KOREAN + W10, 0}, // Country code Korea 1.499 + {"ks", "ks", KASHMIRI + W10, 0}, 1.500 + {"ksc", "ko", KOREAN + W10, 0}, // KSC encoding 1.501 + {"ku", "ku", KURDISH + W10, 0}, 1.502 + {"ky", "ky", KYRGYZ + W10, 0}, 1.503 + {"kz", "kk", KAZAKH + W10, 0}, // Kazakhstan 1.504 + {"la", "la", LATIN + W10, 0}, 1.505 + {"lao", "lo", LAOTHIAN + W10, 0}, // Laos 1.506 + 1.507 + {"lb", "lb", LUXEMBOURGISH + W10, 0}, 1.508 + {"lg", "lg", GANDA + W10, 0}, 1.509 + {"lit", "lt", LITHUANIAN + W10, 0}, 1.510 + {"ln", "ln", LINGALA + W10, 0}, 1.511 + {"lo", "lo", LAOTHIAN + W10, 0}, 1.512 + {"lt", "lt", LITHUANIAN + W10, 0}, 1.513 + {"ltu", "lt", LITHUANIAN + W10, 0}, 1.514 + {"lv", "lv", LATVIAN + W10, 0}, 1.515 + 1.516 + {"mfe", "mfe", MAURITIAN_CREOLE + W10, 0}, 1.517 + {"mg", "mg", MALAGASY + W10, 0}, 1.518 + {"mi", "mi", MAORI + W10, 0}, 1.519 + {"mk", "mk", MACEDONIAN + W10, 0}, 1.520 + {"ml", "ml", MALAYALAM + W10, 0}, 1.521 + {"mn", "mn", MONGOLIAN + W10, 0}, 1.522 + {"mo", "mo", ROMANIAN + W10, 0}, 1.523 + {"mon", "mn", MONGOLIAN + W10, 0}, // Mongolian 1.524 + {"mr", "mr", MARATHI + W10, HINDI - W4}, 1.525 + {"ms", "ms", MALAY + W10, INDONESIAN - W4}, 1.526 + {"mt", "mt", MALTESE + W10, 0}, 1.527 + {"mx", "es", SPANISH + W10, 0}, // Mexico 1.528 + {"my", "my,ms", BURMESE + W10, MALAY + W10}, // Myanmar, Malaysia 1.529 + 1.530 + {"na", "na", NAURU + W10, 0}, 1.531 + {"nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, 1.532 + {"ne", "ne", NEPALI + W10, 0}, 1.533 + {"nl", "nl", DUTCH + W10, 0}, 1.534 + {"nn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, 1.535 + {"no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, 1.536 + {"nr", "nr", NDEBELE + W10, 0}, 1.537 + {"nso", "nso", PEDI + W10, 0}, 1.538 + {"ny", "ny", NYANJA + W10, 0}, 1.539 + 1.540 + {"oc", "oc", OCCITAN + W10, 0}, 1.541 + {"om", "om", OROMO + W10, 0}, 1.542 + {"or", "or", ORIYA + W10, 0}, 1.543 + 1.544 + {"pa", "pa,ps", PUNJABI + W10, PASHTO + W10}, // 1:2 pa-Guru ps-Arab 1.545 + {"per", "fa", PERSIAN + W10, 0}, 1.546 + {"ph", "tl", TAGALOG + W10, 0}, // Philippines 1.547 + {"pk", "ur", URDU + W10, 0}, // Pakistan 1.548 + {"pl", "pl", POLISH + W10, 0}, 1.549 + {"pnb", "pa", PUNJABI + W10, 0}, // Western Punjabi 1.550 + {"pol", "pl", POLISH + W10, 0}, 1.551 + {"por", "pt", PORTUGUESE + W10, 0}, 1.552 + {"ps", "ps", PASHTO + W10, 0}, 1.553 + {"pt", "pt", PORTUGUESE + W10, 0}, 1.554 + {"ptg", "pt", PORTUGUESE + W10, 0}, 1.555 + {"qc", "fr", FRENCH + W10, 0}, // Quebec "country" code 1.556 + {"qu", "qu", QUECHUA + W10, 0}, 1.557 + 1.558 + {"rm", "rm", RHAETO_ROMANCE + W10, 0}, 1.559 + {"rn", "rn", RUNDI + W10, 0}, 1.560 + {"ro", "ro", ROMANIAN + W10, 0}, 1.561 + {"rs", "sr", SERBIAN + W10, 0}, // Serbia country code 1.562 + {"ru", "ru", RUSSIAN + W10, 0}, 1.563 + {"rus", "ru", RUSSIAN + W10, 0}, 1.564 + {"rw", "rw", KINYARWANDA + W10, 0}, 1.565 + 1.566 + {"sa", "sa", SANSKRIT + W10, 0}, 1.567 + {"sco", "sco", SCOTS + W10, ENGLISH - W4}, 1.568 + {"sd", "sd", SINDHI + W10, 0}, 1.569 + {"se", "sv", SWEDISH + W10, 0}, 1.570 + {"sg", "sg", SANGO + W10, 0}, 1.571 + {"si", "si,sl", SINHALESE + W10, SLOVENIAN + W10}, // 1:2 Sinhalese, Slovinia 1.572 + {"sk", "sk", SLOVAK + W10, CZECH - W4}, 1.573 + {"sl", "sl", SLOVENIAN + W10, 0}, 1.574 + {"slo", "sl", SLOVENIAN + W10, 0}, 1.575 + {"sm", "sm", SAMOAN + W10, 0}, 1.576 + {"sn", "sn", SHONA + W10, 0}, 1.577 + {"so", "so", SOMALI + W10, 0}, 1.578 + {"sp", "es", SPANISH + W10, 0}, 1.579 + {"sq", "sq", ALBANIAN + W10, 0}, 1.580 + {"sr", "sr", SERBIAN + W10, 0}, 1.581 + {"srb", "sr", SERBIAN + W10, 0}, 1.582 + {"srl", "sr", SERBIAN + W10, 0}, // Serbian Latin 1.583 + {"srp", "sr", SERBIAN + W10, 0}, 1.584 + {"ss", "ss", SISWANT + W10, 0}, 1.585 + {"st", "st", SESOTHO + W10, 0}, 1.586 + {"su", "su", SUNDANESE + W10, 0}, 1.587 + {"sv", "sv", SWEDISH + W10, 0}, 1.588 + {"sve", "sv", SWEDISH + W10, 0}, 1.589 + {"sw", "sw", SWAHILI + W10, 0}, 1.590 + {"swe", "sv", SWEDISH + W10, 0}, 1.591 + {"sy", "syr", SYRIAC + W10, 0}, 1.592 + {"syr", "syr", SYRIAC + W10, 0}, 1.593 + 1.594 + {"ta", "ta", TAMIL + W10, 0}, 1.595 + {"te", "te", TELUGU + W10, 0}, 1.596 + {"tg", "tg", TAJIK + W10, 0}, 1.597 + {"th", "th", THAI + W10, 0}, 1.598 + {"ti", "ti,bo", TIGRINYA + W10, TIBETAN + W10}, // 1:2 Tigrinya, Tibet 1.599 + {"tj", "tg", TAJIK + W10, 0}, // Tajikistan 1.600 + {"tk", "tk", TURKMEN + W10, 0}, 1.601 + {"tl", "tl", TAGALOG + W10, 0}, 1.602 + {"tlh", "tlh", X_KLINGON + W10, 0}, 1.603 + {"tn", "tn", TSWANA + W10, 0}, 1.604 + {"to", "to", TONGA + W10, 0}, 1.605 + {"tr", "tr", TURKISH + W10, 0}, 1.606 + {"ts", "ts", TSONGA + W10, 0}, 1.607 + {"tt", "tt", TATAR + W10, 0}, 1.608 + {"tw", "ak,zhT", AKAN + W10, CHINESE_T + W10}, // 1:2 Twi => Akan, Taiwan 1.609 + {"twi", "ak", AKAN + W10, 0}, // Twi => Akan 1.610 + 1.611 + {"ua", "uk", UKRAINIAN + W10, 0}, // Ukraine 1.612 + {"ug", "ug", UIGHUR + W10, 0}, 1.613 + {"uk", "uk", UKRAINIAN + W10, 0}, 1.614 + {"ur", "ur", URDU + W10, 0}, 1.615 + {"uz", "uz", UZBEK + W10, 0}, 1.616 + 1.617 + {"va", "ca", CATALAN + W10, 0}, // Valencia => Catalan 1.618 + {"val", "ca", CATALAN + W10, 0}, // Valencia => Catalan 1.619 + {"ve", "ve", VENDA + W10, 0}, 1.620 + {"vi", "vi", VIETNAMESE + W10, 0}, 1.621 + {"vie", "vi", VIETNAMESE + W10, 0}, 1.622 + {"vn", "vi", VIETNAMESE + W10, 0}, 1.623 + {"vo", "vo", VOLAPUK + W10, 0}, 1.624 + 1.625 + {"wo", "wo", WOLOF + W10, 0}, 1.626 + 1.627 + {"xh", "xh", XHOSA + W10, ZULU - W4}, 1.628 + {"xho", "xh", XHOSA + W10, ZULU - W4}, 1.629 + 1.630 + {"yi", "yi", YIDDISH + W10, 0}, 1.631 + {"yo", "yo", YORUBA + W10, 0}, 1.632 + 1.633 + {"za", "za", ZHUANG + W10, 0}, 1.634 + {"zh", "zh", CHINESE + W10, 0}, 1.635 + {"zht", "zhT", CHINESE_T + W10, 0}, 1.636 + {"zu", "zu", ZULU + W10, XHOSA - W4}, 1.637 +}; 1.638 + 1.639 + 1.640 +// Possibly map to tl: 1.641 +// -LangTags tl-Latn /7val.com/ ,bcl 2 Central Bicolano 1.642 +// -LangTags tl-Latn /7val.com/ ,ceb 6 Cebuano 1.643 +// -LangTags tl-Latn /7val.com/ ,war 1 Waray 1.644 + 1.645 + 1.646 + 1.647 +// Table to look up country TLD (no general TLD) 1.648 +// In alphabetical order for binary search 1.649 +static const int kCLDTable3Size = 181; 1.650 +static const TLDLookup kCLDTLDHintTable[kCLDTable3Size] = { 1.651 + {"ac", JAPANESE + W2, 0}, 1.652 + {"ad", CATALAN + W4, 0}, 1.653 + {"ae", ARABIC + W4, 0}, 1.654 + {"af", PASHTO + W4, PERSIAN + W4}, 1.655 + {"ag", GERMAN + W2, 0}, // meager 1.656 + // {"ai", 0, 0}, // meager 1.657 + {"al", ALBANIAN + W4, 0}, 1.658 + {"am", ARMENIAN + W4, 0}, 1.659 + {"an", DUTCH + W4, 0}, // meager 1.660 + {"ao", PORTUGUESE + W4, 0}, 1.661 + // {"aq", 0, 0}, // meager 1.662 + {"ar", SPANISH + W4, 0}, 1.663 + // {"as", 0, 0}, 1.664 + {"at", GERMAN + W4, 0}, 1.665 + {"au", ENGLISH + W2, 0}, 1.666 + {"aw", DUTCH + W4, 0}, 1.667 + {"ax", SWEDISH + W4, 0}, 1.668 + {"az", AZERBAIJANI + W4, 0}, 1.669 + 1.670 + {"ba", BOSNIAN + W8, CROATIAN - W4}, 1.671 + // {"bb", 0, 0}, 1.672 + {"bd", BENGALI + W4, 0}, 1.673 + {"be", DUTCH + W4, FRENCH + W4}, 1.674 + {"bf", FRENCH + W4, 0}, 1.675 + {"bg", BULGARIAN + W4, 0}, 1.676 + {"bh", ARABIC + W4, 0}, 1.677 + {"bi", RUNDI + W4, FRENCH + W4}, 1.678 + {"bj", FRENCH + W4, 0}, 1.679 + {"bm", ENGLISH + W2, 0}, 1.680 + {"bn", MALAY + W4, INDONESIAN - W4}, 1.681 + {"bo", SPANISH + W4, AYMARA + W2}, // and GUARANI QUECHUA 1.682 + {"br", PORTUGUESE + W4, 0}, 1.683 + // {"bs", 0, 0}, 1.684 + {"bt", DZONGKHA + W10, TIBETAN - W10}, // Strong presumption of Dzongha 1.685 + {"bw", TSWANA + W4, 0}, 1.686 + {"by", BELARUSIAN + W4, 0}, 1.687 + // {"bz", 0, 0}, 1.688 + 1.689 + {"ca", FRENCH + W4, ENGLISH + W2}, 1.690 + {"cat", CATALAN + W4, 0}, 1.691 + {"cc", 0, 0}, 1.692 + {"cd", FRENCH + W4, 0}, 1.693 + {"cf", FRENCH + W4, 0}, 1.694 + {"cg", FRENCH + W4, 0}, 1.695 + {"ch", GERMAN + W4, FRENCH + W4}, 1.696 + {"ci", FRENCH + W4, 0}, 1.697 + // {"ck", 0, 0}, 1.698 + {"cl", SPANISH + W4, 0}, 1.699 + {"cm", FRENCH + W4, 0}, 1.700 + {"cn", CHINESE + W4, 0}, 1.701 + {"co", SPANISH + W4, 0}, 1.702 + {"cr", SPANISH + W4, 0}, 1.703 + {"cu", SPANISH + W4, 0}, 1.704 + {"cv", PORTUGUESE + W4, 0}, 1.705 + // {"cx", 0, 0}, 1.706 + {"cy", GREEK + W4, TURKISH + W4}, 1.707 + {"cz", CZECH + W4, SLOVAK - W4}, 1.708 + 1.709 + {"de", GERMAN + W4, 0}, 1.710 + {"dj", 0, 0}, 1.711 + {"dk", DANISH + W4, NORWEGIAN - W4}, 1.712 + {"dm", 0, 0}, 1.713 + {"do", SPANISH + W4, 0}, 1.714 + {"dz", FRENCH + W4, ARABIC + W4}, 1.715 + 1.716 + {"ec", SPANISH + W4, 0}, 1.717 + {"ee", ESTONIAN + W4, 0}, 1.718 + {"eg", ARABIC + W4, 0}, 1.719 + {"er", AFAR + W4, 0}, 1.720 + {"es", SPANISH + W4, 0}, 1.721 + {"et", AMHARIC + W4, AFAR + W4}, 1.722 + 1.723 + {"fi", FINNISH + W4, 0}, 1.724 + {"fj", FIJIAN + W4, 0}, 1.725 + // {"fk", 0, 0}, 1.726 + // {"fm", 0, 0}, 1.727 + {"fo", FAROESE + W4, ICELANDIC - W4}, 1.728 + {"fr", FRENCH + W4, 0}, 1.729 + 1.730 + {"ga", FRENCH + W4, 0}, 1.731 + {"gd", 0, 0}, 1.732 + {"ge", GEORGIAN + W4, 0}, 1.733 + {"gf", FRENCH + W4, 0}, 1.734 + // {"gg", 0, 0}, 1.735 + // {"gh", 0, 0}, 1.736 + // {"gi", 0, 0}, 1.737 + {"gl", GREENLANDIC + W4, DANISH + W4}, 1.738 + // {"gm", 0, 0}, 1.739 + {"gn", FRENCH + W4, 0}, 1.740 + // {"gp", 0, 0}, 1.741 + // {"gq", 0, 0}, 1.742 + {"gr", GREEK + W4, 0}, 1.743 + // {"gs", 0, 0}, 1.744 + {"gt", SPANISH + W4, 0}, 1.745 + // {"gu", 0, 0}, 1.746 + // {"gy", 0, 0}, 1.747 + 1.748 + {"hk", CHINESE_T + W4, 0}, 1.749 + // {"hm", 0, 0}, 1.750 + {"hn", SPANISH + W4, 0}, 1.751 + {"hr", CROATIAN + W8, BOSNIAN - W4}, 1.752 + {"ht", HAITIAN_CREOLE + W4, FRENCH + W4}, 1.753 + {"hu", HUNGARIAN + W4, 0}, 1.754 + 1.755 + {"id", INDONESIAN + W4, MALAY - W4}, 1.756 + {"ie", IRISH + W4, 0}, 1.757 + {"il", HEBREW + W4, 0}, 1.758 + {"im", MANX + W4, 0}, 1.759 + // {"in", 0, 0}, 1.760 + // {"io", 0, 0}, 1.761 + {"iq", ARABIC + W4, 0}, 1.762 + {"ir", PERSIAN + W4, 0}, 1.763 + {"is", ICELANDIC + W4, FAROESE - W4}, 1.764 + {"it", ITALIAN + W4, 0}, 1.765 + 1.766 + // {"je", 0, 0}, 1.767 + // {"jm", 0, 0}, 1.768 + {"jo", ARABIC + W4, 0}, 1.769 + {"jp", JAPANESE + W4, 0}, 1.770 + 1.771 + // {"ke", 0, 0}, 1.772 + {"kg", KYRGYZ + W4, 0}, 1.773 + {"kh", KHMER + W4, 0}, 1.774 + // {"ki", 0, 0}, 1.775 + {"km", FRENCH + W4, 0}, 1.776 + // {"kn", 0, 0}, 1.777 + {"kp", KOREAN + W4, 0}, 1.778 + {"kr", KOREAN + W4, 0}, 1.779 + {"kw", ARABIC + W4, 0}, 1.780 + // {"ky", 0, 0}, 1.781 + {"kz", KAZAKH + W4, 0}, 1.782 + 1.783 + {"la", LAOTHIAN + W4, 0}, 1.784 + {"lb", ARABIC + W4, FRENCH + W4}, 1.785 + // {"lc", 0, 0}, 1.786 + {"li", GERMAN + W4, 0}, 1.787 + {"lk", SINHALESE + W4, 0}, 1.788 + // {"lr", 0, 0}, 1.789 + {"ls", SESOTHO + W4, 0}, 1.790 + {"lt", LITHUANIAN + W4, 0}, 1.791 + {"lu", LUXEMBOURGISH + W4}, 1.792 + {"lv", LATVIAN + W4, 0}, 1.793 + {"ly", ARABIC + W4, 0}, 1.794 + 1.795 + {"ma", FRENCH + W4, 0}, 1.796 + {"mc", FRENCH + W4, 0}, 1.797 + {"md", ROMANIAN + W4, 0}, 1.798 + {"me", MONTENEGRIN + W8, SERBIAN - W4}, 1.799 + {"mg", FRENCH + W4, 0}, 1.800 + {"mk", MACEDONIAN + W4, 0}, 1.801 + {"ml", FRENCH + W4, 0}, 1.802 + {"mm", BURMESE + W4, 0}, 1.803 + {"mn", MONGOLIAN + W4, 0}, 1.804 + {"mo", CHINESE_T + W4, PORTUGUESE + W4}, 1.805 + // {"mp", 0, 0}, 1.806 + {"mq", FRENCH + W4, 0}, 1.807 + {"mr", FRENCH + W4, ARABIC + W4}, 1.808 + // {"ms", 0, 0}, 1.809 + {"mt", MALTESE + W4, 0}, 1.810 + // {"mu", 0, 0}, 1.811 + {"mv", DHIVEHI + W4, 0}, 1.812 + // {"mw", 0, 0}, 1.813 + {"mx", SPANISH + W4, 0}, 1.814 + {"my", MALAY + W4, INDONESIAN - W4}, 1.815 + {"mz", PORTUGUESE + W4, 0}, 1.816 + 1.817 + {"na", 0, 0}, // Namibia 1.818 + {"nc", FRENCH + W4, 0}, 1.819 + {"ne", FRENCH + W4, 0}, 1.820 + {"nf", FRENCH + W4, 0}, 1.821 + // {"ng", 0, 0}, 1.822 + {"ni", SPANISH + W4, 0}, 1.823 + {"nl", DUTCH + W4, 0}, 1.824 + {"no", NORWEGIAN + W4, NORWEGIAN_N + W2}, 1.825 + {"np", NEPALI + W4, 0}, 1.826 + {"nr", NAURU + W4, 0}, 1.827 + {"nu", SWEDISH + W4, 0}, 1.828 + {"nz", MAORI + W4, ENGLISH + W2}, 1.829 + 1.830 + {"om", ARABIC + W4, 0}, 1.831 + 1.832 + {"pa", SPANISH + W4, 0}, 1.833 + {"pe", SPANISH + W4, QUECHUA + W2}, // also AYMARA 1.834 + {"pf", FRENCH + W4, 0}, 1.835 + // {"pg", 0, 0}, 1.836 + {"ph", TAGALOG + W4, 0}, 1.837 + {"pk", URDU + W4, 0}, 1.838 + {"pl", POLISH + W4, 0}, 1.839 + // {"pn", 0, 0}, 1.840 + {"pr", SPANISH + W4, 0}, 1.841 + {"ps", ARABIC + W4, 0}, 1.842 + {"pt", PORTUGUESE + W4, 0}, 1.843 + {"py", SPANISH + W4, GUARANI + W2}, 1.844 + 1.845 + {"qa", ARABIC + W4, 0}, 1.846 + 1.847 + {"re", FRENCH + W4, 0}, 1.848 + {"ro", ROMANIAN + W4, 0}, 1.849 + {"rs", SERBIAN + W8, MONTENEGRIN - W4}, 1.850 + {"ru", RUSSIAN + W4, 0}, 1.851 + {"rw", KINYARWANDA + W4, FRENCH + W2}, 1.852 + 1.853 + {"sa", ARABIC + W4, 0}, 1.854 + // {"sb", 0, 0}, 1.855 + {"sc", SESELWA + W4, 0}, 1.856 + {"sd", ARABIC + W4, 0}, 1.857 + {"se", SWEDISH + W4, 0}, 1.858 + // {"sg", 0, 0}, 1.859 + // {"sh", 0, 0}, 1.860 + {"si", SLOVENIAN + W4, 0}, 1.861 + {"sk", SLOVAK + W4, CZECH - W4}, 1.862 + // {"sl", 0, 0}, 1.863 + {"sm", ITALIAN + W4, 0}, 1.864 + {"sn", FRENCH + W4, 0}, 1.865 + // {"sr", 0, 0}, 1.866 + {"ss", ARABIC + W4, 0}, // Presumed South Sudan TLD. dsites 2011.07.07 1.867 + // {"st", 0, 0}, 1.868 + {"su", RUSSIAN + W4, 0}, 1.869 + {"sv", SPANISH + W4, 0}, 1.870 + {"sy", ARABIC + W4, 0}, 1.871 + // {"sz", 0, 0}, 1.872 + 1.873 + // {"tc", 0, 0}, 1.874 + {"td", FRENCH + W4, 0}, 1.875 + // {"tf", 0, 0}, 1.876 + {"tg", FRENCH + W4, 0}, 1.877 + {"th", THAI + W4, 0}, 1.878 + // Tibet has no country code (see .cn) 1.879 + {"tj", TAJIK + W4, 0}, 1.880 + // {"tk", 0, 0}, 1.881 + // {"tl", 0, 0}, 1.882 + {"tm", TURKISH + W4, 0}, 1.883 + {"tn", FRENCH + W4, ARABIC + W4}, 1.884 + // {"to", 0, 0}, 1.885 + {"tp", JAPANESE + W4, 0}, 1.886 + {"tr", TURKISH + W4, 0}, 1.887 + // {"tt", 0, 0}, 1.888 + // {"tv", 0, 0}, 1.889 + {"tw", CHINESE_T + W4, 0}, 1.890 + {"tz", SWAHILI + W4, AKAN + W4}, 1.891 + 1.892 + {"ua", UKRAINIAN + W4, 0}, 1.893 + {"ug", GANDA + W4, 0}, 1.894 + {"uk", ENGLISH + W2, 0}, 1.895 + {"us", ENGLISH + W2, 0}, 1.896 + {"uy", SPANISH + W4, 0}, 1.897 + {"uz", UZBEK + W4, 0}, 1.898 + 1.899 + {"va", ITALIAN + W4, LATIN + W2}, 1.900 + // {"vc", 0, 0}, 1.901 + {"ve", SPANISH + W4, 0}, 1.902 + // {"vg", 0, 0}, 1.903 + // {"vi", 0, 0}, 1.904 + {"vn", VIETNAMESE + W4, 0}, 1.905 + // {"vu", 0, 0}, 1.906 + 1.907 + {"wf", FRENCH + W4, 0}, 1.908 + // {"ws", 0, 0}, 1.909 + 1.910 + {"ye", ARABIC + W4, 0}, 1.911 + 1.912 + {"za", AFRIKAANS + W4, 0}, 1.913 + // {"zm", 0, 0}, 1.914 + // {"zw", 0, 0}, 1.915 +}; 1.916 + 1.917 +#undef W2 1.918 +#undef W4 1.919 +#undef W6 1.920 +#undef W8 1.921 +#undef W10 1.922 +#undef W12 1.923 + 1.924 + 1.925 + 1.926 + 1.927 + 1.928 +inline void SetCLDPriorWeight(int w, OneCLDLangPrior* olp) { 1.929 + *olp = (*olp & 0x3ff) + (w << 10); 1.930 +} 1.931 +inline void SetCLDPriorLang(Language lang, OneCLDLangPrior* olp) { 1.932 + *olp = (*olp & ~0x3ff) + lang; 1.933 +} 1.934 + 1.935 +OneCLDLangPrior PackCLDPriorLangWeight(Language lang, int w) { 1.936 + return (w << 10) + lang; 1.937 +} 1.938 + 1.939 +inline int MaxInt(int a, int b) { 1.940 + return (a >= b) ? a : b; 1.941 +} 1.942 + 1.943 +// Merge in another language prior, taking max if already there 1.944 +void MergeCLDLangPriorsMax(OneCLDLangPrior olp, CLDLangPriors* lps) { 1.945 + if (olp == 0) {return;} 1.946 + Language target_lang = GetCLDPriorLang(olp); 1.947 + for (int i = 0; i < lps->n; ++i) { 1.948 + if (GetCLDPriorLang(lps->prior[i]) == target_lang) { 1.949 + int new_weight = MaxInt(GetCLDPriorWeight(lps->prior[i]), 1.950 + GetCLDPriorWeight(olp)); 1.951 + SetCLDPriorWeight(new_weight, &lps->prior[i]); 1.952 + return; 1.953 + } 1.954 + } 1.955 + // Not found; add it if room 1.956 + if (lps->n >= kMaxOneCLDLangPrior) {return;} 1.957 + lps->prior[lps->n++] = olp; 1.958 +} 1.959 + 1.960 +// Merge in another language prior, boosting 10x if already there 1.961 +void MergeCLDLangPriorsBoost(OneCLDLangPrior olp, CLDLangPriors* lps) { 1.962 + if (olp == 0) {return;} 1.963 + Language target_lang = GetCLDPriorLang(olp); 1.964 + for (int i = 0; i < lps->n; ++i) { 1.965 + if (GetCLDPriorLang(lps->prior[i]) == target_lang) { 1.966 + int new_weight = GetCLDPriorWeight(lps->prior[i]) + 2; 1.967 + SetCLDPriorWeight(new_weight, &lps->prior[i]); 1.968 + return; 1.969 + } 1.970 + } 1.971 + // Not found; add it if room 1.972 + if (lps->n >= kMaxOneCLDLangPrior) {return;} 1.973 + lps->prior[lps->n++] = olp; 1.974 +} 1.975 + 1.976 + 1.977 +// Trim language priors to no more than max_entries, keeping largest abs weights 1.978 +void TrimCLDLangPriors(int max_entries, CLDLangPriors* lps) { 1.979 + if (lps->n <= max_entries) {return;} 1.980 + 1.981 + // Insertion sort in-place by abs(weight) 1.982 + for (int i = 0; i < lps->n; ++i) { 1.983 + OneCLDLangPrior temp_olp = lps->prior[i]; 1.984 + int w = abs(GetCLDPriorWeight(temp_olp)); 1.985 + int kk = i; 1.986 + for (; kk > 0; --kk) { 1.987 + if (abs(GetCLDPriorWeight(lps->prior[kk - 1])) < w) { 1.988 + // Move down and continue 1.989 + lps->prior[kk] = lps->prior[kk - 1]; 1.990 + } else { 1.991 + // abs(weight[kk - 1]) >= w, time to stop 1.992 + break; 1.993 + } 1.994 + } 1.995 + lps->prior[kk] = temp_olp; 1.996 + } 1.997 + 1.998 + lps->n = max_entries; 1.999 +} 1.1000 + 1.1001 +int CountCommas(const string& langtags) { 1.1002 + int commas = 0; 1.1003 + for (int i = 0; i < static_cast<int>(langtags.size()); ++i) { 1.1004 + if (langtags[i] == ',') {++commas;} 1.1005 + } 1.1006 + return commas; 1.1007 +} 1.1008 + 1.1009 +// Binary lookup on language tag 1.1010 +const LangTagLookup* DoLangTagLookup(const char* key, 1.1011 + const LangTagLookup* tbl, int tbl_size) { 1.1012 + // Key is always in range [lo..hi) 1.1013 + int lo = 0; 1.1014 + int hi = tbl_size; 1.1015 + while (lo < hi) { 1.1016 + int mid = (lo + hi) >> 1; 1.1017 + int comp = strcmp(tbl[mid].langtag, key); 1.1018 + if (comp < 0) { 1.1019 + lo = mid + 1; 1.1020 + } else if (comp > 0) { 1.1021 + hi = mid; 1.1022 + } else { 1.1023 + return &tbl[mid]; 1.1024 + } 1.1025 + } 1.1026 + return NULL; 1.1027 +} 1.1028 + 1.1029 +// Binary lookup on tld 1.1030 +const TLDLookup* DoTLDLookup(const char* key, 1.1031 + const TLDLookup* tbl, int tbl_size) { 1.1032 + // Key is always in range [lo..hi) 1.1033 + int lo = 0; 1.1034 + int hi = tbl_size; 1.1035 + while (lo < hi) { 1.1036 + int mid = (lo + hi) >> 1; 1.1037 + int comp = strcmp(tbl[mid].tld, key); 1.1038 + if (comp < 0) { 1.1039 + lo = mid + 1; 1.1040 + } else if (comp > 0) { 1.1041 + hi = mid; 1.1042 + } else { 1.1043 + return &tbl[mid]; 1.1044 + } 1.1045 + } 1.1046 + return NULL; 1.1047 +} 1.1048 + 1.1049 + 1.1050 + 1.1051 +// Trim language tag string to canonical form for each language 1.1052 +// Input is from GetLangTagsFromHtml(), already lowercased 1.1053 +string TrimCLDLangTagsHint(const string& langtags) { 1.1054 + string retval; 1.1055 + if (langtags.empty()) {return retval;} 1.1056 + int commas = CountCommas(langtags); 1.1057 + if (commas > 4) {return retval;} // Ignore if too many language tags 1.1058 + 1.1059 + char temp[20]; 1.1060 + int pos = 0; 1.1061 + while (pos < static_cast<int>(langtags.size())) { 1.1062 + int comma = langtags.find(',', pos); 1.1063 + if (comma == string::npos) {comma = langtags.size();} // fake trailing comma 1.1064 + int len = comma - pos; 1.1065 + if (len <= 16) { 1.1066 + // Short enough to use 1.1067 + memcpy(temp, &langtags[pos], len); 1.1068 + temp[len] = '\0'; 1.1069 + const LangTagLookup* entry = DoLangTagLookup(temp, 1.1070 + kCLDLangTagsHintTable1, 1.1071 + kCLDTable1Size); 1.1072 + if (entry != NULL) { 1.1073 + // First table hit 1.1074 + retval.append(entry->langcode); // may be "code1,code2" 1.1075 + retval.append(1, ','); 1.1076 + } else { 1.1077 + // Try second table with language code truncated at first hyphen 1.1078 + char* hyphen = strchr(temp, '-'); 1.1079 + if (hyphen != NULL) {*hyphen = '\0';} 1.1080 + len = strlen(temp); 1.1081 + if (len <= 3) { // Short enough to use 1.1082 + entry = DoLangTagLookup(temp, 1.1083 + kCLDLangTagsHintTable2, 1.1084 + kCLDTable2Size); 1.1085 + if (entry != NULL) { 1.1086 + // Second table hit 1.1087 + retval.append(entry->langcode); // may be "code1,code2" 1.1088 + retval.append(1, ','); 1.1089 + } 1.1090 + } 1.1091 + } 1.1092 + } 1.1093 + pos = comma + 1; 1.1094 + } 1.1095 + 1.1096 + // Remove trainling comma, if any 1.1097 + if (!retval.empty()) {retval.resize(retval.size() - 1);} 1.1098 + return retval; 1.1099 +} 1.1100 + 1.1101 + 1.1102 + 1.1103 +//============================================================================== 1.1104 + 1.1105 +// Little state machine to scan insides of language attribute quoted-string. 1.1106 +// Each language code is lowercased and copied to the output string. Underscore 1.1107 +// is mapped to minus. Space, tab, and comma are all mapped to comma, and 1.1108 +// multiple consecutive commas are removed. 1.1109 +// Each language code in the output list will be followed by a single comma. 1.1110 + 1.1111 +// There are three states, and we start in state 1: 1.1112 +// State 0: After a letter. 1.1113 +// Copy all letters/minus[0], copy comma[1]; all others copy comma and skip [2] 1.1114 +// State 1: Just after a comma. 1.1115 +// Copy letter [0], Ignore subsequent commas[1]. minus and all others skip [2] 1.1116 +// State 2: Skipping. 1.1117 +// All characters except comma skip and stay in [2]. comma goes to [1] 1.1118 + 1.1119 +// The thing that is copied is kLangCodeRemap[c] when going to state 0, 1.1120 +// and always comma when going to state 1 or 2. The design depends on copying 1.1121 +// a comma at the *beginning* of skipping, and in state 2 never doing a copy. 1.1122 + 1.1123 +// We pack all this into 8 bits: 1.1124 +// +--+---+---+ 1.1125 +// |78|654|321| 1.1126 +// +--+---+---+ 1.1127 +// 1.1128 +// Shift byte right by 3*state, giving [0] 321, [1] 654, [2] .78 1.1129 +// where . is always zero 1.1130 +// Of these 3 bits, low two are next state ss, high bit is copy bit C. 1.1131 +// If C=1 and ss == 0, copy kLangCodeRemap[c], else copy a comma 1.1132 + 1.1133 +#define SKIP0 0 1.1134 +#define SKIP1 1 1.1135 +#define SKIP2 2 1.1136 +#define COPY0 4 // copy kLangCodeRemap[c] 1.1137 +#define COPY1 5 // copy ',' 1.1138 +#define COPY2 6 // copy ',' 1.1139 + 1.1140 +// These combined actions pack three states into one byte. 1.1141 +// Ninth bit must be zero, so all state 2 values must be skips. 1.1142 +// state[2] state[1] state[0] 1.1143 +#define LTR ((SKIP2 << 6) + (COPY0 << 3) + COPY0) 1.1144 +#define MINUS ((SKIP2 << 6) + (COPY2 << 3) + COPY0) 1.1145 +#define COMMA ((SKIP1 << 6) + (SKIP1 << 3) + COPY1) 1.1146 +#define Bad ((SKIP2 << 6) + (COPY2 << 3) + COPY2) 1.1147 + 1.1148 +// Treat as letter: a-z, A-Z 1.1149 +// Treat as minus: 2D minus, 5F underscore 1.1150 +// Treat as comma: 09 tab, 20 space, 2C comma 1.1151 + 1.1152 +static const unsigned char kLangCodeAction[256] = { 1.1153 + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,COMMA,Bad,Bad,Bad,Bad,Bad,Bad, 1.1154 + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, 1.1155 + COMMA,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,COMMA,MINUS,Bad,Bad, 1.1156 + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, 1.1157 + 1.1158 + Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, 1.1159 + LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,MINUS, 1.1160 + Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, 1.1161 + LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,Bad, 1.1162 + 1.1163 + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, 1.1164 + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, 1.1165 + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, 1.1166 + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, 1.1167 + 1.1168 + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, 1.1169 + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, 1.1170 + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, 1.1171 + Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, 1.1172 +}; 1.1173 + 1.1174 +// This does lowercasing, maps underscore to minus, and maps tab/space to comma 1.1175 +static const unsigned char kLangCodeRemap[256] = { 1.1176 + 0,0,0,0,0,0,0,0, 0,',',0,0,0,0,0,0, // 09 tab 1.1177 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.1178 + ',',0,0,0,0,0,0,0, 0,0,0,0,',','-',0,0, // 20 space 2C comma 2D minus 1.1179 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.1180 + 1.1181 + 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o', 1.1182 + 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,'-', // 5F underscore 1.1183 + 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o', 1.1184 + 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,0, 1.1185 + 1.1186 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.1187 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.1188 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.1189 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.1190 + 1.1191 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.1192 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.1193 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.1194 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1.1195 +}; 1.1196 + 1.1197 +#undef LTR 1.1198 +#undef MINUS 1.1199 +#undef COMMA 1.1200 +#undef Bad 1.1201 + 1.1202 +#undef SKIP0 1.1203 +#undef SKIP1 1.1204 +#undef SKIP2 1.1205 +#undef COPY0 1.1206 +#undef COPY1 1.1207 +#undef COPY2 1.1208 + 1.1209 + 1.1210 +// Find opening '<' for HTML tag 1.1211 +// Note: this is all somewhat insensitive to mismatched quotes 1.1212 +int32 FindTagStart(const char* utf8_body, int32 pos, int32 max_pos) { 1.1213 + int i = pos; 1.1214 + // Advance i by 4 if none of the next 4 bytes are '<' 1.1215 + for (i = pos; i < (max_pos - 3); i += 4) { 1.1216 + // Fast check for any < 1.1217 + const char* p = &utf8_body[i]; 1.1218 + uint32 s0123 = UNALIGNED_LOAD32(p); 1.1219 + uint32 temp = s0123 ^ 0x3c3c3c3c; // <<<< 1.1220 + if (((temp - 0x01010101) & (~temp & 0x80808080)) != 0) { 1.1221 + // At least one byte is '<' 1.1222 + break; 1.1223 + } 1.1224 + } 1.1225 + // Continue, advancing i by 1 1.1226 + for (; i < max_pos; ++i) { 1.1227 + if (utf8_body[i] == '<') {return i;} 1.1228 + } 1.1229 + return -1; 1.1230 +} 1.1231 + 1.1232 + 1.1233 +// Find closing '>' for HTML tag. Also stop on < and & (simplistic parsing) 1.1234 +int32 FindTagEnd(const char* utf8_body, int32 pos, int32 max_pos) { 1.1235 + // Always outside quotes 1.1236 + for (int i = pos; i < max_pos; ++i) { 1.1237 + char c = utf8_body[i]; 1.1238 + if (c == '>') {return i;} 1.1239 + if (c == '<') {return i - 1;} 1.1240 + if (c == '&') {return i - 1;} 1.1241 + } 1.1242 + return -1; // nothing found 1.1243 +} 1.1244 + 1.1245 +// Find opening quote or apostrophe, skipping spaces 1.1246 +// Note: this is all somewhat insensitive to mismatched quotes 1.1247 +int32 FindQuoteStart(const char* utf8_body, int32 pos, int32 max_pos) { 1.1248 + for (int i = pos; i < max_pos; ++i) { 1.1249 + char c = utf8_body[i]; 1.1250 + if (c == '"') {return i;} 1.1251 + if (c == '\'') {return i;} 1.1252 + if (c != ' ') {return -1;} 1.1253 + } 1.1254 + return -1; 1.1255 +} 1.1256 + 1.1257 +// Find closing quot/apos. Also stop on = > < and & (simplistic parsing) 1.1258 +int32 FindQuoteEnd(const char* utf8_body, int32 pos, int32 max_pos) { 1.1259 + // Always outside quotes 1.1260 + for (int i = pos; i < max_pos; ++i) { 1.1261 + char c = utf8_body[i]; 1.1262 + if (c == '"') {return i;} 1.1263 + if (c == '\'') {return i;} 1.1264 + if (c == '>') {return i - 1;} 1.1265 + if (c == '=') {return i - 1;} 1.1266 + if (c == '<') {return i - 1;} 1.1267 + if (c == '&') {return i - 1;} 1.1268 + } 1.1269 + return -1; // nothing found 1.1270 +} 1.1271 + 1.1272 +int32 FindEqualSign(const char* utf8_body, int32 pos, int32 max_pos) { 1.1273 + // Outside quotes/apostrophes loop 1.1274 + for (int i = pos; i < max_pos; ++i) { 1.1275 + char c = utf8_body[i]; 1.1276 + if (c == '=') { // Found bare equal sign inside tag 1.1277 + return i; 1.1278 + } else if (c == '"') { 1.1279 + // Inside quotes loop 1.1280 + int j; 1.1281 + for (j = i + 1; j < max_pos; ++j) { 1.1282 + if (utf8_body[j] == '"') { 1.1283 + break; 1.1284 + } else if (utf8_body[j] == '\\') { 1.1285 + ++j; 1.1286 + } 1.1287 + } 1.1288 + i = j; 1.1289 + } else if (c == '\'') { 1.1290 + // Inside apostrophes loop 1.1291 + int j; 1.1292 + for (j = i + 1; j < max_pos; ++j) { 1.1293 + if (utf8_body[j] == '\'') { 1.1294 + break; 1.1295 + } else if (utf8_body[j] == '\\') { 1.1296 + ++j; 1.1297 + } 1.1298 + } 1.1299 + i = j; 1.1300 + } 1.1301 + 1.1302 + } 1.1303 + return -1; // nothing found 1.1304 +} 1.1305 + 1.1306 +// Scan backwards for case-insensitive string s in [min_pos..pos) 1.1307 +// Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f] 1.1308 +// Cheap lowercase. Control codes will masquerade as 20..3f 1.1309 +bool FindBefore(const char* utf8_body, 1.1310 + int32 min_pos, int32 pos, const char* s) { 1.1311 + int len = strlen(s); 1.1312 + if ((pos - min_pos) < len) {return false;} // Too small to fit s 1.1313 + 1.1314 + // Skip trailing spaces 1.1315 + int i = pos; 1.1316 + while ((i > (min_pos + len)) && (utf8_body[i - 1] == ' ')) {--i;} 1.1317 + i -= len; 1.1318 + if (i < min_pos) {return false;} // pos - min_pos < len, so s can't be found 1.1319 + 1.1320 + const char* p = &utf8_body[i]; 1.1321 + for (int j = 0; j < len; ++j) { 1.1322 + if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte 1.1323 + } 1.1324 + return true; // All bytes equal at i 1.1325 +} 1.1326 + 1.1327 +// Scan forwards for case-insensitive string s in [pos..max_pos) 1.1328 +// Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f] 1.1329 +// Cheap lowercase. Control codes will masquerade as 20..3f 1.1330 +// Allows but does not require quoted/apostrophe string 1.1331 +bool FindAfter(const char* utf8_body, 1.1332 + int32 pos, int32 max_pos, const char* s) { 1.1333 + int len = strlen(s); 1.1334 + if ((max_pos - pos) < len) {return false;} // Too small to fit s 1.1335 + 1.1336 + // Skip leading spaces, quote, apostrophe 1.1337 + int i = pos; 1.1338 + while (i < (max_pos - len)) { 1.1339 + unsigned char c = utf8_body[i]; 1.1340 + if ((c == ' ') || (c == '"') || (c == '\'')) {++i;} 1.1341 + else {break;} 1.1342 + } 1.1343 + 1.1344 + const char* p = &utf8_body[i]; 1.1345 + for (int j = 0; j < len; ++j) { 1.1346 + if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte 1.1347 + } 1.1348 + return true; // All bytes equal 1.1349 +} 1.1350 + 1.1351 + 1.1352 + 1.1353 +// Copy attribute value in [pos..max_pos) 1.1354 +// pos is just after an opening quote/apostrophe and max_pos is the ending one 1.1355 +// String must all be on a single line. 1.1356 +// Return slightly-normalized language list, empty or ending in comma 1.1357 +// Does lowercasing and removes excess punctuation/space 1.1358 +string CopyOneQuotedString(const char* utf8_body, 1.1359 + int32 pos, int32 max_pos) { 1.1360 + string s; 1.1361 + int state = 1; // Front is logically just after a comma 1.1362 + for (int i = pos; i < max_pos; ++i) { 1.1363 + unsigned char c = utf8_body[i]; 1.1364 + int e = kLangCodeAction[c] >> (3 * state); 1.1365 + state = e & 3; // Update to next state 1.1366 + if ((e & 4) != 0) { 1.1367 + // Copy a remapped byte if going to state 0, else copy a comma 1.1368 + if (state == 0) { 1.1369 + s.append(1, kLangCodeRemap[c]); 1.1370 + } else { 1.1371 + s.append(1, ','); 1.1372 + } 1.1373 + } 1.1374 + } 1.1375 + 1.1376 + // Add final comma if needed 1.1377 + if (state == 0) { 1.1378 + s.append(1, ','); 1.1379 + } 1.1380 + return s; 1.1381 +} 1.1382 + 1.1383 +// Find and copy attribute value: quoted string in [pos..max_pos) 1.1384 +// Return slightly-normalized language list, empty or ending in comma 1.1385 +string CopyQuotedString(const char* utf8_body, 1.1386 + int32 pos, int32 max_pos) { 1.1387 + int32 start_quote = FindQuoteStart(utf8_body, pos, max_pos); 1.1388 + if (start_quote < 0) {return string("");} 1.1389 + int32 end_quote = FindQuoteEnd(utf8_body, start_quote + 1, max_pos); 1.1390 + if (end_quote < 0) {return string("");} 1.1391 + 1.1392 + return CopyOneQuotedString(utf8_body, start_quote + 1, end_quote); 1.1393 +} 1.1394 + 1.1395 +// Add hints to vector of langpriors 1.1396 +// Input is from GetLangTagsFromHtml(), already lowercased 1.1397 +void SetCLDLangTagsHint(const string& langtags, CLDLangPriors* langpriors) { 1.1398 + if (langtags.empty()) {return;} 1.1399 + int commas = CountCommas(langtags); 1.1400 + if (commas > 4) {return;} // Ignore if too many language tags 1.1401 + 1.1402 + char temp[20]; 1.1403 + int pos = 0; 1.1404 + while (pos < static_cast<int>(langtags.size())) { 1.1405 + int comma = langtags.find(',', pos); 1.1406 + if (comma == string::npos) {comma = langtags.size();} // fake trailing comma 1.1407 + int len = comma - pos; 1.1408 + if (len <= 16) { 1.1409 + // Short enough to use 1.1410 + memcpy(temp, &langtags[pos], len); 1.1411 + temp[len] = '\0'; 1.1412 + const LangTagLookup* entry = DoLangTagLookup(temp, 1.1413 + kCLDLangTagsHintTable1, 1.1414 + kCLDTable1Size); 1.1415 + if (entry != NULL) { 1.1416 + // First table hit 1.1417 + MergeCLDLangPriorsMax(entry->onelangprior1, langpriors); 1.1418 + MergeCLDLangPriorsMax(entry->onelangprior2, langpriors); 1.1419 + } else { 1.1420 + // Try second table with language code truncated at first hyphen 1.1421 + char* hyphen = strchr(temp, '-'); 1.1422 + if (hyphen != NULL) {*hyphen = '\0';} 1.1423 + len = strlen(temp); 1.1424 + if (len <= 3) { // Short enough to use 1.1425 + entry = DoLangTagLookup(temp, 1.1426 + kCLDLangTagsHintTable2, 1.1427 + kCLDTable2Size); 1.1428 + if (entry != NULL) { 1.1429 + // Second table hit 1.1430 + MergeCLDLangPriorsMax(entry->onelangprior1, langpriors); 1.1431 + MergeCLDLangPriorsMax(entry->onelangprior2, langpriors); 1.1432 + } 1.1433 + } 1.1434 + } 1.1435 + } 1.1436 + pos = comma + 1; 1.1437 + } 1.1438 +} 1.1439 + 1.1440 +// Add hints to vector of langpriors 1.1441 +// Input is string after HTTP header Content-Language: 1.1442 +void SetCLDContentLangHint(const char* contentlang, CLDLangPriors* langpriors) { 1.1443 + string langtags = CopyOneQuotedString(contentlang, 0, strlen(contentlang)); 1.1444 + SetCLDLangTagsHint(langtags, langpriors); 1.1445 +} 1.1446 + 1.1447 +// Add hints to vector of langpriors 1.1448 +// Input is last element of hostname (no dot), e.g. from GetTLD() 1.1449 +void SetCLDTLDHint(const char* tld, CLDLangPriors* langpriors) { 1.1450 + int len = strlen(tld); 1.1451 + if (len > 3) {return;} // Ignore if more than three letters 1.1452 + char local_tld[4]; 1.1453 + strncpy(local_tld, tld, 4); 1.1454 + local_tld[3] = '\0'; // Safety move 1.1455 + // Lowercase 1.1456 + for (int i = 0; i < len; ++i) {local_tld[i] |= 0x20;} 1.1457 + const TLDLookup* entry = DoTLDLookup(local_tld, 1.1458 + kCLDTLDHintTable, 1.1459 + kCLDTable3Size); 1.1460 + if (entry != NULL) { 1.1461 + // Table hit 1.1462 + MergeCLDLangPriorsBoost(entry->onelangprior1, langpriors); 1.1463 + MergeCLDLangPriorsBoost(entry->onelangprior2, langpriors); 1.1464 + } 1.1465 +} 1.1466 + 1.1467 +// Add hints to vector of langpriors 1.1468 +// Input is from DetectEncoding() 1.1469 +void SetCLDEncodingHint(Encoding enc, CLDLangPriors* langpriors) { 1.1470 + OneCLDLangPrior olp; 1.1471 + switch (enc) { 1.1472 + case CHINESE_GB: 1.1473 + case GBK: 1.1474 + case GB18030: 1.1475 + case ISO_2022_CN: 1.1476 + case HZ_GB_2312: 1.1477 + olp = PackCLDPriorLangWeight(CHINESE, kCLDPriorEncodingWeight); 1.1478 + MergeCLDLangPriorsBoost(olp, langpriors); 1.1479 + break; 1.1480 + case CHINESE_BIG5: 1.1481 + case CHINESE_BIG5_CP950: 1.1482 + case BIG5_HKSCS: 1.1483 + olp = PackCLDPriorLangWeight(CHINESE_T, kCLDPriorEncodingWeight); 1.1484 + MergeCLDLangPriorsBoost(olp, langpriors); 1.1485 + break; 1.1486 + case JAPANESE_EUC_JP: 1.1487 + case JAPANESE_SHIFT_JIS: 1.1488 + case JAPANESE_CP932: 1.1489 + case JAPANESE_JIS: // ISO-2022-JP 1.1490 + olp = PackCLDPriorLangWeight(JAPANESE, kCLDPriorEncodingWeight); 1.1491 + MergeCLDLangPriorsBoost(olp, langpriors); 1.1492 + break; 1.1493 + case KOREAN_EUC_KR: 1.1494 + case ISO_2022_KR: 1.1495 + olp = PackCLDPriorLangWeight(KOREAN, kCLDPriorEncodingWeight); 1.1496 + MergeCLDLangPriorsBoost(olp, langpriors); 1.1497 + break; 1.1498 + 1.1499 + default: 1.1500 + break; 1.1501 + } 1.1502 +} 1.1503 + 1.1504 +// Add hints to vector of langpriors 1.1505 +// Input is from random source 1.1506 +void SetCLDLanguageHint(Language lang, CLDLangPriors* langpriors) { 1.1507 + OneCLDLangPrior olp = PackCLDPriorLangWeight(lang, kCLDPriorLanguageWeight); 1.1508 + MergeCLDLangPriorsBoost(olp, langpriors); 1.1509 +} 1.1510 + 1.1511 + 1.1512 +// Make printable string of priors 1.1513 +string DumpCLDLangPriors(const CLDLangPriors* langpriors) { 1.1514 + string retval; 1.1515 + for (int i = 0; i < langpriors->n; ++i) { 1.1516 + char temp[64]; 1.1517 + sprintf(temp, "%s.%d ", 1.1518 + LanguageCode(GetCLDPriorLang(langpriors->prior[i])), 1.1519 + GetCLDPriorWeight(langpriors->prior[i])); 1.1520 + retval.append(temp); 1.1521 + } 1.1522 + return retval; 1.1523 +} 1.1524 + 1.1525 + 1.1526 + 1.1527 + 1.1528 +// Look for 1.1529 +// <html lang="en"> 1.1530 +// <doc xml:lang="en"> 1.1531 +// <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en-US"> 1.1532 +// <meta http-equiv="content-language" content="en-GB" /> 1.1533 +// <meta name="language" content="Srpski"> 1.1534 +// <meta name="DC.language" scheme="RFCOMMA766" content="en"> 1.1535 +// <SPAN id="msg1" class="info" lang='en'> 1.1536 +// 1.1537 +// Do not trigger on 1.1538 +// <!-- lang=french ...--> 1.1539 +// <font lang=postscript ...> 1.1540 +// <link href="index.fr.html" hreflang="fr-FR" xml:lang="fr-FR" /> 1.1541 +// <META name="Author" lang="fr" content="Arnaud Le Hors"> 1.1542 +// 1.1543 +// Stop fairly quickly on mismatched quotes 1.1544 +// 1.1545 +// Allowed language characters 1.1546 +// a-z A-Z -_ , space\t 1.1547 +// Think about: GB2312, big5, shift-jis, euc-jp, ksc euc-kr 1.1548 +// zh-hans zh-TW cmn-Hani zh_cn.gb18030_CN zh-min-nan zh-yue 1.1549 +// de-x-mtfrom-en zh-tw-x-mtfrom-en (machine translation) 1.1550 +// GB2312 => gb 1.1551 +// Big5 => big 1.1552 +// zh_CN.gb18030_C => zh-cn 1.1553 +// 1.1554 +// Remove duplicates and extra spaces as we go 1.1555 +// Lowercase as we go. 1.1556 + 1.1557 +// Get language tag hints from HTML body 1.1558 +// Normalize: remove spaces and make lowercase comma list 1.1559 + 1.1560 +string GetLangTagsFromHtml(const char* utf8_body, int32 utf8_body_len, 1.1561 + int32 max_scan_bytes) { 1.1562 + string retval; 1.1563 + if (max_scan_bytes > utf8_body_len) { 1.1564 + max_scan_bytes = utf8_body_len; 1.1565 + } 1.1566 + 1.1567 + int32 k = 0; 1.1568 + while (k < max_scan_bytes) { 1.1569 + int32 start_tag = FindTagStart(utf8_body, k, max_scan_bytes); 1.1570 + if (start_tag < 0) {break;} 1.1571 + int32 end_tag = FindTagEnd(utf8_body, start_tag + 1, max_scan_bytes); 1.1572 + // FindTagEnd exits on < > & 1.1573 + if (end_tag < 0) {break;} 1.1574 + 1.1575 + // Skip <!--...> 1.1576 + // Skip <font ...> 1.1577 + // Skip <script ...> 1.1578 + // Skip <link ...> 1.1579 + // Skip <img ...> 1.1580 + // Skip <a ...> 1.1581 + if (FindAfter(utf8_body, start_tag + 1, end_tag, "!--") || 1.1582 + FindAfter(utf8_body, start_tag + 1, end_tag, "font ") || 1.1583 + FindAfter(utf8_body, start_tag + 1, end_tag, "script ") || 1.1584 + FindAfter(utf8_body, start_tag + 1, end_tag, "link ") || 1.1585 + FindAfter(utf8_body, start_tag + 1, end_tag, "img ") || 1.1586 + FindAfter(utf8_body, start_tag + 1, end_tag, "a ")) { 1.1587 + k = end_tag + 1; 1.1588 + continue; 1.1589 + } 1.1590 + 1.1591 + // Remember <meta ...> 1.1592 + bool in_meta = false; 1.1593 + if (FindAfter(utf8_body, start_tag + 1, end_tag, "meta ")) { 1.1594 + in_meta = true; 1.1595 + } 1.1596 + 1.1597 + // Scan for each equal sign inside tag 1.1598 + bool content_is_lang = false; 1.1599 + int32 kk = start_tag + 1; 1.1600 + int32 equal_sign; 1.1601 + while ((equal_sign = FindEqualSign(utf8_body, kk, end_tag)) >= 0) { 1.1602 + // eq exits on < > & 1.1603 + 1.1604 + // Look inside a meta tag 1.1605 + // <meta ... http-equiv="content-language" ...> 1.1606 + // <meta ... name="language" ...> 1.1607 + // <meta ... name="dc.language" ...> 1.1608 + if (in_meta) { 1.1609 + if (FindBefore(utf8_body, kk, equal_sign, " http-equiv") && 1.1610 + FindAfter(utf8_body, equal_sign + 1, end_tag, 1.1611 + "content-language ")) { 1.1612 + content_is_lang = true; 1.1613 + } else if (FindBefore(utf8_body, kk, equal_sign, " name") && 1.1614 + (FindAfter(utf8_body, equal_sign + 1, end_tag, 1.1615 + "dc.language ") || 1.1616 + FindAfter(utf8_body, equal_sign + 1, end_tag, 1.1617 + "language "))) { 1.1618 + content_is_lang = true; 1.1619 + } 1.1620 + } 1.1621 + 1.1622 + // Look inside any tag 1.1623 + // <meta ... content="lang-list" ...> 1.1624 + // <... lang="lang-list" ...> 1.1625 + // <... xml:lang="lang-list" ...> 1.1626 + if ((content_is_lang && FindBefore(utf8_body, kk, equal_sign, 1.1627 + " content")) || 1.1628 + FindBefore(utf8_body, kk, equal_sign, " lang") || 1.1629 + FindBefore(utf8_body, kk, equal_sign, ":lang")) { 1.1630 + string temp = CopyQuotedString(utf8_body, equal_sign + 1, end_tag); 1.1631 + 1.1632 + // Append new lang tag(s) if not a duplicate 1.1633 + if (!temp.empty() && (retval.find(temp) == string::npos)) { 1.1634 + retval.append(temp); 1.1635 + } 1.1636 + } 1.1637 + 1.1638 + kk = equal_sign + 1; 1.1639 + } 1.1640 + k = end_tag + 1; 1.1641 + } 1.1642 + 1.1643 + // Strip last comma 1.1644 + if (retval.size() > 1) { 1.1645 + retval.erase(retval.size() - 1); 1.1646 + } 1.1647 + return retval; 1.1648 +} 1.1649 + 1.1650 +} // End namespace CLD2 1.1651 + 1.1652 +//============================================================================== 1.1653 + 1.1654 +