Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 //
16 // Author: dsites@google.com (Dick Sites)
17 //
19 #include "compact_lang_det_hint_code.h"
21 #include <stdlib.h> // for abs()
22 #include <stdio.h> // for sprintf()
23 #include <string.h> //
24 #include "lang_script.h"
25 #include "port.h"
27 using namespace std;
29 namespace CLD2 {
31 static const int kCLDPriorEncodingWeight = 4; // 100x more likely
32 static const int kCLDPriorLanguageWeight = 8; // 10000x more likely
35 // Tables to map lang="..." language code lists to actual languages.
36 // based on scraping and hand-edits, dsites June 2011
38 // n = f(string, &a) gives list of n<=4 language pairs: primary, secondary
40 // For close pairs like ms/id, more weight on TLD and lang=
41 // Alternately, weaker boost but mark others of set as negative;
42 // makes "neither" an easier result.
43 // lang=en low weight 4
44 // tld=lu boost lu maaybe 4. but lang= alwyas overcomes tld and encoding
45 // (except maybe en)
47 // TLD to separate, e.g., burundi from rwanda
49 // Encoding lookup: OneLangProb array
50 // TLD lookup: tld OneLangProb pairs
53 typedef struct {
54 const char* const langtag; // Lowercased, hyphen only lookup key
55 const char* const langcode; // Canonical language codes; two if ambiguous
56 OneCLDLangPrior onelangprior1;
57 OneCLDLangPrior onelangprior2;
58 } LangTagLookup;
60 typedef struct {
61 const char* const tld; // Lowercased, hyphen only lookup key
62 OneCLDLangPrior onelangprior1;
63 OneCLDLangPrior onelangprior2;
64 } TLDLookup;
67 #define W2 (2 << 10) // 3**2 = 10x more likely
68 #define W4 (4 << 10) // 3**4 = 100x more likely
69 #define W6 (6 << 10) // 3**6 = 1000x more likely
70 #define W8 (8 << 10) // 3**8 = 10K x more likely
71 #define W10 (10 << 10) // 3**10 = 100K x more likely
72 #define W12 (12 << 10) // 3**12 = 1M x more likely
74 // TODO: more about ba hr sr sr-ME and sl
75 // Temporary state of affairs:
76 // BOSNIAN CROATIAN MONTENEGRIN SERBIAN detecting just CROATIAN SERBIAN
77 // Eventually, we want to do all four, but it requires a CLD change to handle
78 // up to six languages per quadgram.
81 // Close pairs boost one of pair, demote other.
82 // Statistically close pairs:
83 // INDONESIAN/MALAY difficult to distinguish -- extra word-based lookups used
84 //
85 // INDONESIAN MALAY coef=0.4698 Problematic w/o extra words
86 // TIBETAN DZONGKHA coef=0.4571
87 // CZECH SLOVAK coef=0.4273
88 // NORWEGIAN NORWEGIAN_N coef=0.4182
89 //
90 // HINDI MARATHI coef=0.3795
91 // ZULU XHOSA coef=0.3716
92 //
93 // DANISH NORWEGIAN coef=0.3672 Usually OK
94 // BIHARI HINDI coef=0.3668 Usually OK
95 // ICELANDIC FAROESE coef=0.3519 Usually OK
97 //
98 // Table to look up lang= tags longer than three characters
99 // Overrides table below, which is truncated at first hyphen
100 // In alphabetical order for binary search
101 static const int kCLDTable1Size = 213;
102 static const LangTagLookup kCLDLangTagsHintTable1[kCLDTable1Size] = {
103 {"abkhazian", "ab", ABKHAZIAN + W10, 0},
104 {"afar", "aa", AFAR + W10, 0},
105 {"afrikaans", "af", AFRIKAANS + W10, 0},
106 {"akan", "ak", AKAN + W10, 0},
107 {"albanian", "sq", ALBANIAN + W10, 0},
108 {"am-am", "hy", ARMENIAN + W10, 0}, // 1:2 Armenian, not ambiguous
109 {"amharic", "am", AMHARIC + W10, 0},
110 {"arabic", "ar", ARABIC + W10, 0},
111 {"argentina", "es", SPANISH + W10, 0},
112 {"armenian", "hy", ARMENIAN + W10, 0},
113 {"assamese", "as", ASSAMESE + W10, 0},
114 {"aymara", "ay", AYMARA + W10, 0},
115 {"azerbaijani", "az", AZERBAIJANI + W10, 0},
117 {"bangla", "bn", BENGALI + W10, 0},
118 {"bashkir", "ba", BASHKIR + W10, 0},
119 {"basque", "eu", BASQUE + W10, 0},
120 {"belarusian", "be", BELARUSIAN + W10, 0},
121 {"bengali", "bn", BENGALI + W10, 0},
122 {"bihari", "bh", BIHARI + W10, HINDI - W4},
123 {"bislama", "bi", BISLAMA + W10, 0},
124 {"bosnian", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian
125 {"br-br", "pt", PORTUGUESE + W10, 0}, // 1:2 Portuguese, not ambiguous
126 {"br-fr", "br", BRETON + W10, 0}, // 1:2 Breton, not ambiguous
127 {"breton", "br", BRETON + W10, 0},
128 {"bulgarian", "bg", BULGARIAN + W10, 0},
129 {"burmese", "my", BURMESE + W10, 0}, // Myanmar
131 {"catalan", "ca", CATALAN + W10, 0},
132 {"cherokee", "chr", CHEROKEE + W10, 0},
133 {"chichewa", "ny", NYANJA + W10, 0},
135 {"chinese", "zh", CHINESE + W10, 0},
136 {"chinese-t", "zhT", CHINESE_T + W10, 0},
137 {"chineset", "zhT", CHINESE_T + W10, 0},
138 {"corsican", "co", CORSICAN + W10, 0},
139 {"cpf-hat", "ht", HAITIAN_CREOLE + W10, 0}, // Creole, French-based
140 {"croatian", "hr", CROATIAN + W10, 0},
141 {"czech", "cs", CZECH + W10, SLOVAK - W4},
143 {"danish", "da", DANISH + W10, NORWEGIAN - W4},
144 {"deutsch", "de", GERMAN + W10, 0},
145 {"dhivehi", "dv", DHIVEHI + W10, 0},
146 {"dutch", "nl", DUTCH + W10, 0},
147 {"dzongkha", "dz", DZONGKHA + W10, TIBETAN - W4},
149 {"ell-gr", "el", GREEK + W10, 0},
150 {"english", "en", ENGLISH + W4, 0},
151 {"esperanto", "eo", ESPERANTO + W10, 0},
152 {"estonian", "et", ESTONIAN + W10, 0},
153 {"euc-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding
154 {"euc-kr", "ko", KOREAN + W10, 0}, // Korean encoding
156 {"faroese", "fo", FAROESE + W10, ICELANDIC - W4},
157 {"fijian", "fj", FIJIAN + W10, 0},
158 {"finnish", "fi", FINNISH + W10, 0},
159 {"fran", "fr", FRENCH + W10, 0}, // Truncated at non-ASCII
160 {"francais", "fr", FRENCH + W10, 0},
161 {"french", "fr", FRENCH + W10, 0},
162 {"frisian", "fy", FRISIAN + W10, 0},
164 {"ga-es", "gl", GALICIAN + W10, 0}, // 1:2 Galician, not ambiguous
165 {"galician", "gl", GALICIAN + W10, 0},
166 {"ganda", "lg", GANDA + W10, 0},
167 {"georgian", "ka", GEORGIAN + W10, 0},
168 {"german", "de", GERMAN + W10, 0},
169 {"greek", "el", GREEK + W10, 0},
170 {"greenlandic", "kl", GREENLANDIC + W10, 0},
171 {"guarani", "gn", GUARANI + W10, 0},
172 {"gujarati", "gu", GUJARATI + W10, 0},
174 {"haitian_creole", "ht", HAITIAN_CREOLE + W10, 0},
175 {"hausa", "ha", HAUSA + W10, 0},
176 {"hawaiian", "haw", HAWAIIAN + W10, 0},
177 {"hebrew", "iw", HEBREW + W10, 0},
178 {"hindi", "hi", HINDI + W10, MARATHI - W4},
179 {"hn-in", "hi", HINDI + W10, MARATHI - W4},
180 {"hungarian", "hu", HUNGARIAN + W10, 0},
182 {"icelandic", "is", ICELANDIC + W10, FAROESE - W4},
183 {"igbo", "ig", IGBO + W10, 0},
184 {"indonesian", "id", INDONESIAN + W10, MALAY - W4},
185 {"interlingua", "ia", INTERLINGUA + W10, 0},
186 {"interlingue", "ie", INTERLINGUE + W10, 0},
187 // 1:2 iu-Cans ik-Latn
188 {"inuktitut", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2
189 {"inupiak", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2
190 {"ir-ie", "ga", IRISH + W10, 0}, // Irish
191 {"irish", "ga", IRISH + W10, 0},
192 {"italian", "it", ITALIAN + W10, 0},
194 {"ja-euc", "ja", JAPANESE + W10, 0}, // Japanese encoding
195 {"jan-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding
196 {"japanese", "ja", JAPANESE + W10, 0},
197 {"javanese", "jw", JAVANESE + W10, 0},
199 {"kannada", "kn", KANNADA + W10, 0},
200 {"kashmiri", "ks", KASHMIRI + W10, 0},
201 {"kazakh", "kk", KAZAKH + W10, 0},
202 {"khasi", "kha", KHASI + W10, 0},
203 {"khmer", "km", KHMER + W10, 0},
204 {"kinyarwanda", "rw", KINYARWANDA + W10, 0},
205 {"klingon", "tlh", X_KLINGON + W10, 0},
206 {"korean", "ko", KOREAN + W10, 0},
207 {"kurdish", "ku", KURDISH + W10, 0},
208 {"kyrgyz", "ky", KYRGYZ + W10, 0},
210 {"laothian", "lo", LAOTHIAN + W10, 0},
211 {"latin", "la", LATIN + W10, 0},
212 {"latvian", "lv", LATVIAN + W10, 0},
213 {"limbu", "sit", LIMBU + W10, 0},
214 {"lingala", "ln", LINGALA + W10, 0},
215 {"lithuanian", "lt", LITHUANIAN + W10, 0},
216 {"luxembourgish", "lb", LUXEMBOURGISH + W10, 0},
218 {"macedonian", "mk", MACEDONIAN + W10, 0},
219 {"malagasy", "mg", MALAGASY + W10, 0},
220 {"malay", "ms", MALAY + W10, INDONESIAN - W4},
221 {"malayalam", "ml", MALAYALAM + W10, 0},
222 {"maltese", "mt", MALTESE + W10, 0},
223 {"manx", "gv", MANX + W10, 0},
224 {"maori", "mi", MAORI + W10, 0},
225 {"marathi", "mr", MARATHI + W10, HINDI - W4},
226 {"mauritian_creole", "mfe", MAURITIAN_CREOLE + W10, 0},
227 {"moldavian", "mo", ROMANIAN + W10, 0},
228 {"mongolian", "mn", MONGOLIAN + W10, 0},
229 {"montenegrin", "sr-me", MONTENEGRIN + W10, 0},
230 {"myanmar", "my", BURMESE + W10, 0}, // Myanmar
231 {"nauru", "na", NAURU + W10, 0},
232 {"ndebele", "nr", NDEBELE + W10, 0},
233 {"nepali", "ne", NEPALI + W10, 0},
234 {"no-bok", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal
235 {"no-bokmaal", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
236 {"no-nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal
237 {"no-no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
238 {"no-nyn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, // Nynorsk
239 {"no-nynorsk", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
240 {"norwegian", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
241 {"norwegian_n", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
242 {"nyanja", "ny", NYANJA + W10, 0},
244 {"occitan", "oc", OCCITAN + W10, 0},
245 {"oriya", "or", ORIYA + W10, 0},
246 {"oromo", "om", OROMO + W10, 0},
247 {"parsi", "fa", PERSIAN + W10, 0},
249 {"pashto", "ps", PASHTO + W10, 0},
250 {"pedi", "nso", PEDI + W10, 0},
251 {"persian", "fa", PERSIAN + W10, 0},
252 {"polish", "pl", POLISH + W10, 0},
253 {"polska", "pl", POLISH + W10, 0},
254 {"polski", "pl", POLISH + W10, 0},
255 {"portugu", "pt", PORTUGUESE + W10, 0}, // Truncated at non-ASCII
256 {"portuguese", "pt", PORTUGUESE + W10, 0},
257 {"punjabi", "pa", PUNJABI + W10, 0},
259 {"quechua", "qu", QUECHUA + W10, 0},
261 {"rhaeto_romance", "rm", RHAETO_ROMANCE + W10, 0},
262 {"romanian", "ro", ROMANIAN + W10, 0},
263 {"rundi", "rn", RUNDI + W10, 0},
264 {"russian", "ru", RUSSIAN + W10, 0},
266 {"samoan", "sm", SAMOAN + W10, 0},
267 {"sango", "sg", SANGO + W10, 0},
268 {"sanskrit", "sa", SANSKRIT + W10, 0},
269 {"scots", "sco", SCOTS + W10, ENGLISH - W4},
270 {"scots_gaelic", "gd", SCOTS_GAELIC + W10, 0},
271 {"serbian", "sr", SERBIAN + W10, 0},
272 {"seselwa", "crs", SESELWA + W10, 0},
273 {"sesotho", "st", SESOTHO + W10, 0},
274 {"shift-jis", "ja", JAPANESE + W10, 0}, // Japanese encoding
275 {"shift-js", "ja", JAPANESE + W10, 0}, // Japanese encoding
276 {"shona", "sn", SHONA + W10, 0},
277 {"si-lk", "si", SINHALESE + W10, 0}, // 1:2 Sri Lanka, not ambiguous
278 {"si-si", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous
279 {"si-sl", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous
280 {"sindhi", "sd", SINDHI + W10, 0},
281 {"sinhalese", "si", SINHALESE + W10, 0},
282 {"siswant", "ss", SISWANT + W10, 0},
283 {"sit-np", "sit", LIMBU + W10, 0},
284 {"slovak", "sk", SLOVAK + W10, CZECH - W4},
285 {"slovenian", "sl", SLOVENIAN + W10, 0},
286 {"somali", "so", SOMALI + W10, 0},
287 {"spanish", "es", SPANISH + W10, 0},
288 {"sr-me", "sr-me", MONTENEGRIN + W10, 0}, // Montenegrin => Montenegrin
289 {"sundanese", "su", SUNDANESE + W10, 0},
290 {"suomi", "fi", FINNISH + W10, 0}, // Finnish
291 {"swahili", "sw", SWAHILI + W10, 0},
292 {"swedish", "sv", SWEDISH + W10, 0},
293 {"syriac", "syr", SYRIAC + W10, 0},
295 {"tagalog", "tl", TAGALOG + W10, 0},
296 {"tajik", "tg", TAJIK + W10, 0},
297 {"tamil", "ta", TAMIL + W10, 0},
298 {"tatar", "tt", TATAR + W10, 0},
299 {"tb-tb", "bo", TIBETAN + W10, DZONGKHA - W4}, // Tibet
300 {"tchinese", "zhT", CHINESE_T + W10, 0},
301 {"telugu", "te", TELUGU + W10, 0},
302 {"thai", "th", THAI + W10, 0},
303 {"tibetan", "bo", TIBETAN + W10, DZONGKHA - W4},
304 {"tigrinya", "ti", TIGRINYA + W10, 0},
305 {"tonga", "to", TONGA + W10, 0},
306 {"tsonga", "ts", TSONGA + W10, 0},
307 {"tswana", "tn", TSWANA + W10, 0},
308 {"tt-ru", "tt", TATAR + W10, 0},
309 {"tur-tr", "tr", TURKISH + W10, 0},
310 {"turkish", "tr", TURKISH + W10, 0},
311 {"turkmen", "tk", TURKMEN + W10, 0},
312 {"uighur", "ug", UIGHUR + W10, 0},
313 {"ukrainian", "uk", UKRAINIAN + W10, 0},
314 {"urdu", "ur", URDU + W10, 0},
315 {"uzbek", "uz", UZBEK + W10, 0},
317 {"venda", "ve", VENDA + W10, 0},
318 {"vietnam", "vi", VIETNAMESE + W10, 0},
319 {"vietnamese", "vi", VIETNAMESE + W10, 0},
320 {"volapuk", "vo", VOLAPUK + W10, 0},
322 {"welsh", "cy", WELSH + W10, 0},
323 {"wolof", "wo", WOLOF + W10, 0},
325 {"xhosa", "xh", XHOSA + W10, ZULU - W4},
327 {"yiddish", "yi", YIDDISH + W10, 0},
328 {"yoruba", "yo", YORUBA + W10, 0},
330 {"zh-classical", "zhT", CHINESE_T + W10, 0},
331 {"zh-cn", "zh", CHINESE + W10, 0},
332 {"zh-hans", "zh", CHINESE + W10, 0},
333 {"zh-hant", "zhT", CHINESE_T + W10, 0},
334 {"zh-hk", "zhT", CHINESE_T + W10, 0},
335 {"zh-min-nan", "zhT", CHINESE_T + W10, 0}, // Min Nan => ChineseT
336 {"zh-sg", "zhT", CHINESE_T + W10, 0},
337 {"zh-tw", "zhT", CHINESE_T + W10, 0},
338 {"zh-yue", "zh", CHINESE + W10, 0}, // Yue (Cantonese) => Chinese
339 {"zhuang", "za", ZHUANG + W10, 0},
340 {"zulu", "zu", ZULU + W10, XHOSA - W4},
341 };
345 // Table to look up lang= tags of two/three characters after truncate at hyphen
346 // In alphabetical order for binary search
347 static const int kCLDTable2Size = 257;
348 static const LangTagLookup kCLDLangTagsHintTable2[kCLDTable2Size] = {
349 {"aa", "aa", AFAR + W10, 0},
350 {"ab", "ab", ABKHAZIAN + W10, 0},
351 {"af", "af", AFRIKAANS + W10, 0},
352 {"ak", "ak", AKAN + W10, 0},
353 {"al", "sq", ALBANIAN + W10, 0}, // Albania
354 {"am", "am,hy", AMHARIC + W10, ARMENIAN + W10}, // 1:2 Amharic Armenian
355 {"ar", "ar", ARABIC + W10, 0},
356 {"ara", "ar", ARABIC + W10, 0},
357 {"arm", "hy", ARMENIAN + W10, 0}, // Armenia
358 {"arz", "ar", ARABIC + W10, 0}, // Egyptian Arabic
359 {"as", "as", ASSAMESE + W10, 0},
360 {"at", "de", GERMAN + W10, 0}, // Austria
361 {"au", "de", GERMAN + W10, 0}, // Austria
362 {"ay", "ay", AYMARA + W10, 0},
363 {"az", "az", AZERBAIJANI + W10, 0},
364 {"aze", "az", AZERBAIJANI + W10, 0},
366 {"ba", "ba,bs", BASHKIR + W10, BOSNIAN + W10}, // 1:2 Bashkir Bosnia
367 {"be", "be", BELARUSIAN + W10, 0},
368 {"bel", "be", BELARUSIAN + W10, 0},
369 {"bg", "bg", BULGARIAN + W10, 0},
370 {"bh", "bh", BIHARI + W10, HINDI - W4},
371 {"bi", "bi", BISLAMA + W10, 0},
372 {"big", "zhT", CHINESE_T + W10, 0}, // Big5 encoding
373 {"bm", "ms", MALAY + W10, INDONESIAN - W4}, // Bahasa Malaysia
374 {"bn", "bn", BENGALI + W10, 0},
375 {"bo", "bo", TIBETAN + W10, DZONGKHA - W4},
376 // 1:2 Breton, Brazil country code, both Latn .br TLD enough for pt to win
377 {"br", "br,pt", BRETON + W10, PORTUGUESE + W8}, // 1:2 Breton, Brazil
378 {"bs", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian
380 {"ca", "ca", CATALAN + W10, 0},
381 {"cat", "ca", CATALAN + W10, 0},
382 {"ch", "de,fr", GERMAN + W10, FRENCH + W10}, // 1:2 Switzerland
383 {"chn", "zh", CHINESE + W10, 0},
384 {"chr", "chr", CHEROKEE + W10, 0},
385 {"ckb", "ku", KURDISH + W10, 0}, // Central Kurdish
386 {"cn", "zh,zhT", CHINESE + W6, CHINESE_T + W4}, // Ambiguous, so weaker.
387 // Offset by 2 so that TLD=tw or
388 // enc=big5 will put zhT ahead
389 {"co", "co", CORSICAN + W10, 0},
390 {"cro", "hr", CROATIAN + W10, 0}, // Croatia
391 {"crs", "crs", SESELWA + W10, 0},
392 {"cs", "cs", CZECH + W10, SLOVAK - W4},
393 {"ct", "ca", CATALAN + W10, 0},
394 {"cy", "cy", WELSH + W10, 0},
395 {"cym", "cy", WELSH + W10, 0},
396 {"cz", "cs", CZECH + W10, SLOVAK - W4},
398 {"da", "da", DANISH + W10, NORWEGIAN - W4},
399 {"dan", "da", DANISH + W10, NORWEGIAN - W4},
400 {"de", "de", GERMAN + W10, 0},
401 {"deu", "de", GERMAN + W10, 0},
402 {"div", "dv", DHIVEHI + W10, 0},
403 {"dk", "da", DANISH + W10, NORWEGIAN - W4}, // Denmark
404 {"dut", "nl", DUTCH + W10, 0}, // Dutch
405 {"dv", "dv", DHIVEHI + W10, 0},
406 {"dz", "dz", DZONGKHA + W10, TIBETAN - W4},
408 {"ee", "et", ESTONIAN + W10, 0}, // Estonia
409 {"eg", "ar", ARABIC + W10, 0}, // Egypt
410 {"el", "el", GREEK + W10, 0},
411 {"en", "en", ENGLISH + W4, 0},
412 {"eng", "en", ENGLISH + W4, 0},
413 {"eo", "eo", ESPERANTO + W10, 0},
414 {"er", "ur", URDU + W10, 0}, // "Erdu"
415 {"es", "es", SPANISH + W10, 0},
416 {"esp", "es", SPANISH + W10, 0},
417 {"est", "et", ESTONIAN + W10, 0},
418 {"et", "et", ESTONIAN + W10, 0},
419 {"eu", "eu", BASQUE + W10, 0},
421 {"fa", "fa", PERSIAN + W10, 0},
422 {"far", "fa", PERSIAN + W10, 0},
423 {"fi", "fi", FINNISH + W10, 0},
424 {"fil", "tl", TAGALOG + W10, 0}, // Philippines
425 {"fj", "fj", FIJIAN + W10, 0},
426 {"fo", "fo", FAROESE + W10, ICELANDIC - W4},
427 {"fr", "fr", FRENCH + W10, 0},
428 {"fra", "fr", FRENCH + W10, 0},
429 {"fre", "fr", FRENCH + W10, 0},
430 {"fy", "fy", FRISIAN + W10, 0},
432 {"ga", "ga,gl", IRISH + W10, GALICIAN + W10}, // 1:2 Irish, Galician
433 {"gae", "gd,ga", SCOTS_GAELIC + W10, IRISH + W10}, // 1:2 Gaelic, either
434 {"gal", "gl", GALICIAN + W10, 0},
435 {"gb", "zh", CHINESE + W10, 0}, // GB2312 encoding
436 {"gbk", "zh", CHINESE + W10, 0}, // GBK encoding
437 {"gd", "gd", SCOTS_GAELIC + W10, 0},
438 {"ge", "ka", GEORGIAN + W10, 0}, // Georgia
439 {"geo", "ka", GEORGIAN + W10, 0},
440 {"ger", "de", GERMAN + W10, 0},
441 {"gl", "gl", GALICIAN + W10, 0}, // Also Greenland; hard to confuse
442 {"gn", "gn", GUARANI + W10, 0},
443 {"gr", "el", GREEK + W10, 0}, // Greece
444 {"gu", "gu", GUJARATI + W10, 0},
445 {"gv", "gv", MANX + W10, 0},
447 {"ha", "ha", HAUSA + W10, 0},
448 {"hat", "ht", HAITIAN_CREOLE + W10, 0}, // Haiti
449 {"haw", "haw", HAWAIIAN + W10, 0},
450 {"hb", "iw", HEBREW + W10, 0},
451 {"he", "iw", HEBREW + W10, 0},
452 {"heb", "iw", HEBREW + W10, 0},
453 {"hi", "hi", HINDI + W10, MARATHI - W4},
454 {"hk", "zhT", CHINESE_T + W10, 0}, // Hong Kong
455 {"hr", "hr", CROATIAN + W10, 0},
456 {"ht", "ht", HAITIAN_CREOLE + W10, 0},
457 {"hu", "hu", HUNGARIAN + W10, 0},
458 {"hun", "hu", HUNGARIAN + W10, 0},
459 {"hy", "hy", ARMENIAN + W10, 0},
461 {"ia", "ia", INTERLINGUA + W10, 0},
462 {"ice", "is", ICELANDIC + W10, FAROESE - W4}, // Iceland
463 {"id", "id", INDONESIAN + W10, MALAY - W4},
464 {"ids", "id", INDONESIAN + W10, MALAY - W4},
465 {"ie", "ie", INTERLINGUE + W10, 0},
466 {"ig", "ig", IGBO + W10, 0},
467 // 1:2 iu-Cans ik-Latn
468 {"ik", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2
469 {"in", "id", INDONESIAN + W10, MALAY - W4},
470 {"ind", "id", INDONESIAN + W10, MALAY - W4}, // Indonesia
471 {"inu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2
472 {"is", "is", ICELANDIC + W10, FAROESE - W4},
473 {"it", "it", ITALIAN + W10, 0},
474 {"ita", "it", ITALIAN + W10, 0},
475 {"iu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2
476 {"iw", "iw", HEBREW + W10, 0},
478 {"ja", "ja", JAPANESE + W10, 0},
479 {"jp", "ja", JAPANESE + W10, 0}, // Japan
480 {"jpn", "ja", JAPANESE + W10, 0},
481 {"jv", "jw", JAVANESE + W10, 0},
482 {"jw", "jw", JAVANESE + W10, 0},
484 {"ka", "ka", GEORGIAN + W10, 0},
485 {"kc", "qu", QUECHUA + W10, 0}, // (K)Quechua
486 {"kg", "ky", KYRGYZ + W10, 0}, // Kyrgyzstan
487 {"kh", "km", KHMER + W10, 0}, // Country code Khmer (Cambodia)
488 {"kha", "kha", KHASI + W10, 0},
489 {"kk", "kk", KAZAKH + W10, 0}, // Kazakh
490 {"kl", "kl", GREENLANDIC + W10, 0},
491 {"km", "km", KHMER + W10, 0},
492 {"kn", "kn", KANNADA + W10, 0},
493 {"ko", "ko", KOREAN + W10, 0},
494 {"kor", "ko", KOREAN + W10, 0},
495 {"kr", "ko", KOREAN + W10, 0}, // Country code Korea
496 {"ks", "ks", KASHMIRI + W10, 0},
497 {"ksc", "ko", KOREAN + W10, 0}, // KSC encoding
498 {"ku", "ku", KURDISH + W10, 0},
499 {"ky", "ky", KYRGYZ + W10, 0},
500 {"kz", "kk", KAZAKH + W10, 0}, // Kazakhstan
501 {"la", "la", LATIN + W10, 0},
502 {"lao", "lo", LAOTHIAN + W10, 0}, // Laos
504 {"lb", "lb", LUXEMBOURGISH + W10, 0},
505 {"lg", "lg", GANDA + W10, 0},
506 {"lit", "lt", LITHUANIAN + W10, 0},
507 {"ln", "ln", LINGALA + W10, 0},
508 {"lo", "lo", LAOTHIAN + W10, 0},
509 {"lt", "lt", LITHUANIAN + W10, 0},
510 {"ltu", "lt", LITHUANIAN + W10, 0},
511 {"lv", "lv", LATVIAN + W10, 0},
513 {"mfe", "mfe", MAURITIAN_CREOLE + W10, 0},
514 {"mg", "mg", MALAGASY + W10, 0},
515 {"mi", "mi", MAORI + W10, 0},
516 {"mk", "mk", MACEDONIAN + W10, 0},
517 {"ml", "ml", MALAYALAM + W10, 0},
518 {"mn", "mn", MONGOLIAN + W10, 0},
519 {"mo", "mo", ROMANIAN + W10, 0},
520 {"mon", "mn", MONGOLIAN + W10, 0}, // Mongolian
521 {"mr", "mr", MARATHI + W10, HINDI - W4},
522 {"ms", "ms", MALAY + W10, INDONESIAN - W4},
523 {"mt", "mt", MALTESE + W10, 0},
524 {"mx", "es", SPANISH + W10, 0}, // Mexico
525 {"my", "my,ms", BURMESE + W10, MALAY + W10}, // Myanmar, Malaysia
527 {"na", "na", NAURU + W10, 0},
528 {"nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
529 {"ne", "ne", NEPALI + W10, 0},
530 {"nl", "nl", DUTCH + W10, 0},
531 {"nn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
532 {"no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
533 {"nr", "nr", NDEBELE + W10, 0},
534 {"nso", "nso", PEDI + W10, 0},
535 {"ny", "ny", NYANJA + W10, 0},
537 {"oc", "oc", OCCITAN + W10, 0},
538 {"om", "om", OROMO + W10, 0},
539 {"or", "or", ORIYA + W10, 0},
541 {"pa", "pa,ps", PUNJABI + W10, PASHTO + W10}, // 1:2 pa-Guru ps-Arab
542 {"per", "fa", PERSIAN + W10, 0},
543 {"ph", "tl", TAGALOG + W10, 0}, // Philippines
544 {"pk", "ur", URDU + W10, 0}, // Pakistan
545 {"pl", "pl", POLISH + W10, 0},
546 {"pnb", "pa", PUNJABI + W10, 0}, // Western Punjabi
547 {"pol", "pl", POLISH + W10, 0},
548 {"por", "pt", PORTUGUESE + W10, 0},
549 {"ps", "ps", PASHTO + W10, 0},
550 {"pt", "pt", PORTUGUESE + W10, 0},
551 {"ptg", "pt", PORTUGUESE + W10, 0},
552 {"qc", "fr", FRENCH + W10, 0}, // Quebec "country" code
553 {"qu", "qu", QUECHUA + W10, 0},
555 {"rm", "rm", RHAETO_ROMANCE + W10, 0},
556 {"rn", "rn", RUNDI + W10, 0},
557 {"ro", "ro", ROMANIAN + W10, 0},
558 {"rs", "sr", SERBIAN + W10, 0}, // Serbia country code
559 {"ru", "ru", RUSSIAN + W10, 0},
560 {"rus", "ru", RUSSIAN + W10, 0},
561 {"rw", "rw", KINYARWANDA + W10, 0},
563 {"sa", "sa", SANSKRIT + W10, 0},
564 {"sco", "sco", SCOTS + W10, ENGLISH - W4},
565 {"sd", "sd", SINDHI + W10, 0},
566 {"se", "sv", SWEDISH + W10, 0},
567 {"sg", "sg", SANGO + W10, 0},
568 {"si", "si,sl", SINHALESE + W10, SLOVENIAN + W10}, // 1:2 Sinhalese, Slovinia
569 {"sk", "sk", SLOVAK + W10, CZECH - W4},
570 {"sl", "sl", SLOVENIAN + W10, 0},
571 {"slo", "sl", SLOVENIAN + W10, 0},
572 {"sm", "sm", SAMOAN + W10, 0},
573 {"sn", "sn", SHONA + W10, 0},
574 {"so", "so", SOMALI + W10, 0},
575 {"sp", "es", SPANISH + W10, 0},
576 {"sq", "sq", ALBANIAN + W10, 0},
577 {"sr", "sr", SERBIAN + W10, 0},
578 {"srb", "sr", SERBIAN + W10, 0},
579 {"srl", "sr", SERBIAN + W10, 0}, // Serbian Latin
580 {"srp", "sr", SERBIAN + W10, 0},
581 {"ss", "ss", SISWANT + W10, 0},
582 {"st", "st", SESOTHO + W10, 0},
583 {"su", "su", SUNDANESE + W10, 0},
584 {"sv", "sv", SWEDISH + W10, 0},
585 {"sve", "sv", SWEDISH + W10, 0},
586 {"sw", "sw", SWAHILI + W10, 0},
587 {"swe", "sv", SWEDISH + W10, 0},
588 {"sy", "syr", SYRIAC + W10, 0},
589 {"syr", "syr", SYRIAC + W10, 0},
591 {"ta", "ta", TAMIL + W10, 0},
592 {"te", "te", TELUGU + W10, 0},
593 {"tg", "tg", TAJIK + W10, 0},
594 {"th", "th", THAI + W10, 0},
595 {"ti", "ti,bo", TIGRINYA + W10, TIBETAN + W10}, // 1:2 Tigrinya, Tibet
596 {"tj", "tg", TAJIK + W10, 0}, // Tajikistan
597 {"tk", "tk", TURKMEN + W10, 0},
598 {"tl", "tl", TAGALOG + W10, 0},
599 {"tlh", "tlh", X_KLINGON + W10, 0},
600 {"tn", "tn", TSWANA + W10, 0},
601 {"to", "to", TONGA + W10, 0},
602 {"tr", "tr", TURKISH + W10, 0},
603 {"ts", "ts", TSONGA + W10, 0},
604 {"tt", "tt", TATAR + W10, 0},
605 {"tw", "ak,zhT", AKAN + W10, CHINESE_T + W10}, // 1:2 Twi => Akan, Taiwan
606 {"twi", "ak", AKAN + W10, 0}, // Twi => Akan
608 {"ua", "uk", UKRAINIAN + W10, 0}, // Ukraine
609 {"ug", "ug", UIGHUR + W10, 0},
610 {"uk", "uk", UKRAINIAN + W10, 0},
611 {"ur", "ur", URDU + W10, 0},
612 {"uz", "uz", UZBEK + W10, 0},
614 {"va", "ca", CATALAN + W10, 0}, // Valencia => Catalan
615 {"val", "ca", CATALAN + W10, 0}, // Valencia => Catalan
616 {"ve", "ve", VENDA + W10, 0},
617 {"vi", "vi", VIETNAMESE + W10, 0},
618 {"vie", "vi", VIETNAMESE + W10, 0},
619 {"vn", "vi", VIETNAMESE + W10, 0},
620 {"vo", "vo", VOLAPUK + W10, 0},
622 {"wo", "wo", WOLOF + W10, 0},
624 {"xh", "xh", XHOSA + W10, ZULU - W4},
625 {"xho", "xh", XHOSA + W10, ZULU - W4},
627 {"yi", "yi", YIDDISH + W10, 0},
628 {"yo", "yo", YORUBA + W10, 0},
630 {"za", "za", ZHUANG + W10, 0},
631 {"zh", "zh", CHINESE + W10, 0},
632 {"zht", "zhT", CHINESE_T + W10, 0},
633 {"zu", "zu", ZULU + W10, XHOSA - W4},
634 };
637 // Possibly map to tl:
638 // -LangTags tl-Latn /7val.com/ ,bcl 2 Central Bicolano
639 // -LangTags tl-Latn /7val.com/ ,ceb 6 Cebuano
640 // -LangTags tl-Latn /7val.com/ ,war 1 Waray
644 // Table to look up country TLD (no general TLD)
645 // In alphabetical order for binary search
646 static const int kCLDTable3Size = 181;
647 static const TLDLookup kCLDTLDHintTable[kCLDTable3Size] = {
648 {"ac", JAPANESE + W2, 0},
649 {"ad", CATALAN + W4, 0},
650 {"ae", ARABIC + W4, 0},
651 {"af", PASHTO + W4, PERSIAN + W4},
652 {"ag", GERMAN + W2, 0}, // meager
653 // {"ai", 0, 0}, // meager
654 {"al", ALBANIAN + W4, 0},
655 {"am", ARMENIAN + W4, 0},
656 {"an", DUTCH + W4, 0}, // meager
657 {"ao", PORTUGUESE + W4, 0},
658 // {"aq", 0, 0}, // meager
659 {"ar", SPANISH + W4, 0},
660 // {"as", 0, 0},
661 {"at", GERMAN + W4, 0},
662 {"au", ENGLISH + W2, 0},
663 {"aw", DUTCH + W4, 0},
664 {"ax", SWEDISH + W4, 0},
665 {"az", AZERBAIJANI + W4, 0},
667 {"ba", BOSNIAN + W8, CROATIAN - W4},
668 // {"bb", 0, 0},
669 {"bd", BENGALI + W4, 0},
670 {"be", DUTCH + W4, FRENCH + W4},
671 {"bf", FRENCH + W4, 0},
672 {"bg", BULGARIAN + W4, 0},
673 {"bh", ARABIC + W4, 0},
674 {"bi", RUNDI + W4, FRENCH + W4},
675 {"bj", FRENCH + W4, 0},
676 {"bm", ENGLISH + W2, 0},
677 {"bn", MALAY + W4, INDONESIAN - W4},
678 {"bo", SPANISH + W4, AYMARA + W2}, // and GUARANI QUECHUA
679 {"br", PORTUGUESE + W4, 0},
680 // {"bs", 0, 0},
681 {"bt", DZONGKHA + W10, TIBETAN - W10}, // Strong presumption of Dzongha
682 {"bw", TSWANA + W4, 0},
683 {"by", BELARUSIAN + W4, 0},
684 // {"bz", 0, 0},
686 {"ca", FRENCH + W4, ENGLISH + W2},
687 {"cat", CATALAN + W4, 0},
688 {"cc", 0, 0},
689 {"cd", FRENCH + W4, 0},
690 {"cf", FRENCH + W4, 0},
691 {"cg", FRENCH + W4, 0},
692 {"ch", GERMAN + W4, FRENCH + W4},
693 {"ci", FRENCH + W4, 0},
694 // {"ck", 0, 0},
695 {"cl", SPANISH + W4, 0},
696 {"cm", FRENCH + W4, 0},
697 {"cn", CHINESE + W4, 0},
698 {"co", SPANISH + W4, 0},
699 {"cr", SPANISH + W4, 0},
700 {"cu", SPANISH + W4, 0},
701 {"cv", PORTUGUESE + W4, 0},
702 // {"cx", 0, 0},
703 {"cy", GREEK + W4, TURKISH + W4},
704 {"cz", CZECH + W4, SLOVAK - W4},
706 {"de", GERMAN + W4, 0},
707 {"dj", 0, 0},
708 {"dk", DANISH + W4, NORWEGIAN - W4},
709 {"dm", 0, 0},
710 {"do", SPANISH + W4, 0},
711 {"dz", FRENCH + W4, ARABIC + W4},
713 {"ec", SPANISH + W4, 0},
714 {"ee", ESTONIAN + W4, 0},
715 {"eg", ARABIC + W4, 0},
716 {"er", AFAR + W4, 0},
717 {"es", SPANISH + W4, 0},
718 {"et", AMHARIC + W4, AFAR + W4},
720 {"fi", FINNISH + W4, 0},
721 {"fj", FIJIAN + W4, 0},
722 // {"fk", 0, 0},
723 // {"fm", 0, 0},
724 {"fo", FAROESE + W4, ICELANDIC - W4},
725 {"fr", FRENCH + W4, 0},
727 {"ga", FRENCH + W4, 0},
728 {"gd", 0, 0},
729 {"ge", GEORGIAN + W4, 0},
730 {"gf", FRENCH + W4, 0},
731 // {"gg", 0, 0},
732 // {"gh", 0, 0},
733 // {"gi", 0, 0},
734 {"gl", GREENLANDIC + W4, DANISH + W4},
735 // {"gm", 0, 0},
736 {"gn", FRENCH + W4, 0},
737 // {"gp", 0, 0},
738 // {"gq", 0, 0},
739 {"gr", GREEK + W4, 0},
740 // {"gs", 0, 0},
741 {"gt", SPANISH + W4, 0},
742 // {"gu", 0, 0},
743 // {"gy", 0, 0},
745 {"hk", CHINESE_T + W4, 0},
746 // {"hm", 0, 0},
747 {"hn", SPANISH + W4, 0},
748 {"hr", CROATIAN + W8, BOSNIAN - W4},
749 {"ht", HAITIAN_CREOLE + W4, FRENCH + W4},
750 {"hu", HUNGARIAN + W4, 0},
752 {"id", INDONESIAN + W4, MALAY - W4},
753 {"ie", IRISH + W4, 0},
754 {"il", HEBREW + W4, 0},
755 {"im", MANX + W4, 0},
756 // {"in", 0, 0},
757 // {"io", 0, 0},
758 {"iq", ARABIC + W4, 0},
759 {"ir", PERSIAN + W4, 0},
760 {"is", ICELANDIC + W4, FAROESE - W4},
761 {"it", ITALIAN + W4, 0},
763 // {"je", 0, 0},
764 // {"jm", 0, 0},
765 {"jo", ARABIC + W4, 0},
766 {"jp", JAPANESE + W4, 0},
768 // {"ke", 0, 0},
769 {"kg", KYRGYZ + W4, 0},
770 {"kh", KHMER + W4, 0},
771 // {"ki", 0, 0},
772 {"km", FRENCH + W4, 0},
773 // {"kn", 0, 0},
774 {"kp", KOREAN + W4, 0},
775 {"kr", KOREAN + W4, 0},
776 {"kw", ARABIC + W4, 0},
777 // {"ky", 0, 0},
778 {"kz", KAZAKH + W4, 0},
780 {"la", LAOTHIAN + W4, 0},
781 {"lb", ARABIC + W4, FRENCH + W4},
782 // {"lc", 0, 0},
783 {"li", GERMAN + W4, 0},
784 {"lk", SINHALESE + W4, 0},
785 // {"lr", 0, 0},
786 {"ls", SESOTHO + W4, 0},
787 {"lt", LITHUANIAN + W4, 0},
788 {"lu", LUXEMBOURGISH + W4},
789 {"lv", LATVIAN + W4, 0},
790 {"ly", ARABIC + W4, 0},
792 {"ma", FRENCH + W4, 0},
793 {"mc", FRENCH + W4, 0},
794 {"md", ROMANIAN + W4, 0},
795 {"me", MONTENEGRIN + W8, SERBIAN - W4},
796 {"mg", FRENCH + W4, 0},
797 {"mk", MACEDONIAN + W4, 0},
798 {"ml", FRENCH + W4, 0},
799 {"mm", BURMESE + W4, 0},
800 {"mn", MONGOLIAN + W4, 0},
801 {"mo", CHINESE_T + W4, PORTUGUESE + W4},
802 // {"mp", 0, 0},
803 {"mq", FRENCH + W4, 0},
804 {"mr", FRENCH + W4, ARABIC + W4},
805 // {"ms", 0, 0},
806 {"mt", MALTESE + W4, 0},
807 // {"mu", 0, 0},
808 {"mv", DHIVEHI + W4, 0},
809 // {"mw", 0, 0},
810 {"mx", SPANISH + W4, 0},
811 {"my", MALAY + W4, INDONESIAN - W4},
812 {"mz", PORTUGUESE + W4, 0},
814 {"na", 0, 0}, // Namibia
815 {"nc", FRENCH + W4, 0},
816 {"ne", FRENCH + W4, 0},
817 {"nf", FRENCH + W4, 0},
818 // {"ng", 0, 0},
819 {"ni", SPANISH + W4, 0},
820 {"nl", DUTCH + W4, 0},
821 {"no", NORWEGIAN + W4, NORWEGIAN_N + W2},
822 {"np", NEPALI + W4, 0},
823 {"nr", NAURU + W4, 0},
824 {"nu", SWEDISH + W4, 0},
825 {"nz", MAORI + W4, ENGLISH + W2},
827 {"om", ARABIC + W4, 0},
829 {"pa", SPANISH + W4, 0},
830 {"pe", SPANISH + W4, QUECHUA + W2}, // also AYMARA
831 {"pf", FRENCH + W4, 0},
832 // {"pg", 0, 0},
833 {"ph", TAGALOG + W4, 0},
834 {"pk", URDU + W4, 0},
835 {"pl", POLISH + W4, 0},
836 // {"pn", 0, 0},
837 {"pr", SPANISH + W4, 0},
838 {"ps", ARABIC + W4, 0},
839 {"pt", PORTUGUESE + W4, 0},
840 {"py", SPANISH + W4, GUARANI + W2},
842 {"qa", ARABIC + W4, 0},
844 {"re", FRENCH + W4, 0},
845 {"ro", ROMANIAN + W4, 0},
846 {"rs", SERBIAN + W8, MONTENEGRIN - W4},
847 {"ru", RUSSIAN + W4, 0},
848 {"rw", KINYARWANDA + W4, FRENCH + W2},
850 {"sa", ARABIC + W4, 0},
851 // {"sb", 0, 0},
852 {"sc", SESELWA + W4, 0},
853 {"sd", ARABIC + W4, 0},
854 {"se", SWEDISH + W4, 0},
855 // {"sg", 0, 0},
856 // {"sh", 0, 0},
857 {"si", SLOVENIAN + W4, 0},
858 {"sk", SLOVAK + W4, CZECH - W4},
859 // {"sl", 0, 0},
860 {"sm", ITALIAN + W4, 0},
861 {"sn", FRENCH + W4, 0},
862 // {"sr", 0, 0},
863 {"ss", ARABIC + W4, 0}, // Presumed South Sudan TLD. dsites 2011.07.07
864 // {"st", 0, 0},
865 {"su", RUSSIAN + W4, 0},
866 {"sv", SPANISH + W4, 0},
867 {"sy", ARABIC + W4, 0},
868 // {"sz", 0, 0},
870 // {"tc", 0, 0},
871 {"td", FRENCH + W4, 0},
872 // {"tf", 0, 0},
873 {"tg", FRENCH + W4, 0},
874 {"th", THAI + W4, 0},
875 // Tibet has no country code (see .cn)
876 {"tj", TAJIK + W4, 0},
877 // {"tk", 0, 0},
878 // {"tl", 0, 0},
879 {"tm", TURKISH + W4, 0},
880 {"tn", FRENCH + W4, ARABIC + W4},
881 // {"to", 0, 0},
882 {"tp", JAPANESE + W4, 0},
883 {"tr", TURKISH + W4, 0},
884 // {"tt", 0, 0},
885 // {"tv", 0, 0},
886 {"tw", CHINESE_T + W4, 0},
887 {"tz", SWAHILI + W4, AKAN + W4},
889 {"ua", UKRAINIAN + W4, 0},
890 {"ug", GANDA + W4, 0},
891 {"uk", ENGLISH + W2, 0},
892 {"us", ENGLISH + W2, 0},
893 {"uy", SPANISH + W4, 0},
894 {"uz", UZBEK + W4, 0},
896 {"va", ITALIAN + W4, LATIN + W2},
897 // {"vc", 0, 0},
898 {"ve", SPANISH + W4, 0},
899 // {"vg", 0, 0},
900 // {"vi", 0, 0},
901 {"vn", VIETNAMESE + W4, 0},
902 // {"vu", 0, 0},
904 {"wf", FRENCH + W4, 0},
905 // {"ws", 0, 0},
907 {"ye", ARABIC + W4, 0},
909 {"za", AFRIKAANS + W4, 0},
910 // {"zm", 0, 0},
911 // {"zw", 0, 0},
912 };
914 #undef W2
915 #undef W4
916 #undef W6
917 #undef W8
918 #undef W10
919 #undef W12
925 inline void SetCLDPriorWeight(int w, OneCLDLangPrior* olp) {
926 *olp = (*olp & 0x3ff) + (w << 10);
927 }
928 inline void SetCLDPriorLang(Language lang, OneCLDLangPrior* olp) {
929 *olp = (*olp & ~0x3ff) + lang;
930 }
932 OneCLDLangPrior PackCLDPriorLangWeight(Language lang, int w) {
933 return (w << 10) + lang;
934 }
936 inline int MaxInt(int a, int b) {
937 return (a >= b) ? a : b;
938 }
940 // Merge in another language prior, taking max if already there
941 void MergeCLDLangPriorsMax(OneCLDLangPrior olp, CLDLangPriors* lps) {
942 if (olp == 0) {return;}
943 Language target_lang = GetCLDPriorLang(olp);
944 for (int i = 0; i < lps->n; ++i) {
945 if (GetCLDPriorLang(lps->prior[i]) == target_lang) {
946 int new_weight = MaxInt(GetCLDPriorWeight(lps->prior[i]),
947 GetCLDPriorWeight(olp));
948 SetCLDPriorWeight(new_weight, &lps->prior[i]);
949 return;
950 }
951 }
952 // Not found; add it if room
953 if (lps->n >= kMaxOneCLDLangPrior) {return;}
954 lps->prior[lps->n++] = olp;
955 }
957 // Merge in another language prior, boosting 10x if already there
958 void MergeCLDLangPriorsBoost(OneCLDLangPrior olp, CLDLangPriors* lps) {
959 if (olp == 0) {return;}
960 Language target_lang = GetCLDPriorLang(olp);
961 for (int i = 0; i < lps->n; ++i) {
962 if (GetCLDPriorLang(lps->prior[i]) == target_lang) {
963 int new_weight = GetCLDPriorWeight(lps->prior[i]) + 2;
964 SetCLDPriorWeight(new_weight, &lps->prior[i]);
965 return;
966 }
967 }
968 // Not found; add it if room
969 if (lps->n >= kMaxOneCLDLangPrior) {return;}
970 lps->prior[lps->n++] = olp;
971 }
974 // Trim language priors to no more than max_entries, keeping largest abs weights
975 void TrimCLDLangPriors(int max_entries, CLDLangPriors* lps) {
976 if (lps->n <= max_entries) {return;}
978 // Insertion sort in-place by abs(weight)
979 for (int i = 0; i < lps->n; ++i) {
980 OneCLDLangPrior temp_olp = lps->prior[i];
981 int w = abs(GetCLDPriorWeight(temp_olp));
982 int kk = i;
983 for (; kk > 0; --kk) {
984 if (abs(GetCLDPriorWeight(lps->prior[kk - 1])) < w) {
985 // Move down and continue
986 lps->prior[kk] = lps->prior[kk - 1];
987 } else {
988 // abs(weight[kk - 1]) >= w, time to stop
989 break;
990 }
991 }
992 lps->prior[kk] = temp_olp;
993 }
995 lps->n = max_entries;
996 }
998 int CountCommas(const string& langtags) {
999 int commas = 0;
1000 for (int i = 0; i < static_cast<int>(langtags.size()); ++i) {
1001 if (langtags[i] == ',') {++commas;}
1002 }
1003 return commas;
1004 }
1006 // Binary lookup on language tag
1007 const LangTagLookup* DoLangTagLookup(const char* key,
1008 const LangTagLookup* tbl, int tbl_size) {
1009 // Key is always in range [lo..hi)
1010 int lo = 0;
1011 int hi = tbl_size;
1012 while (lo < hi) {
1013 int mid = (lo + hi) >> 1;
1014 int comp = strcmp(tbl[mid].langtag, key);
1015 if (comp < 0) {
1016 lo = mid + 1;
1017 } else if (comp > 0) {
1018 hi = mid;
1019 } else {
1020 return &tbl[mid];
1021 }
1022 }
1023 return NULL;
1024 }
1026 // Binary lookup on tld
1027 const TLDLookup* DoTLDLookup(const char* key,
1028 const TLDLookup* tbl, int tbl_size) {
1029 // Key is always in range [lo..hi)
1030 int lo = 0;
1031 int hi = tbl_size;
1032 while (lo < hi) {
1033 int mid = (lo + hi) >> 1;
1034 int comp = strcmp(tbl[mid].tld, key);
1035 if (comp < 0) {
1036 lo = mid + 1;
1037 } else if (comp > 0) {
1038 hi = mid;
1039 } else {
1040 return &tbl[mid];
1041 }
1042 }
1043 return NULL;
1044 }
1048 // Trim language tag string to canonical form for each language
1049 // Input is from GetLangTagsFromHtml(), already lowercased
1050 string TrimCLDLangTagsHint(const string& langtags) {
1051 string retval;
1052 if (langtags.empty()) {return retval;}
1053 int commas = CountCommas(langtags);
1054 if (commas > 4) {return retval;} // Ignore if too many language tags
1056 char temp[20];
1057 int pos = 0;
1058 while (pos < static_cast<int>(langtags.size())) {
1059 int comma = langtags.find(',', pos);
1060 if (comma == string::npos) {comma = langtags.size();} // fake trailing comma
1061 int len = comma - pos;
1062 if (len <= 16) {
1063 // Short enough to use
1064 memcpy(temp, &langtags[pos], len);
1065 temp[len] = '\0';
1066 const LangTagLookup* entry = DoLangTagLookup(temp,
1067 kCLDLangTagsHintTable1,
1068 kCLDTable1Size);
1069 if (entry != NULL) {
1070 // First table hit
1071 retval.append(entry->langcode); // may be "code1,code2"
1072 retval.append(1, ',');
1073 } else {
1074 // Try second table with language code truncated at first hyphen
1075 char* hyphen = strchr(temp, '-');
1076 if (hyphen != NULL) {*hyphen = '\0';}
1077 len = strlen(temp);
1078 if (len <= 3) { // Short enough to use
1079 entry = DoLangTagLookup(temp,
1080 kCLDLangTagsHintTable2,
1081 kCLDTable2Size);
1082 if (entry != NULL) {
1083 // Second table hit
1084 retval.append(entry->langcode); // may be "code1,code2"
1085 retval.append(1, ',');
1086 }
1087 }
1088 }
1089 }
1090 pos = comma + 1;
1091 }
1093 // Remove trainling comma, if any
1094 if (!retval.empty()) {retval.resize(retval.size() - 1);}
1095 return retval;
1096 }
1100 //==============================================================================
1102 // Little state machine to scan insides of language attribute quoted-string.
1103 // Each language code is lowercased and copied to the output string. Underscore
1104 // is mapped to minus. Space, tab, and comma are all mapped to comma, and
1105 // multiple consecutive commas are removed.
1106 // Each language code in the output list will be followed by a single comma.
1108 // There are three states, and we start in state 1:
1109 // State 0: After a letter.
1110 // Copy all letters/minus[0], copy comma[1]; all others copy comma and skip [2]
1111 // State 1: Just after a comma.
1112 // Copy letter [0], Ignore subsequent commas[1]. minus and all others skip [2]
1113 // State 2: Skipping.
1114 // All characters except comma skip and stay in [2]. comma goes to [1]
1116 // The thing that is copied is kLangCodeRemap[c] when going to state 0,
1117 // and always comma when going to state 1 or 2. The design depends on copying
1118 // a comma at the *beginning* of skipping, and in state 2 never doing a copy.
1120 // We pack all this into 8 bits:
1121 // +--+---+---+
1122 // |78|654|321|
1123 // +--+---+---+
1124 //
1125 // Shift byte right by 3*state, giving [0] 321, [1] 654, [2] .78
1126 // where . is always zero
1127 // Of these 3 bits, low two are next state ss, high bit is copy bit C.
1128 // If C=1 and ss == 0, copy kLangCodeRemap[c], else copy a comma
1130 #define SKIP0 0
1131 #define SKIP1 1
1132 #define SKIP2 2
1133 #define COPY0 4 // copy kLangCodeRemap[c]
1134 #define COPY1 5 // copy ','
1135 #define COPY2 6 // copy ','
1137 // These combined actions pack three states into one byte.
1138 // Ninth bit must be zero, so all state 2 values must be skips.
1139 // state[2] state[1] state[0]
1140 #define LTR ((SKIP2 << 6) + (COPY0 << 3) + COPY0)
1141 #define MINUS ((SKIP2 << 6) + (COPY2 << 3) + COPY0)
1142 #define COMMA ((SKIP1 << 6) + (SKIP1 << 3) + COPY1)
1143 #define Bad ((SKIP2 << 6) + (COPY2 << 3) + COPY2)
1145 // Treat as letter: a-z, A-Z
1146 // Treat as minus: 2D minus, 5F underscore
1147 // Treat as comma: 09 tab, 20 space, 2C comma
1149 static const unsigned char kLangCodeAction[256] = {
1150 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,COMMA,Bad,Bad,Bad,Bad,Bad,Bad,
1151 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1152 COMMA,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,COMMA,MINUS,Bad,Bad,
1153 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1155 Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,
1156 LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,MINUS,
1157 Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,
1158 LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,Bad,
1160 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1161 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1162 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1163 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1165 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1166 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1167 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1168 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
1169 };
1171 // This does lowercasing, maps underscore to minus, and maps tab/space to comma
1172 static const unsigned char kLangCodeRemap[256] = {
1173 0,0,0,0,0,0,0,0, 0,',',0,0,0,0,0,0, // 09 tab
1174 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1175 ',',0,0,0,0,0,0,0, 0,0,0,0,',','-',0,0, // 20 space 2C comma 2D minus
1176 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1178 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o',
1179 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,'-', // 5F underscore
1180 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o',
1181 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,0,
1183 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1184 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1185 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1186 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1188 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1189 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1190 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1191 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1192 };
1194 #undef LTR
1195 #undef MINUS
1196 #undef COMMA
1197 #undef Bad
1199 #undef SKIP0
1200 #undef SKIP1
1201 #undef SKIP2
1202 #undef COPY0
1203 #undef COPY1
1204 #undef COPY2
1207 // Find opening '<' for HTML tag
1208 // Note: this is all somewhat insensitive to mismatched quotes
1209 int32 FindTagStart(const char* utf8_body, int32 pos, int32 max_pos) {
1210 int i = pos;
1211 // Advance i by 4 if none of the next 4 bytes are '<'
1212 for (i = pos; i < (max_pos - 3); i += 4) {
1213 // Fast check for any <
1214 const char* p = &utf8_body[i];
1215 uint32 s0123 = UNALIGNED_LOAD32(p);
1216 uint32 temp = s0123 ^ 0x3c3c3c3c; // <<<<
1217 if (((temp - 0x01010101) & (~temp & 0x80808080)) != 0) {
1218 // At least one byte is '<'
1219 break;
1220 }
1221 }
1222 // Continue, advancing i by 1
1223 for (; i < max_pos; ++i) {
1224 if (utf8_body[i] == '<') {return i;}
1225 }
1226 return -1;
1227 }
1230 // Find closing '>' for HTML tag. Also stop on < and & (simplistic parsing)
1231 int32 FindTagEnd(const char* utf8_body, int32 pos, int32 max_pos) {
1232 // Always outside quotes
1233 for (int i = pos; i < max_pos; ++i) {
1234 char c = utf8_body[i];
1235 if (c == '>') {return i;}
1236 if (c == '<') {return i - 1;}
1237 if (c == '&') {return i - 1;}
1238 }
1239 return -1; // nothing found
1240 }
1242 // Find opening quote or apostrophe, skipping spaces
1243 // Note: this is all somewhat insensitive to mismatched quotes
1244 int32 FindQuoteStart(const char* utf8_body, int32 pos, int32 max_pos) {
1245 for (int i = pos; i < max_pos; ++i) {
1246 char c = utf8_body[i];
1247 if (c == '"') {return i;}
1248 if (c == '\'') {return i;}
1249 if (c != ' ') {return -1;}
1250 }
1251 return -1;
1252 }
1254 // Find closing quot/apos. Also stop on = > < and & (simplistic parsing)
1255 int32 FindQuoteEnd(const char* utf8_body, int32 pos, int32 max_pos) {
1256 // Always outside quotes
1257 for (int i = pos; i < max_pos; ++i) {
1258 char c = utf8_body[i];
1259 if (c == '"') {return i;}
1260 if (c == '\'') {return i;}
1261 if (c == '>') {return i - 1;}
1262 if (c == '=') {return i - 1;}
1263 if (c == '<') {return i - 1;}
1264 if (c == '&') {return i - 1;}
1265 }
1266 return -1; // nothing found
1267 }
1269 int32 FindEqualSign(const char* utf8_body, int32 pos, int32 max_pos) {
1270 // Outside quotes/apostrophes loop
1271 for (int i = pos; i < max_pos; ++i) {
1272 char c = utf8_body[i];
1273 if (c == '=') { // Found bare equal sign inside tag
1274 return i;
1275 } else if (c == '"') {
1276 // Inside quotes loop
1277 int j;
1278 for (j = i + 1; j < max_pos; ++j) {
1279 if (utf8_body[j] == '"') {
1280 break;
1281 } else if (utf8_body[j] == '\\') {
1282 ++j;
1283 }
1284 }
1285 i = j;
1286 } else if (c == '\'') {
1287 // Inside apostrophes loop
1288 int j;
1289 for (j = i + 1; j < max_pos; ++j) {
1290 if (utf8_body[j] == '\'') {
1291 break;
1292 } else if (utf8_body[j] == '\\') {
1293 ++j;
1294 }
1295 }
1296 i = j;
1297 }
1299 }
1300 return -1; // nothing found
1301 }
1303 // Scan backwards for case-insensitive string s in [min_pos..pos)
1304 // Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f]
1305 // Cheap lowercase. Control codes will masquerade as 20..3f
1306 bool FindBefore(const char* utf8_body,
1307 int32 min_pos, int32 pos, const char* s) {
1308 int len = strlen(s);
1309 if ((pos - min_pos) < len) {return false;} // Too small to fit s
1311 // Skip trailing spaces
1312 int i = pos;
1313 while ((i > (min_pos + len)) && (utf8_body[i - 1] == ' ')) {--i;}
1314 i -= len;
1315 if (i < min_pos) {return false;} // pos - min_pos < len, so s can't be found
1317 const char* p = &utf8_body[i];
1318 for (int j = 0; j < len; ++j) {
1319 if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte
1320 }
1321 return true; // All bytes equal at i
1322 }
1324 // Scan forwards for case-insensitive string s in [pos..max_pos)
1325 // Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f]
1326 // Cheap lowercase. Control codes will masquerade as 20..3f
1327 // Allows but does not require quoted/apostrophe string
1328 bool FindAfter(const char* utf8_body,
1329 int32 pos, int32 max_pos, const char* s) {
1330 int len = strlen(s);
1331 if ((max_pos - pos) < len) {return false;} // Too small to fit s
1333 // Skip leading spaces, quote, apostrophe
1334 int i = pos;
1335 while (i < (max_pos - len)) {
1336 unsigned char c = utf8_body[i];
1337 if ((c == ' ') || (c == '"') || (c == '\'')) {++i;}
1338 else {break;}
1339 }
1341 const char* p = &utf8_body[i];
1342 for (int j = 0; j < len; ++j) {
1343 if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte
1344 }
1345 return true; // All bytes equal
1346 }
1350 // Copy attribute value in [pos..max_pos)
1351 // pos is just after an opening quote/apostrophe and max_pos is the ending one
1352 // String must all be on a single line.
1353 // Return slightly-normalized language list, empty or ending in comma
1354 // Does lowercasing and removes excess punctuation/space
1355 string CopyOneQuotedString(const char* utf8_body,
1356 int32 pos, int32 max_pos) {
1357 string s;
1358 int state = 1; // Front is logically just after a comma
1359 for (int i = pos; i < max_pos; ++i) {
1360 unsigned char c = utf8_body[i];
1361 int e = kLangCodeAction[c] >> (3 * state);
1362 state = e & 3; // Update to next state
1363 if ((e & 4) != 0) {
1364 // Copy a remapped byte if going to state 0, else copy a comma
1365 if (state == 0) {
1366 s.append(1, kLangCodeRemap[c]);
1367 } else {
1368 s.append(1, ',');
1369 }
1370 }
1371 }
1373 // Add final comma if needed
1374 if (state == 0) {
1375 s.append(1, ',');
1376 }
1377 return s;
1378 }
1380 // Find and copy attribute value: quoted string in [pos..max_pos)
1381 // Return slightly-normalized language list, empty or ending in comma
1382 string CopyQuotedString(const char* utf8_body,
1383 int32 pos, int32 max_pos) {
1384 int32 start_quote = FindQuoteStart(utf8_body, pos, max_pos);
1385 if (start_quote < 0) {return string("");}
1386 int32 end_quote = FindQuoteEnd(utf8_body, start_quote + 1, max_pos);
1387 if (end_quote < 0) {return string("");}
1389 return CopyOneQuotedString(utf8_body, start_quote + 1, end_quote);
1390 }
1392 // Add hints to vector of langpriors
1393 // Input is from GetLangTagsFromHtml(), already lowercased
1394 void SetCLDLangTagsHint(const string& langtags, CLDLangPriors* langpriors) {
1395 if (langtags.empty()) {return;}
1396 int commas = CountCommas(langtags);
1397 if (commas > 4) {return;} // Ignore if too many language tags
1399 char temp[20];
1400 int pos = 0;
1401 while (pos < static_cast<int>(langtags.size())) {
1402 int comma = langtags.find(',', pos);
1403 if (comma == string::npos) {comma = langtags.size();} // fake trailing comma
1404 int len = comma - pos;
1405 if (len <= 16) {
1406 // Short enough to use
1407 memcpy(temp, &langtags[pos], len);
1408 temp[len] = '\0';
1409 const LangTagLookup* entry = DoLangTagLookup(temp,
1410 kCLDLangTagsHintTable1,
1411 kCLDTable1Size);
1412 if (entry != NULL) {
1413 // First table hit
1414 MergeCLDLangPriorsMax(entry->onelangprior1, langpriors);
1415 MergeCLDLangPriorsMax(entry->onelangprior2, langpriors);
1416 } else {
1417 // Try second table with language code truncated at first hyphen
1418 char* hyphen = strchr(temp, '-');
1419 if (hyphen != NULL) {*hyphen = '\0';}
1420 len = strlen(temp);
1421 if (len <= 3) { // Short enough to use
1422 entry = DoLangTagLookup(temp,
1423 kCLDLangTagsHintTable2,
1424 kCLDTable2Size);
1425 if (entry != NULL) {
1426 // Second table hit
1427 MergeCLDLangPriorsMax(entry->onelangprior1, langpriors);
1428 MergeCLDLangPriorsMax(entry->onelangprior2, langpriors);
1429 }
1430 }
1431 }
1432 }
1433 pos = comma + 1;
1434 }
1435 }
1437 // Add hints to vector of langpriors
1438 // Input is string after HTTP header Content-Language:
1439 void SetCLDContentLangHint(const char* contentlang, CLDLangPriors* langpriors) {
1440 string langtags = CopyOneQuotedString(contentlang, 0, strlen(contentlang));
1441 SetCLDLangTagsHint(langtags, langpriors);
1442 }
1444 // Add hints to vector of langpriors
1445 // Input is last element of hostname (no dot), e.g. from GetTLD()
1446 void SetCLDTLDHint(const char* tld, CLDLangPriors* langpriors) {
1447 int len = strlen(tld);
1448 if (len > 3) {return;} // Ignore if more than three letters
1449 char local_tld[4];
1450 strncpy(local_tld, tld, 4);
1451 local_tld[3] = '\0'; // Safety move
1452 // Lowercase
1453 for (int i = 0; i < len; ++i) {local_tld[i] |= 0x20;}
1454 const TLDLookup* entry = DoTLDLookup(local_tld,
1455 kCLDTLDHintTable,
1456 kCLDTable3Size);
1457 if (entry != NULL) {
1458 // Table hit
1459 MergeCLDLangPriorsBoost(entry->onelangprior1, langpriors);
1460 MergeCLDLangPriorsBoost(entry->onelangprior2, langpriors);
1461 }
1462 }
1464 // Add hints to vector of langpriors
1465 // Input is from DetectEncoding()
1466 void SetCLDEncodingHint(Encoding enc, CLDLangPriors* langpriors) {
1467 OneCLDLangPrior olp;
1468 switch (enc) {
1469 case CHINESE_GB:
1470 case GBK:
1471 case GB18030:
1472 case ISO_2022_CN:
1473 case HZ_GB_2312:
1474 olp = PackCLDPriorLangWeight(CHINESE, kCLDPriorEncodingWeight);
1475 MergeCLDLangPriorsBoost(olp, langpriors);
1476 break;
1477 case CHINESE_BIG5:
1478 case CHINESE_BIG5_CP950:
1479 case BIG5_HKSCS:
1480 olp = PackCLDPriorLangWeight(CHINESE_T, kCLDPriorEncodingWeight);
1481 MergeCLDLangPriorsBoost(olp, langpriors);
1482 break;
1483 case JAPANESE_EUC_JP:
1484 case JAPANESE_SHIFT_JIS:
1485 case JAPANESE_CP932:
1486 case JAPANESE_JIS: // ISO-2022-JP
1487 olp = PackCLDPriorLangWeight(JAPANESE, kCLDPriorEncodingWeight);
1488 MergeCLDLangPriorsBoost(olp, langpriors);
1489 break;
1490 case KOREAN_EUC_KR:
1491 case ISO_2022_KR:
1492 olp = PackCLDPriorLangWeight(KOREAN, kCLDPriorEncodingWeight);
1493 MergeCLDLangPriorsBoost(olp, langpriors);
1494 break;
1496 default:
1497 break;
1498 }
1499 }
1501 // Add hints to vector of langpriors
1502 // Input is from random source
1503 void SetCLDLanguageHint(Language lang, CLDLangPriors* langpriors) {
1504 OneCLDLangPrior olp = PackCLDPriorLangWeight(lang, kCLDPriorLanguageWeight);
1505 MergeCLDLangPriorsBoost(olp, langpriors);
1506 }
1509 // Make printable string of priors
1510 string DumpCLDLangPriors(const CLDLangPriors* langpriors) {
1511 string retval;
1512 for (int i = 0; i < langpriors->n; ++i) {
1513 char temp[64];
1514 sprintf(temp, "%s.%d ",
1515 LanguageCode(GetCLDPriorLang(langpriors->prior[i])),
1516 GetCLDPriorWeight(langpriors->prior[i]));
1517 retval.append(temp);
1518 }
1519 return retval;
1520 }
1525 // Look for
1526 // <html lang="en">
1527 // <doc xml:lang="en">
1528 // <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en-US">
1529 // <meta http-equiv="content-language" content="en-GB" />
1530 // <meta name="language" content="Srpski">
1531 // <meta name="DC.language" scheme="RFCOMMA766" content="en">
1532 // <SPAN id="msg1" class="info" lang='en'>
1533 //
1534 // Do not trigger on
1535 // <!-- lang=french ...-->
1536 // <font lang=postscript ...>
1537 // <link href="index.fr.html" hreflang="fr-FR" xml:lang="fr-FR" />
1538 // <META name="Author" lang="fr" content="Arnaud Le Hors">
1539 //
1540 // Stop fairly quickly on mismatched quotes
1541 //
1542 // Allowed language characters
1543 // a-z A-Z -_ , space\t
1544 // Think about: GB2312, big5, shift-jis, euc-jp, ksc euc-kr
1545 // zh-hans zh-TW cmn-Hani zh_cn.gb18030_CN zh-min-nan zh-yue
1546 // de-x-mtfrom-en zh-tw-x-mtfrom-en (machine translation)
1547 // GB2312 => gb
1548 // Big5 => big
1549 // zh_CN.gb18030_C => zh-cn
1550 //
1551 // Remove duplicates and extra spaces as we go
1552 // Lowercase as we go.
1554 // Get language tag hints from HTML body
1555 // Normalize: remove spaces and make lowercase comma list
1557 string GetLangTagsFromHtml(const char* utf8_body, int32 utf8_body_len,
1558 int32 max_scan_bytes) {
1559 string retval;
1560 if (max_scan_bytes > utf8_body_len) {
1561 max_scan_bytes = utf8_body_len;
1562 }
1564 int32 k = 0;
1565 while (k < max_scan_bytes) {
1566 int32 start_tag = FindTagStart(utf8_body, k, max_scan_bytes);
1567 if (start_tag < 0) {break;}
1568 int32 end_tag = FindTagEnd(utf8_body, start_tag + 1, max_scan_bytes);
1569 // FindTagEnd exits on < > &
1570 if (end_tag < 0) {break;}
1572 // Skip <!--...>
1573 // Skip <font ...>
1574 // Skip <script ...>
1575 // Skip <link ...>
1576 // Skip <img ...>
1577 // Skip <a ...>
1578 if (FindAfter(utf8_body, start_tag + 1, end_tag, "!--") ||
1579 FindAfter(utf8_body, start_tag + 1, end_tag, "font ") ||
1580 FindAfter(utf8_body, start_tag + 1, end_tag, "script ") ||
1581 FindAfter(utf8_body, start_tag + 1, end_tag, "link ") ||
1582 FindAfter(utf8_body, start_tag + 1, end_tag, "img ") ||
1583 FindAfter(utf8_body, start_tag + 1, end_tag, "a ")) {
1584 k = end_tag + 1;
1585 continue;
1586 }
1588 // Remember <meta ...>
1589 bool in_meta = false;
1590 if (FindAfter(utf8_body, start_tag + 1, end_tag, "meta ")) {
1591 in_meta = true;
1592 }
1594 // Scan for each equal sign inside tag
1595 bool content_is_lang = false;
1596 int32 kk = start_tag + 1;
1597 int32 equal_sign;
1598 while ((equal_sign = FindEqualSign(utf8_body, kk, end_tag)) >= 0) {
1599 // eq exits on < > &
1601 // Look inside a meta tag
1602 // <meta ... http-equiv="content-language" ...>
1603 // <meta ... name="language" ...>
1604 // <meta ... name="dc.language" ...>
1605 if (in_meta) {
1606 if (FindBefore(utf8_body, kk, equal_sign, " http-equiv") &&
1607 FindAfter(utf8_body, equal_sign + 1, end_tag,
1608 "content-language ")) {
1609 content_is_lang = true;
1610 } else if (FindBefore(utf8_body, kk, equal_sign, " name") &&
1611 (FindAfter(utf8_body, equal_sign + 1, end_tag,
1612 "dc.language ") ||
1613 FindAfter(utf8_body, equal_sign + 1, end_tag,
1614 "language "))) {
1615 content_is_lang = true;
1616 }
1617 }
1619 // Look inside any tag
1620 // <meta ... content="lang-list" ...>
1621 // <... lang="lang-list" ...>
1622 // <... xml:lang="lang-list" ...>
1623 if ((content_is_lang && FindBefore(utf8_body, kk, equal_sign,
1624 " content")) ||
1625 FindBefore(utf8_body, kk, equal_sign, " lang") ||
1626 FindBefore(utf8_body, kk, equal_sign, ":lang")) {
1627 string temp = CopyQuotedString(utf8_body, equal_sign + 1, end_tag);
1629 // Append new lang tag(s) if not a duplicate
1630 if (!temp.empty() && (retval.find(temp) == string::npos)) {
1631 retval.append(temp);
1632 }
1633 }
1635 kk = equal_sign + 1;
1636 }
1637 k = end_tag + 1;
1638 }
1640 // Strip last comma
1641 if (retval.size() > 1) {
1642 retval.erase(retval.size() - 1);
1643 }
1644 return retval;
1645 }
1647 } // End namespace CLD2
1649 //==============================================================================