|
1 // Copyright 2013 Google Inc. All Rights Reserved. |
|
2 // |
|
3 // Licensed under the Apache License, Version 2.0 (the "License"); |
|
4 // you may not use this file except in compliance with the License. |
|
5 // You may obtain a copy of the License at |
|
6 // |
|
7 // http://www.apache.org/licenses/LICENSE-2.0 |
|
8 // |
|
9 // Unless required by applicable law or agreed to in writing, software |
|
10 // distributed under the License is distributed on an "AS IS" BASIS, |
|
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
12 // See the License for the specific language governing permissions and |
|
13 // limitations under the License. |
|
14 |
|
15 // |
|
16 // Author: dsites@google.com (Dick Sites) |
|
17 // |
|
18 |
|
19 #include "compact_lang_det_hint_code.h" |
|
20 |
|
21 #include <stdlib.h> // for abs() |
|
22 #include <stdio.h> // for sprintf() |
|
23 #include <string.h> // |
|
24 #include "lang_script.h" |
|
25 #include "port.h" |
|
26 |
|
27 using namespace std; |
|
28 |
|
29 namespace CLD2 { |
|
30 |
|
31 static const int kCLDPriorEncodingWeight = 4; // 100x more likely |
|
32 static const int kCLDPriorLanguageWeight = 8; // 10000x more likely |
|
33 |
|
34 |
|
35 // Tables to map lang="..." language code lists to actual languages. |
|
36 // based on scraping and hand-edits, dsites June 2011 |
|
37 |
|
38 // n = f(string, &a) gives list of n<=4 language pairs: primary, secondary |
|
39 |
|
40 // For close pairs like ms/id, more weight on TLD and lang= |
|
41 // Alternately, weaker boost but mark others of set as negative; |
|
42 // makes "neither" an easier result. |
|
43 // lang=en low weight 4 |
|
44 // tld=lu boost lu maaybe 4. but lang= alwyas overcomes tld and encoding |
|
45 // (except maybe en) |
|
46 |
|
47 // TLD to separate, e.g., burundi from rwanda |
|
48 |
|
49 // Encoding lookup: OneLangProb array |
|
50 // TLD lookup: tld OneLangProb pairs |
|
51 |
|
52 |
|
53 typedef struct { |
|
54 const char* const langtag; // Lowercased, hyphen only lookup key |
|
55 const char* const langcode; // Canonical language codes; two if ambiguous |
|
56 OneCLDLangPrior onelangprior1; |
|
57 OneCLDLangPrior onelangprior2; |
|
58 } LangTagLookup; |
|
59 |
|
60 typedef struct { |
|
61 const char* const tld; // Lowercased, hyphen only lookup key |
|
62 OneCLDLangPrior onelangprior1; |
|
63 OneCLDLangPrior onelangprior2; |
|
64 } TLDLookup; |
|
65 |
|
66 |
|
67 #define W2 (2 << 10) // 3**2 = 10x more likely |
|
68 #define W4 (4 << 10) // 3**4 = 100x more likely |
|
69 #define W6 (6 << 10) // 3**6 = 1000x more likely |
|
70 #define W8 (8 << 10) // 3**8 = 10K x more likely |
|
71 #define W10 (10 << 10) // 3**10 = 100K x more likely |
|
72 #define W12 (12 << 10) // 3**12 = 1M x more likely |
|
73 |
|
74 // TODO: more about ba hr sr sr-ME and sl |
|
75 // Temporary state of affairs: |
|
76 // BOSNIAN CROATIAN MONTENEGRIN SERBIAN detecting just CROATIAN SERBIAN |
|
77 // Eventually, we want to do all four, but it requires a CLD change to handle |
|
78 // up to six languages per quadgram. |
|
79 |
|
80 |
|
81 // Close pairs boost one of pair, demote other. |
|
82 // Statistically close pairs: |
|
83 // INDONESIAN/MALAY difficult to distinguish -- extra word-based lookups used |
|
84 // |
|
85 // INDONESIAN MALAY coef=0.4698 Problematic w/o extra words |
|
86 // TIBETAN DZONGKHA coef=0.4571 |
|
87 // CZECH SLOVAK coef=0.4273 |
|
88 // NORWEGIAN NORWEGIAN_N coef=0.4182 |
|
89 // |
|
90 // HINDI MARATHI coef=0.3795 |
|
91 // ZULU XHOSA coef=0.3716 |
|
92 // |
|
93 // DANISH NORWEGIAN coef=0.3672 Usually OK |
|
94 // BIHARI HINDI coef=0.3668 Usually OK |
|
95 // ICELANDIC FAROESE coef=0.3519 Usually OK |
|
96 |
|
97 // |
|
98 // Table to look up lang= tags longer than three characters |
|
99 // Overrides table below, which is truncated at first hyphen |
|
100 // In alphabetical order for binary search |
|
101 static const int kCLDTable1Size = 213; |
|
102 static const LangTagLookup kCLDLangTagsHintTable1[kCLDTable1Size] = { |
|
103 {"abkhazian", "ab", ABKHAZIAN + W10, 0}, |
|
104 {"afar", "aa", AFAR + W10, 0}, |
|
105 {"afrikaans", "af", AFRIKAANS + W10, 0}, |
|
106 {"akan", "ak", AKAN + W10, 0}, |
|
107 {"albanian", "sq", ALBANIAN + W10, 0}, |
|
108 {"am-am", "hy", ARMENIAN + W10, 0}, // 1:2 Armenian, not ambiguous |
|
109 {"amharic", "am", AMHARIC + W10, 0}, |
|
110 {"arabic", "ar", ARABIC + W10, 0}, |
|
111 {"argentina", "es", SPANISH + W10, 0}, |
|
112 {"armenian", "hy", ARMENIAN + W10, 0}, |
|
113 {"assamese", "as", ASSAMESE + W10, 0}, |
|
114 {"aymara", "ay", AYMARA + W10, 0}, |
|
115 {"azerbaijani", "az", AZERBAIJANI + W10, 0}, |
|
116 |
|
117 {"bangla", "bn", BENGALI + W10, 0}, |
|
118 {"bashkir", "ba", BASHKIR + W10, 0}, |
|
119 {"basque", "eu", BASQUE + W10, 0}, |
|
120 {"belarusian", "be", BELARUSIAN + W10, 0}, |
|
121 {"bengali", "bn", BENGALI + W10, 0}, |
|
122 {"bihari", "bh", BIHARI + W10, HINDI - W4}, |
|
123 {"bislama", "bi", BISLAMA + W10, 0}, |
|
124 {"bosnian", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian |
|
125 {"br-br", "pt", PORTUGUESE + W10, 0}, // 1:2 Portuguese, not ambiguous |
|
126 {"br-fr", "br", BRETON + W10, 0}, // 1:2 Breton, not ambiguous |
|
127 {"breton", "br", BRETON + W10, 0}, |
|
128 {"bulgarian", "bg", BULGARIAN + W10, 0}, |
|
129 {"burmese", "my", BURMESE + W10, 0}, // Myanmar |
|
130 |
|
131 {"catalan", "ca", CATALAN + W10, 0}, |
|
132 {"cherokee", "chr", CHEROKEE + W10, 0}, |
|
133 {"chichewa", "ny", NYANJA + W10, 0}, |
|
134 |
|
135 {"chinese", "zh", CHINESE + W10, 0}, |
|
136 {"chinese-t", "zhT", CHINESE_T + W10, 0}, |
|
137 {"chineset", "zhT", CHINESE_T + W10, 0}, |
|
138 {"corsican", "co", CORSICAN + W10, 0}, |
|
139 {"cpf-hat", "ht", HAITIAN_CREOLE + W10, 0}, // Creole, French-based |
|
140 {"croatian", "hr", CROATIAN + W10, 0}, |
|
141 {"czech", "cs", CZECH + W10, SLOVAK - W4}, |
|
142 |
|
143 {"danish", "da", DANISH + W10, NORWEGIAN - W4}, |
|
144 {"deutsch", "de", GERMAN + W10, 0}, |
|
145 {"dhivehi", "dv", DHIVEHI + W10, 0}, |
|
146 {"dutch", "nl", DUTCH + W10, 0}, |
|
147 {"dzongkha", "dz", DZONGKHA + W10, TIBETAN - W4}, |
|
148 |
|
149 {"ell-gr", "el", GREEK + W10, 0}, |
|
150 {"english", "en", ENGLISH + W4, 0}, |
|
151 {"esperanto", "eo", ESPERANTO + W10, 0}, |
|
152 {"estonian", "et", ESTONIAN + W10, 0}, |
|
153 {"euc-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding |
|
154 {"euc-kr", "ko", KOREAN + W10, 0}, // Korean encoding |
|
155 |
|
156 {"faroese", "fo", FAROESE + W10, ICELANDIC - W4}, |
|
157 {"fijian", "fj", FIJIAN + W10, 0}, |
|
158 {"finnish", "fi", FINNISH + W10, 0}, |
|
159 {"fran", "fr", FRENCH + W10, 0}, // Truncated at non-ASCII |
|
160 {"francais", "fr", FRENCH + W10, 0}, |
|
161 {"french", "fr", FRENCH + W10, 0}, |
|
162 {"frisian", "fy", FRISIAN + W10, 0}, |
|
163 |
|
164 {"ga-es", "gl", GALICIAN + W10, 0}, // 1:2 Galician, not ambiguous |
|
165 {"galician", "gl", GALICIAN + W10, 0}, |
|
166 {"ganda", "lg", GANDA + W10, 0}, |
|
167 {"georgian", "ka", GEORGIAN + W10, 0}, |
|
168 {"german", "de", GERMAN + W10, 0}, |
|
169 {"greek", "el", GREEK + W10, 0}, |
|
170 {"greenlandic", "kl", GREENLANDIC + W10, 0}, |
|
171 {"guarani", "gn", GUARANI + W10, 0}, |
|
172 {"gujarati", "gu", GUJARATI + W10, 0}, |
|
173 |
|
174 {"haitian_creole", "ht", HAITIAN_CREOLE + W10, 0}, |
|
175 {"hausa", "ha", HAUSA + W10, 0}, |
|
176 {"hawaiian", "haw", HAWAIIAN + W10, 0}, |
|
177 {"hebrew", "iw", HEBREW + W10, 0}, |
|
178 {"hindi", "hi", HINDI + W10, MARATHI - W4}, |
|
179 {"hn-in", "hi", HINDI + W10, MARATHI - W4}, |
|
180 {"hungarian", "hu", HUNGARIAN + W10, 0}, |
|
181 |
|
182 {"icelandic", "is", ICELANDIC + W10, FAROESE - W4}, |
|
183 {"igbo", "ig", IGBO + W10, 0}, |
|
184 {"indonesian", "id", INDONESIAN + W10, MALAY - W4}, |
|
185 {"interlingua", "ia", INTERLINGUA + W10, 0}, |
|
186 {"interlingue", "ie", INTERLINGUE + W10, 0}, |
|
187 // 1:2 iu-Cans ik-Latn |
|
188 {"inuktitut", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2 |
|
189 {"inupiak", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2 |
|
190 {"ir-ie", "ga", IRISH + W10, 0}, // Irish |
|
191 {"irish", "ga", IRISH + W10, 0}, |
|
192 {"italian", "it", ITALIAN + W10, 0}, |
|
193 |
|
194 {"ja-euc", "ja", JAPANESE + W10, 0}, // Japanese encoding |
|
195 {"jan-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding |
|
196 {"japanese", "ja", JAPANESE + W10, 0}, |
|
197 {"javanese", "jw", JAVANESE + W10, 0}, |
|
198 |
|
199 {"kannada", "kn", KANNADA + W10, 0}, |
|
200 {"kashmiri", "ks", KASHMIRI + W10, 0}, |
|
201 {"kazakh", "kk", KAZAKH + W10, 0}, |
|
202 {"khasi", "kha", KHASI + W10, 0}, |
|
203 {"khmer", "km", KHMER + W10, 0}, |
|
204 {"kinyarwanda", "rw", KINYARWANDA + W10, 0}, |
|
205 {"klingon", "tlh", X_KLINGON + W10, 0}, |
|
206 {"korean", "ko", KOREAN + W10, 0}, |
|
207 {"kurdish", "ku", KURDISH + W10, 0}, |
|
208 {"kyrgyz", "ky", KYRGYZ + W10, 0}, |
|
209 |
|
210 {"laothian", "lo", LAOTHIAN + W10, 0}, |
|
211 {"latin", "la", LATIN + W10, 0}, |
|
212 {"latvian", "lv", LATVIAN + W10, 0}, |
|
213 {"limbu", "sit", LIMBU + W10, 0}, |
|
214 {"lingala", "ln", LINGALA + W10, 0}, |
|
215 {"lithuanian", "lt", LITHUANIAN + W10, 0}, |
|
216 {"luxembourgish", "lb", LUXEMBOURGISH + W10, 0}, |
|
217 |
|
218 {"macedonian", "mk", MACEDONIAN + W10, 0}, |
|
219 {"malagasy", "mg", MALAGASY + W10, 0}, |
|
220 {"malay", "ms", MALAY + W10, INDONESIAN - W4}, |
|
221 {"malayalam", "ml", MALAYALAM + W10, 0}, |
|
222 {"maltese", "mt", MALTESE + W10, 0}, |
|
223 {"manx", "gv", MANX + W10, 0}, |
|
224 {"maori", "mi", MAORI + W10, 0}, |
|
225 {"marathi", "mr", MARATHI + W10, HINDI - W4}, |
|
226 {"mauritian_creole", "mfe", MAURITIAN_CREOLE + W10, 0}, |
|
227 {"moldavian", "mo", ROMANIAN + W10, 0}, |
|
228 {"mongolian", "mn", MONGOLIAN + W10, 0}, |
|
229 {"montenegrin", "sr-me", MONTENEGRIN + W10, 0}, |
|
230 {"myanmar", "my", BURMESE + W10, 0}, // Myanmar |
|
231 {"nauru", "na", NAURU + W10, 0}, |
|
232 {"ndebele", "nr", NDEBELE + W10, 0}, |
|
233 {"nepali", "ne", NEPALI + W10, 0}, |
|
234 {"no-bok", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal |
|
235 {"no-bokmaal", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, |
|
236 {"no-nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal |
|
237 {"no-no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, |
|
238 {"no-nyn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, // Nynorsk |
|
239 {"no-nynorsk", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, |
|
240 {"norwegian", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, |
|
241 {"norwegian_n", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, |
|
242 {"nyanja", "ny", NYANJA + W10, 0}, |
|
243 |
|
244 {"occitan", "oc", OCCITAN + W10, 0}, |
|
245 {"oriya", "or", ORIYA + W10, 0}, |
|
246 {"oromo", "om", OROMO + W10, 0}, |
|
247 {"parsi", "fa", PERSIAN + W10, 0}, |
|
248 |
|
249 {"pashto", "ps", PASHTO + W10, 0}, |
|
250 {"pedi", "nso", PEDI + W10, 0}, |
|
251 {"persian", "fa", PERSIAN + W10, 0}, |
|
252 {"polish", "pl", POLISH + W10, 0}, |
|
253 {"polska", "pl", POLISH + W10, 0}, |
|
254 {"polski", "pl", POLISH + W10, 0}, |
|
255 {"portugu", "pt", PORTUGUESE + W10, 0}, // Truncated at non-ASCII |
|
256 {"portuguese", "pt", PORTUGUESE + W10, 0}, |
|
257 {"punjabi", "pa", PUNJABI + W10, 0}, |
|
258 |
|
259 {"quechua", "qu", QUECHUA + W10, 0}, |
|
260 |
|
261 {"rhaeto_romance", "rm", RHAETO_ROMANCE + W10, 0}, |
|
262 {"romanian", "ro", ROMANIAN + W10, 0}, |
|
263 {"rundi", "rn", RUNDI + W10, 0}, |
|
264 {"russian", "ru", RUSSIAN + W10, 0}, |
|
265 |
|
266 {"samoan", "sm", SAMOAN + W10, 0}, |
|
267 {"sango", "sg", SANGO + W10, 0}, |
|
268 {"sanskrit", "sa", SANSKRIT + W10, 0}, |
|
269 {"scots", "sco", SCOTS + W10, ENGLISH - W4}, |
|
270 {"scots_gaelic", "gd", SCOTS_GAELIC + W10, 0}, |
|
271 {"serbian", "sr", SERBIAN + W10, 0}, |
|
272 {"seselwa", "crs", SESELWA + W10, 0}, |
|
273 {"sesotho", "st", SESOTHO + W10, 0}, |
|
274 {"shift-jis", "ja", JAPANESE + W10, 0}, // Japanese encoding |
|
275 {"shift-js", "ja", JAPANESE + W10, 0}, // Japanese encoding |
|
276 {"shona", "sn", SHONA + W10, 0}, |
|
277 {"si-lk", "si", SINHALESE + W10, 0}, // 1:2 Sri Lanka, not ambiguous |
|
278 {"si-si", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous |
|
279 {"si-sl", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous |
|
280 {"sindhi", "sd", SINDHI + W10, 0}, |
|
281 {"sinhalese", "si", SINHALESE + W10, 0}, |
|
282 {"siswant", "ss", SISWANT + W10, 0}, |
|
283 {"sit-np", "sit", LIMBU + W10, 0}, |
|
284 {"slovak", "sk", SLOVAK + W10, CZECH - W4}, |
|
285 {"slovenian", "sl", SLOVENIAN + W10, 0}, |
|
286 {"somali", "so", SOMALI + W10, 0}, |
|
287 {"spanish", "es", SPANISH + W10, 0}, |
|
288 {"sr-me", "sr-me", MONTENEGRIN + W10, 0}, // Montenegrin => Montenegrin |
|
289 {"sundanese", "su", SUNDANESE + W10, 0}, |
|
290 {"suomi", "fi", FINNISH + W10, 0}, // Finnish |
|
291 {"swahili", "sw", SWAHILI + W10, 0}, |
|
292 {"swedish", "sv", SWEDISH + W10, 0}, |
|
293 {"syriac", "syr", SYRIAC + W10, 0}, |
|
294 |
|
295 {"tagalog", "tl", TAGALOG + W10, 0}, |
|
296 {"tajik", "tg", TAJIK + W10, 0}, |
|
297 {"tamil", "ta", TAMIL + W10, 0}, |
|
298 {"tatar", "tt", TATAR + W10, 0}, |
|
299 {"tb-tb", "bo", TIBETAN + W10, DZONGKHA - W4}, // Tibet |
|
300 {"tchinese", "zhT", CHINESE_T + W10, 0}, |
|
301 {"telugu", "te", TELUGU + W10, 0}, |
|
302 {"thai", "th", THAI + W10, 0}, |
|
303 {"tibetan", "bo", TIBETAN + W10, DZONGKHA - W4}, |
|
304 {"tigrinya", "ti", TIGRINYA + W10, 0}, |
|
305 {"tonga", "to", TONGA + W10, 0}, |
|
306 {"tsonga", "ts", TSONGA + W10, 0}, |
|
307 {"tswana", "tn", TSWANA + W10, 0}, |
|
308 {"tt-ru", "tt", TATAR + W10, 0}, |
|
309 {"tur-tr", "tr", TURKISH + W10, 0}, |
|
310 {"turkish", "tr", TURKISH + W10, 0}, |
|
311 {"turkmen", "tk", TURKMEN + W10, 0}, |
|
312 {"uighur", "ug", UIGHUR + W10, 0}, |
|
313 {"ukrainian", "uk", UKRAINIAN + W10, 0}, |
|
314 {"urdu", "ur", URDU + W10, 0}, |
|
315 {"uzbek", "uz", UZBEK + W10, 0}, |
|
316 |
|
317 {"venda", "ve", VENDA + W10, 0}, |
|
318 {"vietnam", "vi", VIETNAMESE + W10, 0}, |
|
319 {"vietnamese", "vi", VIETNAMESE + W10, 0}, |
|
320 {"volapuk", "vo", VOLAPUK + W10, 0}, |
|
321 |
|
322 {"welsh", "cy", WELSH + W10, 0}, |
|
323 {"wolof", "wo", WOLOF + W10, 0}, |
|
324 |
|
325 {"xhosa", "xh", XHOSA + W10, ZULU - W4}, |
|
326 |
|
327 {"yiddish", "yi", YIDDISH + W10, 0}, |
|
328 {"yoruba", "yo", YORUBA + W10, 0}, |
|
329 |
|
330 {"zh-classical", "zhT", CHINESE_T + W10, 0}, |
|
331 {"zh-cn", "zh", CHINESE + W10, 0}, |
|
332 {"zh-hans", "zh", CHINESE + W10, 0}, |
|
333 {"zh-hant", "zhT", CHINESE_T + W10, 0}, |
|
334 {"zh-hk", "zhT", CHINESE_T + W10, 0}, |
|
335 {"zh-min-nan", "zhT", CHINESE_T + W10, 0}, // Min Nan => ChineseT |
|
336 {"zh-sg", "zhT", CHINESE_T + W10, 0}, |
|
337 {"zh-tw", "zhT", CHINESE_T + W10, 0}, |
|
338 {"zh-yue", "zh", CHINESE + W10, 0}, // Yue (Cantonese) => Chinese |
|
339 {"zhuang", "za", ZHUANG + W10, 0}, |
|
340 {"zulu", "zu", ZULU + W10, XHOSA - W4}, |
|
341 }; |
|
342 |
|
343 |
|
344 |
|
345 // Table to look up lang= tags of two/three characters after truncate at hyphen |
|
346 // In alphabetical order for binary search |
|
347 static const int kCLDTable2Size = 257; |
|
348 static const LangTagLookup kCLDLangTagsHintTable2[kCLDTable2Size] = { |
|
349 {"aa", "aa", AFAR + W10, 0}, |
|
350 {"ab", "ab", ABKHAZIAN + W10, 0}, |
|
351 {"af", "af", AFRIKAANS + W10, 0}, |
|
352 {"ak", "ak", AKAN + W10, 0}, |
|
353 {"al", "sq", ALBANIAN + W10, 0}, // Albania |
|
354 {"am", "am,hy", AMHARIC + W10, ARMENIAN + W10}, // 1:2 Amharic Armenian |
|
355 {"ar", "ar", ARABIC + W10, 0}, |
|
356 {"ara", "ar", ARABIC + W10, 0}, |
|
357 {"arm", "hy", ARMENIAN + W10, 0}, // Armenia |
|
358 {"arz", "ar", ARABIC + W10, 0}, // Egyptian Arabic |
|
359 {"as", "as", ASSAMESE + W10, 0}, |
|
360 {"at", "de", GERMAN + W10, 0}, // Austria |
|
361 {"au", "de", GERMAN + W10, 0}, // Austria |
|
362 {"ay", "ay", AYMARA + W10, 0}, |
|
363 {"az", "az", AZERBAIJANI + W10, 0}, |
|
364 {"aze", "az", AZERBAIJANI + W10, 0}, |
|
365 |
|
366 {"ba", "ba,bs", BASHKIR + W10, BOSNIAN + W10}, // 1:2 Bashkir Bosnia |
|
367 {"be", "be", BELARUSIAN + W10, 0}, |
|
368 {"bel", "be", BELARUSIAN + W10, 0}, |
|
369 {"bg", "bg", BULGARIAN + W10, 0}, |
|
370 {"bh", "bh", BIHARI + W10, HINDI - W4}, |
|
371 {"bi", "bi", BISLAMA + W10, 0}, |
|
372 {"big", "zhT", CHINESE_T + W10, 0}, // Big5 encoding |
|
373 {"bm", "ms", MALAY + W10, INDONESIAN - W4}, // Bahasa Malaysia |
|
374 {"bn", "bn", BENGALI + W10, 0}, |
|
375 {"bo", "bo", TIBETAN + W10, DZONGKHA - W4}, |
|
376 // 1:2 Breton, Brazil country code, both Latn .br TLD enough for pt to win |
|
377 {"br", "br,pt", BRETON + W10, PORTUGUESE + W8}, // 1:2 Breton, Brazil |
|
378 {"bs", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian |
|
379 |
|
380 {"ca", "ca", CATALAN + W10, 0}, |
|
381 {"cat", "ca", CATALAN + W10, 0}, |
|
382 {"ch", "de,fr", GERMAN + W10, FRENCH + W10}, // 1:2 Switzerland |
|
383 {"chn", "zh", CHINESE + W10, 0}, |
|
384 {"chr", "chr", CHEROKEE + W10, 0}, |
|
385 {"ckb", "ku", KURDISH + W10, 0}, // Central Kurdish |
|
386 {"cn", "zh,zhT", CHINESE + W6, CHINESE_T + W4}, // Ambiguous, so weaker. |
|
387 // Offset by 2 so that TLD=tw or |
|
388 // enc=big5 will put zhT ahead |
|
389 {"co", "co", CORSICAN + W10, 0}, |
|
390 {"cro", "hr", CROATIAN + W10, 0}, // Croatia |
|
391 {"crs", "crs", SESELWA + W10, 0}, |
|
392 {"cs", "cs", CZECH + W10, SLOVAK - W4}, |
|
393 {"ct", "ca", CATALAN + W10, 0}, |
|
394 {"cy", "cy", WELSH + W10, 0}, |
|
395 {"cym", "cy", WELSH + W10, 0}, |
|
396 {"cz", "cs", CZECH + W10, SLOVAK - W4}, |
|
397 |
|
398 {"da", "da", DANISH + W10, NORWEGIAN - W4}, |
|
399 {"dan", "da", DANISH + W10, NORWEGIAN - W4}, |
|
400 {"de", "de", GERMAN + W10, 0}, |
|
401 {"deu", "de", GERMAN + W10, 0}, |
|
402 {"div", "dv", DHIVEHI + W10, 0}, |
|
403 {"dk", "da", DANISH + W10, NORWEGIAN - W4}, // Denmark |
|
404 {"dut", "nl", DUTCH + W10, 0}, // Dutch |
|
405 {"dv", "dv", DHIVEHI + W10, 0}, |
|
406 {"dz", "dz", DZONGKHA + W10, TIBETAN - W4}, |
|
407 |
|
408 {"ee", "et", ESTONIAN + W10, 0}, // Estonia |
|
409 {"eg", "ar", ARABIC + W10, 0}, // Egypt |
|
410 {"el", "el", GREEK + W10, 0}, |
|
411 {"en", "en", ENGLISH + W4, 0}, |
|
412 {"eng", "en", ENGLISH + W4, 0}, |
|
413 {"eo", "eo", ESPERANTO + W10, 0}, |
|
414 {"er", "ur", URDU + W10, 0}, // "Erdu" |
|
415 {"es", "es", SPANISH + W10, 0}, |
|
416 {"esp", "es", SPANISH + W10, 0}, |
|
417 {"est", "et", ESTONIAN + W10, 0}, |
|
418 {"et", "et", ESTONIAN + W10, 0}, |
|
419 {"eu", "eu", BASQUE + W10, 0}, |
|
420 |
|
421 {"fa", "fa", PERSIAN + W10, 0}, |
|
422 {"far", "fa", PERSIAN + W10, 0}, |
|
423 {"fi", "fi", FINNISH + W10, 0}, |
|
424 {"fil", "tl", TAGALOG + W10, 0}, // Philippines |
|
425 {"fj", "fj", FIJIAN + W10, 0}, |
|
426 {"fo", "fo", FAROESE + W10, ICELANDIC - W4}, |
|
427 {"fr", "fr", FRENCH + W10, 0}, |
|
428 {"fra", "fr", FRENCH + W10, 0}, |
|
429 {"fre", "fr", FRENCH + W10, 0}, |
|
430 {"fy", "fy", FRISIAN + W10, 0}, |
|
431 |
|
432 {"ga", "ga,gl", IRISH + W10, GALICIAN + W10}, // 1:2 Irish, Galician |
|
433 {"gae", "gd,ga", SCOTS_GAELIC + W10, IRISH + W10}, // 1:2 Gaelic, either |
|
434 {"gal", "gl", GALICIAN + W10, 0}, |
|
435 {"gb", "zh", CHINESE + W10, 0}, // GB2312 encoding |
|
436 {"gbk", "zh", CHINESE + W10, 0}, // GBK encoding |
|
437 {"gd", "gd", SCOTS_GAELIC + W10, 0}, |
|
438 {"ge", "ka", GEORGIAN + W10, 0}, // Georgia |
|
439 {"geo", "ka", GEORGIAN + W10, 0}, |
|
440 {"ger", "de", GERMAN + W10, 0}, |
|
441 {"gl", "gl", GALICIAN + W10, 0}, // Also Greenland; hard to confuse |
|
442 {"gn", "gn", GUARANI + W10, 0}, |
|
443 {"gr", "el", GREEK + W10, 0}, // Greece |
|
444 {"gu", "gu", GUJARATI + W10, 0}, |
|
445 {"gv", "gv", MANX + W10, 0}, |
|
446 |
|
447 {"ha", "ha", HAUSA + W10, 0}, |
|
448 {"hat", "ht", HAITIAN_CREOLE + W10, 0}, // Haiti |
|
449 {"haw", "haw", HAWAIIAN + W10, 0}, |
|
450 {"hb", "iw", HEBREW + W10, 0}, |
|
451 {"he", "iw", HEBREW + W10, 0}, |
|
452 {"heb", "iw", HEBREW + W10, 0}, |
|
453 {"hi", "hi", HINDI + W10, MARATHI - W4}, |
|
454 {"hk", "zhT", CHINESE_T + W10, 0}, // Hong Kong |
|
455 {"hr", "hr", CROATIAN + W10, 0}, |
|
456 {"ht", "ht", HAITIAN_CREOLE + W10, 0}, |
|
457 {"hu", "hu", HUNGARIAN + W10, 0}, |
|
458 {"hun", "hu", HUNGARIAN + W10, 0}, |
|
459 {"hy", "hy", ARMENIAN + W10, 0}, |
|
460 |
|
461 {"ia", "ia", INTERLINGUA + W10, 0}, |
|
462 {"ice", "is", ICELANDIC + W10, FAROESE - W4}, // Iceland |
|
463 {"id", "id", INDONESIAN + W10, MALAY - W4}, |
|
464 {"ids", "id", INDONESIAN + W10, MALAY - W4}, |
|
465 {"ie", "ie", INTERLINGUE + W10, 0}, |
|
466 {"ig", "ig", IGBO + W10, 0}, |
|
467 // 1:2 iu-Cans ik-Latn |
|
468 {"ik", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2 |
|
469 {"in", "id", INDONESIAN + W10, MALAY - W4}, |
|
470 {"ind", "id", INDONESIAN + W10, MALAY - W4}, // Indonesia |
|
471 {"inu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2 |
|
472 {"is", "is", ICELANDIC + W10, FAROESE - W4}, |
|
473 {"it", "it", ITALIAN + W10, 0}, |
|
474 {"ita", "it", ITALIAN + W10, 0}, |
|
475 {"iu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2 |
|
476 {"iw", "iw", HEBREW + W10, 0}, |
|
477 |
|
478 {"ja", "ja", JAPANESE + W10, 0}, |
|
479 {"jp", "ja", JAPANESE + W10, 0}, // Japan |
|
480 {"jpn", "ja", JAPANESE + W10, 0}, |
|
481 {"jv", "jw", JAVANESE + W10, 0}, |
|
482 {"jw", "jw", JAVANESE + W10, 0}, |
|
483 |
|
484 {"ka", "ka", GEORGIAN + W10, 0}, |
|
485 {"kc", "qu", QUECHUA + W10, 0}, // (K)Quechua |
|
486 {"kg", "ky", KYRGYZ + W10, 0}, // Kyrgyzstan |
|
487 {"kh", "km", KHMER + W10, 0}, // Country code Khmer (Cambodia) |
|
488 {"kha", "kha", KHASI + W10, 0}, |
|
489 {"kk", "kk", KAZAKH + W10, 0}, // Kazakh |
|
490 {"kl", "kl", GREENLANDIC + W10, 0}, |
|
491 {"km", "km", KHMER + W10, 0}, |
|
492 {"kn", "kn", KANNADA + W10, 0}, |
|
493 {"ko", "ko", KOREAN + W10, 0}, |
|
494 {"kor", "ko", KOREAN + W10, 0}, |
|
495 {"kr", "ko", KOREAN + W10, 0}, // Country code Korea |
|
496 {"ks", "ks", KASHMIRI + W10, 0}, |
|
497 {"ksc", "ko", KOREAN + W10, 0}, // KSC encoding |
|
498 {"ku", "ku", KURDISH + W10, 0}, |
|
499 {"ky", "ky", KYRGYZ + W10, 0}, |
|
500 {"kz", "kk", KAZAKH + W10, 0}, // Kazakhstan |
|
501 {"la", "la", LATIN + W10, 0}, |
|
502 {"lao", "lo", LAOTHIAN + W10, 0}, // Laos |
|
503 |
|
504 {"lb", "lb", LUXEMBOURGISH + W10, 0}, |
|
505 {"lg", "lg", GANDA + W10, 0}, |
|
506 {"lit", "lt", LITHUANIAN + W10, 0}, |
|
507 {"ln", "ln", LINGALA + W10, 0}, |
|
508 {"lo", "lo", LAOTHIAN + W10, 0}, |
|
509 {"lt", "lt", LITHUANIAN + W10, 0}, |
|
510 {"ltu", "lt", LITHUANIAN + W10, 0}, |
|
511 {"lv", "lv", LATVIAN + W10, 0}, |
|
512 |
|
513 {"mfe", "mfe", MAURITIAN_CREOLE + W10, 0}, |
|
514 {"mg", "mg", MALAGASY + W10, 0}, |
|
515 {"mi", "mi", MAORI + W10, 0}, |
|
516 {"mk", "mk", MACEDONIAN + W10, 0}, |
|
517 {"ml", "ml", MALAYALAM + W10, 0}, |
|
518 {"mn", "mn", MONGOLIAN + W10, 0}, |
|
519 {"mo", "mo", ROMANIAN + W10, 0}, |
|
520 {"mon", "mn", MONGOLIAN + W10, 0}, // Mongolian |
|
521 {"mr", "mr", MARATHI + W10, HINDI - W4}, |
|
522 {"ms", "ms", MALAY + W10, INDONESIAN - W4}, |
|
523 {"mt", "mt", MALTESE + W10, 0}, |
|
524 {"mx", "es", SPANISH + W10, 0}, // Mexico |
|
525 {"my", "my,ms", BURMESE + W10, MALAY + W10}, // Myanmar, Malaysia |
|
526 |
|
527 {"na", "na", NAURU + W10, 0}, |
|
528 {"nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, |
|
529 {"ne", "ne", NEPALI + W10, 0}, |
|
530 {"nl", "nl", DUTCH + W10, 0}, |
|
531 {"nn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, |
|
532 {"no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, |
|
533 {"nr", "nr", NDEBELE + W10, 0}, |
|
534 {"nso", "nso", PEDI + W10, 0}, |
|
535 {"ny", "ny", NYANJA + W10, 0}, |
|
536 |
|
537 {"oc", "oc", OCCITAN + W10, 0}, |
|
538 {"om", "om", OROMO + W10, 0}, |
|
539 {"or", "or", ORIYA + W10, 0}, |
|
540 |
|
541 {"pa", "pa,ps", PUNJABI + W10, PASHTO + W10}, // 1:2 pa-Guru ps-Arab |
|
542 {"per", "fa", PERSIAN + W10, 0}, |
|
543 {"ph", "tl", TAGALOG + W10, 0}, // Philippines |
|
544 {"pk", "ur", URDU + W10, 0}, // Pakistan |
|
545 {"pl", "pl", POLISH + W10, 0}, |
|
546 {"pnb", "pa", PUNJABI + W10, 0}, // Western Punjabi |
|
547 {"pol", "pl", POLISH + W10, 0}, |
|
548 {"por", "pt", PORTUGUESE + W10, 0}, |
|
549 {"ps", "ps", PASHTO + W10, 0}, |
|
550 {"pt", "pt", PORTUGUESE + W10, 0}, |
|
551 {"ptg", "pt", PORTUGUESE + W10, 0}, |
|
552 {"qc", "fr", FRENCH + W10, 0}, // Quebec "country" code |
|
553 {"qu", "qu", QUECHUA + W10, 0}, |
|
554 |
|
555 {"rm", "rm", RHAETO_ROMANCE + W10, 0}, |
|
556 {"rn", "rn", RUNDI + W10, 0}, |
|
557 {"ro", "ro", ROMANIAN + W10, 0}, |
|
558 {"rs", "sr", SERBIAN + W10, 0}, // Serbia country code |
|
559 {"ru", "ru", RUSSIAN + W10, 0}, |
|
560 {"rus", "ru", RUSSIAN + W10, 0}, |
|
561 {"rw", "rw", KINYARWANDA + W10, 0}, |
|
562 |
|
563 {"sa", "sa", SANSKRIT + W10, 0}, |
|
564 {"sco", "sco", SCOTS + W10, ENGLISH - W4}, |
|
565 {"sd", "sd", SINDHI + W10, 0}, |
|
566 {"se", "sv", SWEDISH + W10, 0}, |
|
567 {"sg", "sg", SANGO + W10, 0}, |
|
568 {"si", "si,sl", SINHALESE + W10, SLOVENIAN + W10}, // 1:2 Sinhalese, Slovinia |
|
569 {"sk", "sk", SLOVAK + W10, CZECH - W4}, |
|
570 {"sl", "sl", SLOVENIAN + W10, 0}, |
|
571 {"slo", "sl", SLOVENIAN + W10, 0}, |
|
572 {"sm", "sm", SAMOAN + W10, 0}, |
|
573 {"sn", "sn", SHONA + W10, 0}, |
|
574 {"so", "so", SOMALI + W10, 0}, |
|
575 {"sp", "es", SPANISH + W10, 0}, |
|
576 {"sq", "sq", ALBANIAN + W10, 0}, |
|
577 {"sr", "sr", SERBIAN + W10, 0}, |
|
578 {"srb", "sr", SERBIAN + W10, 0}, |
|
579 {"srl", "sr", SERBIAN + W10, 0}, // Serbian Latin |
|
580 {"srp", "sr", SERBIAN + W10, 0}, |
|
581 {"ss", "ss", SISWANT + W10, 0}, |
|
582 {"st", "st", SESOTHO + W10, 0}, |
|
583 {"su", "su", SUNDANESE + W10, 0}, |
|
584 {"sv", "sv", SWEDISH + W10, 0}, |
|
585 {"sve", "sv", SWEDISH + W10, 0}, |
|
586 {"sw", "sw", SWAHILI + W10, 0}, |
|
587 {"swe", "sv", SWEDISH + W10, 0}, |
|
588 {"sy", "syr", SYRIAC + W10, 0}, |
|
589 {"syr", "syr", SYRIAC + W10, 0}, |
|
590 |
|
591 {"ta", "ta", TAMIL + W10, 0}, |
|
592 {"te", "te", TELUGU + W10, 0}, |
|
593 {"tg", "tg", TAJIK + W10, 0}, |
|
594 {"th", "th", THAI + W10, 0}, |
|
595 {"ti", "ti,bo", TIGRINYA + W10, TIBETAN + W10}, // 1:2 Tigrinya, Tibet |
|
596 {"tj", "tg", TAJIK + W10, 0}, // Tajikistan |
|
597 {"tk", "tk", TURKMEN + W10, 0}, |
|
598 {"tl", "tl", TAGALOG + W10, 0}, |
|
599 {"tlh", "tlh", X_KLINGON + W10, 0}, |
|
600 {"tn", "tn", TSWANA + W10, 0}, |
|
601 {"to", "to", TONGA + W10, 0}, |
|
602 {"tr", "tr", TURKISH + W10, 0}, |
|
603 {"ts", "ts", TSONGA + W10, 0}, |
|
604 {"tt", "tt", TATAR + W10, 0}, |
|
605 {"tw", "ak,zhT", AKAN + W10, CHINESE_T + W10}, // 1:2 Twi => Akan, Taiwan |
|
606 {"twi", "ak", AKAN + W10, 0}, // Twi => Akan |
|
607 |
|
608 {"ua", "uk", UKRAINIAN + W10, 0}, // Ukraine |
|
609 {"ug", "ug", UIGHUR + W10, 0}, |
|
610 {"uk", "uk", UKRAINIAN + W10, 0}, |
|
611 {"ur", "ur", URDU + W10, 0}, |
|
612 {"uz", "uz", UZBEK + W10, 0}, |
|
613 |
|
614 {"va", "ca", CATALAN + W10, 0}, // Valencia => Catalan |
|
615 {"val", "ca", CATALAN + W10, 0}, // Valencia => Catalan |
|
616 {"ve", "ve", VENDA + W10, 0}, |
|
617 {"vi", "vi", VIETNAMESE + W10, 0}, |
|
618 {"vie", "vi", VIETNAMESE + W10, 0}, |
|
619 {"vn", "vi", VIETNAMESE + W10, 0}, |
|
620 {"vo", "vo", VOLAPUK + W10, 0}, |
|
621 |
|
622 {"wo", "wo", WOLOF + W10, 0}, |
|
623 |
|
624 {"xh", "xh", XHOSA + W10, ZULU - W4}, |
|
625 {"xho", "xh", XHOSA + W10, ZULU - W4}, |
|
626 |
|
627 {"yi", "yi", YIDDISH + W10, 0}, |
|
628 {"yo", "yo", YORUBA + W10, 0}, |
|
629 |
|
630 {"za", "za", ZHUANG + W10, 0}, |
|
631 {"zh", "zh", CHINESE + W10, 0}, |
|
632 {"zht", "zhT", CHINESE_T + W10, 0}, |
|
633 {"zu", "zu", ZULU + W10, XHOSA - W4}, |
|
634 }; |
|
635 |
|
636 |
|
637 // Possibly map to tl: |
|
638 // -LangTags tl-Latn /7val.com/ ,bcl 2 Central Bicolano |
|
639 // -LangTags tl-Latn /7val.com/ ,ceb 6 Cebuano |
|
640 // -LangTags tl-Latn /7val.com/ ,war 1 Waray |
|
641 |
|
642 |
|
643 |
|
644 // Table to look up country TLD (no general TLD) |
|
645 // In alphabetical order for binary search |
|
646 static const int kCLDTable3Size = 181; |
|
647 static const TLDLookup kCLDTLDHintTable[kCLDTable3Size] = { |
|
648 {"ac", JAPANESE + W2, 0}, |
|
649 {"ad", CATALAN + W4, 0}, |
|
650 {"ae", ARABIC + W4, 0}, |
|
651 {"af", PASHTO + W4, PERSIAN + W4}, |
|
652 {"ag", GERMAN + W2, 0}, // meager |
|
653 // {"ai", 0, 0}, // meager |
|
654 {"al", ALBANIAN + W4, 0}, |
|
655 {"am", ARMENIAN + W4, 0}, |
|
656 {"an", DUTCH + W4, 0}, // meager |
|
657 {"ao", PORTUGUESE + W4, 0}, |
|
658 // {"aq", 0, 0}, // meager |
|
659 {"ar", SPANISH + W4, 0}, |
|
660 // {"as", 0, 0}, |
|
661 {"at", GERMAN + W4, 0}, |
|
662 {"au", ENGLISH + W2, 0}, |
|
663 {"aw", DUTCH + W4, 0}, |
|
664 {"ax", SWEDISH + W4, 0}, |
|
665 {"az", AZERBAIJANI + W4, 0}, |
|
666 |
|
667 {"ba", BOSNIAN + W8, CROATIAN - W4}, |
|
668 // {"bb", 0, 0}, |
|
669 {"bd", BENGALI + W4, 0}, |
|
670 {"be", DUTCH + W4, FRENCH + W4}, |
|
671 {"bf", FRENCH + W4, 0}, |
|
672 {"bg", BULGARIAN + W4, 0}, |
|
673 {"bh", ARABIC + W4, 0}, |
|
674 {"bi", RUNDI + W4, FRENCH + W4}, |
|
675 {"bj", FRENCH + W4, 0}, |
|
676 {"bm", ENGLISH + W2, 0}, |
|
677 {"bn", MALAY + W4, INDONESIAN - W4}, |
|
678 {"bo", SPANISH + W4, AYMARA + W2}, // and GUARANI QUECHUA |
|
679 {"br", PORTUGUESE + W4, 0}, |
|
680 // {"bs", 0, 0}, |
|
681 {"bt", DZONGKHA + W10, TIBETAN - W10}, // Strong presumption of Dzongha |
|
682 {"bw", TSWANA + W4, 0}, |
|
683 {"by", BELARUSIAN + W4, 0}, |
|
684 // {"bz", 0, 0}, |
|
685 |
|
686 {"ca", FRENCH + W4, ENGLISH + W2}, |
|
687 {"cat", CATALAN + W4, 0}, |
|
688 {"cc", 0, 0}, |
|
689 {"cd", FRENCH + W4, 0}, |
|
690 {"cf", FRENCH + W4, 0}, |
|
691 {"cg", FRENCH + W4, 0}, |
|
692 {"ch", GERMAN + W4, FRENCH + W4}, |
|
693 {"ci", FRENCH + W4, 0}, |
|
694 // {"ck", 0, 0}, |
|
695 {"cl", SPANISH + W4, 0}, |
|
696 {"cm", FRENCH + W4, 0}, |
|
697 {"cn", CHINESE + W4, 0}, |
|
698 {"co", SPANISH + W4, 0}, |
|
699 {"cr", SPANISH + W4, 0}, |
|
700 {"cu", SPANISH + W4, 0}, |
|
701 {"cv", PORTUGUESE + W4, 0}, |
|
702 // {"cx", 0, 0}, |
|
703 {"cy", GREEK + W4, TURKISH + W4}, |
|
704 {"cz", CZECH + W4, SLOVAK - W4}, |
|
705 |
|
706 {"de", GERMAN + W4, 0}, |
|
707 {"dj", 0, 0}, |
|
708 {"dk", DANISH + W4, NORWEGIAN - W4}, |
|
709 {"dm", 0, 0}, |
|
710 {"do", SPANISH + W4, 0}, |
|
711 {"dz", FRENCH + W4, ARABIC + W4}, |
|
712 |
|
713 {"ec", SPANISH + W4, 0}, |
|
714 {"ee", ESTONIAN + W4, 0}, |
|
715 {"eg", ARABIC + W4, 0}, |
|
716 {"er", AFAR + W4, 0}, |
|
717 {"es", SPANISH + W4, 0}, |
|
718 {"et", AMHARIC + W4, AFAR + W4}, |
|
719 |
|
720 {"fi", FINNISH + W4, 0}, |
|
721 {"fj", FIJIAN + W4, 0}, |
|
722 // {"fk", 0, 0}, |
|
723 // {"fm", 0, 0}, |
|
724 {"fo", FAROESE + W4, ICELANDIC - W4}, |
|
725 {"fr", FRENCH + W4, 0}, |
|
726 |
|
727 {"ga", FRENCH + W4, 0}, |
|
728 {"gd", 0, 0}, |
|
729 {"ge", GEORGIAN + W4, 0}, |
|
730 {"gf", FRENCH + W4, 0}, |
|
731 // {"gg", 0, 0}, |
|
732 // {"gh", 0, 0}, |
|
733 // {"gi", 0, 0}, |
|
734 {"gl", GREENLANDIC + W4, DANISH + W4}, |
|
735 // {"gm", 0, 0}, |
|
736 {"gn", FRENCH + W4, 0}, |
|
737 // {"gp", 0, 0}, |
|
738 // {"gq", 0, 0}, |
|
739 {"gr", GREEK + W4, 0}, |
|
740 // {"gs", 0, 0}, |
|
741 {"gt", SPANISH + W4, 0}, |
|
742 // {"gu", 0, 0}, |
|
743 // {"gy", 0, 0}, |
|
744 |
|
745 {"hk", CHINESE_T + W4, 0}, |
|
746 // {"hm", 0, 0}, |
|
747 {"hn", SPANISH + W4, 0}, |
|
748 {"hr", CROATIAN + W8, BOSNIAN - W4}, |
|
749 {"ht", HAITIAN_CREOLE + W4, FRENCH + W4}, |
|
750 {"hu", HUNGARIAN + W4, 0}, |
|
751 |
|
752 {"id", INDONESIAN + W4, MALAY - W4}, |
|
753 {"ie", IRISH + W4, 0}, |
|
754 {"il", HEBREW + W4, 0}, |
|
755 {"im", MANX + W4, 0}, |
|
756 // {"in", 0, 0}, |
|
757 // {"io", 0, 0}, |
|
758 {"iq", ARABIC + W4, 0}, |
|
759 {"ir", PERSIAN + W4, 0}, |
|
760 {"is", ICELANDIC + W4, FAROESE - W4}, |
|
761 {"it", ITALIAN + W4, 0}, |
|
762 |
|
763 // {"je", 0, 0}, |
|
764 // {"jm", 0, 0}, |
|
765 {"jo", ARABIC + W4, 0}, |
|
766 {"jp", JAPANESE + W4, 0}, |
|
767 |
|
768 // {"ke", 0, 0}, |
|
769 {"kg", KYRGYZ + W4, 0}, |
|
770 {"kh", KHMER + W4, 0}, |
|
771 // {"ki", 0, 0}, |
|
772 {"km", FRENCH + W4, 0}, |
|
773 // {"kn", 0, 0}, |
|
774 {"kp", KOREAN + W4, 0}, |
|
775 {"kr", KOREAN + W4, 0}, |
|
776 {"kw", ARABIC + W4, 0}, |
|
777 // {"ky", 0, 0}, |
|
778 {"kz", KAZAKH + W4, 0}, |
|
779 |
|
780 {"la", LAOTHIAN + W4, 0}, |
|
781 {"lb", ARABIC + W4, FRENCH + W4}, |
|
782 // {"lc", 0, 0}, |
|
783 {"li", GERMAN + W4, 0}, |
|
784 {"lk", SINHALESE + W4, 0}, |
|
785 // {"lr", 0, 0}, |
|
786 {"ls", SESOTHO + W4, 0}, |
|
787 {"lt", LITHUANIAN + W4, 0}, |
|
788 {"lu", LUXEMBOURGISH + W4}, |
|
789 {"lv", LATVIAN + W4, 0}, |
|
790 {"ly", ARABIC + W4, 0}, |
|
791 |
|
792 {"ma", FRENCH + W4, 0}, |
|
793 {"mc", FRENCH + W4, 0}, |
|
794 {"md", ROMANIAN + W4, 0}, |
|
795 {"me", MONTENEGRIN + W8, SERBIAN - W4}, |
|
796 {"mg", FRENCH + W4, 0}, |
|
797 {"mk", MACEDONIAN + W4, 0}, |
|
798 {"ml", FRENCH + W4, 0}, |
|
799 {"mm", BURMESE + W4, 0}, |
|
800 {"mn", MONGOLIAN + W4, 0}, |
|
801 {"mo", CHINESE_T + W4, PORTUGUESE + W4}, |
|
802 // {"mp", 0, 0}, |
|
803 {"mq", FRENCH + W4, 0}, |
|
804 {"mr", FRENCH + W4, ARABIC + W4}, |
|
805 // {"ms", 0, 0}, |
|
806 {"mt", MALTESE + W4, 0}, |
|
807 // {"mu", 0, 0}, |
|
808 {"mv", DHIVEHI + W4, 0}, |
|
809 // {"mw", 0, 0}, |
|
810 {"mx", SPANISH + W4, 0}, |
|
811 {"my", MALAY + W4, INDONESIAN - W4}, |
|
812 {"mz", PORTUGUESE + W4, 0}, |
|
813 |
|
814 {"na", 0, 0}, // Namibia |
|
815 {"nc", FRENCH + W4, 0}, |
|
816 {"ne", FRENCH + W4, 0}, |
|
817 {"nf", FRENCH + W4, 0}, |
|
818 // {"ng", 0, 0}, |
|
819 {"ni", SPANISH + W4, 0}, |
|
820 {"nl", DUTCH + W4, 0}, |
|
821 {"no", NORWEGIAN + W4, NORWEGIAN_N + W2}, |
|
822 {"np", NEPALI + W4, 0}, |
|
823 {"nr", NAURU + W4, 0}, |
|
824 {"nu", SWEDISH + W4, 0}, |
|
825 {"nz", MAORI + W4, ENGLISH + W2}, |
|
826 |
|
827 {"om", ARABIC + W4, 0}, |
|
828 |
|
829 {"pa", SPANISH + W4, 0}, |
|
830 {"pe", SPANISH + W4, QUECHUA + W2}, // also AYMARA |
|
831 {"pf", FRENCH + W4, 0}, |
|
832 // {"pg", 0, 0}, |
|
833 {"ph", TAGALOG + W4, 0}, |
|
834 {"pk", URDU + W4, 0}, |
|
835 {"pl", POLISH + W4, 0}, |
|
836 // {"pn", 0, 0}, |
|
837 {"pr", SPANISH + W4, 0}, |
|
838 {"ps", ARABIC + W4, 0}, |
|
839 {"pt", PORTUGUESE + W4, 0}, |
|
840 {"py", SPANISH + W4, GUARANI + W2}, |
|
841 |
|
842 {"qa", ARABIC + W4, 0}, |
|
843 |
|
844 {"re", FRENCH + W4, 0}, |
|
845 {"ro", ROMANIAN + W4, 0}, |
|
846 {"rs", SERBIAN + W8, MONTENEGRIN - W4}, |
|
847 {"ru", RUSSIAN + W4, 0}, |
|
848 {"rw", KINYARWANDA + W4, FRENCH + W2}, |
|
849 |
|
850 {"sa", ARABIC + W4, 0}, |
|
851 // {"sb", 0, 0}, |
|
852 {"sc", SESELWA + W4, 0}, |
|
853 {"sd", ARABIC + W4, 0}, |
|
854 {"se", SWEDISH + W4, 0}, |
|
855 // {"sg", 0, 0}, |
|
856 // {"sh", 0, 0}, |
|
857 {"si", SLOVENIAN + W4, 0}, |
|
858 {"sk", SLOVAK + W4, CZECH - W4}, |
|
859 // {"sl", 0, 0}, |
|
860 {"sm", ITALIAN + W4, 0}, |
|
861 {"sn", FRENCH + W4, 0}, |
|
862 // {"sr", 0, 0}, |
|
863 {"ss", ARABIC + W4, 0}, // Presumed South Sudan TLD. dsites 2011.07.07 |
|
864 // {"st", 0, 0}, |
|
865 {"su", RUSSIAN + W4, 0}, |
|
866 {"sv", SPANISH + W4, 0}, |
|
867 {"sy", ARABIC + W4, 0}, |
|
868 // {"sz", 0, 0}, |
|
869 |
|
870 // {"tc", 0, 0}, |
|
871 {"td", FRENCH + W4, 0}, |
|
872 // {"tf", 0, 0}, |
|
873 {"tg", FRENCH + W4, 0}, |
|
874 {"th", THAI + W4, 0}, |
|
875 // Tibet has no country code (see .cn) |
|
876 {"tj", TAJIK + W4, 0}, |
|
877 // {"tk", 0, 0}, |
|
878 // {"tl", 0, 0}, |
|
879 {"tm", TURKISH + W4, 0}, |
|
880 {"tn", FRENCH + W4, ARABIC + W4}, |
|
881 // {"to", 0, 0}, |
|
882 {"tp", JAPANESE + W4, 0}, |
|
883 {"tr", TURKISH + W4, 0}, |
|
884 // {"tt", 0, 0}, |
|
885 // {"tv", 0, 0}, |
|
886 {"tw", CHINESE_T + W4, 0}, |
|
887 {"tz", SWAHILI + W4, AKAN + W4}, |
|
888 |
|
889 {"ua", UKRAINIAN + W4, 0}, |
|
890 {"ug", GANDA + W4, 0}, |
|
891 {"uk", ENGLISH + W2, 0}, |
|
892 {"us", ENGLISH + W2, 0}, |
|
893 {"uy", SPANISH + W4, 0}, |
|
894 {"uz", UZBEK + W4, 0}, |
|
895 |
|
896 {"va", ITALIAN + W4, LATIN + W2}, |
|
897 // {"vc", 0, 0}, |
|
898 {"ve", SPANISH + W4, 0}, |
|
899 // {"vg", 0, 0}, |
|
900 // {"vi", 0, 0}, |
|
901 {"vn", VIETNAMESE + W4, 0}, |
|
902 // {"vu", 0, 0}, |
|
903 |
|
904 {"wf", FRENCH + W4, 0}, |
|
905 // {"ws", 0, 0}, |
|
906 |
|
907 {"ye", ARABIC + W4, 0}, |
|
908 |
|
909 {"za", AFRIKAANS + W4, 0}, |
|
910 // {"zm", 0, 0}, |
|
911 // {"zw", 0, 0}, |
|
912 }; |
|
913 |
|
914 #undef W2 |
|
915 #undef W4 |
|
916 #undef W6 |
|
917 #undef W8 |
|
918 #undef W10 |
|
919 #undef W12 |
|
920 |
|
921 |
|
922 |
|
923 |
|
924 |
|
925 inline void SetCLDPriorWeight(int w, OneCLDLangPrior* olp) { |
|
926 *olp = (*olp & 0x3ff) + (w << 10); |
|
927 } |
|
928 inline void SetCLDPriorLang(Language lang, OneCLDLangPrior* olp) { |
|
929 *olp = (*olp & ~0x3ff) + lang; |
|
930 } |
|
931 |
|
932 OneCLDLangPrior PackCLDPriorLangWeight(Language lang, int w) { |
|
933 return (w << 10) + lang; |
|
934 } |
|
935 |
|
936 inline int MaxInt(int a, int b) { |
|
937 return (a >= b) ? a : b; |
|
938 } |
|
939 |
|
940 // Merge in another language prior, taking max if already there |
|
941 void MergeCLDLangPriorsMax(OneCLDLangPrior olp, CLDLangPriors* lps) { |
|
942 if (olp == 0) {return;} |
|
943 Language target_lang = GetCLDPriorLang(olp); |
|
944 for (int i = 0; i < lps->n; ++i) { |
|
945 if (GetCLDPriorLang(lps->prior[i]) == target_lang) { |
|
946 int new_weight = MaxInt(GetCLDPriorWeight(lps->prior[i]), |
|
947 GetCLDPriorWeight(olp)); |
|
948 SetCLDPriorWeight(new_weight, &lps->prior[i]); |
|
949 return; |
|
950 } |
|
951 } |
|
952 // Not found; add it if room |
|
953 if (lps->n >= kMaxOneCLDLangPrior) {return;} |
|
954 lps->prior[lps->n++] = olp; |
|
955 } |
|
956 |
|
957 // Merge in another language prior, boosting 10x if already there |
|
958 void MergeCLDLangPriorsBoost(OneCLDLangPrior olp, CLDLangPriors* lps) { |
|
959 if (olp == 0) {return;} |
|
960 Language target_lang = GetCLDPriorLang(olp); |
|
961 for (int i = 0; i < lps->n; ++i) { |
|
962 if (GetCLDPriorLang(lps->prior[i]) == target_lang) { |
|
963 int new_weight = GetCLDPriorWeight(lps->prior[i]) + 2; |
|
964 SetCLDPriorWeight(new_weight, &lps->prior[i]); |
|
965 return; |
|
966 } |
|
967 } |
|
968 // Not found; add it if room |
|
969 if (lps->n >= kMaxOneCLDLangPrior) {return;} |
|
970 lps->prior[lps->n++] = olp; |
|
971 } |
|
972 |
|
973 |
|
974 // Trim language priors to no more than max_entries, keeping largest abs weights |
|
975 void TrimCLDLangPriors(int max_entries, CLDLangPriors* lps) { |
|
976 if (lps->n <= max_entries) {return;} |
|
977 |
|
978 // Insertion sort in-place by abs(weight) |
|
979 for (int i = 0; i < lps->n; ++i) { |
|
980 OneCLDLangPrior temp_olp = lps->prior[i]; |
|
981 int w = abs(GetCLDPriorWeight(temp_olp)); |
|
982 int kk = i; |
|
983 for (; kk > 0; --kk) { |
|
984 if (abs(GetCLDPriorWeight(lps->prior[kk - 1])) < w) { |
|
985 // Move down and continue |
|
986 lps->prior[kk] = lps->prior[kk - 1]; |
|
987 } else { |
|
988 // abs(weight[kk - 1]) >= w, time to stop |
|
989 break; |
|
990 } |
|
991 } |
|
992 lps->prior[kk] = temp_olp; |
|
993 } |
|
994 |
|
995 lps->n = max_entries; |
|
996 } |
|
997 |
|
998 int CountCommas(const string& langtags) { |
|
999 int commas = 0; |
|
1000 for (int i = 0; i < static_cast<int>(langtags.size()); ++i) { |
|
1001 if (langtags[i] == ',') {++commas;} |
|
1002 } |
|
1003 return commas; |
|
1004 } |
|
1005 |
|
1006 // Binary lookup on language tag |
|
1007 const LangTagLookup* DoLangTagLookup(const char* key, |
|
1008 const LangTagLookup* tbl, int tbl_size) { |
|
1009 // Key is always in range [lo..hi) |
|
1010 int lo = 0; |
|
1011 int hi = tbl_size; |
|
1012 while (lo < hi) { |
|
1013 int mid = (lo + hi) >> 1; |
|
1014 int comp = strcmp(tbl[mid].langtag, key); |
|
1015 if (comp < 0) { |
|
1016 lo = mid + 1; |
|
1017 } else if (comp > 0) { |
|
1018 hi = mid; |
|
1019 } else { |
|
1020 return &tbl[mid]; |
|
1021 } |
|
1022 } |
|
1023 return NULL; |
|
1024 } |
|
1025 |
|
1026 // Binary lookup on tld |
|
1027 const TLDLookup* DoTLDLookup(const char* key, |
|
1028 const TLDLookup* tbl, int tbl_size) { |
|
1029 // Key is always in range [lo..hi) |
|
1030 int lo = 0; |
|
1031 int hi = tbl_size; |
|
1032 while (lo < hi) { |
|
1033 int mid = (lo + hi) >> 1; |
|
1034 int comp = strcmp(tbl[mid].tld, key); |
|
1035 if (comp < 0) { |
|
1036 lo = mid + 1; |
|
1037 } else if (comp > 0) { |
|
1038 hi = mid; |
|
1039 } else { |
|
1040 return &tbl[mid]; |
|
1041 } |
|
1042 } |
|
1043 return NULL; |
|
1044 } |
|
1045 |
|
1046 |
|
1047 |
|
1048 // Trim language tag string to canonical form for each language |
|
1049 // Input is from GetLangTagsFromHtml(), already lowercased |
|
1050 string TrimCLDLangTagsHint(const string& langtags) { |
|
1051 string retval; |
|
1052 if (langtags.empty()) {return retval;} |
|
1053 int commas = CountCommas(langtags); |
|
1054 if (commas > 4) {return retval;} // Ignore if too many language tags |
|
1055 |
|
1056 char temp[20]; |
|
1057 int pos = 0; |
|
1058 while (pos < static_cast<int>(langtags.size())) { |
|
1059 int comma = langtags.find(',', pos); |
|
1060 if (comma == string::npos) {comma = langtags.size();} // fake trailing comma |
|
1061 int len = comma - pos; |
|
1062 if (len <= 16) { |
|
1063 // Short enough to use |
|
1064 memcpy(temp, &langtags[pos], len); |
|
1065 temp[len] = '\0'; |
|
1066 const LangTagLookup* entry = DoLangTagLookup(temp, |
|
1067 kCLDLangTagsHintTable1, |
|
1068 kCLDTable1Size); |
|
1069 if (entry != NULL) { |
|
1070 // First table hit |
|
1071 retval.append(entry->langcode); // may be "code1,code2" |
|
1072 retval.append(1, ','); |
|
1073 } else { |
|
1074 // Try second table with language code truncated at first hyphen |
|
1075 char* hyphen = strchr(temp, '-'); |
|
1076 if (hyphen != NULL) {*hyphen = '\0';} |
|
1077 len = strlen(temp); |
|
1078 if (len <= 3) { // Short enough to use |
|
1079 entry = DoLangTagLookup(temp, |
|
1080 kCLDLangTagsHintTable2, |
|
1081 kCLDTable2Size); |
|
1082 if (entry != NULL) { |
|
1083 // Second table hit |
|
1084 retval.append(entry->langcode); // may be "code1,code2" |
|
1085 retval.append(1, ','); |
|
1086 } |
|
1087 } |
|
1088 } |
|
1089 } |
|
1090 pos = comma + 1; |
|
1091 } |
|
1092 |
|
1093 // Remove trainling comma, if any |
|
1094 if (!retval.empty()) {retval.resize(retval.size() - 1);} |
|
1095 return retval; |
|
1096 } |
|
1097 |
|
1098 |
|
1099 |
|
1100 //============================================================================== |
|
1101 |
|
1102 // Little state machine to scan insides of language attribute quoted-string. |
|
1103 // Each language code is lowercased and copied to the output string. Underscore |
|
1104 // is mapped to minus. Space, tab, and comma are all mapped to comma, and |
|
1105 // multiple consecutive commas are removed. |
|
1106 // Each language code in the output list will be followed by a single comma. |
|
1107 |
|
1108 // There are three states, and we start in state 1: |
|
1109 // State 0: After a letter. |
|
1110 // Copy all letters/minus[0], copy comma[1]; all others copy comma and skip [2] |
|
1111 // State 1: Just after a comma. |
|
1112 // Copy letter [0], Ignore subsequent commas[1]. minus and all others skip [2] |
|
1113 // State 2: Skipping. |
|
1114 // All characters except comma skip and stay in [2]. comma goes to [1] |
|
1115 |
|
1116 // The thing that is copied is kLangCodeRemap[c] when going to state 0, |
|
1117 // and always comma when going to state 1 or 2. The design depends on copying |
|
1118 // a comma at the *beginning* of skipping, and in state 2 never doing a copy. |
|
1119 |
|
1120 // We pack all this into 8 bits: |
|
1121 // +--+---+---+ |
|
1122 // |78|654|321| |
|
1123 // +--+---+---+ |
|
1124 // |
|
1125 // Shift byte right by 3*state, giving [0] 321, [1] 654, [2] .78 |
|
1126 // where . is always zero |
|
1127 // Of these 3 bits, low two are next state ss, high bit is copy bit C. |
|
1128 // If C=1 and ss == 0, copy kLangCodeRemap[c], else copy a comma |
|
1129 |
|
1130 #define SKIP0 0 |
|
1131 #define SKIP1 1 |
|
1132 #define SKIP2 2 |
|
1133 #define COPY0 4 // copy kLangCodeRemap[c] |
|
1134 #define COPY1 5 // copy ',' |
|
1135 #define COPY2 6 // copy ',' |
|
1136 |
|
1137 // These combined actions pack three states into one byte. |
|
1138 // Ninth bit must be zero, so all state 2 values must be skips. |
|
1139 // state[2] state[1] state[0] |
|
1140 #define LTR ((SKIP2 << 6) + (COPY0 << 3) + COPY0) |
|
1141 #define MINUS ((SKIP2 << 6) + (COPY2 << 3) + COPY0) |
|
1142 #define COMMA ((SKIP1 << 6) + (SKIP1 << 3) + COPY1) |
|
1143 #define Bad ((SKIP2 << 6) + (COPY2 << 3) + COPY2) |
|
1144 |
|
1145 // Treat as letter: a-z, A-Z |
|
1146 // Treat as minus: 2D minus, 5F underscore |
|
1147 // Treat as comma: 09 tab, 20 space, 2C comma |
|
1148 |
|
1149 static const unsigned char kLangCodeAction[256] = { |
|
1150 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,COMMA,Bad,Bad,Bad,Bad,Bad,Bad, |
|
1151 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
|
1152 COMMA,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,COMMA,MINUS,Bad,Bad, |
|
1153 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
|
1154 |
|
1155 Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, |
|
1156 LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,MINUS, |
|
1157 Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, |
|
1158 LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,Bad, |
|
1159 |
|
1160 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
|
1161 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
|
1162 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
|
1163 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
|
1164 |
|
1165 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
|
1166 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
|
1167 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
|
1168 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
|
1169 }; |
|
1170 |
|
1171 // This does lowercasing, maps underscore to minus, and maps tab/space to comma |
|
1172 static const unsigned char kLangCodeRemap[256] = { |
|
1173 0,0,0,0,0,0,0,0, 0,',',0,0,0,0,0,0, // 09 tab |
|
1174 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
1175 ',',0,0,0,0,0,0,0, 0,0,0,0,',','-',0,0, // 20 space 2C comma 2D minus |
|
1176 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
1177 |
|
1178 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o', |
|
1179 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,'-', // 5F underscore |
|
1180 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o', |
|
1181 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,0, |
|
1182 |
|
1183 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
1184 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
1185 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
1186 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
1187 |
|
1188 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
1189 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
1190 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
1191 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
|
1192 }; |
|
1193 |
|
1194 #undef LTR |
|
1195 #undef MINUS |
|
1196 #undef COMMA |
|
1197 #undef Bad |
|
1198 |
|
1199 #undef SKIP0 |
|
1200 #undef SKIP1 |
|
1201 #undef SKIP2 |
|
1202 #undef COPY0 |
|
1203 #undef COPY1 |
|
1204 #undef COPY2 |
|
1205 |
|
1206 |
|
1207 // Find opening '<' for HTML tag |
|
1208 // Note: this is all somewhat insensitive to mismatched quotes |
|
1209 int32 FindTagStart(const char* utf8_body, int32 pos, int32 max_pos) { |
|
1210 int i = pos; |
|
1211 // Advance i by 4 if none of the next 4 bytes are '<' |
|
1212 for (i = pos; i < (max_pos - 3); i += 4) { |
|
1213 // Fast check for any < |
|
1214 const char* p = &utf8_body[i]; |
|
1215 uint32 s0123 = UNALIGNED_LOAD32(p); |
|
1216 uint32 temp = s0123 ^ 0x3c3c3c3c; // <<<< |
|
1217 if (((temp - 0x01010101) & (~temp & 0x80808080)) != 0) { |
|
1218 // At least one byte is '<' |
|
1219 break; |
|
1220 } |
|
1221 } |
|
1222 // Continue, advancing i by 1 |
|
1223 for (; i < max_pos; ++i) { |
|
1224 if (utf8_body[i] == '<') {return i;} |
|
1225 } |
|
1226 return -1; |
|
1227 } |
|
1228 |
|
1229 |
|
1230 // Find closing '>' for HTML tag. Also stop on < and & (simplistic parsing) |
|
1231 int32 FindTagEnd(const char* utf8_body, int32 pos, int32 max_pos) { |
|
1232 // Always outside quotes |
|
1233 for (int i = pos; i < max_pos; ++i) { |
|
1234 char c = utf8_body[i]; |
|
1235 if (c == '>') {return i;} |
|
1236 if (c == '<') {return i - 1;} |
|
1237 if (c == '&') {return i - 1;} |
|
1238 } |
|
1239 return -1; // nothing found |
|
1240 } |
|
1241 |
|
1242 // Find opening quote or apostrophe, skipping spaces |
|
1243 // Note: this is all somewhat insensitive to mismatched quotes |
|
1244 int32 FindQuoteStart(const char* utf8_body, int32 pos, int32 max_pos) { |
|
1245 for (int i = pos; i < max_pos; ++i) { |
|
1246 char c = utf8_body[i]; |
|
1247 if (c == '"') {return i;} |
|
1248 if (c == '\'') {return i;} |
|
1249 if (c != ' ') {return -1;} |
|
1250 } |
|
1251 return -1; |
|
1252 } |
|
1253 |
|
1254 // Find closing quot/apos. Also stop on = > < and & (simplistic parsing) |
|
1255 int32 FindQuoteEnd(const char* utf8_body, int32 pos, int32 max_pos) { |
|
1256 // Always outside quotes |
|
1257 for (int i = pos; i < max_pos; ++i) { |
|
1258 char c = utf8_body[i]; |
|
1259 if (c == '"') {return i;} |
|
1260 if (c == '\'') {return i;} |
|
1261 if (c == '>') {return i - 1;} |
|
1262 if (c == '=') {return i - 1;} |
|
1263 if (c == '<') {return i - 1;} |
|
1264 if (c == '&') {return i - 1;} |
|
1265 } |
|
1266 return -1; // nothing found |
|
1267 } |
|
1268 |
|
1269 int32 FindEqualSign(const char* utf8_body, int32 pos, int32 max_pos) { |
|
1270 // Outside quotes/apostrophes loop |
|
1271 for (int i = pos; i < max_pos; ++i) { |
|
1272 char c = utf8_body[i]; |
|
1273 if (c == '=') { // Found bare equal sign inside tag |
|
1274 return i; |
|
1275 } else if (c == '"') { |
|
1276 // Inside quotes loop |
|
1277 int j; |
|
1278 for (j = i + 1; j < max_pos; ++j) { |
|
1279 if (utf8_body[j] == '"') { |
|
1280 break; |
|
1281 } else if (utf8_body[j] == '\\') { |
|
1282 ++j; |
|
1283 } |
|
1284 } |
|
1285 i = j; |
|
1286 } else if (c == '\'') { |
|
1287 // Inside apostrophes loop |
|
1288 int j; |
|
1289 for (j = i + 1; j < max_pos; ++j) { |
|
1290 if (utf8_body[j] == '\'') { |
|
1291 break; |
|
1292 } else if (utf8_body[j] == '\\') { |
|
1293 ++j; |
|
1294 } |
|
1295 } |
|
1296 i = j; |
|
1297 } |
|
1298 |
|
1299 } |
|
1300 return -1; // nothing found |
|
1301 } |
|
1302 |
|
1303 // Scan backwards for case-insensitive string s in [min_pos..pos) |
|
1304 // Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f] |
|
1305 // Cheap lowercase. Control codes will masquerade as 20..3f |
|
1306 bool FindBefore(const char* utf8_body, |
|
1307 int32 min_pos, int32 pos, const char* s) { |
|
1308 int len = strlen(s); |
|
1309 if ((pos - min_pos) < len) {return false;} // Too small to fit s |
|
1310 |
|
1311 // Skip trailing spaces |
|
1312 int i = pos; |
|
1313 while ((i > (min_pos + len)) && (utf8_body[i - 1] == ' ')) {--i;} |
|
1314 i -= len; |
|
1315 if (i < min_pos) {return false;} // pos - min_pos < len, so s can't be found |
|
1316 |
|
1317 const char* p = &utf8_body[i]; |
|
1318 for (int j = 0; j < len; ++j) { |
|
1319 if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte |
|
1320 } |
|
1321 return true; // All bytes equal at i |
|
1322 } |
|
1323 |
|
1324 // Scan forwards for case-insensitive string s in [pos..max_pos) |
|
1325 // Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f] |
|
1326 // Cheap lowercase. Control codes will masquerade as 20..3f |
|
1327 // Allows but does not require quoted/apostrophe string |
|
1328 bool FindAfter(const char* utf8_body, |
|
1329 int32 pos, int32 max_pos, const char* s) { |
|
1330 int len = strlen(s); |
|
1331 if ((max_pos - pos) < len) {return false;} // Too small to fit s |
|
1332 |
|
1333 // Skip leading spaces, quote, apostrophe |
|
1334 int i = pos; |
|
1335 while (i < (max_pos - len)) { |
|
1336 unsigned char c = utf8_body[i]; |
|
1337 if ((c == ' ') || (c == '"') || (c == '\'')) {++i;} |
|
1338 else {break;} |
|
1339 } |
|
1340 |
|
1341 const char* p = &utf8_body[i]; |
|
1342 for (int j = 0; j < len; ++j) { |
|
1343 if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte |
|
1344 } |
|
1345 return true; // All bytes equal |
|
1346 } |
|
1347 |
|
1348 |
|
1349 |
|
1350 // Copy attribute value in [pos..max_pos) |
|
1351 // pos is just after an opening quote/apostrophe and max_pos is the ending one |
|
1352 // String must all be on a single line. |
|
1353 // Return slightly-normalized language list, empty or ending in comma |
|
1354 // Does lowercasing and removes excess punctuation/space |
|
1355 string CopyOneQuotedString(const char* utf8_body, |
|
1356 int32 pos, int32 max_pos) { |
|
1357 string s; |
|
1358 int state = 1; // Front is logically just after a comma |
|
1359 for (int i = pos; i < max_pos; ++i) { |
|
1360 unsigned char c = utf8_body[i]; |
|
1361 int e = kLangCodeAction[c] >> (3 * state); |
|
1362 state = e & 3; // Update to next state |
|
1363 if ((e & 4) != 0) { |
|
1364 // Copy a remapped byte if going to state 0, else copy a comma |
|
1365 if (state == 0) { |
|
1366 s.append(1, kLangCodeRemap[c]); |
|
1367 } else { |
|
1368 s.append(1, ','); |
|
1369 } |
|
1370 } |
|
1371 } |
|
1372 |
|
1373 // Add final comma if needed |
|
1374 if (state == 0) { |
|
1375 s.append(1, ','); |
|
1376 } |
|
1377 return s; |
|
1378 } |
|
1379 |
|
1380 // Find and copy attribute value: quoted string in [pos..max_pos) |
|
1381 // Return slightly-normalized language list, empty or ending in comma |
|
1382 string CopyQuotedString(const char* utf8_body, |
|
1383 int32 pos, int32 max_pos) { |
|
1384 int32 start_quote = FindQuoteStart(utf8_body, pos, max_pos); |
|
1385 if (start_quote < 0) {return string("");} |
|
1386 int32 end_quote = FindQuoteEnd(utf8_body, start_quote + 1, max_pos); |
|
1387 if (end_quote < 0) {return string("");} |
|
1388 |
|
1389 return CopyOneQuotedString(utf8_body, start_quote + 1, end_quote); |
|
1390 } |
|
1391 |
|
1392 // Add hints to vector of langpriors |
|
1393 // Input is from GetLangTagsFromHtml(), already lowercased |
|
1394 void SetCLDLangTagsHint(const string& langtags, CLDLangPriors* langpriors) { |
|
1395 if (langtags.empty()) {return;} |
|
1396 int commas = CountCommas(langtags); |
|
1397 if (commas > 4) {return;} // Ignore if too many language tags |
|
1398 |
|
1399 char temp[20]; |
|
1400 int pos = 0; |
|
1401 while (pos < static_cast<int>(langtags.size())) { |
|
1402 int comma = langtags.find(',', pos); |
|
1403 if (comma == string::npos) {comma = langtags.size();} // fake trailing comma |
|
1404 int len = comma - pos; |
|
1405 if (len <= 16) { |
|
1406 // Short enough to use |
|
1407 memcpy(temp, &langtags[pos], len); |
|
1408 temp[len] = '\0'; |
|
1409 const LangTagLookup* entry = DoLangTagLookup(temp, |
|
1410 kCLDLangTagsHintTable1, |
|
1411 kCLDTable1Size); |
|
1412 if (entry != NULL) { |
|
1413 // First table hit |
|
1414 MergeCLDLangPriorsMax(entry->onelangprior1, langpriors); |
|
1415 MergeCLDLangPriorsMax(entry->onelangprior2, langpriors); |
|
1416 } else { |
|
1417 // Try second table with language code truncated at first hyphen |
|
1418 char* hyphen = strchr(temp, '-'); |
|
1419 if (hyphen != NULL) {*hyphen = '\0';} |
|
1420 len = strlen(temp); |
|
1421 if (len <= 3) { // Short enough to use |
|
1422 entry = DoLangTagLookup(temp, |
|
1423 kCLDLangTagsHintTable2, |
|
1424 kCLDTable2Size); |
|
1425 if (entry != NULL) { |
|
1426 // Second table hit |
|
1427 MergeCLDLangPriorsMax(entry->onelangprior1, langpriors); |
|
1428 MergeCLDLangPriorsMax(entry->onelangprior2, langpriors); |
|
1429 } |
|
1430 } |
|
1431 } |
|
1432 } |
|
1433 pos = comma + 1; |
|
1434 } |
|
1435 } |
|
1436 |
|
1437 // Add hints to vector of langpriors |
|
1438 // Input is string after HTTP header Content-Language: |
|
1439 void SetCLDContentLangHint(const char* contentlang, CLDLangPriors* langpriors) { |
|
1440 string langtags = CopyOneQuotedString(contentlang, 0, strlen(contentlang)); |
|
1441 SetCLDLangTagsHint(langtags, langpriors); |
|
1442 } |
|
1443 |
|
1444 // Add hints to vector of langpriors |
|
1445 // Input is last element of hostname (no dot), e.g. from GetTLD() |
|
1446 void SetCLDTLDHint(const char* tld, CLDLangPriors* langpriors) { |
|
1447 int len = strlen(tld); |
|
1448 if (len > 3) {return;} // Ignore if more than three letters |
|
1449 char local_tld[4]; |
|
1450 strncpy(local_tld, tld, 4); |
|
1451 local_tld[3] = '\0'; // Safety move |
|
1452 // Lowercase |
|
1453 for (int i = 0; i < len; ++i) {local_tld[i] |= 0x20;} |
|
1454 const TLDLookup* entry = DoTLDLookup(local_tld, |
|
1455 kCLDTLDHintTable, |
|
1456 kCLDTable3Size); |
|
1457 if (entry != NULL) { |
|
1458 // Table hit |
|
1459 MergeCLDLangPriorsBoost(entry->onelangprior1, langpriors); |
|
1460 MergeCLDLangPriorsBoost(entry->onelangprior2, langpriors); |
|
1461 } |
|
1462 } |
|
1463 |
|
1464 // Add hints to vector of langpriors |
|
1465 // Input is from DetectEncoding() |
|
1466 void SetCLDEncodingHint(Encoding enc, CLDLangPriors* langpriors) { |
|
1467 OneCLDLangPrior olp; |
|
1468 switch (enc) { |
|
1469 case CHINESE_GB: |
|
1470 case GBK: |
|
1471 case GB18030: |
|
1472 case ISO_2022_CN: |
|
1473 case HZ_GB_2312: |
|
1474 olp = PackCLDPriorLangWeight(CHINESE, kCLDPriorEncodingWeight); |
|
1475 MergeCLDLangPriorsBoost(olp, langpriors); |
|
1476 break; |
|
1477 case CHINESE_BIG5: |
|
1478 case CHINESE_BIG5_CP950: |
|
1479 case BIG5_HKSCS: |
|
1480 olp = PackCLDPriorLangWeight(CHINESE_T, kCLDPriorEncodingWeight); |
|
1481 MergeCLDLangPriorsBoost(olp, langpriors); |
|
1482 break; |
|
1483 case JAPANESE_EUC_JP: |
|
1484 case JAPANESE_SHIFT_JIS: |
|
1485 case JAPANESE_CP932: |
|
1486 case JAPANESE_JIS: // ISO-2022-JP |
|
1487 olp = PackCLDPriorLangWeight(JAPANESE, kCLDPriorEncodingWeight); |
|
1488 MergeCLDLangPriorsBoost(olp, langpriors); |
|
1489 break; |
|
1490 case KOREAN_EUC_KR: |
|
1491 case ISO_2022_KR: |
|
1492 olp = PackCLDPriorLangWeight(KOREAN, kCLDPriorEncodingWeight); |
|
1493 MergeCLDLangPriorsBoost(olp, langpriors); |
|
1494 break; |
|
1495 |
|
1496 default: |
|
1497 break; |
|
1498 } |
|
1499 } |
|
1500 |
|
1501 // Add hints to vector of langpriors |
|
1502 // Input is from random source |
|
1503 void SetCLDLanguageHint(Language lang, CLDLangPriors* langpriors) { |
|
1504 OneCLDLangPrior olp = PackCLDPriorLangWeight(lang, kCLDPriorLanguageWeight); |
|
1505 MergeCLDLangPriorsBoost(olp, langpriors); |
|
1506 } |
|
1507 |
|
1508 |
|
1509 // Make printable string of priors |
|
1510 string DumpCLDLangPriors(const CLDLangPriors* langpriors) { |
|
1511 string retval; |
|
1512 for (int i = 0; i < langpriors->n; ++i) { |
|
1513 char temp[64]; |
|
1514 sprintf(temp, "%s.%d ", |
|
1515 LanguageCode(GetCLDPriorLang(langpriors->prior[i])), |
|
1516 GetCLDPriorWeight(langpriors->prior[i])); |
|
1517 retval.append(temp); |
|
1518 } |
|
1519 return retval; |
|
1520 } |
|
1521 |
|
1522 |
|
1523 |
|
1524 |
|
1525 // Look for |
|
1526 // <html lang="en"> |
|
1527 // <doc xml:lang="en"> |
|
1528 // <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en-US"> |
|
1529 // <meta http-equiv="content-language" content="en-GB" /> |
|
1530 // <meta name="language" content="Srpski"> |
|
1531 // <meta name="DC.language" scheme="RFCOMMA766" content="en"> |
|
1532 // <SPAN id="msg1" class="info" lang='en'> |
|
1533 // |
|
1534 // Do not trigger on |
|
1535 // <!-- lang=french ...--> |
|
1536 // <font lang=postscript ...> |
|
1537 // <link href="index.fr.html" hreflang="fr-FR" xml:lang="fr-FR" /> |
|
1538 // <META name="Author" lang="fr" content="Arnaud Le Hors"> |
|
1539 // |
|
1540 // Stop fairly quickly on mismatched quotes |
|
1541 // |
|
1542 // Allowed language characters |
|
1543 // a-z A-Z -_ , space\t |
|
1544 // Think about: GB2312, big5, shift-jis, euc-jp, ksc euc-kr |
|
1545 // zh-hans zh-TW cmn-Hani zh_cn.gb18030_CN zh-min-nan zh-yue |
|
1546 // de-x-mtfrom-en zh-tw-x-mtfrom-en (machine translation) |
|
1547 // GB2312 => gb |
|
1548 // Big5 => big |
|
1549 // zh_CN.gb18030_C => zh-cn |
|
1550 // |
|
1551 // Remove duplicates and extra spaces as we go |
|
1552 // Lowercase as we go. |
|
1553 |
|
1554 // Get language tag hints from HTML body |
|
1555 // Normalize: remove spaces and make lowercase comma list |
|
1556 |
|
1557 string GetLangTagsFromHtml(const char* utf8_body, int32 utf8_body_len, |
|
1558 int32 max_scan_bytes) { |
|
1559 string retval; |
|
1560 if (max_scan_bytes > utf8_body_len) { |
|
1561 max_scan_bytes = utf8_body_len; |
|
1562 } |
|
1563 |
|
1564 int32 k = 0; |
|
1565 while (k < max_scan_bytes) { |
|
1566 int32 start_tag = FindTagStart(utf8_body, k, max_scan_bytes); |
|
1567 if (start_tag < 0) {break;} |
|
1568 int32 end_tag = FindTagEnd(utf8_body, start_tag + 1, max_scan_bytes); |
|
1569 // FindTagEnd exits on < > & |
|
1570 if (end_tag < 0) {break;} |
|
1571 |
|
1572 // Skip <!--...> |
|
1573 // Skip <font ...> |
|
1574 // Skip <script ...> |
|
1575 // Skip <link ...> |
|
1576 // Skip <img ...> |
|
1577 // Skip <a ...> |
|
1578 if (FindAfter(utf8_body, start_tag + 1, end_tag, "!--") || |
|
1579 FindAfter(utf8_body, start_tag + 1, end_tag, "font ") || |
|
1580 FindAfter(utf8_body, start_tag + 1, end_tag, "script ") || |
|
1581 FindAfter(utf8_body, start_tag + 1, end_tag, "link ") || |
|
1582 FindAfter(utf8_body, start_tag + 1, end_tag, "img ") || |
|
1583 FindAfter(utf8_body, start_tag + 1, end_tag, "a ")) { |
|
1584 k = end_tag + 1; |
|
1585 continue; |
|
1586 } |
|
1587 |
|
1588 // Remember <meta ...> |
|
1589 bool in_meta = false; |
|
1590 if (FindAfter(utf8_body, start_tag + 1, end_tag, "meta ")) { |
|
1591 in_meta = true; |
|
1592 } |
|
1593 |
|
1594 // Scan for each equal sign inside tag |
|
1595 bool content_is_lang = false; |
|
1596 int32 kk = start_tag + 1; |
|
1597 int32 equal_sign; |
|
1598 while ((equal_sign = FindEqualSign(utf8_body, kk, end_tag)) >= 0) { |
|
1599 // eq exits on < > & |
|
1600 |
|
1601 // Look inside a meta tag |
|
1602 // <meta ... http-equiv="content-language" ...> |
|
1603 // <meta ... name="language" ...> |
|
1604 // <meta ... name="dc.language" ...> |
|
1605 if (in_meta) { |
|
1606 if (FindBefore(utf8_body, kk, equal_sign, " http-equiv") && |
|
1607 FindAfter(utf8_body, equal_sign + 1, end_tag, |
|
1608 "content-language ")) { |
|
1609 content_is_lang = true; |
|
1610 } else if (FindBefore(utf8_body, kk, equal_sign, " name") && |
|
1611 (FindAfter(utf8_body, equal_sign + 1, end_tag, |
|
1612 "dc.language ") || |
|
1613 FindAfter(utf8_body, equal_sign + 1, end_tag, |
|
1614 "language "))) { |
|
1615 content_is_lang = true; |
|
1616 } |
|
1617 } |
|
1618 |
|
1619 // Look inside any tag |
|
1620 // <meta ... content="lang-list" ...> |
|
1621 // <... lang="lang-list" ...> |
|
1622 // <... xml:lang="lang-list" ...> |
|
1623 if ((content_is_lang && FindBefore(utf8_body, kk, equal_sign, |
|
1624 " content")) || |
|
1625 FindBefore(utf8_body, kk, equal_sign, " lang") || |
|
1626 FindBefore(utf8_body, kk, equal_sign, ":lang")) { |
|
1627 string temp = CopyQuotedString(utf8_body, equal_sign + 1, end_tag); |
|
1628 |
|
1629 // Append new lang tag(s) if not a duplicate |
|
1630 if (!temp.empty() && (retval.find(temp) == string::npos)) { |
|
1631 retval.append(temp); |
|
1632 } |
|
1633 } |
|
1634 |
|
1635 kk = equal_sign + 1; |
|
1636 } |
|
1637 k = end_tag + 1; |
|
1638 } |
|
1639 |
|
1640 // Strip last comma |
|
1641 if (retval.size() > 1) { |
|
1642 retval.erase(retval.size() - 1); |
|
1643 } |
|
1644 return retval; |
|
1645 } |
|
1646 |
|
1647 } // End namespace CLD2 |
|
1648 |
|
1649 //============================================================================== |
|
1650 |
|
1651 |