browser/components/translation/cld2/internal/compact_lang_det_hint_code.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 // Copyright 2013 Google Inc. All Rights Reserved.
michael@0 2 //
michael@0 3 // Licensed under the Apache License, Version 2.0 (the "License");
michael@0 4 // you may not use this file except in compliance with the License.
michael@0 5 // You may obtain a copy of the License at
michael@0 6 //
michael@0 7 // http://www.apache.org/licenses/LICENSE-2.0
michael@0 8 //
michael@0 9 // Unless required by applicable law or agreed to in writing, software
michael@0 10 // distributed under the License is distributed on an "AS IS" BASIS,
michael@0 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0 12 // See the License for the specific language governing permissions and
michael@0 13 // limitations under the License.
michael@0 14
michael@0 15 //
michael@0 16 // Author: dsites@google.com (Dick Sites)
michael@0 17 //
michael@0 18
michael@0 19 #include "compact_lang_det_hint_code.h"
michael@0 20
michael@0 21 #include <stdlib.h> // for abs()
michael@0 22 #include <stdio.h> // for sprintf()
michael@0 23 #include <string.h> //
michael@0 24 #include "lang_script.h"
michael@0 25 #include "port.h"
michael@0 26
michael@0 27 using namespace std;
michael@0 28
michael@0 29 namespace CLD2 {
michael@0 30
michael@0 31 static const int kCLDPriorEncodingWeight = 4; // 100x more likely
michael@0 32 static const int kCLDPriorLanguageWeight = 8; // 10000x more likely
michael@0 33
michael@0 34
michael@0 35 // Tables to map lang="..." language code lists to actual languages.
michael@0 36 // based on scraping and hand-edits, dsites June 2011
michael@0 37
michael@0 38 // n = f(string, &a) gives list of n<=4 language pairs: primary, secondary
michael@0 39
michael@0 40 // For close pairs like ms/id, more weight on TLD and lang=
michael@0 41 // Alternately, weaker boost but mark others of set as negative;
michael@0 42 // makes "neither" an easier result.
michael@0 43 // lang=en low weight 4
michael@0 44 // tld=lu boost lu maaybe 4. but lang= alwyas overcomes tld and encoding
michael@0 45 // (except maybe en)
michael@0 46
michael@0 47 // TLD to separate, e.g., burundi from rwanda
michael@0 48
michael@0 49 // Encoding lookup: OneLangProb array
michael@0 50 // TLD lookup: tld OneLangProb pairs
michael@0 51
michael@0 52
michael@0 53 typedef struct {
michael@0 54 const char* const langtag; // Lowercased, hyphen only lookup key
michael@0 55 const char* const langcode; // Canonical language codes; two if ambiguous
michael@0 56 OneCLDLangPrior onelangprior1;
michael@0 57 OneCLDLangPrior onelangprior2;
michael@0 58 } LangTagLookup;
michael@0 59
michael@0 60 typedef struct {
michael@0 61 const char* const tld; // Lowercased, hyphen only lookup key
michael@0 62 OneCLDLangPrior onelangprior1;
michael@0 63 OneCLDLangPrior onelangprior2;
michael@0 64 } TLDLookup;
michael@0 65
michael@0 66
michael@0 67 #define W2 (2 << 10) // 3**2 = 10x more likely
michael@0 68 #define W4 (4 << 10) // 3**4 = 100x more likely
michael@0 69 #define W6 (6 << 10) // 3**6 = 1000x more likely
michael@0 70 #define W8 (8 << 10) // 3**8 = 10K x more likely
michael@0 71 #define W10 (10 << 10) // 3**10 = 100K x more likely
michael@0 72 #define W12 (12 << 10) // 3**12 = 1M x more likely
michael@0 73
michael@0 74 // TODO: more about ba hr sr sr-ME and sl
michael@0 75 // Temporary state of affairs:
michael@0 76 // BOSNIAN CROATIAN MONTENEGRIN SERBIAN detecting just CROATIAN SERBIAN
michael@0 77 // Eventually, we want to do all four, but it requires a CLD change to handle
michael@0 78 // up to six languages per quadgram.
michael@0 79
michael@0 80
michael@0 81 // Close pairs boost one of pair, demote other.
michael@0 82 // Statistically close pairs:
michael@0 83 // INDONESIAN/MALAY difficult to distinguish -- extra word-based lookups used
michael@0 84 //
michael@0 85 // INDONESIAN MALAY coef=0.4698 Problematic w/o extra words
michael@0 86 // TIBETAN DZONGKHA coef=0.4571
michael@0 87 // CZECH SLOVAK coef=0.4273
michael@0 88 // NORWEGIAN NORWEGIAN_N coef=0.4182
michael@0 89 //
michael@0 90 // HINDI MARATHI coef=0.3795
michael@0 91 // ZULU XHOSA coef=0.3716
michael@0 92 //
michael@0 93 // DANISH NORWEGIAN coef=0.3672 Usually OK
michael@0 94 // BIHARI HINDI coef=0.3668 Usually OK
michael@0 95 // ICELANDIC FAROESE coef=0.3519 Usually OK
michael@0 96
michael@0 97 //
michael@0 98 // Table to look up lang= tags longer than three characters
michael@0 99 // Overrides table below, which is truncated at first hyphen
michael@0 100 // In alphabetical order for binary search
michael@0 101 static const int kCLDTable1Size = 213;
michael@0 102 static const LangTagLookup kCLDLangTagsHintTable1[kCLDTable1Size] = {
michael@0 103 {"abkhazian", "ab", ABKHAZIAN + W10, 0},
michael@0 104 {"afar", "aa", AFAR + W10, 0},
michael@0 105 {"afrikaans", "af", AFRIKAANS + W10, 0},
michael@0 106 {"akan", "ak", AKAN + W10, 0},
michael@0 107 {"albanian", "sq", ALBANIAN + W10, 0},
michael@0 108 {"am-am", "hy", ARMENIAN + W10, 0}, // 1:2 Armenian, not ambiguous
michael@0 109 {"amharic", "am", AMHARIC + W10, 0},
michael@0 110 {"arabic", "ar", ARABIC + W10, 0},
michael@0 111 {"argentina", "es", SPANISH + W10, 0},
michael@0 112 {"armenian", "hy", ARMENIAN + W10, 0},
michael@0 113 {"assamese", "as", ASSAMESE + W10, 0},
michael@0 114 {"aymara", "ay", AYMARA + W10, 0},
michael@0 115 {"azerbaijani", "az", AZERBAIJANI + W10, 0},
michael@0 116
michael@0 117 {"bangla", "bn", BENGALI + W10, 0},
michael@0 118 {"bashkir", "ba", BASHKIR + W10, 0},
michael@0 119 {"basque", "eu", BASQUE + W10, 0},
michael@0 120 {"belarusian", "be", BELARUSIAN + W10, 0},
michael@0 121 {"bengali", "bn", BENGALI + W10, 0},
michael@0 122 {"bihari", "bh", BIHARI + W10, HINDI - W4},
michael@0 123 {"bislama", "bi", BISLAMA + W10, 0},
michael@0 124 {"bosnian", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian
michael@0 125 {"br-br", "pt", PORTUGUESE + W10, 0}, // 1:2 Portuguese, not ambiguous
michael@0 126 {"br-fr", "br", BRETON + W10, 0}, // 1:2 Breton, not ambiguous
michael@0 127 {"breton", "br", BRETON + W10, 0},
michael@0 128 {"bulgarian", "bg", BULGARIAN + W10, 0},
michael@0 129 {"burmese", "my", BURMESE + W10, 0}, // Myanmar
michael@0 130
michael@0 131 {"catalan", "ca", CATALAN + W10, 0},
michael@0 132 {"cherokee", "chr", CHEROKEE + W10, 0},
michael@0 133 {"chichewa", "ny", NYANJA + W10, 0},
michael@0 134
michael@0 135 {"chinese", "zh", CHINESE + W10, 0},
michael@0 136 {"chinese-t", "zhT", CHINESE_T + W10, 0},
michael@0 137 {"chineset", "zhT", CHINESE_T + W10, 0},
michael@0 138 {"corsican", "co", CORSICAN + W10, 0},
michael@0 139 {"cpf-hat", "ht", HAITIAN_CREOLE + W10, 0}, // Creole, French-based
michael@0 140 {"croatian", "hr", CROATIAN + W10, 0},
michael@0 141 {"czech", "cs", CZECH + W10, SLOVAK - W4},
michael@0 142
michael@0 143 {"danish", "da", DANISH + W10, NORWEGIAN - W4},
michael@0 144 {"deutsch", "de", GERMAN + W10, 0},
michael@0 145 {"dhivehi", "dv", DHIVEHI + W10, 0},
michael@0 146 {"dutch", "nl", DUTCH + W10, 0},
michael@0 147 {"dzongkha", "dz", DZONGKHA + W10, TIBETAN - W4},
michael@0 148
michael@0 149 {"ell-gr", "el", GREEK + W10, 0},
michael@0 150 {"english", "en", ENGLISH + W4, 0},
michael@0 151 {"esperanto", "eo", ESPERANTO + W10, 0},
michael@0 152 {"estonian", "et", ESTONIAN + W10, 0},
michael@0 153 {"euc-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding
michael@0 154 {"euc-kr", "ko", KOREAN + W10, 0}, // Korean encoding
michael@0 155
michael@0 156 {"faroese", "fo", FAROESE + W10, ICELANDIC - W4},
michael@0 157 {"fijian", "fj", FIJIAN + W10, 0},
michael@0 158 {"finnish", "fi", FINNISH + W10, 0},
michael@0 159 {"fran", "fr", FRENCH + W10, 0}, // Truncated at non-ASCII
michael@0 160 {"francais", "fr", FRENCH + W10, 0},
michael@0 161 {"french", "fr", FRENCH + W10, 0},
michael@0 162 {"frisian", "fy", FRISIAN + W10, 0},
michael@0 163
michael@0 164 {"ga-es", "gl", GALICIAN + W10, 0}, // 1:2 Galician, not ambiguous
michael@0 165 {"galician", "gl", GALICIAN + W10, 0},
michael@0 166 {"ganda", "lg", GANDA + W10, 0},
michael@0 167 {"georgian", "ka", GEORGIAN + W10, 0},
michael@0 168 {"german", "de", GERMAN + W10, 0},
michael@0 169 {"greek", "el", GREEK + W10, 0},
michael@0 170 {"greenlandic", "kl", GREENLANDIC + W10, 0},
michael@0 171 {"guarani", "gn", GUARANI + W10, 0},
michael@0 172 {"gujarati", "gu", GUJARATI + W10, 0},
michael@0 173
michael@0 174 {"haitian_creole", "ht", HAITIAN_CREOLE + W10, 0},
michael@0 175 {"hausa", "ha", HAUSA + W10, 0},
michael@0 176 {"hawaiian", "haw", HAWAIIAN + W10, 0},
michael@0 177 {"hebrew", "iw", HEBREW + W10, 0},
michael@0 178 {"hindi", "hi", HINDI + W10, MARATHI - W4},
michael@0 179 {"hn-in", "hi", HINDI + W10, MARATHI - W4},
michael@0 180 {"hungarian", "hu", HUNGARIAN + W10, 0},
michael@0 181
michael@0 182 {"icelandic", "is", ICELANDIC + W10, FAROESE - W4},
michael@0 183 {"igbo", "ig", IGBO + W10, 0},
michael@0 184 {"indonesian", "id", INDONESIAN + W10, MALAY - W4},
michael@0 185 {"interlingua", "ia", INTERLINGUA + W10, 0},
michael@0 186 {"interlingue", "ie", INTERLINGUE + W10, 0},
michael@0 187 // 1:2 iu-Cans ik-Latn
michael@0 188 {"inuktitut", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2
michael@0 189 {"inupiak", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2
michael@0 190 {"ir-ie", "ga", IRISH + W10, 0}, // Irish
michael@0 191 {"irish", "ga", IRISH + W10, 0},
michael@0 192 {"italian", "it", ITALIAN + W10, 0},
michael@0 193
michael@0 194 {"ja-euc", "ja", JAPANESE + W10, 0}, // Japanese encoding
michael@0 195 {"jan-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding
michael@0 196 {"japanese", "ja", JAPANESE + W10, 0},
michael@0 197 {"javanese", "jw", JAVANESE + W10, 0},
michael@0 198
michael@0 199 {"kannada", "kn", KANNADA + W10, 0},
michael@0 200 {"kashmiri", "ks", KASHMIRI + W10, 0},
michael@0 201 {"kazakh", "kk", KAZAKH + W10, 0},
michael@0 202 {"khasi", "kha", KHASI + W10, 0},
michael@0 203 {"khmer", "km", KHMER + W10, 0},
michael@0 204 {"kinyarwanda", "rw", KINYARWANDA + W10, 0},
michael@0 205 {"klingon", "tlh", X_KLINGON + W10, 0},
michael@0 206 {"korean", "ko", KOREAN + W10, 0},
michael@0 207 {"kurdish", "ku", KURDISH + W10, 0},
michael@0 208 {"kyrgyz", "ky", KYRGYZ + W10, 0},
michael@0 209
michael@0 210 {"laothian", "lo", LAOTHIAN + W10, 0},
michael@0 211 {"latin", "la", LATIN + W10, 0},
michael@0 212 {"latvian", "lv", LATVIAN + W10, 0},
michael@0 213 {"limbu", "sit", LIMBU + W10, 0},
michael@0 214 {"lingala", "ln", LINGALA + W10, 0},
michael@0 215 {"lithuanian", "lt", LITHUANIAN + W10, 0},
michael@0 216 {"luxembourgish", "lb", LUXEMBOURGISH + W10, 0},
michael@0 217
michael@0 218 {"macedonian", "mk", MACEDONIAN + W10, 0},
michael@0 219 {"malagasy", "mg", MALAGASY + W10, 0},
michael@0 220 {"malay", "ms", MALAY + W10, INDONESIAN - W4},
michael@0 221 {"malayalam", "ml", MALAYALAM + W10, 0},
michael@0 222 {"maltese", "mt", MALTESE + W10, 0},
michael@0 223 {"manx", "gv", MANX + W10, 0},
michael@0 224 {"maori", "mi", MAORI + W10, 0},
michael@0 225 {"marathi", "mr", MARATHI + W10, HINDI - W4},
michael@0 226 {"mauritian_creole", "mfe", MAURITIAN_CREOLE + W10, 0},
michael@0 227 {"moldavian", "mo", ROMANIAN + W10, 0},
michael@0 228 {"mongolian", "mn", MONGOLIAN + W10, 0},
michael@0 229 {"montenegrin", "sr-me", MONTENEGRIN + W10, 0},
michael@0 230 {"myanmar", "my", BURMESE + W10, 0}, // Myanmar
michael@0 231 {"nauru", "na", NAURU + W10, 0},
michael@0 232 {"ndebele", "nr", NDEBELE + W10, 0},
michael@0 233 {"nepali", "ne", NEPALI + W10, 0},
michael@0 234 {"no-bok", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal
michael@0 235 {"no-bokmaal", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
michael@0 236 {"no-nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal
michael@0 237 {"no-no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
michael@0 238 {"no-nyn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, // Nynorsk
michael@0 239 {"no-nynorsk", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
michael@0 240 {"norwegian", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
michael@0 241 {"norwegian_n", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
michael@0 242 {"nyanja", "ny", NYANJA + W10, 0},
michael@0 243
michael@0 244 {"occitan", "oc", OCCITAN + W10, 0},
michael@0 245 {"oriya", "or", ORIYA + W10, 0},
michael@0 246 {"oromo", "om", OROMO + W10, 0},
michael@0 247 {"parsi", "fa", PERSIAN + W10, 0},
michael@0 248
michael@0 249 {"pashto", "ps", PASHTO + W10, 0},
michael@0 250 {"pedi", "nso", PEDI + W10, 0},
michael@0 251 {"persian", "fa", PERSIAN + W10, 0},
michael@0 252 {"polish", "pl", POLISH + W10, 0},
michael@0 253 {"polska", "pl", POLISH + W10, 0},
michael@0 254 {"polski", "pl", POLISH + W10, 0},
michael@0 255 {"portugu", "pt", PORTUGUESE + W10, 0}, // Truncated at non-ASCII
michael@0 256 {"portuguese", "pt", PORTUGUESE + W10, 0},
michael@0 257 {"punjabi", "pa", PUNJABI + W10, 0},
michael@0 258
michael@0 259 {"quechua", "qu", QUECHUA + W10, 0},
michael@0 260
michael@0 261 {"rhaeto_romance", "rm", RHAETO_ROMANCE + W10, 0},
michael@0 262 {"romanian", "ro", ROMANIAN + W10, 0},
michael@0 263 {"rundi", "rn", RUNDI + W10, 0},
michael@0 264 {"russian", "ru", RUSSIAN + W10, 0},
michael@0 265
michael@0 266 {"samoan", "sm", SAMOAN + W10, 0},
michael@0 267 {"sango", "sg", SANGO + W10, 0},
michael@0 268 {"sanskrit", "sa", SANSKRIT + W10, 0},
michael@0 269 {"scots", "sco", SCOTS + W10, ENGLISH - W4},
michael@0 270 {"scots_gaelic", "gd", SCOTS_GAELIC + W10, 0},
michael@0 271 {"serbian", "sr", SERBIAN + W10, 0},
michael@0 272 {"seselwa", "crs", SESELWA + W10, 0},
michael@0 273 {"sesotho", "st", SESOTHO + W10, 0},
michael@0 274 {"shift-jis", "ja", JAPANESE + W10, 0}, // Japanese encoding
michael@0 275 {"shift-js", "ja", JAPANESE + W10, 0}, // Japanese encoding
michael@0 276 {"shona", "sn", SHONA + W10, 0},
michael@0 277 {"si-lk", "si", SINHALESE + W10, 0}, // 1:2 Sri Lanka, not ambiguous
michael@0 278 {"si-si", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous
michael@0 279 {"si-sl", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous
michael@0 280 {"sindhi", "sd", SINDHI + W10, 0},
michael@0 281 {"sinhalese", "si", SINHALESE + W10, 0},
michael@0 282 {"siswant", "ss", SISWANT + W10, 0},
michael@0 283 {"sit-np", "sit", LIMBU + W10, 0},
michael@0 284 {"slovak", "sk", SLOVAK + W10, CZECH - W4},
michael@0 285 {"slovenian", "sl", SLOVENIAN + W10, 0},
michael@0 286 {"somali", "so", SOMALI + W10, 0},
michael@0 287 {"spanish", "es", SPANISH + W10, 0},
michael@0 288 {"sr-me", "sr-me", MONTENEGRIN + W10, 0}, // Montenegrin => Montenegrin
michael@0 289 {"sundanese", "su", SUNDANESE + W10, 0},
michael@0 290 {"suomi", "fi", FINNISH + W10, 0}, // Finnish
michael@0 291 {"swahili", "sw", SWAHILI + W10, 0},
michael@0 292 {"swedish", "sv", SWEDISH + W10, 0},
michael@0 293 {"syriac", "syr", SYRIAC + W10, 0},
michael@0 294
michael@0 295 {"tagalog", "tl", TAGALOG + W10, 0},
michael@0 296 {"tajik", "tg", TAJIK + W10, 0},
michael@0 297 {"tamil", "ta", TAMIL + W10, 0},
michael@0 298 {"tatar", "tt", TATAR + W10, 0},
michael@0 299 {"tb-tb", "bo", TIBETAN + W10, DZONGKHA - W4}, // Tibet
michael@0 300 {"tchinese", "zhT", CHINESE_T + W10, 0},
michael@0 301 {"telugu", "te", TELUGU + W10, 0},
michael@0 302 {"thai", "th", THAI + W10, 0},
michael@0 303 {"tibetan", "bo", TIBETAN + W10, DZONGKHA - W4},
michael@0 304 {"tigrinya", "ti", TIGRINYA + W10, 0},
michael@0 305 {"tonga", "to", TONGA + W10, 0},
michael@0 306 {"tsonga", "ts", TSONGA + W10, 0},
michael@0 307 {"tswana", "tn", TSWANA + W10, 0},
michael@0 308 {"tt-ru", "tt", TATAR + W10, 0},
michael@0 309 {"tur-tr", "tr", TURKISH + W10, 0},
michael@0 310 {"turkish", "tr", TURKISH + W10, 0},
michael@0 311 {"turkmen", "tk", TURKMEN + W10, 0},
michael@0 312 {"uighur", "ug", UIGHUR + W10, 0},
michael@0 313 {"ukrainian", "uk", UKRAINIAN + W10, 0},
michael@0 314 {"urdu", "ur", URDU + W10, 0},
michael@0 315 {"uzbek", "uz", UZBEK + W10, 0},
michael@0 316
michael@0 317 {"venda", "ve", VENDA + W10, 0},
michael@0 318 {"vietnam", "vi", VIETNAMESE + W10, 0},
michael@0 319 {"vietnamese", "vi", VIETNAMESE + W10, 0},
michael@0 320 {"volapuk", "vo", VOLAPUK + W10, 0},
michael@0 321
michael@0 322 {"welsh", "cy", WELSH + W10, 0},
michael@0 323 {"wolof", "wo", WOLOF + W10, 0},
michael@0 324
michael@0 325 {"xhosa", "xh", XHOSA + W10, ZULU - W4},
michael@0 326
michael@0 327 {"yiddish", "yi", YIDDISH + W10, 0},
michael@0 328 {"yoruba", "yo", YORUBA + W10, 0},
michael@0 329
michael@0 330 {"zh-classical", "zhT", CHINESE_T + W10, 0},
michael@0 331 {"zh-cn", "zh", CHINESE + W10, 0},
michael@0 332 {"zh-hans", "zh", CHINESE + W10, 0},
michael@0 333 {"zh-hant", "zhT", CHINESE_T + W10, 0},
michael@0 334 {"zh-hk", "zhT", CHINESE_T + W10, 0},
michael@0 335 {"zh-min-nan", "zhT", CHINESE_T + W10, 0}, // Min Nan => ChineseT
michael@0 336 {"zh-sg", "zhT", CHINESE_T + W10, 0},
michael@0 337 {"zh-tw", "zhT", CHINESE_T + W10, 0},
michael@0 338 {"zh-yue", "zh", CHINESE + W10, 0}, // Yue (Cantonese) => Chinese
michael@0 339 {"zhuang", "za", ZHUANG + W10, 0},
michael@0 340 {"zulu", "zu", ZULU + W10, XHOSA - W4},
michael@0 341 };
michael@0 342
michael@0 343
michael@0 344
michael@0 345 // Table to look up lang= tags of two/three characters after truncate at hyphen
michael@0 346 // In alphabetical order for binary search
michael@0 347 static const int kCLDTable2Size = 257;
michael@0 348 static const LangTagLookup kCLDLangTagsHintTable2[kCLDTable2Size] = {
michael@0 349 {"aa", "aa", AFAR + W10, 0},
michael@0 350 {"ab", "ab", ABKHAZIAN + W10, 0},
michael@0 351 {"af", "af", AFRIKAANS + W10, 0},
michael@0 352 {"ak", "ak", AKAN + W10, 0},
michael@0 353 {"al", "sq", ALBANIAN + W10, 0}, // Albania
michael@0 354 {"am", "am,hy", AMHARIC + W10, ARMENIAN + W10}, // 1:2 Amharic Armenian
michael@0 355 {"ar", "ar", ARABIC + W10, 0},
michael@0 356 {"ara", "ar", ARABIC + W10, 0},
michael@0 357 {"arm", "hy", ARMENIAN + W10, 0}, // Armenia
michael@0 358 {"arz", "ar", ARABIC + W10, 0}, // Egyptian Arabic
michael@0 359 {"as", "as", ASSAMESE + W10, 0},
michael@0 360 {"at", "de", GERMAN + W10, 0}, // Austria
michael@0 361 {"au", "de", GERMAN + W10, 0}, // Austria
michael@0 362 {"ay", "ay", AYMARA + W10, 0},
michael@0 363 {"az", "az", AZERBAIJANI + W10, 0},
michael@0 364 {"aze", "az", AZERBAIJANI + W10, 0},
michael@0 365
michael@0 366 {"ba", "ba,bs", BASHKIR + W10, BOSNIAN + W10}, // 1:2 Bashkir Bosnia
michael@0 367 {"be", "be", BELARUSIAN + W10, 0},
michael@0 368 {"bel", "be", BELARUSIAN + W10, 0},
michael@0 369 {"bg", "bg", BULGARIAN + W10, 0},
michael@0 370 {"bh", "bh", BIHARI + W10, HINDI - W4},
michael@0 371 {"bi", "bi", BISLAMA + W10, 0},
michael@0 372 {"big", "zhT", CHINESE_T + W10, 0}, // Big5 encoding
michael@0 373 {"bm", "ms", MALAY + W10, INDONESIAN - W4}, // Bahasa Malaysia
michael@0 374 {"bn", "bn", BENGALI + W10, 0},
michael@0 375 {"bo", "bo", TIBETAN + W10, DZONGKHA - W4},
michael@0 376 // 1:2 Breton, Brazil country code, both Latn .br TLD enough for pt to win
michael@0 377 {"br", "br,pt", BRETON + W10, PORTUGUESE + W8}, // 1:2 Breton, Brazil
michael@0 378 {"bs", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian
michael@0 379
michael@0 380 {"ca", "ca", CATALAN + W10, 0},
michael@0 381 {"cat", "ca", CATALAN + W10, 0},
michael@0 382 {"ch", "de,fr", GERMAN + W10, FRENCH + W10}, // 1:2 Switzerland
michael@0 383 {"chn", "zh", CHINESE + W10, 0},
michael@0 384 {"chr", "chr", CHEROKEE + W10, 0},
michael@0 385 {"ckb", "ku", KURDISH + W10, 0}, // Central Kurdish
michael@0 386 {"cn", "zh,zhT", CHINESE + W6, CHINESE_T + W4}, // Ambiguous, so weaker.
michael@0 387 // Offset by 2 so that TLD=tw or
michael@0 388 // enc=big5 will put zhT ahead
michael@0 389 {"co", "co", CORSICAN + W10, 0},
michael@0 390 {"cro", "hr", CROATIAN + W10, 0}, // Croatia
michael@0 391 {"crs", "crs", SESELWA + W10, 0},
michael@0 392 {"cs", "cs", CZECH + W10, SLOVAK - W4},
michael@0 393 {"ct", "ca", CATALAN + W10, 0},
michael@0 394 {"cy", "cy", WELSH + W10, 0},
michael@0 395 {"cym", "cy", WELSH + W10, 0},
michael@0 396 {"cz", "cs", CZECH + W10, SLOVAK - W4},
michael@0 397
michael@0 398 {"da", "da", DANISH + W10, NORWEGIAN - W4},
michael@0 399 {"dan", "da", DANISH + W10, NORWEGIAN - W4},
michael@0 400 {"de", "de", GERMAN + W10, 0},
michael@0 401 {"deu", "de", GERMAN + W10, 0},
michael@0 402 {"div", "dv", DHIVEHI + W10, 0},
michael@0 403 {"dk", "da", DANISH + W10, NORWEGIAN - W4}, // Denmark
michael@0 404 {"dut", "nl", DUTCH + W10, 0}, // Dutch
michael@0 405 {"dv", "dv", DHIVEHI + W10, 0},
michael@0 406 {"dz", "dz", DZONGKHA + W10, TIBETAN - W4},
michael@0 407
michael@0 408 {"ee", "et", ESTONIAN + W10, 0}, // Estonia
michael@0 409 {"eg", "ar", ARABIC + W10, 0}, // Egypt
michael@0 410 {"el", "el", GREEK + W10, 0},
michael@0 411 {"en", "en", ENGLISH + W4, 0},
michael@0 412 {"eng", "en", ENGLISH + W4, 0},
michael@0 413 {"eo", "eo", ESPERANTO + W10, 0},
michael@0 414 {"er", "ur", URDU + W10, 0}, // "Erdu"
michael@0 415 {"es", "es", SPANISH + W10, 0},
michael@0 416 {"esp", "es", SPANISH + W10, 0},
michael@0 417 {"est", "et", ESTONIAN + W10, 0},
michael@0 418 {"et", "et", ESTONIAN + W10, 0},
michael@0 419 {"eu", "eu", BASQUE + W10, 0},
michael@0 420
michael@0 421 {"fa", "fa", PERSIAN + W10, 0},
michael@0 422 {"far", "fa", PERSIAN + W10, 0},
michael@0 423 {"fi", "fi", FINNISH + W10, 0},
michael@0 424 {"fil", "tl", TAGALOG + W10, 0}, // Philippines
michael@0 425 {"fj", "fj", FIJIAN + W10, 0},
michael@0 426 {"fo", "fo", FAROESE + W10, ICELANDIC - W4},
michael@0 427 {"fr", "fr", FRENCH + W10, 0},
michael@0 428 {"fra", "fr", FRENCH + W10, 0},
michael@0 429 {"fre", "fr", FRENCH + W10, 0},
michael@0 430 {"fy", "fy", FRISIAN + W10, 0},
michael@0 431
michael@0 432 {"ga", "ga,gl", IRISH + W10, GALICIAN + W10}, // 1:2 Irish, Galician
michael@0 433 {"gae", "gd,ga", SCOTS_GAELIC + W10, IRISH + W10}, // 1:2 Gaelic, either
michael@0 434 {"gal", "gl", GALICIAN + W10, 0},
michael@0 435 {"gb", "zh", CHINESE + W10, 0}, // GB2312 encoding
michael@0 436 {"gbk", "zh", CHINESE + W10, 0}, // GBK encoding
michael@0 437 {"gd", "gd", SCOTS_GAELIC + W10, 0},
michael@0 438 {"ge", "ka", GEORGIAN + W10, 0}, // Georgia
michael@0 439 {"geo", "ka", GEORGIAN + W10, 0},
michael@0 440 {"ger", "de", GERMAN + W10, 0},
michael@0 441 {"gl", "gl", GALICIAN + W10, 0}, // Also Greenland; hard to confuse
michael@0 442 {"gn", "gn", GUARANI + W10, 0},
michael@0 443 {"gr", "el", GREEK + W10, 0}, // Greece
michael@0 444 {"gu", "gu", GUJARATI + W10, 0},
michael@0 445 {"gv", "gv", MANX + W10, 0},
michael@0 446
michael@0 447 {"ha", "ha", HAUSA + W10, 0},
michael@0 448 {"hat", "ht", HAITIAN_CREOLE + W10, 0}, // Haiti
michael@0 449 {"haw", "haw", HAWAIIAN + W10, 0},
michael@0 450 {"hb", "iw", HEBREW + W10, 0},
michael@0 451 {"he", "iw", HEBREW + W10, 0},
michael@0 452 {"heb", "iw", HEBREW + W10, 0},
michael@0 453 {"hi", "hi", HINDI + W10, MARATHI - W4},
michael@0 454 {"hk", "zhT", CHINESE_T + W10, 0}, // Hong Kong
michael@0 455 {"hr", "hr", CROATIAN + W10, 0},
michael@0 456 {"ht", "ht", HAITIAN_CREOLE + W10, 0},
michael@0 457 {"hu", "hu", HUNGARIAN + W10, 0},
michael@0 458 {"hun", "hu", HUNGARIAN + W10, 0},
michael@0 459 {"hy", "hy", ARMENIAN + W10, 0},
michael@0 460
michael@0 461 {"ia", "ia", INTERLINGUA + W10, 0},
michael@0 462 {"ice", "is", ICELANDIC + W10, FAROESE - W4}, // Iceland
michael@0 463 {"id", "id", INDONESIAN + W10, MALAY - W4},
michael@0 464 {"ids", "id", INDONESIAN + W10, MALAY - W4},
michael@0 465 {"ie", "ie", INTERLINGUE + W10, 0},
michael@0 466 {"ig", "ig", IGBO + W10, 0},
michael@0 467 // 1:2 iu-Cans ik-Latn
michael@0 468 {"ik", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2
michael@0 469 {"in", "id", INDONESIAN + W10, MALAY - W4},
michael@0 470 {"ind", "id", INDONESIAN + W10, MALAY - W4}, // Indonesia
michael@0 471 {"inu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2
michael@0 472 {"is", "is", ICELANDIC + W10, FAROESE - W4},
michael@0 473 {"it", "it", ITALIAN + W10, 0},
michael@0 474 {"ita", "it", ITALIAN + W10, 0},
michael@0 475 {"iu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2
michael@0 476 {"iw", "iw", HEBREW + W10, 0},
michael@0 477
michael@0 478 {"ja", "ja", JAPANESE + W10, 0},
michael@0 479 {"jp", "ja", JAPANESE + W10, 0}, // Japan
michael@0 480 {"jpn", "ja", JAPANESE + W10, 0},
michael@0 481 {"jv", "jw", JAVANESE + W10, 0},
michael@0 482 {"jw", "jw", JAVANESE + W10, 0},
michael@0 483
michael@0 484 {"ka", "ka", GEORGIAN + W10, 0},
michael@0 485 {"kc", "qu", QUECHUA + W10, 0}, // (K)Quechua
michael@0 486 {"kg", "ky", KYRGYZ + W10, 0}, // Kyrgyzstan
michael@0 487 {"kh", "km", KHMER + W10, 0}, // Country code Khmer (Cambodia)
michael@0 488 {"kha", "kha", KHASI + W10, 0},
michael@0 489 {"kk", "kk", KAZAKH + W10, 0}, // Kazakh
michael@0 490 {"kl", "kl", GREENLANDIC + W10, 0},
michael@0 491 {"km", "km", KHMER + W10, 0},
michael@0 492 {"kn", "kn", KANNADA + W10, 0},
michael@0 493 {"ko", "ko", KOREAN + W10, 0},
michael@0 494 {"kor", "ko", KOREAN + W10, 0},
michael@0 495 {"kr", "ko", KOREAN + W10, 0}, // Country code Korea
michael@0 496 {"ks", "ks", KASHMIRI + W10, 0},
michael@0 497 {"ksc", "ko", KOREAN + W10, 0}, // KSC encoding
michael@0 498 {"ku", "ku", KURDISH + W10, 0},
michael@0 499 {"ky", "ky", KYRGYZ + W10, 0},
michael@0 500 {"kz", "kk", KAZAKH + W10, 0}, // Kazakhstan
michael@0 501 {"la", "la", LATIN + W10, 0},
michael@0 502 {"lao", "lo", LAOTHIAN + W10, 0}, // Laos
michael@0 503
michael@0 504 {"lb", "lb", LUXEMBOURGISH + W10, 0},
michael@0 505 {"lg", "lg", GANDA + W10, 0},
michael@0 506 {"lit", "lt", LITHUANIAN + W10, 0},
michael@0 507 {"ln", "ln", LINGALA + W10, 0},
michael@0 508 {"lo", "lo", LAOTHIAN + W10, 0},
michael@0 509 {"lt", "lt", LITHUANIAN + W10, 0},
michael@0 510 {"ltu", "lt", LITHUANIAN + W10, 0},
michael@0 511 {"lv", "lv", LATVIAN + W10, 0},
michael@0 512
michael@0 513 {"mfe", "mfe", MAURITIAN_CREOLE + W10, 0},
michael@0 514 {"mg", "mg", MALAGASY + W10, 0},
michael@0 515 {"mi", "mi", MAORI + W10, 0},
michael@0 516 {"mk", "mk", MACEDONIAN + W10, 0},
michael@0 517 {"ml", "ml", MALAYALAM + W10, 0},
michael@0 518 {"mn", "mn", MONGOLIAN + W10, 0},
michael@0 519 {"mo", "mo", ROMANIAN + W10, 0},
michael@0 520 {"mon", "mn", MONGOLIAN + W10, 0}, // Mongolian
michael@0 521 {"mr", "mr", MARATHI + W10, HINDI - W4},
michael@0 522 {"ms", "ms", MALAY + W10, INDONESIAN - W4},
michael@0 523 {"mt", "mt", MALTESE + W10, 0},
michael@0 524 {"mx", "es", SPANISH + W10, 0}, // Mexico
michael@0 525 {"my", "my,ms", BURMESE + W10, MALAY + W10}, // Myanmar, Malaysia
michael@0 526
michael@0 527 {"na", "na", NAURU + W10, 0},
michael@0 528 {"nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
michael@0 529 {"ne", "ne", NEPALI + W10, 0},
michael@0 530 {"nl", "nl", DUTCH + W10, 0},
michael@0 531 {"nn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
michael@0 532 {"no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
michael@0 533 {"nr", "nr", NDEBELE + W10, 0},
michael@0 534 {"nso", "nso", PEDI + W10, 0},
michael@0 535 {"ny", "ny", NYANJA + W10, 0},
michael@0 536
michael@0 537 {"oc", "oc", OCCITAN + W10, 0},
michael@0 538 {"om", "om", OROMO + W10, 0},
michael@0 539 {"or", "or", ORIYA + W10, 0},
michael@0 540
michael@0 541 {"pa", "pa,ps", PUNJABI + W10, PASHTO + W10}, // 1:2 pa-Guru ps-Arab
michael@0 542 {"per", "fa", PERSIAN + W10, 0},
michael@0 543 {"ph", "tl", TAGALOG + W10, 0}, // Philippines
michael@0 544 {"pk", "ur", URDU + W10, 0}, // Pakistan
michael@0 545 {"pl", "pl", POLISH + W10, 0},
michael@0 546 {"pnb", "pa", PUNJABI + W10, 0}, // Western Punjabi
michael@0 547 {"pol", "pl", POLISH + W10, 0},
michael@0 548 {"por", "pt", PORTUGUESE + W10, 0},
michael@0 549 {"ps", "ps", PASHTO + W10, 0},
michael@0 550 {"pt", "pt", PORTUGUESE + W10, 0},
michael@0 551 {"ptg", "pt", PORTUGUESE + W10, 0},
michael@0 552 {"qc", "fr", FRENCH + W10, 0}, // Quebec "country" code
michael@0 553 {"qu", "qu", QUECHUA + W10, 0},
michael@0 554
michael@0 555 {"rm", "rm", RHAETO_ROMANCE + W10, 0},
michael@0 556 {"rn", "rn", RUNDI + W10, 0},
michael@0 557 {"ro", "ro", ROMANIAN + W10, 0},
michael@0 558 {"rs", "sr", SERBIAN + W10, 0}, // Serbia country code
michael@0 559 {"ru", "ru", RUSSIAN + W10, 0},
michael@0 560 {"rus", "ru", RUSSIAN + W10, 0},
michael@0 561 {"rw", "rw", KINYARWANDA + W10, 0},
michael@0 562
michael@0 563 {"sa", "sa", SANSKRIT + W10, 0},
michael@0 564 {"sco", "sco", SCOTS + W10, ENGLISH - W4},
michael@0 565 {"sd", "sd", SINDHI + W10, 0},
michael@0 566 {"se", "sv", SWEDISH + W10, 0},
michael@0 567 {"sg", "sg", SANGO + W10, 0},
michael@0 568 {"si", "si,sl", SINHALESE + W10, SLOVENIAN + W10}, // 1:2 Sinhalese, Slovinia
michael@0 569 {"sk", "sk", SLOVAK + W10, CZECH - W4},
michael@0 570 {"sl", "sl", SLOVENIAN + W10, 0},
michael@0 571 {"slo", "sl", SLOVENIAN + W10, 0},
michael@0 572 {"sm", "sm", SAMOAN + W10, 0},
michael@0 573 {"sn", "sn", SHONA + W10, 0},
michael@0 574 {"so", "so", SOMALI + W10, 0},
michael@0 575 {"sp", "es", SPANISH + W10, 0},
michael@0 576 {"sq", "sq", ALBANIAN + W10, 0},
michael@0 577 {"sr", "sr", SERBIAN + W10, 0},
michael@0 578 {"srb", "sr", SERBIAN + W10, 0},
michael@0 579 {"srl", "sr", SERBIAN + W10, 0}, // Serbian Latin
michael@0 580 {"srp", "sr", SERBIAN + W10, 0},
michael@0 581 {"ss", "ss", SISWANT + W10, 0},
michael@0 582 {"st", "st", SESOTHO + W10, 0},
michael@0 583 {"su", "su", SUNDANESE + W10, 0},
michael@0 584 {"sv", "sv", SWEDISH + W10, 0},
michael@0 585 {"sve", "sv", SWEDISH + W10, 0},
michael@0 586 {"sw", "sw", SWAHILI + W10, 0},
michael@0 587 {"swe", "sv", SWEDISH + W10, 0},
michael@0 588 {"sy", "syr", SYRIAC + W10, 0},
michael@0 589 {"syr", "syr", SYRIAC + W10, 0},
michael@0 590
michael@0 591 {"ta", "ta", TAMIL + W10, 0},
michael@0 592 {"te", "te", TELUGU + W10, 0},
michael@0 593 {"tg", "tg", TAJIK + W10, 0},
michael@0 594 {"th", "th", THAI + W10, 0},
michael@0 595 {"ti", "ti,bo", TIGRINYA + W10, TIBETAN + W10}, // 1:2 Tigrinya, Tibet
michael@0 596 {"tj", "tg", TAJIK + W10, 0}, // Tajikistan
michael@0 597 {"tk", "tk", TURKMEN + W10, 0},
michael@0 598 {"tl", "tl", TAGALOG + W10, 0},
michael@0 599 {"tlh", "tlh", X_KLINGON + W10, 0},
michael@0 600 {"tn", "tn", TSWANA + W10, 0},
michael@0 601 {"to", "to", TONGA + W10, 0},
michael@0 602 {"tr", "tr", TURKISH + W10, 0},
michael@0 603 {"ts", "ts", TSONGA + W10, 0},
michael@0 604 {"tt", "tt", TATAR + W10, 0},
michael@0 605 {"tw", "ak,zhT", AKAN + W10, CHINESE_T + W10}, // 1:2 Twi => Akan, Taiwan
michael@0 606 {"twi", "ak", AKAN + W10, 0}, // Twi => Akan
michael@0 607
michael@0 608 {"ua", "uk", UKRAINIAN + W10, 0}, // Ukraine
michael@0 609 {"ug", "ug", UIGHUR + W10, 0},
michael@0 610 {"uk", "uk", UKRAINIAN + W10, 0},
michael@0 611 {"ur", "ur", URDU + W10, 0},
michael@0 612 {"uz", "uz", UZBEK + W10, 0},
michael@0 613
michael@0 614 {"va", "ca", CATALAN + W10, 0}, // Valencia => Catalan
michael@0 615 {"val", "ca", CATALAN + W10, 0}, // Valencia => Catalan
michael@0 616 {"ve", "ve", VENDA + W10, 0},
michael@0 617 {"vi", "vi", VIETNAMESE + W10, 0},
michael@0 618 {"vie", "vi", VIETNAMESE + W10, 0},
michael@0 619 {"vn", "vi", VIETNAMESE + W10, 0},
michael@0 620 {"vo", "vo", VOLAPUK + W10, 0},
michael@0 621
michael@0 622 {"wo", "wo", WOLOF + W10, 0},
michael@0 623
michael@0 624 {"xh", "xh", XHOSA + W10, ZULU - W4},
michael@0 625 {"xho", "xh", XHOSA + W10, ZULU - W4},
michael@0 626
michael@0 627 {"yi", "yi", YIDDISH + W10, 0},
michael@0 628 {"yo", "yo", YORUBA + W10, 0},
michael@0 629
michael@0 630 {"za", "za", ZHUANG + W10, 0},
michael@0 631 {"zh", "zh", CHINESE + W10, 0},
michael@0 632 {"zht", "zhT", CHINESE_T + W10, 0},
michael@0 633 {"zu", "zu", ZULU + W10, XHOSA - W4},
michael@0 634 };
michael@0 635
michael@0 636
michael@0 637 // Possibly map to tl:
michael@0 638 // -LangTags tl-Latn /7val.com/ ,bcl 2 Central Bicolano
michael@0 639 // -LangTags tl-Latn /7val.com/ ,ceb 6 Cebuano
michael@0 640 // -LangTags tl-Latn /7val.com/ ,war 1 Waray
michael@0 641
michael@0 642
michael@0 643
michael@0 644 // Table to look up country TLD (no general TLD)
michael@0 645 // In alphabetical order for binary search
michael@0 646 static const int kCLDTable3Size = 181;
michael@0 647 static const TLDLookup kCLDTLDHintTable[kCLDTable3Size] = {
michael@0 648 {"ac", JAPANESE + W2, 0},
michael@0 649 {"ad", CATALAN + W4, 0},
michael@0 650 {"ae", ARABIC + W4, 0},
michael@0 651 {"af", PASHTO + W4, PERSIAN + W4},
michael@0 652 {"ag", GERMAN + W2, 0}, // meager
michael@0 653 // {"ai", 0, 0}, // meager
michael@0 654 {"al", ALBANIAN + W4, 0},
michael@0 655 {"am", ARMENIAN + W4, 0},
michael@0 656 {"an", DUTCH + W4, 0}, // meager
michael@0 657 {"ao", PORTUGUESE + W4, 0},
michael@0 658 // {"aq", 0, 0}, // meager
michael@0 659 {"ar", SPANISH + W4, 0},
michael@0 660 // {"as", 0, 0},
michael@0 661 {"at", GERMAN + W4, 0},
michael@0 662 {"au", ENGLISH + W2, 0},
michael@0 663 {"aw", DUTCH + W4, 0},
michael@0 664 {"ax", SWEDISH + W4, 0},
michael@0 665 {"az", AZERBAIJANI + W4, 0},
michael@0 666
michael@0 667 {"ba", BOSNIAN + W8, CROATIAN - W4},
michael@0 668 // {"bb", 0, 0},
michael@0 669 {"bd", BENGALI + W4, 0},
michael@0 670 {"be", DUTCH + W4, FRENCH + W4},
michael@0 671 {"bf", FRENCH + W4, 0},
michael@0 672 {"bg", BULGARIAN + W4, 0},
michael@0 673 {"bh", ARABIC + W4, 0},
michael@0 674 {"bi", RUNDI + W4, FRENCH + W4},
michael@0 675 {"bj", FRENCH + W4, 0},
michael@0 676 {"bm", ENGLISH + W2, 0},
michael@0 677 {"bn", MALAY + W4, INDONESIAN - W4},
michael@0 678 {"bo", SPANISH + W4, AYMARA + W2}, // and GUARANI QUECHUA
michael@0 679 {"br", PORTUGUESE + W4, 0},
michael@0 680 // {"bs", 0, 0},
michael@0 681 {"bt", DZONGKHA + W10, TIBETAN - W10}, // Strong presumption of Dzongha
michael@0 682 {"bw", TSWANA + W4, 0},
michael@0 683 {"by", BELARUSIAN + W4, 0},
michael@0 684 // {"bz", 0, 0},
michael@0 685
michael@0 686 {"ca", FRENCH + W4, ENGLISH + W2},
michael@0 687 {"cat", CATALAN + W4, 0},
michael@0 688 {"cc", 0, 0},
michael@0 689 {"cd", FRENCH + W4, 0},
michael@0 690 {"cf", FRENCH + W4, 0},
michael@0 691 {"cg", FRENCH + W4, 0},
michael@0 692 {"ch", GERMAN + W4, FRENCH + W4},
michael@0 693 {"ci", FRENCH + W4, 0},
michael@0 694 // {"ck", 0, 0},
michael@0 695 {"cl", SPANISH + W4, 0},
michael@0 696 {"cm", FRENCH + W4, 0},
michael@0 697 {"cn", CHINESE + W4, 0},
michael@0 698 {"co", SPANISH + W4, 0},
michael@0 699 {"cr", SPANISH + W4, 0},
michael@0 700 {"cu", SPANISH + W4, 0},
michael@0 701 {"cv", PORTUGUESE + W4, 0},
michael@0 702 // {"cx", 0, 0},
michael@0 703 {"cy", GREEK + W4, TURKISH + W4},
michael@0 704 {"cz", CZECH + W4, SLOVAK - W4},
michael@0 705
michael@0 706 {"de", GERMAN + W4, 0},
michael@0 707 {"dj", 0, 0},
michael@0 708 {"dk", DANISH + W4, NORWEGIAN - W4},
michael@0 709 {"dm", 0, 0},
michael@0 710 {"do", SPANISH + W4, 0},
michael@0 711 {"dz", FRENCH + W4, ARABIC + W4},
michael@0 712
michael@0 713 {"ec", SPANISH + W4, 0},
michael@0 714 {"ee", ESTONIAN + W4, 0},
michael@0 715 {"eg", ARABIC + W4, 0},
michael@0 716 {"er", AFAR + W4, 0},
michael@0 717 {"es", SPANISH + W4, 0},
michael@0 718 {"et", AMHARIC + W4, AFAR + W4},
michael@0 719
michael@0 720 {"fi", FINNISH + W4, 0},
michael@0 721 {"fj", FIJIAN + W4, 0},
michael@0 722 // {"fk", 0, 0},
michael@0 723 // {"fm", 0, 0},
michael@0 724 {"fo", FAROESE + W4, ICELANDIC - W4},
michael@0 725 {"fr", FRENCH + W4, 0},
michael@0 726
michael@0 727 {"ga", FRENCH + W4, 0},
michael@0 728 {"gd", 0, 0},
michael@0 729 {"ge", GEORGIAN + W4, 0},
michael@0 730 {"gf", FRENCH + W4, 0},
michael@0 731 // {"gg", 0, 0},
michael@0 732 // {"gh", 0, 0},
michael@0 733 // {"gi", 0, 0},
michael@0 734 {"gl", GREENLANDIC + W4, DANISH + W4},
michael@0 735 // {"gm", 0, 0},
michael@0 736 {"gn", FRENCH + W4, 0},
michael@0 737 // {"gp", 0, 0},
michael@0 738 // {"gq", 0, 0},
michael@0 739 {"gr", GREEK + W4, 0},
michael@0 740 // {"gs", 0, 0},
michael@0 741 {"gt", SPANISH + W4, 0},
michael@0 742 // {"gu", 0, 0},
michael@0 743 // {"gy", 0, 0},
michael@0 744
michael@0 745 {"hk", CHINESE_T + W4, 0},
michael@0 746 // {"hm", 0, 0},
michael@0 747 {"hn", SPANISH + W4, 0},
michael@0 748 {"hr", CROATIAN + W8, BOSNIAN - W4},
michael@0 749 {"ht", HAITIAN_CREOLE + W4, FRENCH + W4},
michael@0 750 {"hu", HUNGARIAN + W4, 0},
michael@0 751
michael@0 752 {"id", INDONESIAN + W4, MALAY - W4},
michael@0 753 {"ie", IRISH + W4, 0},
michael@0 754 {"il", HEBREW + W4, 0},
michael@0 755 {"im", MANX + W4, 0},
michael@0 756 // {"in", 0, 0},
michael@0 757 // {"io", 0, 0},
michael@0 758 {"iq", ARABIC + W4, 0},
michael@0 759 {"ir", PERSIAN + W4, 0},
michael@0 760 {"is", ICELANDIC + W4, FAROESE - W4},
michael@0 761 {"it", ITALIAN + W4, 0},
michael@0 762
michael@0 763 // {"je", 0, 0},
michael@0 764 // {"jm", 0, 0},
michael@0 765 {"jo", ARABIC + W4, 0},
michael@0 766 {"jp", JAPANESE + W4, 0},
michael@0 767
michael@0 768 // {"ke", 0, 0},
michael@0 769 {"kg", KYRGYZ + W4, 0},
michael@0 770 {"kh", KHMER + W4, 0},
michael@0 771 // {"ki", 0, 0},
michael@0 772 {"km", FRENCH + W4, 0},
michael@0 773 // {"kn", 0, 0},
michael@0 774 {"kp", KOREAN + W4, 0},
michael@0 775 {"kr", KOREAN + W4, 0},
michael@0 776 {"kw", ARABIC + W4, 0},
michael@0 777 // {"ky", 0, 0},
michael@0 778 {"kz", KAZAKH + W4, 0},
michael@0 779
michael@0 780 {"la", LAOTHIAN + W4, 0},
michael@0 781 {"lb", ARABIC + W4, FRENCH + W4},
michael@0 782 // {"lc", 0, 0},
michael@0 783 {"li", GERMAN + W4, 0},
michael@0 784 {"lk", SINHALESE + W4, 0},
michael@0 785 // {"lr", 0, 0},
michael@0 786 {"ls", SESOTHO + W4, 0},
michael@0 787 {"lt", LITHUANIAN + W4, 0},
michael@0 788 {"lu", LUXEMBOURGISH + W4},
michael@0 789 {"lv", LATVIAN + W4, 0},
michael@0 790 {"ly", ARABIC + W4, 0},
michael@0 791
michael@0 792 {"ma", FRENCH + W4, 0},
michael@0 793 {"mc", FRENCH + W4, 0},
michael@0 794 {"md", ROMANIAN + W4, 0},
michael@0 795 {"me", MONTENEGRIN + W8, SERBIAN - W4},
michael@0 796 {"mg", FRENCH + W4, 0},
michael@0 797 {"mk", MACEDONIAN + W4, 0},
michael@0 798 {"ml", FRENCH + W4, 0},
michael@0 799 {"mm", BURMESE + W4, 0},
michael@0 800 {"mn", MONGOLIAN + W4, 0},
michael@0 801 {"mo", CHINESE_T + W4, PORTUGUESE + W4},
michael@0 802 // {"mp", 0, 0},
michael@0 803 {"mq", FRENCH + W4, 0},
michael@0 804 {"mr", FRENCH + W4, ARABIC + W4},
michael@0 805 // {"ms", 0, 0},
michael@0 806 {"mt", MALTESE + W4, 0},
michael@0 807 // {"mu", 0, 0},
michael@0 808 {"mv", DHIVEHI + W4, 0},
michael@0 809 // {"mw", 0, 0},
michael@0 810 {"mx", SPANISH + W4, 0},
michael@0 811 {"my", MALAY + W4, INDONESIAN - W4},
michael@0 812 {"mz", PORTUGUESE + W4, 0},
michael@0 813
michael@0 814 {"na", 0, 0}, // Namibia
michael@0 815 {"nc", FRENCH + W4, 0},
michael@0 816 {"ne", FRENCH + W4, 0},
michael@0 817 {"nf", FRENCH + W4, 0},
michael@0 818 // {"ng", 0, 0},
michael@0 819 {"ni", SPANISH + W4, 0},
michael@0 820 {"nl", DUTCH + W4, 0},
michael@0 821 {"no", NORWEGIAN + W4, NORWEGIAN_N + W2},
michael@0 822 {"np", NEPALI + W4, 0},
michael@0 823 {"nr", NAURU + W4, 0},
michael@0 824 {"nu", SWEDISH + W4, 0},
michael@0 825 {"nz", MAORI + W4, ENGLISH + W2},
michael@0 826
michael@0 827 {"om", ARABIC + W4, 0},
michael@0 828
michael@0 829 {"pa", SPANISH + W4, 0},
michael@0 830 {"pe", SPANISH + W4, QUECHUA + W2}, // also AYMARA
michael@0 831 {"pf", FRENCH + W4, 0},
michael@0 832 // {"pg", 0, 0},
michael@0 833 {"ph", TAGALOG + W4, 0},
michael@0 834 {"pk", URDU + W4, 0},
michael@0 835 {"pl", POLISH + W4, 0},
michael@0 836 // {"pn", 0, 0},
michael@0 837 {"pr", SPANISH + W4, 0},
michael@0 838 {"ps", ARABIC + W4, 0},
michael@0 839 {"pt", PORTUGUESE + W4, 0},
michael@0 840 {"py", SPANISH + W4, GUARANI + W2},
michael@0 841
michael@0 842 {"qa", ARABIC + W4, 0},
michael@0 843
michael@0 844 {"re", FRENCH + W4, 0},
michael@0 845 {"ro", ROMANIAN + W4, 0},
michael@0 846 {"rs", SERBIAN + W8, MONTENEGRIN - W4},
michael@0 847 {"ru", RUSSIAN + W4, 0},
michael@0 848 {"rw", KINYARWANDA + W4, FRENCH + W2},
michael@0 849
michael@0 850 {"sa", ARABIC + W4, 0},
michael@0 851 // {"sb", 0, 0},
michael@0 852 {"sc", SESELWA + W4, 0},
michael@0 853 {"sd", ARABIC + W4, 0},
michael@0 854 {"se", SWEDISH + W4, 0},
michael@0 855 // {"sg", 0, 0},
michael@0 856 // {"sh", 0, 0},
michael@0 857 {"si", SLOVENIAN + W4, 0},
michael@0 858 {"sk", SLOVAK + W4, CZECH - W4},
michael@0 859 // {"sl", 0, 0},
michael@0 860 {"sm", ITALIAN + W4, 0},
michael@0 861 {"sn", FRENCH + W4, 0},
michael@0 862 // {"sr", 0, 0},
michael@0 863 {"ss", ARABIC + W4, 0}, // Presumed South Sudan TLD. dsites 2011.07.07
michael@0 864 // {"st", 0, 0},
michael@0 865 {"su", RUSSIAN + W4, 0},
michael@0 866 {"sv", SPANISH + W4, 0},
michael@0 867 {"sy", ARABIC + W4, 0},
michael@0 868 // {"sz", 0, 0},
michael@0 869
michael@0 870 // {"tc", 0, 0},
michael@0 871 {"td", FRENCH + W4, 0},
michael@0 872 // {"tf", 0, 0},
michael@0 873 {"tg", FRENCH + W4, 0},
michael@0 874 {"th", THAI + W4, 0},
michael@0 875 // Tibet has no country code (see .cn)
michael@0 876 {"tj", TAJIK + W4, 0},
michael@0 877 // {"tk", 0, 0},
michael@0 878 // {"tl", 0, 0},
michael@0 879 {"tm", TURKISH + W4, 0},
michael@0 880 {"tn", FRENCH + W4, ARABIC + W4},
michael@0 881 // {"to", 0, 0},
michael@0 882 {"tp", JAPANESE + W4, 0},
michael@0 883 {"tr", TURKISH + W4, 0},
michael@0 884 // {"tt", 0, 0},
michael@0 885 // {"tv", 0, 0},
michael@0 886 {"tw", CHINESE_T + W4, 0},
michael@0 887 {"tz", SWAHILI + W4, AKAN + W4},
michael@0 888
michael@0 889 {"ua", UKRAINIAN + W4, 0},
michael@0 890 {"ug", GANDA + W4, 0},
michael@0 891 {"uk", ENGLISH + W2, 0},
michael@0 892 {"us", ENGLISH + W2, 0},
michael@0 893 {"uy", SPANISH + W4, 0},
michael@0 894 {"uz", UZBEK + W4, 0},
michael@0 895
michael@0 896 {"va", ITALIAN + W4, LATIN + W2},
michael@0 897 // {"vc", 0, 0},
michael@0 898 {"ve", SPANISH + W4, 0},
michael@0 899 // {"vg", 0, 0},
michael@0 900 // {"vi", 0, 0},
michael@0 901 {"vn", VIETNAMESE + W4, 0},
michael@0 902 // {"vu", 0, 0},
michael@0 903
michael@0 904 {"wf", FRENCH + W4, 0},
michael@0 905 // {"ws", 0, 0},
michael@0 906
michael@0 907 {"ye", ARABIC + W4, 0},
michael@0 908
michael@0 909 {"za", AFRIKAANS + W4, 0},
michael@0 910 // {"zm", 0, 0},
michael@0 911 // {"zw", 0, 0},
michael@0 912 };
michael@0 913
michael@0 914 #undef W2
michael@0 915 #undef W4
michael@0 916 #undef W6
michael@0 917 #undef W8
michael@0 918 #undef W10
michael@0 919 #undef W12
michael@0 920
michael@0 921
michael@0 922
michael@0 923
michael@0 924
michael@0 925 inline void SetCLDPriorWeight(int w, OneCLDLangPrior* olp) {
michael@0 926 *olp = (*olp & 0x3ff) + (w << 10);
michael@0 927 }
michael@0 928 inline void SetCLDPriorLang(Language lang, OneCLDLangPrior* olp) {
michael@0 929 *olp = (*olp & ~0x3ff) + lang;
michael@0 930 }
michael@0 931
michael@0 932 OneCLDLangPrior PackCLDPriorLangWeight(Language lang, int w) {
michael@0 933 return (w << 10) + lang;
michael@0 934 }
michael@0 935
michael@0 936 inline int MaxInt(int a, int b) {
michael@0 937 return (a >= b) ? a : b;
michael@0 938 }
michael@0 939
michael@0 940 // Merge in another language prior, taking max if already there
michael@0 941 void MergeCLDLangPriorsMax(OneCLDLangPrior olp, CLDLangPriors* lps) {
michael@0 942 if (olp == 0) {return;}
michael@0 943 Language target_lang = GetCLDPriorLang(olp);
michael@0 944 for (int i = 0; i < lps->n; ++i) {
michael@0 945 if (GetCLDPriorLang(lps->prior[i]) == target_lang) {
michael@0 946 int new_weight = MaxInt(GetCLDPriorWeight(lps->prior[i]),
michael@0 947 GetCLDPriorWeight(olp));
michael@0 948 SetCLDPriorWeight(new_weight, &lps->prior[i]);
michael@0 949 return;
michael@0 950 }
michael@0 951 }
michael@0 952 // Not found; add it if room
michael@0 953 if (lps->n >= kMaxOneCLDLangPrior) {return;}
michael@0 954 lps->prior[lps->n++] = olp;
michael@0 955 }
michael@0 956
michael@0 957 // Merge in another language prior, boosting 10x if already there
michael@0 958 void MergeCLDLangPriorsBoost(OneCLDLangPrior olp, CLDLangPriors* lps) {
michael@0 959 if (olp == 0) {return;}
michael@0 960 Language target_lang = GetCLDPriorLang(olp);
michael@0 961 for (int i = 0; i < lps->n; ++i) {
michael@0 962 if (GetCLDPriorLang(lps->prior[i]) == target_lang) {
michael@0 963 int new_weight = GetCLDPriorWeight(lps->prior[i]) + 2;
michael@0 964 SetCLDPriorWeight(new_weight, &lps->prior[i]);
michael@0 965 return;
michael@0 966 }
michael@0 967 }
michael@0 968 // Not found; add it if room
michael@0 969 if (lps->n >= kMaxOneCLDLangPrior) {return;}
michael@0 970 lps->prior[lps->n++] = olp;
michael@0 971 }
michael@0 972
michael@0 973
michael@0 974 // Trim language priors to no more than max_entries, keeping largest abs weights
michael@0 975 void TrimCLDLangPriors(int max_entries, CLDLangPriors* lps) {
michael@0 976 if (lps->n <= max_entries) {return;}
michael@0 977
michael@0 978 // Insertion sort in-place by abs(weight)
michael@0 979 for (int i = 0; i < lps->n; ++i) {
michael@0 980 OneCLDLangPrior temp_olp = lps->prior[i];
michael@0 981 int w = abs(GetCLDPriorWeight(temp_olp));
michael@0 982 int kk = i;
michael@0 983 for (; kk > 0; --kk) {
michael@0 984 if (abs(GetCLDPriorWeight(lps->prior[kk - 1])) < w) {
michael@0 985 // Move down and continue
michael@0 986 lps->prior[kk] = lps->prior[kk - 1];
michael@0 987 } else {
michael@0 988 // abs(weight[kk - 1]) >= w, time to stop
michael@0 989 break;
michael@0 990 }
michael@0 991 }
michael@0 992 lps->prior[kk] = temp_olp;
michael@0 993 }
michael@0 994
michael@0 995 lps->n = max_entries;
michael@0 996 }
michael@0 997
michael@0 998 int CountCommas(const string& langtags) {
michael@0 999 int commas = 0;
michael@0 1000 for (int i = 0; i < static_cast<int>(langtags.size()); ++i) {
michael@0 1001 if (langtags[i] == ',') {++commas;}
michael@0 1002 }
michael@0 1003 return commas;
michael@0 1004 }
michael@0 1005
michael@0 1006 // Binary lookup on language tag
michael@0 1007 const LangTagLookup* DoLangTagLookup(const char* key,
michael@0 1008 const LangTagLookup* tbl, int tbl_size) {
michael@0 1009 // Key is always in range [lo..hi)
michael@0 1010 int lo = 0;
michael@0 1011 int hi = tbl_size;
michael@0 1012 while (lo < hi) {
michael@0 1013 int mid = (lo + hi) >> 1;
michael@0 1014 int comp = strcmp(tbl[mid].langtag, key);
michael@0 1015 if (comp < 0) {
michael@0 1016 lo = mid + 1;
michael@0 1017 } else if (comp > 0) {
michael@0 1018 hi = mid;
michael@0 1019 } else {
michael@0 1020 return &tbl[mid];
michael@0 1021 }
michael@0 1022 }
michael@0 1023 return NULL;
michael@0 1024 }
michael@0 1025
michael@0 1026 // Binary lookup on tld
michael@0 1027 const TLDLookup* DoTLDLookup(const char* key,
michael@0 1028 const TLDLookup* tbl, int tbl_size) {
michael@0 1029 // Key is always in range [lo..hi)
michael@0 1030 int lo = 0;
michael@0 1031 int hi = tbl_size;
michael@0 1032 while (lo < hi) {
michael@0 1033 int mid = (lo + hi) >> 1;
michael@0 1034 int comp = strcmp(tbl[mid].tld, key);
michael@0 1035 if (comp < 0) {
michael@0 1036 lo = mid + 1;
michael@0 1037 } else if (comp > 0) {
michael@0 1038 hi = mid;
michael@0 1039 } else {
michael@0 1040 return &tbl[mid];
michael@0 1041 }
michael@0 1042 }
michael@0 1043 return NULL;
michael@0 1044 }
michael@0 1045
michael@0 1046
michael@0 1047
michael@0 1048 // Trim language tag string to canonical form for each language
michael@0 1049 // Input is from GetLangTagsFromHtml(), already lowercased
michael@0 1050 string TrimCLDLangTagsHint(const string& langtags) {
michael@0 1051 string retval;
michael@0 1052 if (langtags.empty()) {return retval;}
michael@0 1053 int commas = CountCommas(langtags);
michael@0 1054 if (commas > 4) {return retval;} // Ignore if too many language tags
michael@0 1055
michael@0 1056 char temp[20];
michael@0 1057 int pos = 0;
michael@0 1058 while (pos < static_cast<int>(langtags.size())) {
michael@0 1059 int comma = langtags.find(',', pos);
michael@0 1060 if (comma == string::npos) {comma = langtags.size();} // fake trailing comma
michael@0 1061 int len = comma - pos;
michael@0 1062 if (len <= 16) {
michael@0 1063 // Short enough to use
michael@0 1064 memcpy(temp, &langtags[pos], len);
michael@0 1065 temp[len] = '\0';
michael@0 1066 const LangTagLookup* entry = DoLangTagLookup(temp,
michael@0 1067 kCLDLangTagsHintTable1,
michael@0 1068 kCLDTable1Size);
michael@0 1069 if (entry != NULL) {
michael@0 1070 // First table hit
michael@0 1071 retval.append(entry->langcode); // may be "code1,code2"
michael@0 1072 retval.append(1, ',');
michael@0 1073 } else {
michael@0 1074 // Try second table with language code truncated at first hyphen
michael@0 1075 char* hyphen = strchr(temp, '-');
michael@0 1076 if (hyphen != NULL) {*hyphen = '\0';}
michael@0 1077 len = strlen(temp);
michael@0 1078 if (len <= 3) { // Short enough to use
michael@0 1079 entry = DoLangTagLookup(temp,
michael@0 1080 kCLDLangTagsHintTable2,
michael@0 1081 kCLDTable2Size);
michael@0 1082 if (entry != NULL) {
michael@0 1083 // Second table hit
michael@0 1084 retval.append(entry->langcode); // may be "code1,code2"
michael@0 1085 retval.append(1, ',');
michael@0 1086 }
michael@0 1087 }
michael@0 1088 }
michael@0 1089 }
michael@0 1090 pos = comma + 1;
michael@0 1091 }
michael@0 1092
michael@0 1093 // Remove trainling comma, if any
michael@0 1094 if (!retval.empty()) {retval.resize(retval.size() - 1);}
michael@0 1095 return retval;
michael@0 1096 }
michael@0 1097
michael@0 1098
michael@0 1099
michael@0 1100 //==============================================================================
michael@0 1101
michael@0 1102 // Little state machine to scan insides of language attribute quoted-string.
michael@0 1103 // Each language code is lowercased and copied to the output string. Underscore
michael@0 1104 // is mapped to minus. Space, tab, and comma are all mapped to comma, and
michael@0 1105 // multiple consecutive commas are removed.
michael@0 1106 // Each language code in the output list will be followed by a single comma.
michael@0 1107
michael@0 1108 // There are three states, and we start in state 1:
michael@0 1109 // State 0: After a letter.
michael@0 1110 // Copy all letters/minus[0], copy comma[1]; all others copy comma and skip [2]
michael@0 1111 // State 1: Just after a comma.
michael@0 1112 // Copy letter [0], Ignore subsequent commas[1]. minus and all others skip [2]
michael@0 1113 // State 2: Skipping.
michael@0 1114 // All characters except comma skip and stay in [2]. comma goes to [1]
michael@0 1115
michael@0 1116 // The thing that is copied is kLangCodeRemap[c] when going to state 0,
michael@0 1117 // and always comma when going to state 1 or 2. The design depends on copying
michael@0 1118 // a comma at the *beginning* of skipping, and in state 2 never doing a copy.
michael@0 1119
michael@0 1120 // We pack all this into 8 bits:
michael@0 1121 // +--+---+---+
michael@0 1122 // |78|654|321|
michael@0 1123 // +--+---+---+
michael@0 1124 //
michael@0 1125 // Shift byte right by 3*state, giving [0] 321, [1] 654, [2] .78
michael@0 1126 // where . is always zero
michael@0 1127 // Of these 3 bits, low two are next state ss, high bit is copy bit C.
michael@0 1128 // If C=1 and ss == 0, copy kLangCodeRemap[c], else copy a comma
michael@0 1129
michael@0 1130 #define SKIP0 0
michael@0 1131 #define SKIP1 1
michael@0 1132 #define SKIP2 2
michael@0 1133 #define COPY0 4 // copy kLangCodeRemap[c]
michael@0 1134 #define COPY1 5 // copy ','
michael@0 1135 #define COPY2 6 // copy ','
michael@0 1136
michael@0 1137 // These combined actions pack three states into one byte.
michael@0 1138 // Ninth bit must be zero, so all state 2 values must be skips.
michael@0 1139 // state[2] state[1] state[0]
michael@0 1140 #define LTR ((SKIP2 << 6) + (COPY0 << 3) + COPY0)
michael@0 1141 #define MINUS ((SKIP2 << 6) + (COPY2 << 3) + COPY0)
michael@0 1142 #define COMMA ((SKIP1 << 6) + (SKIP1 << 3) + COPY1)
michael@0 1143 #define Bad ((SKIP2 << 6) + (COPY2 << 3) + COPY2)
michael@0 1144
michael@0 1145 // Treat as letter: a-z, A-Z
michael@0 1146 // Treat as minus: 2D minus, 5F underscore
michael@0 1147 // Treat as comma: 09 tab, 20 space, 2C comma
michael@0 1148
michael@0 1149 static const unsigned char kLangCodeAction[256] = {
michael@0 1150 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,COMMA,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0 1151 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0 1152 COMMA,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,COMMA,MINUS,Bad,Bad,
michael@0 1153 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0 1154
michael@0 1155 Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,
michael@0 1156 LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,MINUS,
michael@0 1157 Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,
michael@0 1158 LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,Bad,
michael@0 1159
michael@0 1160 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0 1161 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0 1162 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0 1163 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0 1164
michael@0 1165 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0 1166 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0 1167 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0 1168 Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
michael@0 1169 };
michael@0 1170
michael@0 1171 // This does lowercasing, maps underscore to minus, and maps tab/space to comma
michael@0 1172 static const unsigned char kLangCodeRemap[256] = {
michael@0 1173 0,0,0,0,0,0,0,0, 0,',',0,0,0,0,0,0, // 09 tab
michael@0 1174 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 1175 ',',0,0,0,0,0,0,0, 0,0,0,0,',','-',0,0, // 20 space 2C comma 2D minus
michael@0 1176 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 1177
michael@0 1178 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o',
michael@0 1179 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,'-', // 5F underscore
michael@0 1180 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o',
michael@0 1181 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,0,
michael@0 1182
michael@0 1183 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 1184 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 1185 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 1186 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 1187
michael@0 1188 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 1189 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 1190 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 1191 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
michael@0 1192 };
michael@0 1193
michael@0 1194 #undef LTR
michael@0 1195 #undef MINUS
michael@0 1196 #undef COMMA
michael@0 1197 #undef Bad
michael@0 1198
michael@0 1199 #undef SKIP0
michael@0 1200 #undef SKIP1
michael@0 1201 #undef SKIP2
michael@0 1202 #undef COPY0
michael@0 1203 #undef COPY1
michael@0 1204 #undef COPY2
michael@0 1205
michael@0 1206
michael@0 1207 // Find opening '<' for HTML tag
michael@0 1208 // Note: this is all somewhat insensitive to mismatched quotes
michael@0 1209 int32 FindTagStart(const char* utf8_body, int32 pos, int32 max_pos) {
michael@0 1210 int i = pos;
michael@0 1211 // Advance i by 4 if none of the next 4 bytes are '<'
michael@0 1212 for (i = pos; i < (max_pos - 3); i += 4) {
michael@0 1213 // Fast check for any <
michael@0 1214 const char* p = &utf8_body[i];
michael@0 1215 uint32 s0123 = UNALIGNED_LOAD32(p);
michael@0 1216 uint32 temp = s0123 ^ 0x3c3c3c3c; // <<<<
michael@0 1217 if (((temp - 0x01010101) & (~temp & 0x80808080)) != 0) {
michael@0 1218 // At least one byte is '<'
michael@0 1219 break;
michael@0 1220 }
michael@0 1221 }
michael@0 1222 // Continue, advancing i by 1
michael@0 1223 for (; i < max_pos; ++i) {
michael@0 1224 if (utf8_body[i] == '<') {return i;}
michael@0 1225 }
michael@0 1226 return -1;
michael@0 1227 }
michael@0 1228
michael@0 1229
michael@0 1230 // Find closing '>' for HTML tag. Also stop on < and & (simplistic parsing)
michael@0 1231 int32 FindTagEnd(const char* utf8_body, int32 pos, int32 max_pos) {
michael@0 1232 // Always outside quotes
michael@0 1233 for (int i = pos; i < max_pos; ++i) {
michael@0 1234 char c = utf8_body[i];
michael@0 1235 if (c == '>') {return i;}
michael@0 1236 if (c == '<') {return i - 1;}
michael@0 1237 if (c == '&') {return i - 1;}
michael@0 1238 }
michael@0 1239 return -1; // nothing found
michael@0 1240 }
michael@0 1241
michael@0 1242 // Find opening quote or apostrophe, skipping spaces
michael@0 1243 // Note: this is all somewhat insensitive to mismatched quotes
michael@0 1244 int32 FindQuoteStart(const char* utf8_body, int32 pos, int32 max_pos) {
michael@0 1245 for (int i = pos; i < max_pos; ++i) {
michael@0 1246 char c = utf8_body[i];
michael@0 1247 if (c == '"') {return i;}
michael@0 1248 if (c == '\'') {return i;}
michael@0 1249 if (c != ' ') {return -1;}
michael@0 1250 }
michael@0 1251 return -1;
michael@0 1252 }
michael@0 1253
michael@0 1254 // Find closing quot/apos. Also stop on = > < and & (simplistic parsing)
michael@0 1255 int32 FindQuoteEnd(const char* utf8_body, int32 pos, int32 max_pos) {
michael@0 1256 // Always outside quotes
michael@0 1257 for (int i = pos; i < max_pos; ++i) {
michael@0 1258 char c = utf8_body[i];
michael@0 1259 if (c == '"') {return i;}
michael@0 1260 if (c == '\'') {return i;}
michael@0 1261 if (c == '>') {return i - 1;}
michael@0 1262 if (c == '=') {return i - 1;}
michael@0 1263 if (c == '<') {return i - 1;}
michael@0 1264 if (c == '&') {return i - 1;}
michael@0 1265 }
michael@0 1266 return -1; // nothing found
michael@0 1267 }
michael@0 1268
michael@0 1269 int32 FindEqualSign(const char* utf8_body, int32 pos, int32 max_pos) {
michael@0 1270 // Outside quotes/apostrophes loop
michael@0 1271 for (int i = pos; i < max_pos; ++i) {
michael@0 1272 char c = utf8_body[i];
michael@0 1273 if (c == '=') { // Found bare equal sign inside tag
michael@0 1274 return i;
michael@0 1275 } else if (c == '"') {
michael@0 1276 // Inside quotes loop
michael@0 1277 int j;
michael@0 1278 for (j = i + 1; j < max_pos; ++j) {
michael@0 1279 if (utf8_body[j] == '"') {
michael@0 1280 break;
michael@0 1281 } else if (utf8_body[j] == '\\') {
michael@0 1282 ++j;
michael@0 1283 }
michael@0 1284 }
michael@0 1285 i = j;
michael@0 1286 } else if (c == '\'') {
michael@0 1287 // Inside apostrophes loop
michael@0 1288 int j;
michael@0 1289 for (j = i + 1; j < max_pos; ++j) {
michael@0 1290 if (utf8_body[j] == '\'') {
michael@0 1291 break;
michael@0 1292 } else if (utf8_body[j] == '\\') {
michael@0 1293 ++j;
michael@0 1294 }
michael@0 1295 }
michael@0 1296 i = j;
michael@0 1297 }
michael@0 1298
michael@0 1299 }
michael@0 1300 return -1; // nothing found
michael@0 1301 }
michael@0 1302
michael@0 1303 // Scan backwards for case-insensitive string s in [min_pos..pos)
michael@0 1304 // Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f]
michael@0 1305 // Cheap lowercase. Control codes will masquerade as 20..3f
michael@0 1306 bool FindBefore(const char* utf8_body,
michael@0 1307 int32 min_pos, int32 pos, const char* s) {
michael@0 1308 int len = strlen(s);
michael@0 1309 if ((pos - min_pos) < len) {return false;} // Too small to fit s
michael@0 1310
michael@0 1311 // Skip trailing spaces
michael@0 1312 int i = pos;
michael@0 1313 while ((i > (min_pos + len)) && (utf8_body[i - 1] == ' ')) {--i;}
michael@0 1314 i -= len;
michael@0 1315 if (i < min_pos) {return false;} // pos - min_pos < len, so s can't be found
michael@0 1316
michael@0 1317 const char* p = &utf8_body[i];
michael@0 1318 for (int j = 0; j < len; ++j) {
michael@0 1319 if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte
michael@0 1320 }
michael@0 1321 return true; // All bytes equal at i
michael@0 1322 }
michael@0 1323
michael@0 1324 // Scan forwards for case-insensitive string s in [pos..max_pos)
michael@0 1325 // Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f]
michael@0 1326 // Cheap lowercase. Control codes will masquerade as 20..3f
michael@0 1327 // Allows but does not require quoted/apostrophe string
michael@0 1328 bool FindAfter(const char* utf8_body,
michael@0 1329 int32 pos, int32 max_pos, const char* s) {
michael@0 1330 int len = strlen(s);
michael@0 1331 if ((max_pos - pos) < len) {return false;} // Too small to fit s
michael@0 1332
michael@0 1333 // Skip leading spaces, quote, apostrophe
michael@0 1334 int i = pos;
michael@0 1335 while (i < (max_pos - len)) {
michael@0 1336 unsigned char c = utf8_body[i];
michael@0 1337 if ((c == ' ') || (c == '"') || (c == '\'')) {++i;}
michael@0 1338 else {break;}
michael@0 1339 }
michael@0 1340
michael@0 1341 const char* p = &utf8_body[i];
michael@0 1342 for (int j = 0; j < len; ++j) {
michael@0 1343 if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte
michael@0 1344 }
michael@0 1345 return true; // All bytes equal
michael@0 1346 }
michael@0 1347
michael@0 1348
michael@0 1349
michael@0 1350 // Copy attribute value in [pos..max_pos)
michael@0 1351 // pos is just after an opening quote/apostrophe and max_pos is the ending one
michael@0 1352 // String must all be on a single line.
michael@0 1353 // Return slightly-normalized language list, empty or ending in comma
michael@0 1354 // Does lowercasing and removes excess punctuation/space
michael@0 1355 string CopyOneQuotedString(const char* utf8_body,
michael@0 1356 int32 pos, int32 max_pos) {
michael@0 1357 string s;
michael@0 1358 int state = 1; // Front is logically just after a comma
michael@0 1359 for (int i = pos; i < max_pos; ++i) {
michael@0 1360 unsigned char c = utf8_body[i];
michael@0 1361 int e = kLangCodeAction[c] >> (3 * state);
michael@0 1362 state = e & 3; // Update to next state
michael@0 1363 if ((e & 4) != 0) {
michael@0 1364 // Copy a remapped byte if going to state 0, else copy a comma
michael@0 1365 if (state == 0) {
michael@0 1366 s.append(1, kLangCodeRemap[c]);
michael@0 1367 } else {
michael@0 1368 s.append(1, ',');
michael@0 1369 }
michael@0 1370 }
michael@0 1371 }
michael@0 1372
michael@0 1373 // Add final comma if needed
michael@0 1374 if (state == 0) {
michael@0 1375 s.append(1, ',');
michael@0 1376 }
michael@0 1377 return s;
michael@0 1378 }
michael@0 1379
michael@0 1380 // Find and copy attribute value: quoted string in [pos..max_pos)
michael@0 1381 // Return slightly-normalized language list, empty or ending in comma
michael@0 1382 string CopyQuotedString(const char* utf8_body,
michael@0 1383 int32 pos, int32 max_pos) {
michael@0 1384 int32 start_quote = FindQuoteStart(utf8_body, pos, max_pos);
michael@0 1385 if (start_quote < 0) {return string("");}
michael@0 1386 int32 end_quote = FindQuoteEnd(utf8_body, start_quote + 1, max_pos);
michael@0 1387 if (end_quote < 0) {return string("");}
michael@0 1388
michael@0 1389 return CopyOneQuotedString(utf8_body, start_quote + 1, end_quote);
michael@0 1390 }
michael@0 1391
michael@0 1392 // Add hints to vector of langpriors
michael@0 1393 // Input is from GetLangTagsFromHtml(), already lowercased
michael@0 1394 void SetCLDLangTagsHint(const string& langtags, CLDLangPriors* langpriors) {
michael@0 1395 if (langtags.empty()) {return;}
michael@0 1396 int commas = CountCommas(langtags);
michael@0 1397 if (commas > 4) {return;} // Ignore if too many language tags
michael@0 1398
michael@0 1399 char temp[20];
michael@0 1400 int pos = 0;
michael@0 1401 while (pos < static_cast<int>(langtags.size())) {
michael@0 1402 int comma = langtags.find(',', pos);
michael@0 1403 if (comma == string::npos) {comma = langtags.size();} // fake trailing comma
michael@0 1404 int len = comma - pos;
michael@0 1405 if (len <= 16) {
michael@0 1406 // Short enough to use
michael@0 1407 memcpy(temp, &langtags[pos], len);
michael@0 1408 temp[len] = '\0';
michael@0 1409 const LangTagLookup* entry = DoLangTagLookup(temp,
michael@0 1410 kCLDLangTagsHintTable1,
michael@0 1411 kCLDTable1Size);
michael@0 1412 if (entry != NULL) {
michael@0 1413 // First table hit
michael@0 1414 MergeCLDLangPriorsMax(entry->onelangprior1, langpriors);
michael@0 1415 MergeCLDLangPriorsMax(entry->onelangprior2, langpriors);
michael@0 1416 } else {
michael@0 1417 // Try second table with language code truncated at first hyphen
michael@0 1418 char* hyphen = strchr(temp, '-');
michael@0 1419 if (hyphen != NULL) {*hyphen = '\0';}
michael@0 1420 len = strlen(temp);
michael@0 1421 if (len <= 3) { // Short enough to use
michael@0 1422 entry = DoLangTagLookup(temp,
michael@0 1423 kCLDLangTagsHintTable2,
michael@0 1424 kCLDTable2Size);
michael@0 1425 if (entry != NULL) {
michael@0 1426 // Second table hit
michael@0 1427 MergeCLDLangPriorsMax(entry->onelangprior1, langpriors);
michael@0 1428 MergeCLDLangPriorsMax(entry->onelangprior2, langpriors);
michael@0 1429 }
michael@0 1430 }
michael@0 1431 }
michael@0 1432 }
michael@0 1433 pos = comma + 1;
michael@0 1434 }
michael@0 1435 }
michael@0 1436
michael@0 1437 // Add hints to vector of langpriors
michael@0 1438 // Input is string after HTTP header Content-Language:
michael@0 1439 void SetCLDContentLangHint(const char* contentlang, CLDLangPriors* langpriors) {
michael@0 1440 string langtags = CopyOneQuotedString(contentlang, 0, strlen(contentlang));
michael@0 1441 SetCLDLangTagsHint(langtags, langpriors);
michael@0 1442 }
michael@0 1443
michael@0 1444 // Add hints to vector of langpriors
michael@0 1445 // Input is last element of hostname (no dot), e.g. from GetTLD()
michael@0 1446 void SetCLDTLDHint(const char* tld, CLDLangPriors* langpriors) {
michael@0 1447 int len = strlen(tld);
michael@0 1448 if (len > 3) {return;} // Ignore if more than three letters
michael@0 1449 char local_tld[4];
michael@0 1450 strncpy(local_tld, tld, 4);
michael@0 1451 local_tld[3] = '\0'; // Safety move
michael@0 1452 // Lowercase
michael@0 1453 for (int i = 0; i < len; ++i) {local_tld[i] |= 0x20;}
michael@0 1454 const TLDLookup* entry = DoTLDLookup(local_tld,
michael@0 1455 kCLDTLDHintTable,
michael@0 1456 kCLDTable3Size);
michael@0 1457 if (entry != NULL) {
michael@0 1458 // Table hit
michael@0 1459 MergeCLDLangPriorsBoost(entry->onelangprior1, langpriors);
michael@0 1460 MergeCLDLangPriorsBoost(entry->onelangprior2, langpriors);
michael@0 1461 }
michael@0 1462 }
michael@0 1463
michael@0 1464 // Add hints to vector of langpriors
michael@0 1465 // Input is from DetectEncoding()
michael@0 1466 void SetCLDEncodingHint(Encoding enc, CLDLangPriors* langpriors) {
michael@0 1467 OneCLDLangPrior olp;
michael@0 1468 switch (enc) {
michael@0 1469 case CHINESE_GB:
michael@0 1470 case GBK:
michael@0 1471 case GB18030:
michael@0 1472 case ISO_2022_CN:
michael@0 1473 case HZ_GB_2312:
michael@0 1474 olp = PackCLDPriorLangWeight(CHINESE, kCLDPriorEncodingWeight);
michael@0 1475 MergeCLDLangPriorsBoost(olp, langpriors);
michael@0 1476 break;
michael@0 1477 case CHINESE_BIG5:
michael@0 1478 case CHINESE_BIG5_CP950:
michael@0 1479 case BIG5_HKSCS:
michael@0 1480 olp = PackCLDPriorLangWeight(CHINESE_T, kCLDPriorEncodingWeight);
michael@0 1481 MergeCLDLangPriorsBoost(olp, langpriors);
michael@0 1482 break;
michael@0 1483 case JAPANESE_EUC_JP:
michael@0 1484 case JAPANESE_SHIFT_JIS:
michael@0 1485 case JAPANESE_CP932:
michael@0 1486 case JAPANESE_JIS: // ISO-2022-JP
michael@0 1487 olp = PackCLDPriorLangWeight(JAPANESE, kCLDPriorEncodingWeight);
michael@0 1488 MergeCLDLangPriorsBoost(olp, langpriors);
michael@0 1489 break;
michael@0 1490 case KOREAN_EUC_KR:
michael@0 1491 case ISO_2022_KR:
michael@0 1492 olp = PackCLDPriorLangWeight(KOREAN, kCLDPriorEncodingWeight);
michael@0 1493 MergeCLDLangPriorsBoost(olp, langpriors);
michael@0 1494 break;
michael@0 1495
michael@0 1496 default:
michael@0 1497 break;
michael@0 1498 }
michael@0 1499 }
michael@0 1500
michael@0 1501 // Add hints to vector of langpriors
michael@0 1502 // Input is from random source
michael@0 1503 void SetCLDLanguageHint(Language lang, CLDLangPriors* langpriors) {
michael@0 1504 OneCLDLangPrior olp = PackCLDPriorLangWeight(lang, kCLDPriorLanguageWeight);
michael@0 1505 MergeCLDLangPriorsBoost(olp, langpriors);
michael@0 1506 }
michael@0 1507
michael@0 1508
michael@0 1509 // Make printable string of priors
michael@0 1510 string DumpCLDLangPriors(const CLDLangPriors* langpriors) {
michael@0 1511 string retval;
michael@0 1512 for (int i = 0; i < langpriors->n; ++i) {
michael@0 1513 char temp[64];
michael@0 1514 sprintf(temp, "%s.%d ",
michael@0 1515 LanguageCode(GetCLDPriorLang(langpriors->prior[i])),
michael@0 1516 GetCLDPriorWeight(langpriors->prior[i]));
michael@0 1517 retval.append(temp);
michael@0 1518 }
michael@0 1519 return retval;
michael@0 1520 }
michael@0 1521
michael@0 1522
michael@0 1523
michael@0 1524
michael@0 1525 // Look for
michael@0 1526 // <html lang="en">
michael@0 1527 // <doc xml:lang="en">
michael@0 1528 // <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en-US">
michael@0 1529 // <meta http-equiv="content-language" content="en-GB" />
michael@0 1530 // <meta name="language" content="Srpski">
michael@0 1531 // <meta name="DC.language" scheme="RFCOMMA766" content="en">
michael@0 1532 // <SPAN id="msg1" class="info" lang='en'>
michael@0 1533 //
michael@0 1534 // Do not trigger on
michael@0 1535 // <!-- lang=french ...-->
michael@0 1536 // <font lang=postscript ...>
michael@0 1537 // <link href="index.fr.html" hreflang="fr-FR" xml:lang="fr-FR" />
michael@0 1538 // <META name="Author" lang="fr" content="Arnaud Le Hors">
michael@0 1539 //
michael@0 1540 // Stop fairly quickly on mismatched quotes
michael@0 1541 //
michael@0 1542 // Allowed language characters
michael@0 1543 // a-z A-Z -_ , space\t
michael@0 1544 // Think about: GB2312, big5, shift-jis, euc-jp, ksc euc-kr
michael@0 1545 // zh-hans zh-TW cmn-Hani zh_cn.gb18030_CN zh-min-nan zh-yue
michael@0 1546 // de-x-mtfrom-en zh-tw-x-mtfrom-en (machine translation)
michael@0 1547 // GB2312 => gb
michael@0 1548 // Big5 => big
michael@0 1549 // zh_CN.gb18030_C => zh-cn
michael@0 1550 //
michael@0 1551 // Remove duplicates and extra spaces as we go
michael@0 1552 // Lowercase as we go.
michael@0 1553
michael@0 1554 // Get language tag hints from HTML body
michael@0 1555 // Normalize: remove spaces and make lowercase comma list
michael@0 1556
michael@0 1557 string GetLangTagsFromHtml(const char* utf8_body, int32 utf8_body_len,
michael@0 1558 int32 max_scan_bytes) {
michael@0 1559 string retval;
michael@0 1560 if (max_scan_bytes > utf8_body_len) {
michael@0 1561 max_scan_bytes = utf8_body_len;
michael@0 1562 }
michael@0 1563
michael@0 1564 int32 k = 0;
michael@0 1565 while (k < max_scan_bytes) {
michael@0 1566 int32 start_tag = FindTagStart(utf8_body, k, max_scan_bytes);
michael@0 1567 if (start_tag < 0) {break;}
michael@0 1568 int32 end_tag = FindTagEnd(utf8_body, start_tag + 1, max_scan_bytes);
michael@0 1569 // FindTagEnd exits on < > &
michael@0 1570 if (end_tag < 0) {break;}
michael@0 1571
michael@0 1572 // Skip <!--...>
michael@0 1573 // Skip <font ...>
michael@0 1574 // Skip <script ...>
michael@0 1575 // Skip <link ...>
michael@0 1576 // Skip <img ...>
michael@0 1577 // Skip <a ...>
michael@0 1578 if (FindAfter(utf8_body, start_tag + 1, end_tag, "!--") ||
michael@0 1579 FindAfter(utf8_body, start_tag + 1, end_tag, "font ") ||
michael@0 1580 FindAfter(utf8_body, start_tag + 1, end_tag, "script ") ||
michael@0 1581 FindAfter(utf8_body, start_tag + 1, end_tag, "link ") ||
michael@0 1582 FindAfter(utf8_body, start_tag + 1, end_tag, "img ") ||
michael@0 1583 FindAfter(utf8_body, start_tag + 1, end_tag, "a ")) {
michael@0 1584 k = end_tag + 1;
michael@0 1585 continue;
michael@0 1586 }
michael@0 1587
michael@0 1588 // Remember <meta ...>
michael@0 1589 bool in_meta = false;
michael@0 1590 if (FindAfter(utf8_body, start_tag + 1, end_tag, "meta ")) {
michael@0 1591 in_meta = true;
michael@0 1592 }
michael@0 1593
michael@0 1594 // Scan for each equal sign inside tag
michael@0 1595 bool content_is_lang = false;
michael@0 1596 int32 kk = start_tag + 1;
michael@0 1597 int32 equal_sign;
michael@0 1598 while ((equal_sign = FindEqualSign(utf8_body, kk, end_tag)) >= 0) {
michael@0 1599 // eq exits on < > &
michael@0 1600
michael@0 1601 // Look inside a meta tag
michael@0 1602 // <meta ... http-equiv="content-language" ...>
michael@0 1603 // <meta ... name="language" ...>
michael@0 1604 // <meta ... name="dc.language" ...>
michael@0 1605 if (in_meta) {
michael@0 1606 if (FindBefore(utf8_body, kk, equal_sign, " http-equiv") &&
michael@0 1607 FindAfter(utf8_body, equal_sign + 1, end_tag,
michael@0 1608 "content-language ")) {
michael@0 1609 content_is_lang = true;
michael@0 1610 } else if (FindBefore(utf8_body, kk, equal_sign, " name") &&
michael@0 1611 (FindAfter(utf8_body, equal_sign + 1, end_tag,
michael@0 1612 "dc.language ") ||
michael@0 1613 FindAfter(utf8_body, equal_sign + 1, end_tag,
michael@0 1614 "language "))) {
michael@0 1615 content_is_lang = true;
michael@0 1616 }
michael@0 1617 }
michael@0 1618
michael@0 1619 // Look inside any tag
michael@0 1620 // <meta ... content="lang-list" ...>
michael@0 1621 // <... lang="lang-list" ...>
michael@0 1622 // <... xml:lang="lang-list" ...>
michael@0 1623 if ((content_is_lang && FindBefore(utf8_body, kk, equal_sign,
michael@0 1624 " content")) ||
michael@0 1625 FindBefore(utf8_body, kk, equal_sign, " lang") ||
michael@0 1626 FindBefore(utf8_body, kk, equal_sign, ":lang")) {
michael@0 1627 string temp = CopyQuotedString(utf8_body, equal_sign + 1, end_tag);
michael@0 1628
michael@0 1629 // Append new lang tag(s) if not a duplicate
michael@0 1630 if (!temp.empty() && (retval.find(temp) == string::npos)) {
michael@0 1631 retval.append(temp);
michael@0 1632 }
michael@0 1633 }
michael@0 1634
michael@0 1635 kk = equal_sign + 1;
michael@0 1636 }
michael@0 1637 k = end_tag + 1;
michael@0 1638 }
michael@0 1639
michael@0 1640 // Strip last comma
michael@0 1641 if (retval.size() > 1) {
michael@0 1642 retval.erase(retval.size() - 1);
michael@0 1643 }
michael@0 1644 return retval;
michael@0 1645 }
michael@0 1646
michael@0 1647 } // End namespace CLD2
michael@0 1648
michael@0 1649 //==============================================================================
michael@0 1650
michael@0 1651

mercurial