Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | // Copyright 2013 Google Inc. All Rights Reserved. |
michael@0 | 2 | // |
michael@0 | 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
michael@0 | 4 | // you may not use this file except in compliance with the License. |
michael@0 | 5 | // You may obtain a copy of the License at |
michael@0 | 6 | // |
michael@0 | 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
michael@0 | 8 | // |
michael@0 | 9 | // Unless required by applicable law or agreed to in writing, software |
michael@0 | 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
michael@0 | 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
michael@0 | 12 | // See the License for the specific language governing permissions and |
michael@0 | 13 | // limitations under the License. |
michael@0 | 14 | |
michael@0 | 15 | // |
michael@0 | 16 | // Author: dsites@google.com (Dick Sites) |
michael@0 | 17 | // |
michael@0 | 18 | |
michael@0 | 19 | #include "compact_lang_det_hint_code.h" |
michael@0 | 20 | |
michael@0 | 21 | #include <stdlib.h> // for abs() |
michael@0 | 22 | #include <stdio.h> // for sprintf() |
michael@0 | 23 | #include <string.h> // |
michael@0 | 24 | #include "lang_script.h" |
michael@0 | 25 | #include "port.h" |
michael@0 | 26 | |
michael@0 | 27 | using namespace std; |
michael@0 | 28 | |
michael@0 | 29 | namespace CLD2 { |
michael@0 | 30 | |
michael@0 | 31 | static const int kCLDPriorEncodingWeight = 4; // 100x more likely |
michael@0 | 32 | static const int kCLDPriorLanguageWeight = 8; // 10000x more likely |
michael@0 | 33 | |
michael@0 | 34 | |
michael@0 | 35 | // Tables to map lang="..." language code lists to actual languages. |
michael@0 | 36 | // based on scraping and hand-edits, dsites June 2011 |
michael@0 | 37 | |
michael@0 | 38 | // n = f(string, &a) gives list of n<=4 language pairs: primary, secondary |
michael@0 | 39 | |
michael@0 | 40 | // For close pairs like ms/id, more weight on TLD and lang= |
michael@0 | 41 | // Alternately, weaker boost but mark others of set as negative; |
michael@0 | 42 | // makes "neither" an easier result. |
michael@0 | 43 | // lang=en low weight 4 |
michael@0 | 44 | // tld=lu boost lu maaybe 4. but lang= alwyas overcomes tld and encoding |
michael@0 | 45 | // (except maybe en) |
michael@0 | 46 | |
michael@0 | 47 | // TLD to separate, e.g., burundi from rwanda |
michael@0 | 48 | |
michael@0 | 49 | // Encoding lookup: OneLangProb array |
michael@0 | 50 | // TLD lookup: tld OneLangProb pairs |
michael@0 | 51 | |
michael@0 | 52 | |
michael@0 | 53 | typedef struct { |
michael@0 | 54 | const char* const langtag; // Lowercased, hyphen only lookup key |
michael@0 | 55 | const char* const langcode; // Canonical language codes; two if ambiguous |
michael@0 | 56 | OneCLDLangPrior onelangprior1; |
michael@0 | 57 | OneCLDLangPrior onelangprior2; |
michael@0 | 58 | } LangTagLookup; |
michael@0 | 59 | |
michael@0 | 60 | typedef struct { |
michael@0 | 61 | const char* const tld; // Lowercased, hyphen only lookup key |
michael@0 | 62 | OneCLDLangPrior onelangprior1; |
michael@0 | 63 | OneCLDLangPrior onelangprior2; |
michael@0 | 64 | } TLDLookup; |
michael@0 | 65 | |
michael@0 | 66 | |
michael@0 | 67 | #define W2 (2 << 10) // 3**2 = 10x more likely |
michael@0 | 68 | #define W4 (4 << 10) // 3**4 = 100x more likely |
michael@0 | 69 | #define W6 (6 << 10) // 3**6 = 1000x more likely |
michael@0 | 70 | #define W8 (8 << 10) // 3**8 = 10K x more likely |
michael@0 | 71 | #define W10 (10 << 10) // 3**10 = 100K x more likely |
michael@0 | 72 | #define W12 (12 << 10) // 3**12 = 1M x more likely |
michael@0 | 73 | |
michael@0 | 74 | // TODO: more about ba hr sr sr-ME and sl |
michael@0 | 75 | // Temporary state of affairs: |
michael@0 | 76 | // BOSNIAN CROATIAN MONTENEGRIN SERBIAN detecting just CROATIAN SERBIAN |
michael@0 | 77 | // Eventually, we want to do all four, but it requires a CLD change to handle |
michael@0 | 78 | // up to six languages per quadgram. |
michael@0 | 79 | |
michael@0 | 80 | |
michael@0 | 81 | // Close pairs boost one of pair, demote other. |
michael@0 | 82 | // Statistically close pairs: |
michael@0 | 83 | // INDONESIAN/MALAY difficult to distinguish -- extra word-based lookups used |
michael@0 | 84 | // |
michael@0 | 85 | // INDONESIAN MALAY coef=0.4698 Problematic w/o extra words |
michael@0 | 86 | // TIBETAN DZONGKHA coef=0.4571 |
michael@0 | 87 | // CZECH SLOVAK coef=0.4273 |
michael@0 | 88 | // NORWEGIAN NORWEGIAN_N coef=0.4182 |
michael@0 | 89 | // |
michael@0 | 90 | // HINDI MARATHI coef=0.3795 |
michael@0 | 91 | // ZULU XHOSA coef=0.3716 |
michael@0 | 92 | // |
michael@0 | 93 | // DANISH NORWEGIAN coef=0.3672 Usually OK |
michael@0 | 94 | // BIHARI HINDI coef=0.3668 Usually OK |
michael@0 | 95 | // ICELANDIC FAROESE coef=0.3519 Usually OK |
michael@0 | 96 | |
michael@0 | 97 | // |
michael@0 | 98 | // Table to look up lang= tags longer than three characters |
michael@0 | 99 | // Overrides table below, which is truncated at first hyphen |
michael@0 | 100 | // In alphabetical order for binary search |
michael@0 | 101 | static const int kCLDTable1Size = 213; |
michael@0 | 102 | static const LangTagLookup kCLDLangTagsHintTable1[kCLDTable1Size] = { |
michael@0 | 103 | {"abkhazian", "ab", ABKHAZIAN + W10, 0}, |
michael@0 | 104 | {"afar", "aa", AFAR + W10, 0}, |
michael@0 | 105 | {"afrikaans", "af", AFRIKAANS + W10, 0}, |
michael@0 | 106 | {"akan", "ak", AKAN + W10, 0}, |
michael@0 | 107 | {"albanian", "sq", ALBANIAN + W10, 0}, |
michael@0 | 108 | {"am-am", "hy", ARMENIAN + W10, 0}, // 1:2 Armenian, not ambiguous |
michael@0 | 109 | {"amharic", "am", AMHARIC + W10, 0}, |
michael@0 | 110 | {"arabic", "ar", ARABIC + W10, 0}, |
michael@0 | 111 | {"argentina", "es", SPANISH + W10, 0}, |
michael@0 | 112 | {"armenian", "hy", ARMENIAN + W10, 0}, |
michael@0 | 113 | {"assamese", "as", ASSAMESE + W10, 0}, |
michael@0 | 114 | {"aymara", "ay", AYMARA + W10, 0}, |
michael@0 | 115 | {"azerbaijani", "az", AZERBAIJANI + W10, 0}, |
michael@0 | 116 | |
michael@0 | 117 | {"bangla", "bn", BENGALI + W10, 0}, |
michael@0 | 118 | {"bashkir", "ba", BASHKIR + W10, 0}, |
michael@0 | 119 | {"basque", "eu", BASQUE + W10, 0}, |
michael@0 | 120 | {"belarusian", "be", BELARUSIAN + W10, 0}, |
michael@0 | 121 | {"bengali", "bn", BENGALI + W10, 0}, |
michael@0 | 122 | {"bihari", "bh", BIHARI + W10, HINDI - W4}, |
michael@0 | 123 | {"bislama", "bi", BISLAMA + W10, 0}, |
michael@0 | 124 | {"bosnian", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian |
michael@0 | 125 | {"br-br", "pt", PORTUGUESE + W10, 0}, // 1:2 Portuguese, not ambiguous |
michael@0 | 126 | {"br-fr", "br", BRETON + W10, 0}, // 1:2 Breton, not ambiguous |
michael@0 | 127 | {"breton", "br", BRETON + W10, 0}, |
michael@0 | 128 | {"bulgarian", "bg", BULGARIAN + W10, 0}, |
michael@0 | 129 | {"burmese", "my", BURMESE + W10, 0}, // Myanmar |
michael@0 | 130 | |
michael@0 | 131 | {"catalan", "ca", CATALAN + W10, 0}, |
michael@0 | 132 | {"cherokee", "chr", CHEROKEE + W10, 0}, |
michael@0 | 133 | {"chichewa", "ny", NYANJA + W10, 0}, |
michael@0 | 134 | |
michael@0 | 135 | {"chinese", "zh", CHINESE + W10, 0}, |
michael@0 | 136 | {"chinese-t", "zhT", CHINESE_T + W10, 0}, |
michael@0 | 137 | {"chineset", "zhT", CHINESE_T + W10, 0}, |
michael@0 | 138 | {"corsican", "co", CORSICAN + W10, 0}, |
michael@0 | 139 | {"cpf-hat", "ht", HAITIAN_CREOLE + W10, 0}, // Creole, French-based |
michael@0 | 140 | {"croatian", "hr", CROATIAN + W10, 0}, |
michael@0 | 141 | {"czech", "cs", CZECH + W10, SLOVAK - W4}, |
michael@0 | 142 | |
michael@0 | 143 | {"danish", "da", DANISH + W10, NORWEGIAN - W4}, |
michael@0 | 144 | {"deutsch", "de", GERMAN + W10, 0}, |
michael@0 | 145 | {"dhivehi", "dv", DHIVEHI + W10, 0}, |
michael@0 | 146 | {"dutch", "nl", DUTCH + W10, 0}, |
michael@0 | 147 | {"dzongkha", "dz", DZONGKHA + W10, TIBETAN - W4}, |
michael@0 | 148 | |
michael@0 | 149 | {"ell-gr", "el", GREEK + W10, 0}, |
michael@0 | 150 | {"english", "en", ENGLISH + W4, 0}, |
michael@0 | 151 | {"esperanto", "eo", ESPERANTO + W10, 0}, |
michael@0 | 152 | {"estonian", "et", ESTONIAN + W10, 0}, |
michael@0 | 153 | {"euc-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding |
michael@0 | 154 | {"euc-kr", "ko", KOREAN + W10, 0}, // Korean encoding |
michael@0 | 155 | |
michael@0 | 156 | {"faroese", "fo", FAROESE + W10, ICELANDIC - W4}, |
michael@0 | 157 | {"fijian", "fj", FIJIAN + W10, 0}, |
michael@0 | 158 | {"finnish", "fi", FINNISH + W10, 0}, |
michael@0 | 159 | {"fran", "fr", FRENCH + W10, 0}, // Truncated at non-ASCII |
michael@0 | 160 | {"francais", "fr", FRENCH + W10, 0}, |
michael@0 | 161 | {"french", "fr", FRENCH + W10, 0}, |
michael@0 | 162 | {"frisian", "fy", FRISIAN + W10, 0}, |
michael@0 | 163 | |
michael@0 | 164 | {"ga-es", "gl", GALICIAN + W10, 0}, // 1:2 Galician, not ambiguous |
michael@0 | 165 | {"galician", "gl", GALICIAN + W10, 0}, |
michael@0 | 166 | {"ganda", "lg", GANDA + W10, 0}, |
michael@0 | 167 | {"georgian", "ka", GEORGIAN + W10, 0}, |
michael@0 | 168 | {"german", "de", GERMAN + W10, 0}, |
michael@0 | 169 | {"greek", "el", GREEK + W10, 0}, |
michael@0 | 170 | {"greenlandic", "kl", GREENLANDIC + W10, 0}, |
michael@0 | 171 | {"guarani", "gn", GUARANI + W10, 0}, |
michael@0 | 172 | {"gujarati", "gu", GUJARATI + W10, 0}, |
michael@0 | 173 | |
michael@0 | 174 | {"haitian_creole", "ht", HAITIAN_CREOLE + W10, 0}, |
michael@0 | 175 | {"hausa", "ha", HAUSA + W10, 0}, |
michael@0 | 176 | {"hawaiian", "haw", HAWAIIAN + W10, 0}, |
michael@0 | 177 | {"hebrew", "iw", HEBREW + W10, 0}, |
michael@0 | 178 | {"hindi", "hi", HINDI + W10, MARATHI - W4}, |
michael@0 | 179 | {"hn-in", "hi", HINDI + W10, MARATHI - W4}, |
michael@0 | 180 | {"hungarian", "hu", HUNGARIAN + W10, 0}, |
michael@0 | 181 | |
michael@0 | 182 | {"icelandic", "is", ICELANDIC + W10, FAROESE - W4}, |
michael@0 | 183 | {"igbo", "ig", IGBO + W10, 0}, |
michael@0 | 184 | {"indonesian", "id", INDONESIAN + W10, MALAY - W4}, |
michael@0 | 185 | {"interlingua", "ia", INTERLINGUA + W10, 0}, |
michael@0 | 186 | {"interlingue", "ie", INTERLINGUE + W10, 0}, |
michael@0 | 187 | // 1:2 iu-Cans ik-Latn |
michael@0 | 188 | {"inuktitut", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2 |
michael@0 | 189 | {"inupiak", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2 |
michael@0 | 190 | {"ir-ie", "ga", IRISH + W10, 0}, // Irish |
michael@0 | 191 | {"irish", "ga", IRISH + W10, 0}, |
michael@0 | 192 | {"italian", "it", ITALIAN + W10, 0}, |
michael@0 | 193 | |
michael@0 | 194 | {"ja-euc", "ja", JAPANESE + W10, 0}, // Japanese encoding |
michael@0 | 195 | {"jan-jp", "ja", JAPANESE + W10, 0}, // Japanese encoding |
michael@0 | 196 | {"japanese", "ja", JAPANESE + W10, 0}, |
michael@0 | 197 | {"javanese", "jw", JAVANESE + W10, 0}, |
michael@0 | 198 | |
michael@0 | 199 | {"kannada", "kn", KANNADA + W10, 0}, |
michael@0 | 200 | {"kashmiri", "ks", KASHMIRI + W10, 0}, |
michael@0 | 201 | {"kazakh", "kk", KAZAKH + W10, 0}, |
michael@0 | 202 | {"khasi", "kha", KHASI + W10, 0}, |
michael@0 | 203 | {"khmer", "km", KHMER + W10, 0}, |
michael@0 | 204 | {"kinyarwanda", "rw", KINYARWANDA + W10, 0}, |
michael@0 | 205 | {"klingon", "tlh", X_KLINGON + W10, 0}, |
michael@0 | 206 | {"korean", "ko", KOREAN + W10, 0}, |
michael@0 | 207 | {"kurdish", "ku", KURDISH + W10, 0}, |
michael@0 | 208 | {"kyrgyz", "ky", KYRGYZ + W10, 0}, |
michael@0 | 209 | |
michael@0 | 210 | {"laothian", "lo", LAOTHIAN + W10, 0}, |
michael@0 | 211 | {"latin", "la", LATIN + W10, 0}, |
michael@0 | 212 | {"latvian", "lv", LATVIAN + W10, 0}, |
michael@0 | 213 | {"limbu", "sit", LIMBU + W10, 0}, |
michael@0 | 214 | {"lingala", "ln", LINGALA + W10, 0}, |
michael@0 | 215 | {"lithuanian", "lt", LITHUANIAN + W10, 0}, |
michael@0 | 216 | {"luxembourgish", "lb", LUXEMBOURGISH + W10, 0}, |
michael@0 | 217 | |
michael@0 | 218 | {"macedonian", "mk", MACEDONIAN + W10, 0}, |
michael@0 | 219 | {"malagasy", "mg", MALAGASY + W10, 0}, |
michael@0 | 220 | {"malay", "ms", MALAY + W10, INDONESIAN - W4}, |
michael@0 | 221 | {"malayalam", "ml", MALAYALAM + W10, 0}, |
michael@0 | 222 | {"maltese", "mt", MALTESE + W10, 0}, |
michael@0 | 223 | {"manx", "gv", MANX + W10, 0}, |
michael@0 | 224 | {"maori", "mi", MAORI + W10, 0}, |
michael@0 | 225 | {"marathi", "mr", MARATHI + W10, HINDI - W4}, |
michael@0 | 226 | {"mauritian_creole", "mfe", MAURITIAN_CREOLE + W10, 0}, |
michael@0 | 227 | {"moldavian", "mo", ROMANIAN + W10, 0}, |
michael@0 | 228 | {"mongolian", "mn", MONGOLIAN + W10, 0}, |
michael@0 | 229 | {"montenegrin", "sr-me", MONTENEGRIN + W10, 0}, |
michael@0 | 230 | {"myanmar", "my", BURMESE + W10, 0}, // Myanmar |
michael@0 | 231 | {"nauru", "na", NAURU + W10, 0}, |
michael@0 | 232 | {"ndebele", "nr", NDEBELE + W10, 0}, |
michael@0 | 233 | {"nepali", "ne", NEPALI + W10, 0}, |
michael@0 | 234 | {"no-bok", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal |
michael@0 | 235 | {"no-bokmaal", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, |
michael@0 | 236 | {"no-nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, // Bokmaal |
michael@0 | 237 | {"no-no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, |
michael@0 | 238 | {"no-nyn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, // Nynorsk |
michael@0 | 239 | {"no-nynorsk", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, |
michael@0 | 240 | {"norwegian", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, |
michael@0 | 241 | {"norwegian_n", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, |
michael@0 | 242 | {"nyanja", "ny", NYANJA + W10, 0}, |
michael@0 | 243 | |
michael@0 | 244 | {"occitan", "oc", OCCITAN + W10, 0}, |
michael@0 | 245 | {"oriya", "or", ORIYA + W10, 0}, |
michael@0 | 246 | {"oromo", "om", OROMO + W10, 0}, |
michael@0 | 247 | {"parsi", "fa", PERSIAN + W10, 0}, |
michael@0 | 248 | |
michael@0 | 249 | {"pashto", "ps", PASHTO + W10, 0}, |
michael@0 | 250 | {"pedi", "nso", PEDI + W10, 0}, |
michael@0 | 251 | {"persian", "fa", PERSIAN + W10, 0}, |
michael@0 | 252 | {"polish", "pl", POLISH + W10, 0}, |
michael@0 | 253 | {"polska", "pl", POLISH + W10, 0}, |
michael@0 | 254 | {"polski", "pl", POLISH + W10, 0}, |
michael@0 | 255 | {"portugu", "pt", PORTUGUESE + W10, 0}, // Truncated at non-ASCII |
michael@0 | 256 | {"portuguese", "pt", PORTUGUESE + W10, 0}, |
michael@0 | 257 | {"punjabi", "pa", PUNJABI + W10, 0}, |
michael@0 | 258 | |
michael@0 | 259 | {"quechua", "qu", QUECHUA + W10, 0}, |
michael@0 | 260 | |
michael@0 | 261 | {"rhaeto_romance", "rm", RHAETO_ROMANCE + W10, 0}, |
michael@0 | 262 | {"romanian", "ro", ROMANIAN + W10, 0}, |
michael@0 | 263 | {"rundi", "rn", RUNDI + W10, 0}, |
michael@0 | 264 | {"russian", "ru", RUSSIAN + W10, 0}, |
michael@0 | 265 | |
michael@0 | 266 | {"samoan", "sm", SAMOAN + W10, 0}, |
michael@0 | 267 | {"sango", "sg", SANGO + W10, 0}, |
michael@0 | 268 | {"sanskrit", "sa", SANSKRIT + W10, 0}, |
michael@0 | 269 | {"scots", "sco", SCOTS + W10, ENGLISH - W4}, |
michael@0 | 270 | {"scots_gaelic", "gd", SCOTS_GAELIC + W10, 0}, |
michael@0 | 271 | {"serbian", "sr", SERBIAN + W10, 0}, |
michael@0 | 272 | {"seselwa", "crs", SESELWA + W10, 0}, |
michael@0 | 273 | {"sesotho", "st", SESOTHO + W10, 0}, |
michael@0 | 274 | {"shift-jis", "ja", JAPANESE + W10, 0}, // Japanese encoding |
michael@0 | 275 | {"shift-js", "ja", JAPANESE + W10, 0}, // Japanese encoding |
michael@0 | 276 | {"shona", "sn", SHONA + W10, 0}, |
michael@0 | 277 | {"si-lk", "si", SINHALESE + W10, 0}, // 1:2 Sri Lanka, not ambiguous |
michael@0 | 278 | {"si-si", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous |
michael@0 | 279 | {"si-sl", "sl", SLOVENIAN + W10, 0}, // 1:2 Slovenia, not ambiguous |
michael@0 | 280 | {"sindhi", "sd", SINDHI + W10, 0}, |
michael@0 | 281 | {"sinhalese", "si", SINHALESE + W10, 0}, |
michael@0 | 282 | {"siswant", "ss", SISWANT + W10, 0}, |
michael@0 | 283 | {"sit-np", "sit", LIMBU + W10, 0}, |
michael@0 | 284 | {"slovak", "sk", SLOVAK + W10, CZECH - W4}, |
michael@0 | 285 | {"slovenian", "sl", SLOVENIAN + W10, 0}, |
michael@0 | 286 | {"somali", "so", SOMALI + W10, 0}, |
michael@0 | 287 | {"spanish", "es", SPANISH + W10, 0}, |
michael@0 | 288 | {"sr-me", "sr-me", MONTENEGRIN + W10, 0}, // Montenegrin => Montenegrin |
michael@0 | 289 | {"sundanese", "su", SUNDANESE + W10, 0}, |
michael@0 | 290 | {"suomi", "fi", FINNISH + W10, 0}, // Finnish |
michael@0 | 291 | {"swahili", "sw", SWAHILI + W10, 0}, |
michael@0 | 292 | {"swedish", "sv", SWEDISH + W10, 0}, |
michael@0 | 293 | {"syriac", "syr", SYRIAC + W10, 0}, |
michael@0 | 294 | |
michael@0 | 295 | {"tagalog", "tl", TAGALOG + W10, 0}, |
michael@0 | 296 | {"tajik", "tg", TAJIK + W10, 0}, |
michael@0 | 297 | {"tamil", "ta", TAMIL + W10, 0}, |
michael@0 | 298 | {"tatar", "tt", TATAR + W10, 0}, |
michael@0 | 299 | {"tb-tb", "bo", TIBETAN + W10, DZONGKHA - W4}, // Tibet |
michael@0 | 300 | {"tchinese", "zhT", CHINESE_T + W10, 0}, |
michael@0 | 301 | {"telugu", "te", TELUGU + W10, 0}, |
michael@0 | 302 | {"thai", "th", THAI + W10, 0}, |
michael@0 | 303 | {"tibetan", "bo", TIBETAN + W10, DZONGKHA - W4}, |
michael@0 | 304 | {"tigrinya", "ti", TIGRINYA + W10, 0}, |
michael@0 | 305 | {"tonga", "to", TONGA + W10, 0}, |
michael@0 | 306 | {"tsonga", "ts", TSONGA + W10, 0}, |
michael@0 | 307 | {"tswana", "tn", TSWANA + W10, 0}, |
michael@0 | 308 | {"tt-ru", "tt", TATAR + W10, 0}, |
michael@0 | 309 | {"tur-tr", "tr", TURKISH + W10, 0}, |
michael@0 | 310 | {"turkish", "tr", TURKISH + W10, 0}, |
michael@0 | 311 | {"turkmen", "tk", TURKMEN + W10, 0}, |
michael@0 | 312 | {"uighur", "ug", UIGHUR + W10, 0}, |
michael@0 | 313 | {"ukrainian", "uk", UKRAINIAN + W10, 0}, |
michael@0 | 314 | {"urdu", "ur", URDU + W10, 0}, |
michael@0 | 315 | {"uzbek", "uz", UZBEK + W10, 0}, |
michael@0 | 316 | |
michael@0 | 317 | {"venda", "ve", VENDA + W10, 0}, |
michael@0 | 318 | {"vietnam", "vi", VIETNAMESE + W10, 0}, |
michael@0 | 319 | {"vietnamese", "vi", VIETNAMESE + W10, 0}, |
michael@0 | 320 | {"volapuk", "vo", VOLAPUK + W10, 0}, |
michael@0 | 321 | |
michael@0 | 322 | {"welsh", "cy", WELSH + W10, 0}, |
michael@0 | 323 | {"wolof", "wo", WOLOF + W10, 0}, |
michael@0 | 324 | |
michael@0 | 325 | {"xhosa", "xh", XHOSA + W10, ZULU - W4}, |
michael@0 | 326 | |
michael@0 | 327 | {"yiddish", "yi", YIDDISH + W10, 0}, |
michael@0 | 328 | {"yoruba", "yo", YORUBA + W10, 0}, |
michael@0 | 329 | |
michael@0 | 330 | {"zh-classical", "zhT", CHINESE_T + W10, 0}, |
michael@0 | 331 | {"zh-cn", "zh", CHINESE + W10, 0}, |
michael@0 | 332 | {"zh-hans", "zh", CHINESE + W10, 0}, |
michael@0 | 333 | {"zh-hant", "zhT", CHINESE_T + W10, 0}, |
michael@0 | 334 | {"zh-hk", "zhT", CHINESE_T + W10, 0}, |
michael@0 | 335 | {"zh-min-nan", "zhT", CHINESE_T + W10, 0}, // Min Nan => ChineseT |
michael@0 | 336 | {"zh-sg", "zhT", CHINESE_T + W10, 0}, |
michael@0 | 337 | {"zh-tw", "zhT", CHINESE_T + W10, 0}, |
michael@0 | 338 | {"zh-yue", "zh", CHINESE + W10, 0}, // Yue (Cantonese) => Chinese |
michael@0 | 339 | {"zhuang", "za", ZHUANG + W10, 0}, |
michael@0 | 340 | {"zulu", "zu", ZULU + W10, XHOSA - W4}, |
michael@0 | 341 | }; |
michael@0 | 342 | |
michael@0 | 343 | |
michael@0 | 344 | |
michael@0 | 345 | // Table to look up lang= tags of two/three characters after truncate at hyphen |
michael@0 | 346 | // In alphabetical order for binary search |
michael@0 | 347 | static const int kCLDTable2Size = 257; |
michael@0 | 348 | static const LangTagLookup kCLDLangTagsHintTable2[kCLDTable2Size] = { |
michael@0 | 349 | {"aa", "aa", AFAR + W10, 0}, |
michael@0 | 350 | {"ab", "ab", ABKHAZIAN + W10, 0}, |
michael@0 | 351 | {"af", "af", AFRIKAANS + W10, 0}, |
michael@0 | 352 | {"ak", "ak", AKAN + W10, 0}, |
michael@0 | 353 | {"al", "sq", ALBANIAN + W10, 0}, // Albania |
michael@0 | 354 | {"am", "am,hy", AMHARIC + W10, ARMENIAN + W10}, // 1:2 Amharic Armenian |
michael@0 | 355 | {"ar", "ar", ARABIC + W10, 0}, |
michael@0 | 356 | {"ara", "ar", ARABIC + W10, 0}, |
michael@0 | 357 | {"arm", "hy", ARMENIAN + W10, 0}, // Armenia |
michael@0 | 358 | {"arz", "ar", ARABIC + W10, 0}, // Egyptian Arabic |
michael@0 | 359 | {"as", "as", ASSAMESE + W10, 0}, |
michael@0 | 360 | {"at", "de", GERMAN + W10, 0}, // Austria |
michael@0 | 361 | {"au", "de", GERMAN + W10, 0}, // Austria |
michael@0 | 362 | {"ay", "ay", AYMARA + W10, 0}, |
michael@0 | 363 | {"az", "az", AZERBAIJANI + W10, 0}, |
michael@0 | 364 | {"aze", "az", AZERBAIJANI + W10, 0}, |
michael@0 | 365 | |
michael@0 | 366 | {"ba", "ba,bs", BASHKIR + W10, BOSNIAN + W10}, // 1:2 Bashkir Bosnia |
michael@0 | 367 | {"be", "be", BELARUSIAN + W10, 0}, |
michael@0 | 368 | {"bel", "be", BELARUSIAN + W10, 0}, |
michael@0 | 369 | {"bg", "bg", BULGARIAN + W10, 0}, |
michael@0 | 370 | {"bh", "bh", BIHARI + W10, HINDI - W4}, |
michael@0 | 371 | {"bi", "bi", BISLAMA + W10, 0}, |
michael@0 | 372 | {"big", "zhT", CHINESE_T + W10, 0}, // Big5 encoding |
michael@0 | 373 | {"bm", "ms", MALAY + W10, INDONESIAN - W4}, // Bahasa Malaysia |
michael@0 | 374 | {"bn", "bn", BENGALI + W10, 0}, |
michael@0 | 375 | {"bo", "bo", TIBETAN + W10, DZONGKHA - W4}, |
michael@0 | 376 | // 1:2 Breton, Brazil country code, both Latn .br TLD enough for pt to win |
michael@0 | 377 | {"br", "br,pt", BRETON + W10, PORTUGUESE + W8}, // 1:2 Breton, Brazil |
michael@0 | 378 | {"bs", "bs", BOSNIAN + W10, 0}, // Bosnian => Bosnian |
michael@0 | 379 | |
michael@0 | 380 | {"ca", "ca", CATALAN + W10, 0}, |
michael@0 | 381 | {"cat", "ca", CATALAN + W10, 0}, |
michael@0 | 382 | {"ch", "de,fr", GERMAN + W10, FRENCH + W10}, // 1:2 Switzerland |
michael@0 | 383 | {"chn", "zh", CHINESE + W10, 0}, |
michael@0 | 384 | {"chr", "chr", CHEROKEE + W10, 0}, |
michael@0 | 385 | {"ckb", "ku", KURDISH + W10, 0}, // Central Kurdish |
michael@0 | 386 | {"cn", "zh,zhT", CHINESE + W6, CHINESE_T + W4}, // Ambiguous, so weaker. |
michael@0 | 387 | // Offset by 2 so that TLD=tw or |
michael@0 | 388 | // enc=big5 will put zhT ahead |
michael@0 | 389 | {"co", "co", CORSICAN + W10, 0}, |
michael@0 | 390 | {"cro", "hr", CROATIAN + W10, 0}, // Croatia |
michael@0 | 391 | {"crs", "crs", SESELWA + W10, 0}, |
michael@0 | 392 | {"cs", "cs", CZECH + W10, SLOVAK - W4}, |
michael@0 | 393 | {"ct", "ca", CATALAN + W10, 0}, |
michael@0 | 394 | {"cy", "cy", WELSH + W10, 0}, |
michael@0 | 395 | {"cym", "cy", WELSH + W10, 0}, |
michael@0 | 396 | {"cz", "cs", CZECH + W10, SLOVAK - W4}, |
michael@0 | 397 | |
michael@0 | 398 | {"da", "da", DANISH + W10, NORWEGIAN - W4}, |
michael@0 | 399 | {"dan", "da", DANISH + W10, NORWEGIAN - W4}, |
michael@0 | 400 | {"de", "de", GERMAN + W10, 0}, |
michael@0 | 401 | {"deu", "de", GERMAN + W10, 0}, |
michael@0 | 402 | {"div", "dv", DHIVEHI + W10, 0}, |
michael@0 | 403 | {"dk", "da", DANISH + W10, NORWEGIAN - W4}, // Denmark |
michael@0 | 404 | {"dut", "nl", DUTCH + W10, 0}, // Dutch |
michael@0 | 405 | {"dv", "dv", DHIVEHI + W10, 0}, |
michael@0 | 406 | {"dz", "dz", DZONGKHA + W10, TIBETAN - W4}, |
michael@0 | 407 | |
michael@0 | 408 | {"ee", "et", ESTONIAN + W10, 0}, // Estonia |
michael@0 | 409 | {"eg", "ar", ARABIC + W10, 0}, // Egypt |
michael@0 | 410 | {"el", "el", GREEK + W10, 0}, |
michael@0 | 411 | {"en", "en", ENGLISH + W4, 0}, |
michael@0 | 412 | {"eng", "en", ENGLISH + W4, 0}, |
michael@0 | 413 | {"eo", "eo", ESPERANTO + W10, 0}, |
michael@0 | 414 | {"er", "ur", URDU + W10, 0}, // "Erdu" |
michael@0 | 415 | {"es", "es", SPANISH + W10, 0}, |
michael@0 | 416 | {"esp", "es", SPANISH + W10, 0}, |
michael@0 | 417 | {"est", "et", ESTONIAN + W10, 0}, |
michael@0 | 418 | {"et", "et", ESTONIAN + W10, 0}, |
michael@0 | 419 | {"eu", "eu", BASQUE + W10, 0}, |
michael@0 | 420 | |
michael@0 | 421 | {"fa", "fa", PERSIAN + W10, 0}, |
michael@0 | 422 | {"far", "fa", PERSIAN + W10, 0}, |
michael@0 | 423 | {"fi", "fi", FINNISH + W10, 0}, |
michael@0 | 424 | {"fil", "tl", TAGALOG + W10, 0}, // Philippines |
michael@0 | 425 | {"fj", "fj", FIJIAN + W10, 0}, |
michael@0 | 426 | {"fo", "fo", FAROESE + W10, ICELANDIC - W4}, |
michael@0 | 427 | {"fr", "fr", FRENCH + W10, 0}, |
michael@0 | 428 | {"fra", "fr", FRENCH + W10, 0}, |
michael@0 | 429 | {"fre", "fr", FRENCH + W10, 0}, |
michael@0 | 430 | {"fy", "fy", FRISIAN + W10, 0}, |
michael@0 | 431 | |
michael@0 | 432 | {"ga", "ga,gl", IRISH + W10, GALICIAN + W10}, // 1:2 Irish, Galician |
michael@0 | 433 | {"gae", "gd,ga", SCOTS_GAELIC + W10, IRISH + W10}, // 1:2 Gaelic, either |
michael@0 | 434 | {"gal", "gl", GALICIAN + W10, 0}, |
michael@0 | 435 | {"gb", "zh", CHINESE + W10, 0}, // GB2312 encoding |
michael@0 | 436 | {"gbk", "zh", CHINESE + W10, 0}, // GBK encoding |
michael@0 | 437 | {"gd", "gd", SCOTS_GAELIC + W10, 0}, |
michael@0 | 438 | {"ge", "ka", GEORGIAN + W10, 0}, // Georgia |
michael@0 | 439 | {"geo", "ka", GEORGIAN + W10, 0}, |
michael@0 | 440 | {"ger", "de", GERMAN + W10, 0}, |
michael@0 | 441 | {"gl", "gl", GALICIAN + W10, 0}, // Also Greenland; hard to confuse |
michael@0 | 442 | {"gn", "gn", GUARANI + W10, 0}, |
michael@0 | 443 | {"gr", "el", GREEK + W10, 0}, // Greece |
michael@0 | 444 | {"gu", "gu", GUJARATI + W10, 0}, |
michael@0 | 445 | {"gv", "gv", MANX + W10, 0}, |
michael@0 | 446 | |
michael@0 | 447 | {"ha", "ha", HAUSA + W10, 0}, |
michael@0 | 448 | {"hat", "ht", HAITIAN_CREOLE + W10, 0}, // Haiti |
michael@0 | 449 | {"haw", "haw", HAWAIIAN + W10, 0}, |
michael@0 | 450 | {"hb", "iw", HEBREW + W10, 0}, |
michael@0 | 451 | {"he", "iw", HEBREW + W10, 0}, |
michael@0 | 452 | {"heb", "iw", HEBREW + W10, 0}, |
michael@0 | 453 | {"hi", "hi", HINDI + W10, MARATHI - W4}, |
michael@0 | 454 | {"hk", "zhT", CHINESE_T + W10, 0}, // Hong Kong |
michael@0 | 455 | {"hr", "hr", CROATIAN + W10, 0}, |
michael@0 | 456 | {"ht", "ht", HAITIAN_CREOLE + W10, 0}, |
michael@0 | 457 | {"hu", "hu", HUNGARIAN + W10, 0}, |
michael@0 | 458 | {"hun", "hu", HUNGARIAN + W10, 0}, |
michael@0 | 459 | {"hy", "hy", ARMENIAN + W10, 0}, |
michael@0 | 460 | |
michael@0 | 461 | {"ia", "ia", INTERLINGUA + W10, 0}, |
michael@0 | 462 | {"ice", "is", ICELANDIC + W10, FAROESE - W4}, // Iceland |
michael@0 | 463 | {"id", "id", INDONESIAN + W10, MALAY - W4}, |
michael@0 | 464 | {"ids", "id", INDONESIAN + W10, MALAY - W4}, |
michael@0 | 465 | {"ie", "ie", INTERLINGUE + W10, 0}, |
michael@0 | 466 | {"ig", "ig", IGBO + W10, 0}, |
michael@0 | 467 | // 1:2 iu-Cans ik-Latn |
michael@0 | 468 | {"ik", "ik,iu", INUPIAK + W10, INUKTITUT + W10}, // 1:2 |
michael@0 | 469 | {"in", "id", INDONESIAN + W10, MALAY - W4}, |
michael@0 | 470 | {"ind", "id", INDONESIAN + W10, MALAY - W4}, // Indonesia |
michael@0 | 471 | {"inu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2 |
michael@0 | 472 | {"is", "is", ICELANDIC + W10, FAROESE - W4}, |
michael@0 | 473 | {"it", "it", ITALIAN + W10, 0}, |
michael@0 | 474 | {"ita", "it", ITALIAN + W10, 0}, |
michael@0 | 475 | {"iu", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2 |
michael@0 | 476 | {"iw", "iw", HEBREW + W10, 0}, |
michael@0 | 477 | |
michael@0 | 478 | {"ja", "ja", JAPANESE + W10, 0}, |
michael@0 | 479 | {"jp", "ja", JAPANESE + W10, 0}, // Japan |
michael@0 | 480 | {"jpn", "ja", JAPANESE + W10, 0}, |
michael@0 | 481 | {"jv", "jw", JAVANESE + W10, 0}, |
michael@0 | 482 | {"jw", "jw", JAVANESE + W10, 0}, |
michael@0 | 483 | |
michael@0 | 484 | {"ka", "ka", GEORGIAN + W10, 0}, |
michael@0 | 485 | {"kc", "qu", QUECHUA + W10, 0}, // (K)Quechua |
michael@0 | 486 | {"kg", "ky", KYRGYZ + W10, 0}, // Kyrgyzstan |
michael@0 | 487 | {"kh", "km", KHMER + W10, 0}, // Country code Khmer (Cambodia) |
michael@0 | 488 | {"kha", "kha", KHASI + W10, 0}, |
michael@0 | 489 | {"kk", "kk", KAZAKH + W10, 0}, // Kazakh |
michael@0 | 490 | {"kl", "kl", GREENLANDIC + W10, 0}, |
michael@0 | 491 | {"km", "km", KHMER + W10, 0}, |
michael@0 | 492 | {"kn", "kn", KANNADA + W10, 0}, |
michael@0 | 493 | {"ko", "ko", KOREAN + W10, 0}, |
michael@0 | 494 | {"kor", "ko", KOREAN + W10, 0}, |
michael@0 | 495 | {"kr", "ko", KOREAN + W10, 0}, // Country code Korea |
michael@0 | 496 | {"ks", "ks", KASHMIRI + W10, 0}, |
michael@0 | 497 | {"ksc", "ko", KOREAN + W10, 0}, // KSC encoding |
michael@0 | 498 | {"ku", "ku", KURDISH + W10, 0}, |
michael@0 | 499 | {"ky", "ky", KYRGYZ + W10, 0}, |
michael@0 | 500 | {"kz", "kk", KAZAKH + W10, 0}, // Kazakhstan |
michael@0 | 501 | {"la", "la", LATIN + W10, 0}, |
michael@0 | 502 | {"lao", "lo", LAOTHIAN + W10, 0}, // Laos |
michael@0 | 503 | |
michael@0 | 504 | {"lb", "lb", LUXEMBOURGISH + W10, 0}, |
michael@0 | 505 | {"lg", "lg", GANDA + W10, 0}, |
michael@0 | 506 | {"lit", "lt", LITHUANIAN + W10, 0}, |
michael@0 | 507 | {"ln", "ln", LINGALA + W10, 0}, |
michael@0 | 508 | {"lo", "lo", LAOTHIAN + W10, 0}, |
michael@0 | 509 | {"lt", "lt", LITHUANIAN + W10, 0}, |
michael@0 | 510 | {"ltu", "lt", LITHUANIAN + W10, 0}, |
michael@0 | 511 | {"lv", "lv", LATVIAN + W10, 0}, |
michael@0 | 512 | |
michael@0 | 513 | {"mfe", "mfe", MAURITIAN_CREOLE + W10, 0}, |
michael@0 | 514 | {"mg", "mg", MALAGASY + W10, 0}, |
michael@0 | 515 | {"mi", "mi", MAORI + W10, 0}, |
michael@0 | 516 | {"mk", "mk", MACEDONIAN + W10, 0}, |
michael@0 | 517 | {"ml", "ml", MALAYALAM + W10, 0}, |
michael@0 | 518 | {"mn", "mn", MONGOLIAN + W10, 0}, |
michael@0 | 519 | {"mo", "mo", ROMANIAN + W10, 0}, |
michael@0 | 520 | {"mon", "mn", MONGOLIAN + W10, 0}, // Mongolian |
michael@0 | 521 | {"mr", "mr", MARATHI + W10, HINDI - W4}, |
michael@0 | 522 | {"ms", "ms", MALAY + W10, INDONESIAN - W4}, |
michael@0 | 523 | {"mt", "mt", MALTESE + W10, 0}, |
michael@0 | 524 | {"mx", "es", SPANISH + W10, 0}, // Mexico |
michael@0 | 525 | {"my", "my,ms", BURMESE + W10, MALAY + W10}, // Myanmar, Malaysia |
michael@0 | 526 | |
michael@0 | 527 | {"na", "na", NAURU + W10, 0}, |
michael@0 | 528 | {"nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, |
michael@0 | 529 | {"ne", "ne", NEPALI + W10, 0}, |
michael@0 | 530 | {"nl", "nl", DUTCH + W10, 0}, |
michael@0 | 531 | {"nn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4}, |
michael@0 | 532 | {"no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4}, |
michael@0 | 533 | {"nr", "nr", NDEBELE + W10, 0}, |
michael@0 | 534 | {"nso", "nso", PEDI + W10, 0}, |
michael@0 | 535 | {"ny", "ny", NYANJA + W10, 0}, |
michael@0 | 536 | |
michael@0 | 537 | {"oc", "oc", OCCITAN + W10, 0}, |
michael@0 | 538 | {"om", "om", OROMO + W10, 0}, |
michael@0 | 539 | {"or", "or", ORIYA + W10, 0}, |
michael@0 | 540 | |
michael@0 | 541 | {"pa", "pa,ps", PUNJABI + W10, PASHTO + W10}, // 1:2 pa-Guru ps-Arab |
michael@0 | 542 | {"per", "fa", PERSIAN + W10, 0}, |
michael@0 | 543 | {"ph", "tl", TAGALOG + W10, 0}, // Philippines |
michael@0 | 544 | {"pk", "ur", URDU + W10, 0}, // Pakistan |
michael@0 | 545 | {"pl", "pl", POLISH + W10, 0}, |
michael@0 | 546 | {"pnb", "pa", PUNJABI + W10, 0}, // Western Punjabi |
michael@0 | 547 | {"pol", "pl", POLISH + W10, 0}, |
michael@0 | 548 | {"por", "pt", PORTUGUESE + W10, 0}, |
michael@0 | 549 | {"ps", "ps", PASHTO + W10, 0}, |
michael@0 | 550 | {"pt", "pt", PORTUGUESE + W10, 0}, |
michael@0 | 551 | {"ptg", "pt", PORTUGUESE + W10, 0}, |
michael@0 | 552 | {"qc", "fr", FRENCH + W10, 0}, // Quebec "country" code |
michael@0 | 553 | {"qu", "qu", QUECHUA + W10, 0}, |
michael@0 | 554 | |
michael@0 | 555 | {"rm", "rm", RHAETO_ROMANCE + W10, 0}, |
michael@0 | 556 | {"rn", "rn", RUNDI + W10, 0}, |
michael@0 | 557 | {"ro", "ro", ROMANIAN + W10, 0}, |
michael@0 | 558 | {"rs", "sr", SERBIAN + W10, 0}, // Serbia country code |
michael@0 | 559 | {"ru", "ru", RUSSIAN + W10, 0}, |
michael@0 | 560 | {"rus", "ru", RUSSIAN + W10, 0}, |
michael@0 | 561 | {"rw", "rw", KINYARWANDA + W10, 0}, |
michael@0 | 562 | |
michael@0 | 563 | {"sa", "sa", SANSKRIT + W10, 0}, |
michael@0 | 564 | {"sco", "sco", SCOTS + W10, ENGLISH - W4}, |
michael@0 | 565 | {"sd", "sd", SINDHI + W10, 0}, |
michael@0 | 566 | {"se", "sv", SWEDISH + W10, 0}, |
michael@0 | 567 | {"sg", "sg", SANGO + W10, 0}, |
michael@0 | 568 | {"si", "si,sl", SINHALESE + W10, SLOVENIAN + W10}, // 1:2 Sinhalese, Slovinia |
michael@0 | 569 | {"sk", "sk", SLOVAK + W10, CZECH - W4}, |
michael@0 | 570 | {"sl", "sl", SLOVENIAN + W10, 0}, |
michael@0 | 571 | {"slo", "sl", SLOVENIAN + W10, 0}, |
michael@0 | 572 | {"sm", "sm", SAMOAN + W10, 0}, |
michael@0 | 573 | {"sn", "sn", SHONA + W10, 0}, |
michael@0 | 574 | {"so", "so", SOMALI + W10, 0}, |
michael@0 | 575 | {"sp", "es", SPANISH + W10, 0}, |
michael@0 | 576 | {"sq", "sq", ALBANIAN + W10, 0}, |
michael@0 | 577 | {"sr", "sr", SERBIAN + W10, 0}, |
michael@0 | 578 | {"srb", "sr", SERBIAN + W10, 0}, |
michael@0 | 579 | {"srl", "sr", SERBIAN + W10, 0}, // Serbian Latin |
michael@0 | 580 | {"srp", "sr", SERBIAN + W10, 0}, |
michael@0 | 581 | {"ss", "ss", SISWANT + W10, 0}, |
michael@0 | 582 | {"st", "st", SESOTHO + W10, 0}, |
michael@0 | 583 | {"su", "su", SUNDANESE + W10, 0}, |
michael@0 | 584 | {"sv", "sv", SWEDISH + W10, 0}, |
michael@0 | 585 | {"sve", "sv", SWEDISH + W10, 0}, |
michael@0 | 586 | {"sw", "sw", SWAHILI + W10, 0}, |
michael@0 | 587 | {"swe", "sv", SWEDISH + W10, 0}, |
michael@0 | 588 | {"sy", "syr", SYRIAC + W10, 0}, |
michael@0 | 589 | {"syr", "syr", SYRIAC + W10, 0}, |
michael@0 | 590 | |
michael@0 | 591 | {"ta", "ta", TAMIL + W10, 0}, |
michael@0 | 592 | {"te", "te", TELUGU + W10, 0}, |
michael@0 | 593 | {"tg", "tg", TAJIK + W10, 0}, |
michael@0 | 594 | {"th", "th", THAI + W10, 0}, |
michael@0 | 595 | {"ti", "ti,bo", TIGRINYA + W10, TIBETAN + W10}, // 1:2 Tigrinya, Tibet |
michael@0 | 596 | {"tj", "tg", TAJIK + W10, 0}, // Tajikistan |
michael@0 | 597 | {"tk", "tk", TURKMEN + W10, 0}, |
michael@0 | 598 | {"tl", "tl", TAGALOG + W10, 0}, |
michael@0 | 599 | {"tlh", "tlh", X_KLINGON + W10, 0}, |
michael@0 | 600 | {"tn", "tn", TSWANA + W10, 0}, |
michael@0 | 601 | {"to", "to", TONGA + W10, 0}, |
michael@0 | 602 | {"tr", "tr", TURKISH + W10, 0}, |
michael@0 | 603 | {"ts", "ts", TSONGA + W10, 0}, |
michael@0 | 604 | {"tt", "tt", TATAR + W10, 0}, |
michael@0 | 605 | {"tw", "ak,zhT", AKAN + W10, CHINESE_T + W10}, // 1:2 Twi => Akan, Taiwan |
michael@0 | 606 | {"twi", "ak", AKAN + W10, 0}, // Twi => Akan |
michael@0 | 607 | |
michael@0 | 608 | {"ua", "uk", UKRAINIAN + W10, 0}, // Ukraine |
michael@0 | 609 | {"ug", "ug", UIGHUR + W10, 0}, |
michael@0 | 610 | {"uk", "uk", UKRAINIAN + W10, 0}, |
michael@0 | 611 | {"ur", "ur", URDU + W10, 0}, |
michael@0 | 612 | {"uz", "uz", UZBEK + W10, 0}, |
michael@0 | 613 | |
michael@0 | 614 | {"va", "ca", CATALAN + W10, 0}, // Valencia => Catalan |
michael@0 | 615 | {"val", "ca", CATALAN + W10, 0}, // Valencia => Catalan |
michael@0 | 616 | {"ve", "ve", VENDA + W10, 0}, |
michael@0 | 617 | {"vi", "vi", VIETNAMESE + W10, 0}, |
michael@0 | 618 | {"vie", "vi", VIETNAMESE + W10, 0}, |
michael@0 | 619 | {"vn", "vi", VIETNAMESE + W10, 0}, |
michael@0 | 620 | {"vo", "vo", VOLAPUK + W10, 0}, |
michael@0 | 621 | |
michael@0 | 622 | {"wo", "wo", WOLOF + W10, 0}, |
michael@0 | 623 | |
michael@0 | 624 | {"xh", "xh", XHOSA + W10, ZULU - W4}, |
michael@0 | 625 | {"xho", "xh", XHOSA + W10, ZULU - W4}, |
michael@0 | 626 | |
michael@0 | 627 | {"yi", "yi", YIDDISH + W10, 0}, |
michael@0 | 628 | {"yo", "yo", YORUBA + W10, 0}, |
michael@0 | 629 | |
michael@0 | 630 | {"za", "za", ZHUANG + W10, 0}, |
michael@0 | 631 | {"zh", "zh", CHINESE + W10, 0}, |
michael@0 | 632 | {"zht", "zhT", CHINESE_T + W10, 0}, |
michael@0 | 633 | {"zu", "zu", ZULU + W10, XHOSA - W4}, |
michael@0 | 634 | }; |
michael@0 | 635 | |
michael@0 | 636 | |
michael@0 | 637 | // Possibly map to tl: |
michael@0 | 638 | // -LangTags tl-Latn /7val.com/ ,bcl 2 Central Bicolano |
michael@0 | 639 | // -LangTags tl-Latn /7val.com/ ,ceb 6 Cebuano |
michael@0 | 640 | // -LangTags tl-Latn /7val.com/ ,war 1 Waray |
michael@0 | 641 | |
michael@0 | 642 | |
michael@0 | 643 | |
michael@0 | 644 | // Table to look up country TLD (no general TLD) |
michael@0 | 645 | // In alphabetical order for binary search |
michael@0 | 646 | static const int kCLDTable3Size = 181; |
michael@0 | 647 | static const TLDLookup kCLDTLDHintTable[kCLDTable3Size] = { |
michael@0 | 648 | {"ac", JAPANESE + W2, 0}, |
michael@0 | 649 | {"ad", CATALAN + W4, 0}, |
michael@0 | 650 | {"ae", ARABIC + W4, 0}, |
michael@0 | 651 | {"af", PASHTO + W4, PERSIAN + W4}, |
michael@0 | 652 | {"ag", GERMAN + W2, 0}, // meager |
michael@0 | 653 | // {"ai", 0, 0}, // meager |
michael@0 | 654 | {"al", ALBANIAN + W4, 0}, |
michael@0 | 655 | {"am", ARMENIAN + W4, 0}, |
michael@0 | 656 | {"an", DUTCH + W4, 0}, // meager |
michael@0 | 657 | {"ao", PORTUGUESE + W4, 0}, |
michael@0 | 658 | // {"aq", 0, 0}, // meager |
michael@0 | 659 | {"ar", SPANISH + W4, 0}, |
michael@0 | 660 | // {"as", 0, 0}, |
michael@0 | 661 | {"at", GERMAN + W4, 0}, |
michael@0 | 662 | {"au", ENGLISH + W2, 0}, |
michael@0 | 663 | {"aw", DUTCH + W4, 0}, |
michael@0 | 664 | {"ax", SWEDISH + W4, 0}, |
michael@0 | 665 | {"az", AZERBAIJANI + W4, 0}, |
michael@0 | 666 | |
michael@0 | 667 | {"ba", BOSNIAN + W8, CROATIAN - W4}, |
michael@0 | 668 | // {"bb", 0, 0}, |
michael@0 | 669 | {"bd", BENGALI + W4, 0}, |
michael@0 | 670 | {"be", DUTCH + W4, FRENCH + W4}, |
michael@0 | 671 | {"bf", FRENCH + W4, 0}, |
michael@0 | 672 | {"bg", BULGARIAN + W4, 0}, |
michael@0 | 673 | {"bh", ARABIC + W4, 0}, |
michael@0 | 674 | {"bi", RUNDI + W4, FRENCH + W4}, |
michael@0 | 675 | {"bj", FRENCH + W4, 0}, |
michael@0 | 676 | {"bm", ENGLISH + W2, 0}, |
michael@0 | 677 | {"bn", MALAY + W4, INDONESIAN - W4}, |
michael@0 | 678 | {"bo", SPANISH + W4, AYMARA + W2}, // and GUARANI QUECHUA |
michael@0 | 679 | {"br", PORTUGUESE + W4, 0}, |
michael@0 | 680 | // {"bs", 0, 0}, |
michael@0 | 681 | {"bt", DZONGKHA + W10, TIBETAN - W10}, // Strong presumption of Dzongha |
michael@0 | 682 | {"bw", TSWANA + W4, 0}, |
michael@0 | 683 | {"by", BELARUSIAN + W4, 0}, |
michael@0 | 684 | // {"bz", 0, 0}, |
michael@0 | 685 | |
michael@0 | 686 | {"ca", FRENCH + W4, ENGLISH + W2}, |
michael@0 | 687 | {"cat", CATALAN + W4, 0}, |
michael@0 | 688 | {"cc", 0, 0}, |
michael@0 | 689 | {"cd", FRENCH + W4, 0}, |
michael@0 | 690 | {"cf", FRENCH + W4, 0}, |
michael@0 | 691 | {"cg", FRENCH + W4, 0}, |
michael@0 | 692 | {"ch", GERMAN + W4, FRENCH + W4}, |
michael@0 | 693 | {"ci", FRENCH + W4, 0}, |
michael@0 | 694 | // {"ck", 0, 0}, |
michael@0 | 695 | {"cl", SPANISH + W4, 0}, |
michael@0 | 696 | {"cm", FRENCH + W4, 0}, |
michael@0 | 697 | {"cn", CHINESE + W4, 0}, |
michael@0 | 698 | {"co", SPANISH + W4, 0}, |
michael@0 | 699 | {"cr", SPANISH + W4, 0}, |
michael@0 | 700 | {"cu", SPANISH + W4, 0}, |
michael@0 | 701 | {"cv", PORTUGUESE + W4, 0}, |
michael@0 | 702 | // {"cx", 0, 0}, |
michael@0 | 703 | {"cy", GREEK + W4, TURKISH + W4}, |
michael@0 | 704 | {"cz", CZECH + W4, SLOVAK - W4}, |
michael@0 | 705 | |
michael@0 | 706 | {"de", GERMAN + W4, 0}, |
michael@0 | 707 | {"dj", 0, 0}, |
michael@0 | 708 | {"dk", DANISH + W4, NORWEGIAN - W4}, |
michael@0 | 709 | {"dm", 0, 0}, |
michael@0 | 710 | {"do", SPANISH + W4, 0}, |
michael@0 | 711 | {"dz", FRENCH + W4, ARABIC + W4}, |
michael@0 | 712 | |
michael@0 | 713 | {"ec", SPANISH + W4, 0}, |
michael@0 | 714 | {"ee", ESTONIAN + W4, 0}, |
michael@0 | 715 | {"eg", ARABIC + W4, 0}, |
michael@0 | 716 | {"er", AFAR + W4, 0}, |
michael@0 | 717 | {"es", SPANISH + W4, 0}, |
michael@0 | 718 | {"et", AMHARIC + W4, AFAR + W4}, |
michael@0 | 719 | |
michael@0 | 720 | {"fi", FINNISH + W4, 0}, |
michael@0 | 721 | {"fj", FIJIAN + W4, 0}, |
michael@0 | 722 | // {"fk", 0, 0}, |
michael@0 | 723 | // {"fm", 0, 0}, |
michael@0 | 724 | {"fo", FAROESE + W4, ICELANDIC - W4}, |
michael@0 | 725 | {"fr", FRENCH + W4, 0}, |
michael@0 | 726 | |
michael@0 | 727 | {"ga", FRENCH + W4, 0}, |
michael@0 | 728 | {"gd", 0, 0}, |
michael@0 | 729 | {"ge", GEORGIAN + W4, 0}, |
michael@0 | 730 | {"gf", FRENCH + W4, 0}, |
michael@0 | 731 | // {"gg", 0, 0}, |
michael@0 | 732 | // {"gh", 0, 0}, |
michael@0 | 733 | // {"gi", 0, 0}, |
michael@0 | 734 | {"gl", GREENLANDIC + W4, DANISH + W4}, |
michael@0 | 735 | // {"gm", 0, 0}, |
michael@0 | 736 | {"gn", FRENCH + W4, 0}, |
michael@0 | 737 | // {"gp", 0, 0}, |
michael@0 | 738 | // {"gq", 0, 0}, |
michael@0 | 739 | {"gr", GREEK + W4, 0}, |
michael@0 | 740 | // {"gs", 0, 0}, |
michael@0 | 741 | {"gt", SPANISH + W4, 0}, |
michael@0 | 742 | // {"gu", 0, 0}, |
michael@0 | 743 | // {"gy", 0, 0}, |
michael@0 | 744 | |
michael@0 | 745 | {"hk", CHINESE_T + W4, 0}, |
michael@0 | 746 | // {"hm", 0, 0}, |
michael@0 | 747 | {"hn", SPANISH + W4, 0}, |
michael@0 | 748 | {"hr", CROATIAN + W8, BOSNIAN - W4}, |
michael@0 | 749 | {"ht", HAITIAN_CREOLE + W4, FRENCH + W4}, |
michael@0 | 750 | {"hu", HUNGARIAN + W4, 0}, |
michael@0 | 751 | |
michael@0 | 752 | {"id", INDONESIAN + W4, MALAY - W4}, |
michael@0 | 753 | {"ie", IRISH + W4, 0}, |
michael@0 | 754 | {"il", HEBREW + W4, 0}, |
michael@0 | 755 | {"im", MANX + W4, 0}, |
michael@0 | 756 | // {"in", 0, 0}, |
michael@0 | 757 | // {"io", 0, 0}, |
michael@0 | 758 | {"iq", ARABIC + W4, 0}, |
michael@0 | 759 | {"ir", PERSIAN + W4, 0}, |
michael@0 | 760 | {"is", ICELANDIC + W4, FAROESE - W4}, |
michael@0 | 761 | {"it", ITALIAN + W4, 0}, |
michael@0 | 762 | |
michael@0 | 763 | // {"je", 0, 0}, |
michael@0 | 764 | // {"jm", 0, 0}, |
michael@0 | 765 | {"jo", ARABIC + W4, 0}, |
michael@0 | 766 | {"jp", JAPANESE + W4, 0}, |
michael@0 | 767 | |
michael@0 | 768 | // {"ke", 0, 0}, |
michael@0 | 769 | {"kg", KYRGYZ + W4, 0}, |
michael@0 | 770 | {"kh", KHMER + W4, 0}, |
michael@0 | 771 | // {"ki", 0, 0}, |
michael@0 | 772 | {"km", FRENCH + W4, 0}, |
michael@0 | 773 | // {"kn", 0, 0}, |
michael@0 | 774 | {"kp", KOREAN + W4, 0}, |
michael@0 | 775 | {"kr", KOREAN + W4, 0}, |
michael@0 | 776 | {"kw", ARABIC + W4, 0}, |
michael@0 | 777 | // {"ky", 0, 0}, |
michael@0 | 778 | {"kz", KAZAKH + W4, 0}, |
michael@0 | 779 | |
michael@0 | 780 | {"la", LAOTHIAN + W4, 0}, |
michael@0 | 781 | {"lb", ARABIC + W4, FRENCH + W4}, |
michael@0 | 782 | // {"lc", 0, 0}, |
michael@0 | 783 | {"li", GERMAN + W4, 0}, |
michael@0 | 784 | {"lk", SINHALESE + W4, 0}, |
michael@0 | 785 | // {"lr", 0, 0}, |
michael@0 | 786 | {"ls", SESOTHO + W4, 0}, |
michael@0 | 787 | {"lt", LITHUANIAN + W4, 0}, |
michael@0 | 788 | {"lu", LUXEMBOURGISH + W4}, |
michael@0 | 789 | {"lv", LATVIAN + W4, 0}, |
michael@0 | 790 | {"ly", ARABIC + W4, 0}, |
michael@0 | 791 | |
michael@0 | 792 | {"ma", FRENCH + W4, 0}, |
michael@0 | 793 | {"mc", FRENCH + W4, 0}, |
michael@0 | 794 | {"md", ROMANIAN + W4, 0}, |
michael@0 | 795 | {"me", MONTENEGRIN + W8, SERBIAN - W4}, |
michael@0 | 796 | {"mg", FRENCH + W4, 0}, |
michael@0 | 797 | {"mk", MACEDONIAN + W4, 0}, |
michael@0 | 798 | {"ml", FRENCH + W4, 0}, |
michael@0 | 799 | {"mm", BURMESE + W4, 0}, |
michael@0 | 800 | {"mn", MONGOLIAN + W4, 0}, |
michael@0 | 801 | {"mo", CHINESE_T + W4, PORTUGUESE + W4}, |
michael@0 | 802 | // {"mp", 0, 0}, |
michael@0 | 803 | {"mq", FRENCH + W4, 0}, |
michael@0 | 804 | {"mr", FRENCH + W4, ARABIC + W4}, |
michael@0 | 805 | // {"ms", 0, 0}, |
michael@0 | 806 | {"mt", MALTESE + W4, 0}, |
michael@0 | 807 | // {"mu", 0, 0}, |
michael@0 | 808 | {"mv", DHIVEHI + W4, 0}, |
michael@0 | 809 | // {"mw", 0, 0}, |
michael@0 | 810 | {"mx", SPANISH + W4, 0}, |
michael@0 | 811 | {"my", MALAY + W4, INDONESIAN - W4}, |
michael@0 | 812 | {"mz", PORTUGUESE + W4, 0}, |
michael@0 | 813 | |
michael@0 | 814 | {"na", 0, 0}, // Namibia |
michael@0 | 815 | {"nc", FRENCH + W4, 0}, |
michael@0 | 816 | {"ne", FRENCH + W4, 0}, |
michael@0 | 817 | {"nf", FRENCH + W4, 0}, |
michael@0 | 818 | // {"ng", 0, 0}, |
michael@0 | 819 | {"ni", SPANISH + W4, 0}, |
michael@0 | 820 | {"nl", DUTCH + W4, 0}, |
michael@0 | 821 | {"no", NORWEGIAN + W4, NORWEGIAN_N + W2}, |
michael@0 | 822 | {"np", NEPALI + W4, 0}, |
michael@0 | 823 | {"nr", NAURU + W4, 0}, |
michael@0 | 824 | {"nu", SWEDISH + W4, 0}, |
michael@0 | 825 | {"nz", MAORI + W4, ENGLISH + W2}, |
michael@0 | 826 | |
michael@0 | 827 | {"om", ARABIC + W4, 0}, |
michael@0 | 828 | |
michael@0 | 829 | {"pa", SPANISH + W4, 0}, |
michael@0 | 830 | {"pe", SPANISH + W4, QUECHUA + W2}, // also AYMARA |
michael@0 | 831 | {"pf", FRENCH + W4, 0}, |
michael@0 | 832 | // {"pg", 0, 0}, |
michael@0 | 833 | {"ph", TAGALOG + W4, 0}, |
michael@0 | 834 | {"pk", URDU + W4, 0}, |
michael@0 | 835 | {"pl", POLISH + W4, 0}, |
michael@0 | 836 | // {"pn", 0, 0}, |
michael@0 | 837 | {"pr", SPANISH + W4, 0}, |
michael@0 | 838 | {"ps", ARABIC + W4, 0}, |
michael@0 | 839 | {"pt", PORTUGUESE + W4, 0}, |
michael@0 | 840 | {"py", SPANISH + W4, GUARANI + W2}, |
michael@0 | 841 | |
michael@0 | 842 | {"qa", ARABIC + W4, 0}, |
michael@0 | 843 | |
michael@0 | 844 | {"re", FRENCH + W4, 0}, |
michael@0 | 845 | {"ro", ROMANIAN + W4, 0}, |
michael@0 | 846 | {"rs", SERBIAN + W8, MONTENEGRIN - W4}, |
michael@0 | 847 | {"ru", RUSSIAN + W4, 0}, |
michael@0 | 848 | {"rw", KINYARWANDA + W4, FRENCH + W2}, |
michael@0 | 849 | |
michael@0 | 850 | {"sa", ARABIC + W4, 0}, |
michael@0 | 851 | // {"sb", 0, 0}, |
michael@0 | 852 | {"sc", SESELWA + W4, 0}, |
michael@0 | 853 | {"sd", ARABIC + W4, 0}, |
michael@0 | 854 | {"se", SWEDISH + W4, 0}, |
michael@0 | 855 | // {"sg", 0, 0}, |
michael@0 | 856 | // {"sh", 0, 0}, |
michael@0 | 857 | {"si", SLOVENIAN + W4, 0}, |
michael@0 | 858 | {"sk", SLOVAK + W4, CZECH - W4}, |
michael@0 | 859 | // {"sl", 0, 0}, |
michael@0 | 860 | {"sm", ITALIAN + W4, 0}, |
michael@0 | 861 | {"sn", FRENCH + W4, 0}, |
michael@0 | 862 | // {"sr", 0, 0}, |
michael@0 | 863 | {"ss", ARABIC + W4, 0}, // Presumed South Sudan TLD. dsites 2011.07.07 |
michael@0 | 864 | // {"st", 0, 0}, |
michael@0 | 865 | {"su", RUSSIAN + W4, 0}, |
michael@0 | 866 | {"sv", SPANISH + W4, 0}, |
michael@0 | 867 | {"sy", ARABIC + W4, 0}, |
michael@0 | 868 | // {"sz", 0, 0}, |
michael@0 | 869 | |
michael@0 | 870 | // {"tc", 0, 0}, |
michael@0 | 871 | {"td", FRENCH + W4, 0}, |
michael@0 | 872 | // {"tf", 0, 0}, |
michael@0 | 873 | {"tg", FRENCH + W4, 0}, |
michael@0 | 874 | {"th", THAI + W4, 0}, |
michael@0 | 875 | // Tibet has no country code (see .cn) |
michael@0 | 876 | {"tj", TAJIK + W4, 0}, |
michael@0 | 877 | // {"tk", 0, 0}, |
michael@0 | 878 | // {"tl", 0, 0}, |
michael@0 | 879 | {"tm", TURKISH + W4, 0}, |
michael@0 | 880 | {"tn", FRENCH + W4, ARABIC + W4}, |
michael@0 | 881 | // {"to", 0, 0}, |
michael@0 | 882 | {"tp", JAPANESE + W4, 0}, |
michael@0 | 883 | {"tr", TURKISH + W4, 0}, |
michael@0 | 884 | // {"tt", 0, 0}, |
michael@0 | 885 | // {"tv", 0, 0}, |
michael@0 | 886 | {"tw", CHINESE_T + W4, 0}, |
michael@0 | 887 | {"tz", SWAHILI + W4, AKAN + W4}, |
michael@0 | 888 | |
michael@0 | 889 | {"ua", UKRAINIAN + W4, 0}, |
michael@0 | 890 | {"ug", GANDA + W4, 0}, |
michael@0 | 891 | {"uk", ENGLISH + W2, 0}, |
michael@0 | 892 | {"us", ENGLISH + W2, 0}, |
michael@0 | 893 | {"uy", SPANISH + W4, 0}, |
michael@0 | 894 | {"uz", UZBEK + W4, 0}, |
michael@0 | 895 | |
michael@0 | 896 | {"va", ITALIAN + W4, LATIN + W2}, |
michael@0 | 897 | // {"vc", 0, 0}, |
michael@0 | 898 | {"ve", SPANISH + W4, 0}, |
michael@0 | 899 | // {"vg", 0, 0}, |
michael@0 | 900 | // {"vi", 0, 0}, |
michael@0 | 901 | {"vn", VIETNAMESE + W4, 0}, |
michael@0 | 902 | // {"vu", 0, 0}, |
michael@0 | 903 | |
michael@0 | 904 | {"wf", FRENCH + W4, 0}, |
michael@0 | 905 | // {"ws", 0, 0}, |
michael@0 | 906 | |
michael@0 | 907 | {"ye", ARABIC + W4, 0}, |
michael@0 | 908 | |
michael@0 | 909 | {"za", AFRIKAANS + W4, 0}, |
michael@0 | 910 | // {"zm", 0, 0}, |
michael@0 | 911 | // {"zw", 0, 0}, |
michael@0 | 912 | }; |
michael@0 | 913 | |
michael@0 | 914 | #undef W2 |
michael@0 | 915 | #undef W4 |
michael@0 | 916 | #undef W6 |
michael@0 | 917 | #undef W8 |
michael@0 | 918 | #undef W10 |
michael@0 | 919 | #undef W12 |
michael@0 | 920 | |
michael@0 | 921 | |
michael@0 | 922 | |
michael@0 | 923 | |
michael@0 | 924 | |
michael@0 | 925 | inline void SetCLDPriorWeight(int w, OneCLDLangPrior* olp) { |
michael@0 | 926 | *olp = (*olp & 0x3ff) + (w << 10); |
michael@0 | 927 | } |
michael@0 | 928 | inline void SetCLDPriorLang(Language lang, OneCLDLangPrior* olp) { |
michael@0 | 929 | *olp = (*olp & ~0x3ff) + lang; |
michael@0 | 930 | } |
michael@0 | 931 | |
michael@0 | 932 | OneCLDLangPrior PackCLDPriorLangWeight(Language lang, int w) { |
michael@0 | 933 | return (w << 10) + lang; |
michael@0 | 934 | } |
michael@0 | 935 | |
michael@0 | 936 | inline int MaxInt(int a, int b) { |
michael@0 | 937 | return (a >= b) ? a : b; |
michael@0 | 938 | } |
michael@0 | 939 | |
michael@0 | 940 | // Merge in another language prior, taking max if already there |
michael@0 | 941 | void MergeCLDLangPriorsMax(OneCLDLangPrior olp, CLDLangPriors* lps) { |
michael@0 | 942 | if (olp == 0) {return;} |
michael@0 | 943 | Language target_lang = GetCLDPriorLang(olp); |
michael@0 | 944 | for (int i = 0; i < lps->n; ++i) { |
michael@0 | 945 | if (GetCLDPriorLang(lps->prior[i]) == target_lang) { |
michael@0 | 946 | int new_weight = MaxInt(GetCLDPriorWeight(lps->prior[i]), |
michael@0 | 947 | GetCLDPriorWeight(olp)); |
michael@0 | 948 | SetCLDPriorWeight(new_weight, &lps->prior[i]); |
michael@0 | 949 | return; |
michael@0 | 950 | } |
michael@0 | 951 | } |
michael@0 | 952 | // Not found; add it if room |
michael@0 | 953 | if (lps->n >= kMaxOneCLDLangPrior) {return;} |
michael@0 | 954 | lps->prior[lps->n++] = olp; |
michael@0 | 955 | } |
michael@0 | 956 | |
michael@0 | 957 | // Merge in another language prior, boosting 10x if already there |
michael@0 | 958 | void MergeCLDLangPriorsBoost(OneCLDLangPrior olp, CLDLangPriors* lps) { |
michael@0 | 959 | if (olp == 0) {return;} |
michael@0 | 960 | Language target_lang = GetCLDPriorLang(olp); |
michael@0 | 961 | for (int i = 0; i < lps->n; ++i) { |
michael@0 | 962 | if (GetCLDPriorLang(lps->prior[i]) == target_lang) { |
michael@0 | 963 | int new_weight = GetCLDPriorWeight(lps->prior[i]) + 2; |
michael@0 | 964 | SetCLDPriorWeight(new_weight, &lps->prior[i]); |
michael@0 | 965 | return; |
michael@0 | 966 | } |
michael@0 | 967 | } |
michael@0 | 968 | // Not found; add it if room |
michael@0 | 969 | if (lps->n >= kMaxOneCLDLangPrior) {return;} |
michael@0 | 970 | lps->prior[lps->n++] = olp; |
michael@0 | 971 | } |
michael@0 | 972 | |
michael@0 | 973 | |
michael@0 | 974 | // Trim language priors to no more than max_entries, keeping largest abs weights |
michael@0 | 975 | void TrimCLDLangPriors(int max_entries, CLDLangPriors* lps) { |
michael@0 | 976 | if (lps->n <= max_entries) {return;} |
michael@0 | 977 | |
michael@0 | 978 | // Insertion sort in-place by abs(weight) |
michael@0 | 979 | for (int i = 0; i < lps->n; ++i) { |
michael@0 | 980 | OneCLDLangPrior temp_olp = lps->prior[i]; |
michael@0 | 981 | int w = abs(GetCLDPriorWeight(temp_olp)); |
michael@0 | 982 | int kk = i; |
michael@0 | 983 | for (; kk > 0; --kk) { |
michael@0 | 984 | if (abs(GetCLDPriorWeight(lps->prior[kk - 1])) < w) { |
michael@0 | 985 | // Move down and continue |
michael@0 | 986 | lps->prior[kk] = lps->prior[kk - 1]; |
michael@0 | 987 | } else { |
michael@0 | 988 | // abs(weight[kk - 1]) >= w, time to stop |
michael@0 | 989 | break; |
michael@0 | 990 | } |
michael@0 | 991 | } |
michael@0 | 992 | lps->prior[kk] = temp_olp; |
michael@0 | 993 | } |
michael@0 | 994 | |
michael@0 | 995 | lps->n = max_entries; |
michael@0 | 996 | } |
michael@0 | 997 | |
michael@0 | 998 | int CountCommas(const string& langtags) { |
michael@0 | 999 | int commas = 0; |
michael@0 | 1000 | for (int i = 0; i < static_cast<int>(langtags.size()); ++i) { |
michael@0 | 1001 | if (langtags[i] == ',') {++commas;} |
michael@0 | 1002 | } |
michael@0 | 1003 | return commas; |
michael@0 | 1004 | } |
michael@0 | 1005 | |
michael@0 | 1006 | // Binary lookup on language tag |
michael@0 | 1007 | const LangTagLookup* DoLangTagLookup(const char* key, |
michael@0 | 1008 | const LangTagLookup* tbl, int tbl_size) { |
michael@0 | 1009 | // Key is always in range [lo..hi) |
michael@0 | 1010 | int lo = 0; |
michael@0 | 1011 | int hi = tbl_size; |
michael@0 | 1012 | while (lo < hi) { |
michael@0 | 1013 | int mid = (lo + hi) >> 1; |
michael@0 | 1014 | int comp = strcmp(tbl[mid].langtag, key); |
michael@0 | 1015 | if (comp < 0) { |
michael@0 | 1016 | lo = mid + 1; |
michael@0 | 1017 | } else if (comp > 0) { |
michael@0 | 1018 | hi = mid; |
michael@0 | 1019 | } else { |
michael@0 | 1020 | return &tbl[mid]; |
michael@0 | 1021 | } |
michael@0 | 1022 | } |
michael@0 | 1023 | return NULL; |
michael@0 | 1024 | } |
michael@0 | 1025 | |
michael@0 | 1026 | // Binary lookup on tld |
michael@0 | 1027 | const TLDLookup* DoTLDLookup(const char* key, |
michael@0 | 1028 | const TLDLookup* tbl, int tbl_size) { |
michael@0 | 1029 | // Key is always in range [lo..hi) |
michael@0 | 1030 | int lo = 0; |
michael@0 | 1031 | int hi = tbl_size; |
michael@0 | 1032 | while (lo < hi) { |
michael@0 | 1033 | int mid = (lo + hi) >> 1; |
michael@0 | 1034 | int comp = strcmp(tbl[mid].tld, key); |
michael@0 | 1035 | if (comp < 0) { |
michael@0 | 1036 | lo = mid + 1; |
michael@0 | 1037 | } else if (comp > 0) { |
michael@0 | 1038 | hi = mid; |
michael@0 | 1039 | } else { |
michael@0 | 1040 | return &tbl[mid]; |
michael@0 | 1041 | } |
michael@0 | 1042 | } |
michael@0 | 1043 | return NULL; |
michael@0 | 1044 | } |
michael@0 | 1045 | |
michael@0 | 1046 | |
michael@0 | 1047 | |
michael@0 | 1048 | // Trim language tag string to canonical form for each language |
michael@0 | 1049 | // Input is from GetLangTagsFromHtml(), already lowercased |
michael@0 | 1050 | string TrimCLDLangTagsHint(const string& langtags) { |
michael@0 | 1051 | string retval; |
michael@0 | 1052 | if (langtags.empty()) {return retval;} |
michael@0 | 1053 | int commas = CountCommas(langtags); |
michael@0 | 1054 | if (commas > 4) {return retval;} // Ignore if too many language tags |
michael@0 | 1055 | |
michael@0 | 1056 | char temp[20]; |
michael@0 | 1057 | int pos = 0; |
michael@0 | 1058 | while (pos < static_cast<int>(langtags.size())) { |
michael@0 | 1059 | int comma = langtags.find(',', pos); |
michael@0 | 1060 | if (comma == string::npos) {comma = langtags.size();} // fake trailing comma |
michael@0 | 1061 | int len = comma - pos; |
michael@0 | 1062 | if (len <= 16) { |
michael@0 | 1063 | // Short enough to use |
michael@0 | 1064 | memcpy(temp, &langtags[pos], len); |
michael@0 | 1065 | temp[len] = '\0'; |
michael@0 | 1066 | const LangTagLookup* entry = DoLangTagLookup(temp, |
michael@0 | 1067 | kCLDLangTagsHintTable1, |
michael@0 | 1068 | kCLDTable1Size); |
michael@0 | 1069 | if (entry != NULL) { |
michael@0 | 1070 | // First table hit |
michael@0 | 1071 | retval.append(entry->langcode); // may be "code1,code2" |
michael@0 | 1072 | retval.append(1, ','); |
michael@0 | 1073 | } else { |
michael@0 | 1074 | // Try second table with language code truncated at first hyphen |
michael@0 | 1075 | char* hyphen = strchr(temp, '-'); |
michael@0 | 1076 | if (hyphen != NULL) {*hyphen = '\0';} |
michael@0 | 1077 | len = strlen(temp); |
michael@0 | 1078 | if (len <= 3) { // Short enough to use |
michael@0 | 1079 | entry = DoLangTagLookup(temp, |
michael@0 | 1080 | kCLDLangTagsHintTable2, |
michael@0 | 1081 | kCLDTable2Size); |
michael@0 | 1082 | if (entry != NULL) { |
michael@0 | 1083 | // Second table hit |
michael@0 | 1084 | retval.append(entry->langcode); // may be "code1,code2" |
michael@0 | 1085 | retval.append(1, ','); |
michael@0 | 1086 | } |
michael@0 | 1087 | } |
michael@0 | 1088 | } |
michael@0 | 1089 | } |
michael@0 | 1090 | pos = comma + 1; |
michael@0 | 1091 | } |
michael@0 | 1092 | |
michael@0 | 1093 | // Remove trainling comma, if any |
michael@0 | 1094 | if (!retval.empty()) {retval.resize(retval.size() - 1);} |
michael@0 | 1095 | return retval; |
michael@0 | 1096 | } |
michael@0 | 1097 | |
michael@0 | 1098 | |
michael@0 | 1099 | |
michael@0 | 1100 | //============================================================================== |
michael@0 | 1101 | |
michael@0 | 1102 | // Little state machine to scan insides of language attribute quoted-string. |
michael@0 | 1103 | // Each language code is lowercased and copied to the output string. Underscore |
michael@0 | 1104 | // is mapped to minus. Space, tab, and comma are all mapped to comma, and |
michael@0 | 1105 | // multiple consecutive commas are removed. |
michael@0 | 1106 | // Each language code in the output list will be followed by a single comma. |
michael@0 | 1107 | |
michael@0 | 1108 | // There are three states, and we start in state 1: |
michael@0 | 1109 | // State 0: After a letter. |
michael@0 | 1110 | // Copy all letters/minus[0], copy comma[1]; all others copy comma and skip [2] |
michael@0 | 1111 | // State 1: Just after a comma. |
michael@0 | 1112 | // Copy letter [0], Ignore subsequent commas[1]. minus and all others skip [2] |
michael@0 | 1113 | // State 2: Skipping. |
michael@0 | 1114 | // All characters except comma skip and stay in [2]. comma goes to [1] |
michael@0 | 1115 | |
michael@0 | 1116 | // The thing that is copied is kLangCodeRemap[c] when going to state 0, |
michael@0 | 1117 | // and always comma when going to state 1 or 2. The design depends on copying |
michael@0 | 1118 | // a comma at the *beginning* of skipping, and in state 2 never doing a copy. |
michael@0 | 1119 | |
michael@0 | 1120 | // We pack all this into 8 bits: |
michael@0 | 1121 | // +--+---+---+ |
michael@0 | 1122 | // |78|654|321| |
michael@0 | 1123 | // +--+---+---+ |
michael@0 | 1124 | // |
michael@0 | 1125 | // Shift byte right by 3*state, giving [0] 321, [1] 654, [2] .78 |
michael@0 | 1126 | // where . is always zero |
michael@0 | 1127 | // Of these 3 bits, low two are next state ss, high bit is copy bit C. |
michael@0 | 1128 | // If C=1 and ss == 0, copy kLangCodeRemap[c], else copy a comma |
michael@0 | 1129 | |
michael@0 | 1130 | #define SKIP0 0 |
michael@0 | 1131 | #define SKIP1 1 |
michael@0 | 1132 | #define SKIP2 2 |
michael@0 | 1133 | #define COPY0 4 // copy kLangCodeRemap[c] |
michael@0 | 1134 | #define COPY1 5 // copy ',' |
michael@0 | 1135 | #define COPY2 6 // copy ',' |
michael@0 | 1136 | |
michael@0 | 1137 | // These combined actions pack three states into one byte. |
michael@0 | 1138 | // Ninth bit must be zero, so all state 2 values must be skips. |
michael@0 | 1139 | // state[2] state[1] state[0] |
michael@0 | 1140 | #define LTR ((SKIP2 << 6) + (COPY0 << 3) + COPY0) |
michael@0 | 1141 | #define MINUS ((SKIP2 << 6) + (COPY2 << 3) + COPY0) |
michael@0 | 1142 | #define COMMA ((SKIP1 << 6) + (SKIP1 << 3) + COPY1) |
michael@0 | 1143 | #define Bad ((SKIP2 << 6) + (COPY2 << 3) + COPY2) |
michael@0 | 1144 | |
michael@0 | 1145 | // Treat as letter: a-z, A-Z |
michael@0 | 1146 | // Treat as minus: 2D minus, 5F underscore |
michael@0 | 1147 | // Treat as comma: 09 tab, 20 space, 2C comma |
michael@0 | 1148 | |
michael@0 | 1149 | static const unsigned char kLangCodeAction[256] = { |
michael@0 | 1150 | Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,COMMA,Bad,Bad,Bad,Bad,Bad,Bad, |
michael@0 | 1151 | Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
michael@0 | 1152 | COMMA,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,COMMA,MINUS,Bad,Bad, |
michael@0 | 1153 | Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
michael@0 | 1154 | |
michael@0 | 1155 | Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, |
michael@0 | 1156 | LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,MINUS, |
michael@0 | 1157 | Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, |
michael@0 | 1158 | LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR, LTR,LTR,LTR,Bad,Bad,Bad,Bad,Bad, |
michael@0 | 1159 | |
michael@0 | 1160 | Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
michael@0 | 1161 | Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
michael@0 | 1162 | Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
michael@0 | 1163 | Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
michael@0 | 1164 | |
michael@0 | 1165 | Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
michael@0 | 1166 | Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
michael@0 | 1167 | Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
michael@0 | 1168 | Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad, |
michael@0 | 1169 | }; |
michael@0 | 1170 | |
michael@0 | 1171 | // This does lowercasing, maps underscore to minus, and maps tab/space to comma |
michael@0 | 1172 | static const unsigned char kLangCodeRemap[256] = { |
michael@0 | 1173 | 0,0,0,0,0,0,0,0, 0,',',0,0,0,0,0,0, // 09 tab |
michael@0 | 1174 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 1175 | ',',0,0,0,0,0,0,0, 0,0,0,0,',','-',0,0, // 20 space 2C comma 2D minus |
michael@0 | 1176 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 1177 | |
michael@0 | 1178 | 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o', |
michael@0 | 1179 | 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,'-', // 5F underscore |
michael@0 | 1180 | 0,'a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o', |
michael@0 | 1181 | 'p','q','r','s','t','u','v','w', 'x','y','z',0,0,0,0,0, |
michael@0 | 1182 | |
michael@0 | 1183 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 1184 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 1185 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 1186 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 1187 | |
michael@0 | 1188 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 1189 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 1190 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 1191 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
michael@0 | 1192 | }; |
michael@0 | 1193 | |
michael@0 | 1194 | #undef LTR |
michael@0 | 1195 | #undef MINUS |
michael@0 | 1196 | #undef COMMA |
michael@0 | 1197 | #undef Bad |
michael@0 | 1198 | |
michael@0 | 1199 | #undef SKIP0 |
michael@0 | 1200 | #undef SKIP1 |
michael@0 | 1201 | #undef SKIP2 |
michael@0 | 1202 | #undef COPY0 |
michael@0 | 1203 | #undef COPY1 |
michael@0 | 1204 | #undef COPY2 |
michael@0 | 1205 | |
michael@0 | 1206 | |
michael@0 | 1207 | // Find opening '<' for HTML tag |
michael@0 | 1208 | // Note: this is all somewhat insensitive to mismatched quotes |
michael@0 | 1209 | int32 FindTagStart(const char* utf8_body, int32 pos, int32 max_pos) { |
michael@0 | 1210 | int i = pos; |
michael@0 | 1211 | // Advance i by 4 if none of the next 4 bytes are '<' |
michael@0 | 1212 | for (i = pos; i < (max_pos - 3); i += 4) { |
michael@0 | 1213 | // Fast check for any < |
michael@0 | 1214 | const char* p = &utf8_body[i]; |
michael@0 | 1215 | uint32 s0123 = UNALIGNED_LOAD32(p); |
michael@0 | 1216 | uint32 temp = s0123 ^ 0x3c3c3c3c; // <<<< |
michael@0 | 1217 | if (((temp - 0x01010101) & (~temp & 0x80808080)) != 0) { |
michael@0 | 1218 | // At least one byte is '<' |
michael@0 | 1219 | break; |
michael@0 | 1220 | } |
michael@0 | 1221 | } |
michael@0 | 1222 | // Continue, advancing i by 1 |
michael@0 | 1223 | for (; i < max_pos; ++i) { |
michael@0 | 1224 | if (utf8_body[i] == '<') {return i;} |
michael@0 | 1225 | } |
michael@0 | 1226 | return -1; |
michael@0 | 1227 | } |
michael@0 | 1228 | |
michael@0 | 1229 | |
michael@0 | 1230 | // Find closing '>' for HTML tag. Also stop on < and & (simplistic parsing) |
michael@0 | 1231 | int32 FindTagEnd(const char* utf8_body, int32 pos, int32 max_pos) { |
michael@0 | 1232 | // Always outside quotes |
michael@0 | 1233 | for (int i = pos; i < max_pos; ++i) { |
michael@0 | 1234 | char c = utf8_body[i]; |
michael@0 | 1235 | if (c == '>') {return i;} |
michael@0 | 1236 | if (c == '<') {return i - 1;} |
michael@0 | 1237 | if (c == '&') {return i - 1;} |
michael@0 | 1238 | } |
michael@0 | 1239 | return -1; // nothing found |
michael@0 | 1240 | } |
michael@0 | 1241 | |
michael@0 | 1242 | // Find opening quote or apostrophe, skipping spaces |
michael@0 | 1243 | // Note: this is all somewhat insensitive to mismatched quotes |
michael@0 | 1244 | int32 FindQuoteStart(const char* utf8_body, int32 pos, int32 max_pos) { |
michael@0 | 1245 | for (int i = pos; i < max_pos; ++i) { |
michael@0 | 1246 | char c = utf8_body[i]; |
michael@0 | 1247 | if (c == '"') {return i;} |
michael@0 | 1248 | if (c == '\'') {return i;} |
michael@0 | 1249 | if (c != ' ') {return -1;} |
michael@0 | 1250 | } |
michael@0 | 1251 | return -1; |
michael@0 | 1252 | } |
michael@0 | 1253 | |
michael@0 | 1254 | // Find closing quot/apos. Also stop on = > < and & (simplistic parsing) |
michael@0 | 1255 | int32 FindQuoteEnd(const char* utf8_body, int32 pos, int32 max_pos) { |
michael@0 | 1256 | // Always outside quotes |
michael@0 | 1257 | for (int i = pos; i < max_pos; ++i) { |
michael@0 | 1258 | char c = utf8_body[i]; |
michael@0 | 1259 | if (c == '"') {return i;} |
michael@0 | 1260 | if (c == '\'') {return i;} |
michael@0 | 1261 | if (c == '>') {return i - 1;} |
michael@0 | 1262 | if (c == '=') {return i - 1;} |
michael@0 | 1263 | if (c == '<') {return i - 1;} |
michael@0 | 1264 | if (c == '&') {return i - 1;} |
michael@0 | 1265 | } |
michael@0 | 1266 | return -1; // nothing found |
michael@0 | 1267 | } |
michael@0 | 1268 | |
michael@0 | 1269 | int32 FindEqualSign(const char* utf8_body, int32 pos, int32 max_pos) { |
michael@0 | 1270 | // Outside quotes/apostrophes loop |
michael@0 | 1271 | for (int i = pos; i < max_pos; ++i) { |
michael@0 | 1272 | char c = utf8_body[i]; |
michael@0 | 1273 | if (c == '=') { // Found bare equal sign inside tag |
michael@0 | 1274 | return i; |
michael@0 | 1275 | } else if (c == '"') { |
michael@0 | 1276 | // Inside quotes loop |
michael@0 | 1277 | int j; |
michael@0 | 1278 | for (j = i + 1; j < max_pos; ++j) { |
michael@0 | 1279 | if (utf8_body[j] == '"') { |
michael@0 | 1280 | break; |
michael@0 | 1281 | } else if (utf8_body[j] == '\\') { |
michael@0 | 1282 | ++j; |
michael@0 | 1283 | } |
michael@0 | 1284 | } |
michael@0 | 1285 | i = j; |
michael@0 | 1286 | } else if (c == '\'') { |
michael@0 | 1287 | // Inside apostrophes loop |
michael@0 | 1288 | int j; |
michael@0 | 1289 | for (j = i + 1; j < max_pos; ++j) { |
michael@0 | 1290 | if (utf8_body[j] == '\'') { |
michael@0 | 1291 | break; |
michael@0 | 1292 | } else if (utf8_body[j] == '\\') { |
michael@0 | 1293 | ++j; |
michael@0 | 1294 | } |
michael@0 | 1295 | } |
michael@0 | 1296 | i = j; |
michael@0 | 1297 | } |
michael@0 | 1298 | |
michael@0 | 1299 | } |
michael@0 | 1300 | return -1; // nothing found |
michael@0 | 1301 | } |
michael@0 | 1302 | |
michael@0 | 1303 | // Scan backwards for case-insensitive string s in [min_pos..pos) |
michael@0 | 1304 | // Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f] |
michael@0 | 1305 | // Cheap lowercase. Control codes will masquerade as 20..3f |
michael@0 | 1306 | bool FindBefore(const char* utf8_body, |
michael@0 | 1307 | int32 min_pos, int32 pos, const char* s) { |
michael@0 | 1308 | int len = strlen(s); |
michael@0 | 1309 | if ((pos - min_pos) < len) {return false;} // Too small to fit s |
michael@0 | 1310 | |
michael@0 | 1311 | // Skip trailing spaces |
michael@0 | 1312 | int i = pos; |
michael@0 | 1313 | while ((i > (min_pos + len)) && (utf8_body[i - 1] == ' ')) {--i;} |
michael@0 | 1314 | i -= len; |
michael@0 | 1315 | if (i < min_pos) {return false;} // pos - min_pos < len, so s can't be found |
michael@0 | 1316 | |
michael@0 | 1317 | const char* p = &utf8_body[i]; |
michael@0 | 1318 | for (int j = 0; j < len; ++j) { |
michael@0 | 1319 | if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte |
michael@0 | 1320 | } |
michael@0 | 1321 | return true; // All bytes equal at i |
michael@0 | 1322 | } |
michael@0 | 1323 | |
michael@0 | 1324 | // Scan forwards for case-insensitive string s in [pos..max_pos) |
michael@0 | 1325 | // Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f] |
michael@0 | 1326 | // Cheap lowercase. Control codes will masquerade as 20..3f |
michael@0 | 1327 | // Allows but does not require quoted/apostrophe string |
michael@0 | 1328 | bool FindAfter(const char* utf8_body, |
michael@0 | 1329 | int32 pos, int32 max_pos, const char* s) { |
michael@0 | 1330 | int len = strlen(s); |
michael@0 | 1331 | if ((max_pos - pos) < len) {return false;} // Too small to fit s |
michael@0 | 1332 | |
michael@0 | 1333 | // Skip leading spaces, quote, apostrophe |
michael@0 | 1334 | int i = pos; |
michael@0 | 1335 | while (i < (max_pos - len)) { |
michael@0 | 1336 | unsigned char c = utf8_body[i]; |
michael@0 | 1337 | if ((c == ' ') || (c == '"') || (c == '\'')) {++i;} |
michael@0 | 1338 | else {break;} |
michael@0 | 1339 | } |
michael@0 | 1340 | |
michael@0 | 1341 | const char* p = &utf8_body[i]; |
michael@0 | 1342 | for (int j = 0; j < len; ++j) { |
michael@0 | 1343 | if ((p[j] | 0x20) != s[j]) {return false;} // Unequal byte |
michael@0 | 1344 | } |
michael@0 | 1345 | return true; // All bytes equal |
michael@0 | 1346 | } |
michael@0 | 1347 | |
michael@0 | 1348 | |
michael@0 | 1349 | |
michael@0 | 1350 | // Copy attribute value in [pos..max_pos) |
michael@0 | 1351 | // pos is just after an opening quote/apostrophe and max_pos is the ending one |
michael@0 | 1352 | // String must all be on a single line. |
michael@0 | 1353 | // Return slightly-normalized language list, empty or ending in comma |
michael@0 | 1354 | // Does lowercasing and removes excess punctuation/space |
michael@0 | 1355 | string CopyOneQuotedString(const char* utf8_body, |
michael@0 | 1356 | int32 pos, int32 max_pos) { |
michael@0 | 1357 | string s; |
michael@0 | 1358 | int state = 1; // Front is logically just after a comma |
michael@0 | 1359 | for (int i = pos; i < max_pos; ++i) { |
michael@0 | 1360 | unsigned char c = utf8_body[i]; |
michael@0 | 1361 | int e = kLangCodeAction[c] >> (3 * state); |
michael@0 | 1362 | state = e & 3; // Update to next state |
michael@0 | 1363 | if ((e & 4) != 0) { |
michael@0 | 1364 | // Copy a remapped byte if going to state 0, else copy a comma |
michael@0 | 1365 | if (state == 0) { |
michael@0 | 1366 | s.append(1, kLangCodeRemap[c]); |
michael@0 | 1367 | } else { |
michael@0 | 1368 | s.append(1, ','); |
michael@0 | 1369 | } |
michael@0 | 1370 | } |
michael@0 | 1371 | } |
michael@0 | 1372 | |
michael@0 | 1373 | // Add final comma if needed |
michael@0 | 1374 | if (state == 0) { |
michael@0 | 1375 | s.append(1, ','); |
michael@0 | 1376 | } |
michael@0 | 1377 | return s; |
michael@0 | 1378 | } |
michael@0 | 1379 | |
michael@0 | 1380 | // Find and copy attribute value: quoted string in [pos..max_pos) |
michael@0 | 1381 | // Return slightly-normalized language list, empty or ending in comma |
michael@0 | 1382 | string CopyQuotedString(const char* utf8_body, |
michael@0 | 1383 | int32 pos, int32 max_pos) { |
michael@0 | 1384 | int32 start_quote = FindQuoteStart(utf8_body, pos, max_pos); |
michael@0 | 1385 | if (start_quote < 0) {return string("");} |
michael@0 | 1386 | int32 end_quote = FindQuoteEnd(utf8_body, start_quote + 1, max_pos); |
michael@0 | 1387 | if (end_quote < 0) {return string("");} |
michael@0 | 1388 | |
michael@0 | 1389 | return CopyOneQuotedString(utf8_body, start_quote + 1, end_quote); |
michael@0 | 1390 | } |
michael@0 | 1391 | |
michael@0 | 1392 | // Add hints to vector of langpriors |
michael@0 | 1393 | // Input is from GetLangTagsFromHtml(), already lowercased |
michael@0 | 1394 | void SetCLDLangTagsHint(const string& langtags, CLDLangPriors* langpriors) { |
michael@0 | 1395 | if (langtags.empty()) {return;} |
michael@0 | 1396 | int commas = CountCommas(langtags); |
michael@0 | 1397 | if (commas > 4) {return;} // Ignore if too many language tags |
michael@0 | 1398 | |
michael@0 | 1399 | char temp[20]; |
michael@0 | 1400 | int pos = 0; |
michael@0 | 1401 | while (pos < static_cast<int>(langtags.size())) { |
michael@0 | 1402 | int comma = langtags.find(',', pos); |
michael@0 | 1403 | if (comma == string::npos) {comma = langtags.size();} // fake trailing comma |
michael@0 | 1404 | int len = comma - pos; |
michael@0 | 1405 | if (len <= 16) { |
michael@0 | 1406 | // Short enough to use |
michael@0 | 1407 | memcpy(temp, &langtags[pos], len); |
michael@0 | 1408 | temp[len] = '\0'; |
michael@0 | 1409 | const LangTagLookup* entry = DoLangTagLookup(temp, |
michael@0 | 1410 | kCLDLangTagsHintTable1, |
michael@0 | 1411 | kCLDTable1Size); |
michael@0 | 1412 | if (entry != NULL) { |
michael@0 | 1413 | // First table hit |
michael@0 | 1414 | MergeCLDLangPriorsMax(entry->onelangprior1, langpriors); |
michael@0 | 1415 | MergeCLDLangPriorsMax(entry->onelangprior2, langpriors); |
michael@0 | 1416 | } else { |
michael@0 | 1417 | // Try second table with language code truncated at first hyphen |
michael@0 | 1418 | char* hyphen = strchr(temp, '-'); |
michael@0 | 1419 | if (hyphen != NULL) {*hyphen = '\0';} |
michael@0 | 1420 | len = strlen(temp); |
michael@0 | 1421 | if (len <= 3) { // Short enough to use |
michael@0 | 1422 | entry = DoLangTagLookup(temp, |
michael@0 | 1423 | kCLDLangTagsHintTable2, |
michael@0 | 1424 | kCLDTable2Size); |
michael@0 | 1425 | if (entry != NULL) { |
michael@0 | 1426 | // Second table hit |
michael@0 | 1427 | MergeCLDLangPriorsMax(entry->onelangprior1, langpriors); |
michael@0 | 1428 | MergeCLDLangPriorsMax(entry->onelangprior2, langpriors); |
michael@0 | 1429 | } |
michael@0 | 1430 | } |
michael@0 | 1431 | } |
michael@0 | 1432 | } |
michael@0 | 1433 | pos = comma + 1; |
michael@0 | 1434 | } |
michael@0 | 1435 | } |
michael@0 | 1436 | |
michael@0 | 1437 | // Add hints to vector of langpriors |
michael@0 | 1438 | // Input is string after HTTP header Content-Language: |
michael@0 | 1439 | void SetCLDContentLangHint(const char* contentlang, CLDLangPriors* langpriors) { |
michael@0 | 1440 | string langtags = CopyOneQuotedString(contentlang, 0, strlen(contentlang)); |
michael@0 | 1441 | SetCLDLangTagsHint(langtags, langpriors); |
michael@0 | 1442 | } |
michael@0 | 1443 | |
michael@0 | 1444 | // Add hints to vector of langpriors |
michael@0 | 1445 | // Input is last element of hostname (no dot), e.g. from GetTLD() |
michael@0 | 1446 | void SetCLDTLDHint(const char* tld, CLDLangPriors* langpriors) { |
michael@0 | 1447 | int len = strlen(tld); |
michael@0 | 1448 | if (len > 3) {return;} // Ignore if more than three letters |
michael@0 | 1449 | char local_tld[4]; |
michael@0 | 1450 | strncpy(local_tld, tld, 4); |
michael@0 | 1451 | local_tld[3] = '\0'; // Safety move |
michael@0 | 1452 | // Lowercase |
michael@0 | 1453 | for (int i = 0; i < len; ++i) {local_tld[i] |= 0x20;} |
michael@0 | 1454 | const TLDLookup* entry = DoTLDLookup(local_tld, |
michael@0 | 1455 | kCLDTLDHintTable, |
michael@0 | 1456 | kCLDTable3Size); |
michael@0 | 1457 | if (entry != NULL) { |
michael@0 | 1458 | // Table hit |
michael@0 | 1459 | MergeCLDLangPriorsBoost(entry->onelangprior1, langpriors); |
michael@0 | 1460 | MergeCLDLangPriorsBoost(entry->onelangprior2, langpriors); |
michael@0 | 1461 | } |
michael@0 | 1462 | } |
michael@0 | 1463 | |
michael@0 | 1464 | // Add hints to vector of langpriors |
michael@0 | 1465 | // Input is from DetectEncoding() |
michael@0 | 1466 | void SetCLDEncodingHint(Encoding enc, CLDLangPriors* langpriors) { |
michael@0 | 1467 | OneCLDLangPrior olp; |
michael@0 | 1468 | switch (enc) { |
michael@0 | 1469 | case CHINESE_GB: |
michael@0 | 1470 | case GBK: |
michael@0 | 1471 | case GB18030: |
michael@0 | 1472 | case ISO_2022_CN: |
michael@0 | 1473 | case HZ_GB_2312: |
michael@0 | 1474 | olp = PackCLDPriorLangWeight(CHINESE, kCLDPriorEncodingWeight); |
michael@0 | 1475 | MergeCLDLangPriorsBoost(olp, langpriors); |
michael@0 | 1476 | break; |
michael@0 | 1477 | case CHINESE_BIG5: |
michael@0 | 1478 | case CHINESE_BIG5_CP950: |
michael@0 | 1479 | case BIG5_HKSCS: |
michael@0 | 1480 | olp = PackCLDPriorLangWeight(CHINESE_T, kCLDPriorEncodingWeight); |
michael@0 | 1481 | MergeCLDLangPriorsBoost(olp, langpriors); |
michael@0 | 1482 | break; |
michael@0 | 1483 | case JAPANESE_EUC_JP: |
michael@0 | 1484 | case JAPANESE_SHIFT_JIS: |
michael@0 | 1485 | case JAPANESE_CP932: |
michael@0 | 1486 | case JAPANESE_JIS: // ISO-2022-JP |
michael@0 | 1487 | olp = PackCLDPriorLangWeight(JAPANESE, kCLDPriorEncodingWeight); |
michael@0 | 1488 | MergeCLDLangPriorsBoost(olp, langpriors); |
michael@0 | 1489 | break; |
michael@0 | 1490 | case KOREAN_EUC_KR: |
michael@0 | 1491 | case ISO_2022_KR: |
michael@0 | 1492 | olp = PackCLDPriorLangWeight(KOREAN, kCLDPriorEncodingWeight); |
michael@0 | 1493 | MergeCLDLangPriorsBoost(olp, langpriors); |
michael@0 | 1494 | break; |
michael@0 | 1495 | |
michael@0 | 1496 | default: |
michael@0 | 1497 | break; |
michael@0 | 1498 | } |
michael@0 | 1499 | } |
michael@0 | 1500 | |
michael@0 | 1501 | // Add hints to vector of langpriors |
michael@0 | 1502 | // Input is from random source |
michael@0 | 1503 | void SetCLDLanguageHint(Language lang, CLDLangPriors* langpriors) { |
michael@0 | 1504 | OneCLDLangPrior olp = PackCLDPriorLangWeight(lang, kCLDPriorLanguageWeight); |
michael@0 | 1505 | MergeCLDLangPriorsBoost(olp, langpriors); |
michael@0 | 1506 | } |
michael@0 | 1507 | |
michael@0 | 1508 | |
michael@0 | 1509 | // Make printable string of priors |
michael@0 | 1510 | string DumpCLDLangPriors(const CLDLangPriors* langpriors) { |
michael@0 | 1511 | string retval; |
michael@0 | 1512 | for (int i = 0; i < langpriors->n; ++i) { |
michael@0 | 1513 | char temp[64]; |
michael@0 | 1514 | sprintf(temp, "%s.%d ", |
michael@0 | 1515 | LanguageCode(GetCLDPriorLang(langpriors->prior[i])), |
michael@0 | 1516 | GetCLDPriorWeight(langpriors->prior[i])); |
michael@0 | 1517 | retval.append(temp); |
michael@0 | 1518 | } |
michael@0 | 1519 | return retval; |
michael@0 | 1520 | } |
michael@0 | 1521 | |
michael@0 | 1522 | |
michael@0 | 1523 | |
michael@0 | 1524 | |
michael@0 | 1525 | // Look for |
michael@0 | 1526 | // <html lang="en"> |
michael@0 | 1527 | // <doc xml:lang="en"> |
michael@0 | 1528 | // <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en-US"> |
michael@0 | 1529 | // <meta http-equiv="content-language" content="en-GB" /> |
michael@0 | 1530 | // <meta name="language" content="Srpski"> |
michael@0 | 1531 | // <meta name="DC.language" scheme="RFCOMMA766" content="en"> |
michael@0 | 1532 | // <SPAN id="msg1" class="info" lang='en'> |
michael@0 | 1533 | // |
michael@0 | 1534 | // Do not trigger on |
michael@0 | 1535 | // <!-- lang=french ...--> |
michael@0 | 1536 | // <font lang=postscript ...> |
michael@0 | 1537 | // <link href="index.fr.html" hreflang="fr-FR" xml:lang="fr-FR" /> |
michael@0 | 1538 | // <META name="Author" lang="fr" content="Arnaud Le Hors"> |
michael@0 | 1539 | // |
michael@0 | 1540 | // Stop fairly quickly on mismatched quotes |
michael@0 | 1541 | // |
michael@0 | 1542 | // Allowed language characters |
michael@0 | 1543 | // a-z A-Z -_ , space\t |
michael@0 | 1544 | // Think about: GB2312, big5, shift-jis, euc-jp, ksc euc-kr |
michael@0 | 1545 | // zh-hans zh-TW cmn-Hani zh_cn.gb18030_CN zh-min-nan zh-yue |
michael@0 | 1546 | // de-x-mtfrom-en zh-tw-x-mtfrom-en (machine translation) |
michael@0 | 1547 | // GB2312 => gb |
michael@0 | 1548 | // Big5 => big |
michael@0 | 1549 | // zh_CN.gb18030_C => zh-cn |
michael@0 | 1550 | // |
michael@0 | 1551 | // Remove duplicates and extra spaces as we go |
michael@0 | 1552 | // Lowercase as we go. |
michael@0 | 1553 | |
michael@0 | 1554 | // Get language tag hints from HTML body |
michael@0 | 1555 | // Normalize: remove spaces and make lowercase comma list |
michael@0 | 1556 | |
michael@0 | 1557 | string GetLangTagsFromHtml(const char* utf8_body, int32 utf8_body_len, |
michael@0 | 1558 | int32 max_scan_bytes) { |
michael@0 | 1559 | string retval; |
michael@0 | 1560 | if (max_scan_bytes > utf8_body_len) { |
michael@0 | 1561 | max_scan_bytes = utf8_body_len; |
michael@0 | 1562 | } |
michael@0 | 1563 | |
michael@0 | 1564 | int32 k = 0; |
michael@0 | 1565 | while (k < max_scan_bytes) { |
michael@0 | 1566 | int32 start_tag = FindTagStart(utf8_body, k, max_scan_bytes); |
michael@0 | 1567 | if (start_tag < 0) {break;} |
michael@0 | 1568 | int32 end_tag = FindTagEnd(utf8_body, start_tag + 1, max_scan_bytes); |
michael@0 | 1569 | // FindTagEnd exits on < > & |
michael@0 | 1570 | if (end_tag < 0) {break;} |
michael@0 | 1571 | |
michael@0 | 1572 | // Skip <!--...> |
michael@0 | 1573 | // Skip <font ...> |
michael@0 | 1574 | // Skip <script ...> |
michael@0 | 1575 | // Skip <link ...> |
michael@0 | 1576 | // Skip <img ...> |
michael@0 | 1577 | // Skip <a ...> |
michael@0 | 1578 | if (FindAfter(utf8_body, start_tag + 1, end_tag, "!--") || |
michael@0 | 1579 | FindAfter(utf8_body, start_tag + 1, end_tag, "font ") || |
michael@0 | 1580 | FindAfter(utf8_body, start_tag + 1, end_tag, "script ") || |
michael@0 | 1581 | FindAfter(utf8_body, start_tag + 1, end_tag, "link ") || |
michael@0 | 1582 | FindAfter(utf8_body, start_tag + 1, end_tag, "img ") || |
michael@0 | 1583 | FindAfter(utf8_body, start_tag + 1, end_tag, "a ")) { |
michael@0 | 1584 | k = end_tag + 1; |
michael@0 | 1585 | continue; |
michael@0 | 1586 | } |
michael@0 | 1587 | |
michael@0 | 1588 | // Remember <meta ...> |
michael@0 | 1589 | bool in_meta = false; |
michael@0 | 1590 | if (FindAfter(utf8_body, start_tag + 1, end_tag, "meta ")) { |
michael@0 | 1591 | in_meta = true; |
michael@0 | 1592 | } |
michael@0 | 1593 | |
michael@0 | 1594 | // Scan for each equal sign inside tag |
michael@0 | 1595 | bool content_is_lang = false; |
michael@0 | 1596 | int32 kk = start_tag + 1; |
michael@0 | 1597 | int32 equal_sign; |
michael@0 | 1598 | while ((equal_sign = FindEqualSign(utf8_body, kk, end_tag)) >= 0) { |
michael@0 | 1599 | // eq exits on < > & |
michael@0 | 1600 | |
michael@0 | 1601 | // Look inside a meta tag |
michael@0 | 1602 | // <meta ... http-equiv="content-language" ...> |
michael@0 | 1603 | // <meta ... name="language" ...> |
michael@0 | 1604 | // <meta ... name="dc.language" ...> |
michael@0 | 1605 | if (in_meta) { |
michael@0 | 1606 | if (FindBefore(utf8_body, kk, equal_sign, " http-equiv") && |
michael@0 | 1607 | FindAfter(utf8_body, equal_sign + 1, end_tag, |
michael@0 | 1608 | "content-language ")) { |
michael@0 | 1609 | content_is_lang = true; |
michael@0 | 1610 | } else if (FindBefore(utf8_body, kk, equal_sign, " name") && |
michael@0 | 1611 | (FindAfter(utf8_body, equal_sign + 1, end_tag, |
michael@0 | 1612 | "dc.language ") || |
michael@0 | 1613 | FindAfter(utf8_body, equal_sign + 1, end_tag, |
michael@0 | 1614 | "language "))) { |
michael@0 | 1615 | content_is_lang = true; |
michael@0 | 1616 | } |
michael@0 | 1617 | } |
michael@0 | 1618 | |
michael@0 | 1619 | // Look inside any tag |
michael@0 | 1620 | // <meta ... content="lang-list" ...> |
michael@0 | 1621 | // <... lang="lang-list" ...> |
michael@0 | 1622 | // <... xml:lang="lang-list" ...> |
michael@0 | 1623 | if ((content_is_lang && FindBefore(utf8_body, kk, equal_sign, |
michael@0 | 1624 | " content")) || |
michael@0 | 1625 | FindBefore(utf8_body, kk, equal_sign, " lang") || |
michael@0 | 1626 | FindBefore(utf8_body, kk, equal_sign, ":lang")) { |
michael@0 | 1627 | string temp = CopyQuotedString(utf8_body, equal_sign + 1, end_tag); |
michael@0 | 1628 | |
michael@0 | 1629 | // Append new lang tag(s) if not a duplicate |
michael@0 | 1630 | if (!temp.empty() && (retval.find(temp) == string::npos)) { |
michael@0 | 1631 | retval.append(temp); |
michael@0 | 1632 | } |
michael@0 | 1633 | } |
michael@0 | 1634 | |
michael@0 | 1635 | kk = equal_sign + 1; |
michael@0 | 1636 | } |
michael@0 | 1637 | k = end_tag + 1; |
michael@0 | 1638 | } |
michael@0 | 1639 | |
michael@0 | 1640 | // Strip last comma |
michael@0 | 1641 | if (retval.size() > 1) { |
michael@0 | 1642 | retval.erase(retval.size() - 1); |
michael@0 | 1643 | } |
michael@0 | 1644 | return retval; |
michael@0 | 1645 | } |
michael@0 | 1646 | |
michael@0 | 1647 | } // End namespace CLD2 |
michael@0 | 1648 | |
michael@0 | 1649 | //============================================================================== |
michael@0 | 1650 | |
michael@0 | 1651 |