browser/components/translation/cld2/internal/compact_lang_det_hint_code.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/browser/components/translation/cld2/internal/compact_lang_det_hint_code.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1651 @@
     1.4 +// Copyright 2013 Google Inc. All Rights Reserved.
     1.5 +//
     1.6 +// Licensed under the Apache License, Version 2.0 (the "License");
     1.7 +// you may not use this file except in compliance with the License.
     1.8 +// You may obtain a copy of the License at
     1.9 +//
    1.10 +//     http://www.apache.org/licenses/LICENSE-2.0
    1.11 +//
    1.12 +// Unless required by applicable law or agreed to in writing, software
    1.13 +// distributed under the License is distributed on an "AS IS" BASIS,
    1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    1.15 +// See the License for the specific language governing permissions and
    1.16 +// limitations under the License.
    1.17 +
    1.18 +//
    1.19 +// Author: dsites@google.com (Dick Sites)
    1.20 +//
    1.21 +
    1.22 +#include "compact_lang_det_hint_code.h"
    1.23 +
    1.24 +#include <stdlib.h>     // for abs()
    1.25 +#include <stdio.h>      // for sprintf()
    1.26 +#include <string.h>     //
    1.27 +#include "lang_script.h"
    1.28 +#include "port.h"
    1.29 +
    1.30 +using namespace std;
    1.31 +
    1.32 +namespace CLD2 {
    1.33 +
    1.34 +static const int kCLDPriorEncodingWeight = 4;   // 100x more likely
    1.35 +static const int kCLDPriorLanguageWeight = 8;   // 10000x more likely
    1.36 +
    1.37 +
    1.38 +// Tables to map lang="..." language code lists to actual languages.
    1.39 +// based on scraping and hand-edits, dsites June 2011
    1.40 +
    1.41 +// n = f(string, &a) gives list of n<=4 language pairs: primary, secondary
    1.42 +
    1.43 +// For close pairs like ms/id, more weight on TLD and lang=
    1.44 +// Alternately, weaker boost but mark others of set as negative;
    1.45 +// makes "neither" an easier result.
    1.46 +// lang=en low weight 4
    1.47 +// tld=lu boost lu maaybe 4. but lang= alwyas overcomes tld and encoding
    1.48 +// (except maybe en)
    1.49 +
    1.50 +// TLD to separate, e.g., burundi from rwanda
    1.51 +
    1.52 +// Encoding lookup: OneLangProb array
    1.53 +// TLD lookup:   tld OneLangProb pairs
    1.54 +
    1.55 +
    1.56 +typedef struct {
    1.57 +  const char* const langtag;    // Lowercased, hyphen only lookup key
    1.58 +  const char* const langcode;   // Canonical language codes; two if ambiguous
    1.59 +  OneCLDLangPrior onelangprior1;
    1.60 +  OneCLDLangPrior onelangprior2;
    1.61 +} LangTagLookup;
    1.62 +
    1.63 +typedef struct {
    1.64 +  const char* const tld;        // Lowercased, hyphen only lookup key
    1.65 +  OneCLDLangPrior onelangprior1;
    1.66 +  OneCLDLangPrior onelangprior2;
    1.67 +} TLDLookup;
    1.68 +
    1.69 +
    1.70 +#define W2 (2 << 10)            // 3**2 = 10x more likely
    1.71 +#define W4 (4 << 10)            // 3**4 = 100x more likely
    1.72 +#define W6 (6 << 10)            // 3**6 = 1000x more likely
    1.73 +#define W8 (8 << 10)            // 3**8 = 10K x more likely
    1.74 +#define W10 (10 << 10)          // 3**10 = 100K x more likely
    1.75 +#define W12 (12 << 10)          // 3**12 = 1M x more likely
    1.76 +
    1.77 +// TODO: more about ba hr sr sr-ME and sl
    1.78 +// Temporary state of affairs:
    1.79 +//   BOSNIAN CROATIAN MONTENEGRIN SERBIAN detecting just CROATIAN SERBIAN
    1.80 +// Eventually, we want to do all four, but it requires a CLD change to handle
    1.81 +// up to six languages per quadgram.
    1.82 +
    1.83 +
    1.84 +// Close pairs boost one of pair, demote other.
    1.85 +//   Statistically close pairs:
    1.86 +//   INDONESIAN/MALAY difficult to distinguish -- extra word-based lookups used
    1.87 +//
    1.88 +//   INDONESIAN MALAY coef=0.4698        Problematic w/o extra words
    1.89 +//   TIBETAN DZONGKHA coef=0.4571
    1.90 +//   CZECH SLOVAK coef=0.4273
    1.91 +//   NORWEGIAN NORWEGIAN_N coef=0.4182
    1.92 +//
    1.93 +//   HINDI MARATHI coef=0.3795
    1.94 +//   ZULU XHOSA coef=0.3716
    1.95 +//
    1.96 +//   DANISH NORWEGIAN coef=0.3672        Usually OK
    1.97 +//   BIHARI HINDI coef=0.3668            Usually OK
    1.98 +//   ICELANDIC FAROESE coef=0.3519       Usually OK
    1.99 +
   1.100 +//
   1.101 +// Table to look up lang= tags longer than three characters
   1.102 +// Overrides table below, which is truncated at first hyphen
   1.103 +// In alphabetical order for binary search
   1.104 +static const int kCLDTable1Size = 213;
   1.105 +static const LangTagLookup kCLDLangTagsHintTable1[kCLDTable1Size] = {
   1.106 +  {"abkhazian", "ab", ABKHAZIAN + W10, 0},
   1.107 +  {"afar", "aa", AFAR + W10, 0},
   1.108 +  {"afrikaans", "af", AFRIKAANS + W10, 0},
   1.109 +  {"akan", "ak", AKAN + W10, 0},
   1.110 +  {"albanian", "sq", ALBANIAN + W10, 0},
   1.111 +  {"am-am", "hy", ARMENIAN + W10, 0},        // 1:2 Armenian, not ambiguous
   1.112 +  {"amharic", "am", AMHARIC + W10, 0},
   1.113 +  {"arabic", "ar", ARABIC + W10, 0},
   1.114 +  {"argentina", "es", SPANISH + W10, 0},
   1.115 +  {"armenian", "hy", ARMENIAN + W10, 0},
   1.116 +  {"assamese", "as", ASSAMESE + W10, 0},
   1.117 +  {"aymara", "ay", AYMARA + W10, 0},
   1.118 +  {"azerbaijani", "az", AZERBAIJANI + W10, 0},
   1.119 +
   1.120 +  {"bangla", "bn", BENGALI + W10, 0},
   1.121 +  {"bashkir", "ba", BASHKIR + W10, 0},
   1.122 +  {"basque", "eu", BASQUE + W10, 0},
   1.123 +  {"belarusian", "be", BELARUSIAN + W10, 0},
   1.124 +  {"bengali", "bn", BENGALI + W10, 0},
   1.125 +  {"bihari", "bh", BIHARI + W10, HINDI - W4},
   1.126 +  {"bislama", "bi", BISLAMA + W10, 0},
   1.127 +  {"bosnian", "bs", BOSNIAN + W10, 0},      // Bosnian => Bosnian
   1.128 +  {"br-br", "pt", PORTUGUESE + W10, 0},     // 1:2 Portuguese, not ambiguous
   1.129 +  {"br-fr", "br", BRETON + W10, 0},         // 1:2 Breton, not ambiguous
   1.130 +  {"breton", "br", BRETON + W10, 0},
   1.131 +  {"bulgarian", "bg", BULGARIAN + W10, 0},
   1.132 +  {"burmese", "my", BURMESE + W10, 0},      // Myanmar
   1.133 +
   1.134 +  {"catalan", "ca", CATALAN + W10, 0},
   1.135 +  {"cherokee", "chr", CHEROKEE + W10, 0},
   1.136 +  {"chichewa", "ny", NYANJA + W10, 0},
   1.137 +
   1.138 +  {"chinese", "zh", CHINESE + W10, 0},
   1.139 +  {"chinese-t", "zhT", CHINESE_T + W10, 0},
   1.140 +  {"chineset", "zhT", CHINESE_T + W10, 0},
   1.141 +  {"corsican", "co", CORSICAN + W10, 0},
   1.142 +  {"cpf-hat", "ht", HAITIAN_CREOLE + W10, 0}, // Creole, French-based
   1.143 +  {"croatian", "hr", CROATIAN + W10, 0},
   1.144 +  {"czech", "cs", CZECH + W10, SLOVAK - W4},
   1.145 +
   1.146 +  {"danish", "da", DANISH + W10, NORWEGIAN - W4},
   1.147 +  {"deutsch", "de", GERMAN + W10, 0},
   1.148 +  {"dhivehi", "dv", DHIVEHI + W10, 0},
   1.149 +  {"dutch", "nl", DUTCH + W10, 0},
   1.150 +  {"dzongkha", "dz", DZONGKHA + W10,  TIBETAN - W4},
   1.151 +
   1.152 +  {"ell-gr", "el", GREEK + W10, 0},
   1.153 +  {"english", "en", ENGLISH + W4, 0},
   1.154 +  {"esperanto", "eo", ESPERANTO + W10, 0},
   1.155 +  {"estonian", "et", ESTONIAN + W10, 0},
   1.156 +  {"euc-jp", "ja", JAPANESE + W10, 0},       // Japanese encoding
   1.157 +  {"euc-kr", "ko", KOREAN + W10, 0},         // Korean encoding
   1.158 +
   1.159 +  {"faroese", "fo", FAROESE + W10, ICELANDIC - W4},
   1.160 +  {"fijian", "fj", FIJIAN + W10, 0},
   1.161 +  {"finnish", "fi", FINNISH + W10, 0},
   1.162 +  {"fran", "fr", FRENCH + W10, 0},            // Truncated at non-ASCII
   1.163 +  {"francais", "fr", FRENCH + W10, 0},
   1.164 +  {"french", "fr", FRENCH + W10, 0},
   1.165 +  {"frisian", "fy", FRISIAN + W10, 0},
   1.166 +
   1.167 +  {"ga-es", "gl", GALICIAN + W10, 0},         // 1:2 Galician, not ambiguous
   1.168 +  {"galician", "gl", GALICIAN + W10, 0},
   1.169 +  {"ganda", "lg", GANDA + W10, 0},
   1.170 +  {"georgian", "ka", GEORGIAN + W10, 0},
   1.171 +  {"german", "de", GERMAN + W10, 0},
   1.172 +  {"greek", "el", GREEK + W10, 0},
   1.173 +  {"greenlandic", "kl", GREENLANDIC + W10, 0},
   1.174 +  {"guarani", "gn", GUARANI + W10, 0},
   1.175 +  {"gujarati", "gu", GUJARATI + W10, 0},
   1.176 +
   1.177 +  {"haitian_creole", "ht", HAITIAN_CREOLE + W10, 0},
   1.178 +  {"hausa", "ha", HAUSA + W10, 0},
   1.179 +  {"hawaiian", "haw", HAWAIIAN + W10, 0},
   1.180 +  {"hebrew", "iw", HEBREW + W10, 0},
   1.181 +  {"hindi", "hi", HINDI + W10, MARATHI - W4},
   1.182 +  {"hn-in", "hi", HINDI + W10, MARATHI - W4},
   1.183 +  {"hungarian", "hu", HUNGARIAN + W10, 0},
   1.184 +
   1.185 +  {"icelandic", "is", ICELANDIC + W10, FAROESE - W4},
   1.186 +  {"igbo", "ig", IGBO + W10, 0},
   1.187 +  {"indonesian", "id", INDONESIAN + W10, MALAY - W4},
   1.188 +  {"interlingua", "ia", INTERLINGUA + W10, 0},
   1.189 +  {"interlingue", "ie", INTERLINGUE + W10, 0},
   1.190 +  // 1:2 iu-Cans ik-Latn
   1.191 +  {"inuktitut", "iu,ik", INUKTITUT + W10, INUPIAK + W10}, // 1:2
   1.192 +  {"inupiak", "ik,iu", INUPIAK + W10, INUKTITUT + W10},   // 1:2
   1.193 +  {"ir-ie", "ga", IRISH + W10, 0},          // Irish
   1.194 +  {"irish", "ga", IRISH + W10, 0},
   1.195 +  {"italian", "it", ITALIAN + W10, 0},
   1.196 +
   1.197 +  {"ja-euc", "ja", JAPANESE + W10, 0},      // Japanese encoding
   1.198 +  {"jan-jp", "ja", JAPANESE + W10, 0},      // Japanese encoding
   1.199 +  {"japanese", "ja", JAPANESE + W10, 0},
   1.200 +  {"javanese", "jw", JAVANESE + W10, 0},
   1.201 +
   1.202 +  {"kannada", "kn", KANNADA + W10, 0},
   1.203 +  {"kashmiri", "ks", KASHMIRI + W10, 0},
   1.204 +  {"kazakh", "kk", KAZAKH + W10, 0},
   1.205 +  {"khasi", "kha", KHASI + W10, 0},
   1.206 +  {"khmer", "km", KHMER + W10, 0},
   1.207 +  {"kinyarwanda", "rw", KINYARWANDA + W10, 0},
   1.208 +  {"klingon", "tlh", X_KLINGON + W10, 0},
   1.209 +  {"korean", "ko", KOREAN + W10, 0},
   1.210 +  {"kurdish", "ku", KURDISH + W10, 0},
   1.211 +  {"kyrgyz", "ky", KYRGYZ + W10, 0},
   1.212 +
   1.213 +  {"laothian", "lo", LAOTHIAN + W10, 0},
   1.214 +  {"latin", "la", LATIN + W10, 0},
   1.215 +  {"latvian", "lv", LATVIAN + W10, 0},
   1.216 +  {"limbu", "sit", LIMBU + W10, 0},
   1.217 +  {"lingala", "ln", LINGALA + W10, 0},
   1.218 +  {"lithuanian", "lt", LITHUANIAN + W10, 0},
   1.219 +  {"luxembourgish", "lb", LUXEMBOURGISH + W10, 0},
   1.220 +
   1.221 +  {"macedonian", "mk", MACEDONIAN + W10, 0},
   1.222 +  {"malagasy", "mg", MALAGASY + W10, 0},
   1.223 +  {"malay", "ms", MALAY + W10, INDONESIAN - W4},
   1.224 +  {"malayalam", "ml", MALAYALAM + W10, 0},
   1.225 +  {"maltese", "mt", MALTESE + W10, 0},
   1.226 +  {"manx", "gv", MANX + W10, 0},
   1.227 +  {"maori", "mi", MAORI + W10, 0},
   1.228 +  {"marathi", "mr", MARATHI + W10, HINDI - W4},
   1.229 +  {"mauritian_creole", "mfe", MAURITIAN_CREOLE + W10, 0},
   1.230 +  {"moldavian", "mo", ROMANIAN + W10, 0},
   1.231 +  {"mongolian", "mn", MONGOLIAN + W10, 0},
   1.232 +  {"montenegrin", "sr-me", MONTENEGRIN + W10, 0},
   1.233 +  {"myanmar", "my", BURMESE + W10, 0},      // Myanmar
   1.234 +  {"nauru", "na", NAURU + W10, 0},
   1.235 +  {"ndebele", "nr", NDEBELE + W10, 0},
   1.236 +  {"nepali", "ne", NEPALI + W10, 0},
   1.237 +  {"no-bok", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},       // Bokmaal
   1.238 +  {"no-bokmaal", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
   1.239 +  {"no-nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},        // Bokmaal
   1.240 +  {"no-no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
   1.241 +  {"no-nyn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},       // Nynorsk
   1.242 +  {"no-nynorsk", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
   1.243 +  {"norwegian", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
   1.244 +  {"norwegian_n", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
   1.245 +  {"nyanja", "ny", NYANJA + W10, 0},
   1.246 +
   1.247 +  {"occitan", "oc", OCCITAN + W10, 0},
   1.248 +  {"oriya", "or", ORIYA + W10, 0},
   1.249 +  {"oromo", "om", OROMO + W10, 0},
   1.250 +  {"parsi", "fa", PERSIAN + W10, 0},
   1.251 +
   1.252 +  {"pashto", "ps", PASHTO + W10, 0},
   1.253 +  {"pedi", "nso", PEDI + W10, 0},
   1.254 +  {"persian", "fa", PERSIAN + W10, 0},
   1.255 +  {"polish", "pl", POLISH + W10, 0},
   1.256 +  {"polska", "pl", POLISH + W10, 0},
   1.257 +  {"polski", "pl", POLISH + W10, 0},
   1.258 +  {"portugu", "pt", PORTUGUESE + W10, 0},     // Truncated at non-ASCII
   1.259 +  {"portuguese", "pt", PORTUGUESE + W10, 0},
   1.260 +  {"punjabi", "pa", PUNJABI + W10, 0},
   1.261 +
   1.262 +  {"quechua", "qu", QUECHUA + W10, 0},
   1.263 +
   1.264 +  {"rhaeto_romance", "rm", RHAETO_ROMANCE + W10, 0},
   1.265 +  {"romanian", "ro", ROMANIAN + W10, 0},
   1.266 +  {"rundi", "rn", RUNDI + W10, 0},
   1.267 +  {"russian", "ru", RUSSIAN + W10, 0},
   1.268 +
   1.269 +  {"samoan", "sm", SAMOAN + W10, 0},
   1.270 +  {"sango", "sg", SANGO + W10, 0},
   1.271 +  {"sanskrit", "sa", SANSKRIT + W10, 0},
   1.272 +  {"scots", "sco", SCOTS + W10, ENGLISH - W4},
   1.273 +  {"scots_gaelic", "gd", SCOTS_GAELIC + W10, 0},
   1.274 +  {"serbian", "sr", SERBIAN + W10, 0},
   1.275 +  {"seselwa", "crs", SESELWA + W10, 0},
   1.276 +  {"sesotho", "st", SESOTHO + W10, 0},
   1.277 +  {"shift-jis", "ja", JAPANESE + W10, 0},   // Japanese encoding
   1.278 +  {"shift-js", "ja", JAPANESE + W10, 0},    // Japanese encoding
   1.279 +  {"shona", "sn", SHONA + W10, 0},
   1.280 +  {"si-lk", "si", SINHALESE + W10, 0},      // 1:2 Sri Lanka, not ambiguous
   1.281 +  {"si-si", "sl", SLOVENIAN + W10, 0},      // 1:2 Slovenia, not ambiguous
   1.282 +  {"si-sl", "sl", SLOVENIAN + W10, 0},      // 1:2 Slovenia, not ambiguous
   1.283 +  {"sindhi", "sd", SINDHI + W10, 0},
   1.284 +  {"sinhalese", "si", SINHALESE + W10, 0},
   1.285 +  {"siswant", "ss", SISWANT + W10, 0},
   1.286 +  {"sit-np", "sit", LIMBU + W10, 0},
   1.287 +  {"slovak", "sk", SLOVAK + W10, CZECH - W4},
   1.288 +  {"slovenian", "sl", SLOVENIAN + W10, 0},
   1.289 +  {"somali", "so", SOMALI + W10, 0},
   1.290 +  {"spanish", "es", SPANISH + W10, 0},
   1.291 +  {"sr-me", "sr-me", MONTENEGRIN + W10, 0}, // Montenegrin => Montenegrin
   1.292 +  {"sundanese", "su", SUNDANESE + W10, 0},
   1.293 +  {"suomi", "fi", FINNISH + W10, 0},        // Finnish
   1.294 +  {"swahili", "sw", SWAHILI + W10, 0},
   1.295 +  {"swedish", "sv", SWEDISH + W10, 0},
   1.296 +  {"syriac", "syr", SYRIAC + W10, 0},
   1.297 +
   1.298 +  {"tagalog", "tl", TAGALOG + W10, 0},
   1.299 +  {"tajik", "tg", TAJIK + W10, 0},
   1.300 +  {"tamil", "ta", TAMIL + W10, 0},
   1.301 +  {"tatar", "tt", TATAR + W10, 0},
   1.302 +  {"tb-tb", "bo", TIBETAN + W10, DZONGKHA - W4},        // Tibet
   1.303 +  {"tchinese", "zhT", CHINESE_T + W10, 0},
   1.304 +  {"telugu", "te", TELUGU + W10, 0},
   1.305 +  {"thai", "th", THAI + W10, 0},
   1.306 +  {"tibetan", "bo", TIBETAN + W10, DZONGKHA - W4},
   1.307 +  {"tigrinya", "ti", TIGRINYA + W10, 0},
   1.308 +  {"tonga", "to", TONGA + W10, 0},
   1.309 +  {"tsonga", "ts", TSONGA + W10, 0},
   1.310 +  {"tswana", "tn", TSWANA + W10, 0},
   1.311 +  {"tt-ru", "tt", TATAR + W10, 0},
   1.312 +  {"tur-tr", "tr", TURKISH + W10, 0},
   1.313 +  {"turkish", "tr", TURKISH + W10, 0},
   1.314 +  {"turkmen", "tk", TURKMEN + W10, 0},
   1.315 +  {"uighur", "ug", UIGHUR + W10, 0},
   1.316 +  {"ukrainian", "uk", UKRAINIAN + W10, 0},
   1.317 +  {"urdu", "ur", URDU + W10, 0},
   1.318 +  {"uzbek", "uz", UZBEK + W10, 0},
   1.319 +
   1.320 +  {"venda", "ve", VENDA + W10, 0},
   1.321 +  {"vietnam", "vi", VIETNAMESE + W10, 0},
   1.322 +  {"vietnamese", "vi", VIETNAMESE + W10, 0},
   1.323 +  {"volapuk", "vo", VOLAPUK + W10, 0},
   1.324 +
   1.325 +  {"welsh", "cy", WELSH + W10, 0},
   1.326 +  {"wolof", "wo", WOLOF + W10, 0},
   1.327 +
   1.328 +  {"xhosa", "xh", XHOSA + W10, ZULU - W4},
   1.329 +
   1.330 +  {"yiddish", "yi", YIDDISH + W10, 0},
   1.331 +  {"yoruba", "yo", YORUBA + W10, 0},
   1.332 +
   1.333 +  {"zh-classical", "zhT", CHINESE_T + W10, 0},
   1.334 +  {"zh-cn", "zh", CHINESE + W10, 0},
   1.335 +  {"zh-hans", "zh", CHINESE + W10, 0},
   1.336 +  {"zh-hant", "zhT", CHINESE_T + W10, 0},
   1.337 +  {"zh-hk", "zhT", CHINESE_T + W10, 0},
   1.338 +  {"zh-min-nan", "zhT", CHINESE_T + W10, 0}, // Min Nan => ChineseT
   1.339 +  {"zh-sg", "zhT", CHINESE_T + W10, 0},
   1.340 +  {"zh-tw", "zhT", CHINESE_T + W10, 0},
   1.341 +  {"zh-yue", "zh", CHINESE + W10, 0},       // Yue (Cantonese) => Chinese
   1.342 +  {"zhuang", "za", ZHUANG + W10, 0},
   1.343 +  {"zulu", "zu", ZULU + W10, XHOSA - W4},
   1.344 +};
   1.345 +
   1.346 +
   1.347 +
   1.348 +// Table to look up lang= tags of two/three characters after truncate at hyphen
   1.349 +// In alphabetical order for binary search
   1.350 +static const int kCLDTable2Size = 257;
   1.351 +static const LangTagLookup kCLDLangTagsHintTable2[kCLDTable2Size] = {
   1.352 +  {"aa", "aa", AFAR + W10, 0},
   1.353 +  {"ab", "ab", ABKHAZIAN + W10, 0},
   1.354 +  {"af", "af", AFRIKAANS + W10, 0},
   1.355 +  {"ak", "ak", AKAN + W10, 0},
   1.356 +  {"al", "sq", ALBANIAN + W10, 0},          // Albania
   1.357 +  {"am", "am,hy", AMHARIC + W10, ARMENIAN + W10},  // 1:2 Amharic Armenian
   1.358 +  {"ar", "ar", ARABIC + W10, 0},
   1.359 +  {"ara", "ar", ARABIC + W10, 0},
   1.360 +  {"arm", "hy", ARMENIAN + W10, 0},         // Armenia
   1.361 +  {"arz", "ar", ARABIC + W10, 0},           // Egyptian Arabic
   1.362 +  {"as", "as", ASSAMESE + W10, 0},
   1.363 +  {"at", "de", GERMAN + W10, 0},            // Austria
   1.364 +  {"au", "de", GERMAN + W10, 0},            // Austria
   1.365 +  {"ay", "ay", AYMARA + W10, 0},
   1.366 +  {"az", "az", AZERBAIJANI + W10, 0},
   1.367 +  {"aze", "az", AZERBAIJANI + W10, 0},
   1.368 +
   1.369 +  {"ba", "ba,bs", BASHKIR + W10, BOSNIAN + W10},  // 1:2  Bashkir Bosnia
   1.370 +  {"be", "be", BELARUSIAN + W10, 0},
   1.371 +  {"bel", "be", BELARUSIAN + W10, 0},
   1.372 +  {"bg", "bg", BULGARIAN + W10, 0},
   1.373 +  {"bh", "bh", BIHARI + W10, HINDI - W4},
   1.374 +  {"bi", "bi", BISLAMA + W10, 0},
   1.375 +  {"big", "zhT", CHINESE_T + W10, 0},        // Big5 encoding
   1.376 +  {"bm", "ms", MALAY + W10, INDONESIAN - W4},             // Bahasa Malaysia
   1.377 +  {"bn", "bn", BENGALI + W10, 0},
   1.378 +  {"bo", "bo", TIBETAN + W10, DZONGKHA - W4},
   1.379 +  // 1:2 Breton, Brazil country code, both Latn .br TLD enough for pt to win
   1.380 +  {"br", "br,pt", BRETON + W10, PORTUGUESE + W8}, // 1:2 Breton, Brazil
   1.381 +  {"bs", "bs", BOSNIAN + W10, 0},           // Bosnian => Bosnian
   1.382 +
   1.383 +  {"ca", "ca", CATALAN + W10, 0},
   1.384 +  {"cat", "ca", CATALAN + W10, 0},
   1.385 +  {"ch", "de,fr", GERMAN + W10, FRENCH + W10},    // 1:2 Switzerland
   1.386 +  {"chn", "zh", CHINESE + W10, 0},
   1.387 +  {"chr", "chr", CHEROKEE + W10, 0},
   1.388 +  {"ckb", "ku", KURDISH + W10, 0},          // Central Kurdish
   1.389 +  {"cn", "zh,zhT", CHINESE + W6, CHINESE_T + W4},   // Ambiguous, so weaker.
   1.390 +                                                // Offset by 2 so that TLD=tw or
   1.391 +                                                // enc=big5 will put zhT ahead
   1.392 +  {"co", "co", CORSICAN + W10, 0},
   1.393 +  {"cro", "hr", CROATIAN + W10, 0},          // Croatia
   1.394 +  {"crs", "crs", SESELWA + W10, 0},
   1.395 +  {"cs", "cs", CZECH + W10, SLOVAK - W4},
   1.396 +  {"ct", "ca", CATALAN + W10, 0},
   1.397 +  {"cy", "cy", WELSH + W10, 0},
   1.398 +  {"cym", "cy", WELSH + W10, 0},
   1.399 +  {"cz", "cs", CZECH + W10, SLOVAK - W4},
   1.400 +
   1.401 +  {"da", "da", DANISH + W10, NORWEGIAN - W4},
   1.402 +  {"dan", "da", DANISH + W10, NORWEGIAN - W4},
   1.403 +  {"de", "de", GERMAN + W10, 0},
   1.404 +  {"deu", "de", GERMAN + W10, 0},
   1.405 +  {"div", "dv", DHIVEHI + W10, 0},
   1.406 +  {"dk", "da", DANISH + W10, NORWEGIAN - W4},            // Denmark
   1.407 +  {"dut", "nl", DUTCH + W10, 0},            // Dutch
   1.408 +  {"dv", "dv", DHIVEHI + W10, 0},
   1.409 +  {"dz", "dz", DZONGKHA + W10, TIBETAN - W4},
   1.410 +
   1.411 +  {"ee", "et", ESTONIAN + W10, 0},          // Estonia
   1.412 +  {"eg", "ar", ARABIC + W10, 0},            // Egypt
   1.413 +  {"el", "el", GREEK + W10, 0},
   1.414 +  {"en", "en", ENGLISH + W4, 0},
   1.415 +  {"eng", "en", ENGLISH + W4, 0},
   1.416 +  {"eo", "eo", ESPERANTO + W10, 0},
   1.417 +  {"er", "ur", URDU + W10, 0},              // "Erdu"
   1.418 +  {"es", "es", SPANISH + W10, 0},
   1.419 +  {"esp", "es", SPANISH + W10, 0},
   1.420 +  {"est", "et", ESTONIAN + W10, 0},
   1.421 +  {"et", "et", ESTONIAN + W10, 0},
   1.422 +  {"eu", "eu", BASQUE + W10, 0},
   1.423 +
   1.424 +  {"fa", "fa", PERSIAN + W10, 0},
   1.425 +  {"far", "fa", PERSIAN + W10, 0},
   1.426 +  {"fi", "fi", FINNISH + W10, 0},
   1.427 +  {"fil", "tl", TAGALOG + W10, 0},          // Philippines
   1.428 +  {"fj", "fj", FIJIAN + W10, 0},
   1.429 +  {"fo", "fo", FAROESE + W10, ICELANDIC - W4},
   1.430 +  {"fr", "fr", FRENCH + W10, 0},
   1.431 +  {"fra", "fr", FRENCH + W10, 0},
   1.432 +  {"fre", "fr", FRENCH + W10, 0},
   1.433 +  {"fy", "fy", FRISIAN + W10, 0},
   1.434 +
   1.435 +  {"ga", "ga,gl", IRISH + W10, GALICIAN + W10},       // 1:2 Irish, Galician
   1.436 +  {"gae", "gd,ga", SCOTS_GAELIC + W10, IRISH + W10},  // 1:2 Gaelic, either
   1.437 +  {"gal", "gl", GALICIAN + W10, 0},
   1.438 +  {"gb", "zh", CHINESE + W10, 0},           // GB2312 encoding
   1.439 +  {"gbk", "zh", CHINESE + W10, 0},          // GBK encoding
   1.440 +  {"gd", "gd", SCOTS_GAELIC + W10, 0},
   1.441 +  {"ge", "ka", GEORGIAN + W10, 0},          // Georgia
   1.442 +  {"geo", "ka", GEORGIAN + W10, 0},
   1.443 +  {"ger", "de", GERMAN + W10, 0},
   1.444 +  {"gl", "gl", GALICIAN + W10, 0},          // Also Greenland; hard to confuse
   1.445 +  {"gn", "gn", GUARANI + W10, 0},
   1.446 +  {"gr", "el", GREEK + W10, 0},             // Greece
   1.447 +  {"gu", "gu", GUJARATI + W10, 0},
   1.448 +  {"gv", "gv", MANX + W10, 0},
   1.449 +
   1.450 +  {"ha", "ha", HAUSA + W10, 0},
   1.451 +  {"hat", "ht", HAITIAN_CREOLE + W10, 0},   // Haiti
   1.452 +  {"haw", "haw", HAWAIIAN + W10, 0},
   1.453 +  {"hb", "iw", HEBREW + W10, 0},
   1.454 +  {"he", "iw", HEBREW + W10, 0},
   1.455 +  {"heb", "iw", HEBREW + W10, 0},
   1.456 +  {"hi", "hi", HINDI + W10, MARATHI - W4},
   1.457 +  {"hk", "zhT", CHINESE_T + W10, 0},          // Hong Kong
   1.458 +  {"hr", "hr", CROATIAN + W10, 0},
   1.459 +  {"ht", "ht", HAITIAN_CREOLE + W10, 0},
   1.460 +  {"hu", "hu", HUNGARIAN + W10, 0},
   1.461 +  {"hun", "hu", HUNGARIAN + W10, 0},
   1.462 +  {"hy", "hy", ARMENIAN + W10, 0},
   1.463 +
   1.464 +  {"ia", "ia", INTERLINGUA + W10, 0},
   1.465 +  {"ice", "is", ICELANDIC + W10, FAROESE - W4},        // Iceland
   1.466 +  {"id", "id", INDONESIAN + W10, MALAY - W4},
   1.467 +  {"ids", "id", INDONESIAN + W10, MALAY - W4},
   1.468 +  {"ie", "ie", INTERLINGUE + W10, 0},
   1.469 +  {"ig", "ig", IGBO + W10, 0},
   1.470 +  // 1:2 iu-Cans ik-Latn
   1.471 +  {"ik", "ik,iu", INUPIAK + W10, INUKTITUT + W10},        // 1:2
   1.472 +  {"in", "id", INDONESIAN + W10, MALAY - W4},
   1.473 +  {"ind", "id", INDONESIAN + W10, MALAY - W4},       // Indonesia
   1.474 +  {"inu", "iu,ik", INUKTITUT + W10, INUPIAK + W10},       // 1:2
   1.475 +  {"is", "is", ICELANDIC + W10, FAROESE - W4},
   1.476 +  {"it", "it", ITALIAN + W10, 0},
   1.477 +  {"ita", "it", ITALIAN + W10, 0},
   1.478 +  {"iu", "iu,ik", INUKTITUT + W10, INUPIAK + W10},        // 1:2
   1.479 +  {"iw", "iw", HEBREW + W10, 0},
   1.480 +
   1.481 +  {"ja", "ja", JAPANESE + W10, 0},
   1.482 +  {"jp", "ja", JAPANESE + W10, 0},          // Japan
   1.483 +  {"jpn", "ja", JAPANESE + W10, 0},
   1.484 +  {"jv", "jw", JAVANESE + W10, 0},
   1.485 +  {"jw", "jw", JAVANESE + W10, 0},
   1.486 +
   1.487 +  {"ka", "ka", GEORGIAN + W10, 0},
   1.488 +  {"kc", "qu", QUECHUA + W10, 0},           // (K)Quechua
   1.489 +  {"kg", "ky", KYRGYZ + W10, 0},            // Kyrgyzstan
   1.490 +  {"kh", "km", KHMER + W10, 0},             // Country code Khmer (Cambodia)
   1.491 +  {"kha", "kha", KHASI + W10, 0},
   1.492 +  {"kk", "kk", KAZAKH + W10, 0},            // Kazakh
   1.493 +  {"kl", "kl", GREENLANDIC + W10, 0},
   1.494 +  {"km", "km", KHMER + W10, 0},
   1.495 +  {"kn", "kn", KANNADA + W10, 0},
   1.496 +  {"ko", "ko", KOREAN + W10, 0},
   1.497 +  {"kor", "ko", KOREAN + W10, 0},
   1.498 +  {"kr", "ko", KOREAN + W10, 0},            // Country code Korea
   1.499 +  {"ks", "ks", KASHMIRI + W10, 0},
   1.500 +  {"ksc", "ko", KOREAN + W10, 0},           // KSC encoding
   1.501 +  {"ku", "ku", KURDISH + W10, 0},
   1.502 +  {"ky", "ky", KYRGYZ + W10, 0},
   1.503 +  {"kz", "kk", KAZAKH + W10, 0},            // Kazakhstan
   1.504 +  {"la", "la", LATIN + W10, 0},
   1.505 +  {"lao", "lo", LAOTHIAN + W10, 0},         // Laos
   1.506 +
   1.507 +  {"lb", "lb", LUXEMBOURGISH + W10, 0},
   1.508 +  {"lg", "lg", GANDA + W10, 0},
   1.509 +  {"lit", "lt", LITHUANIAN + W10, 0},
   1.510 +  {"ln", "ln", LINGALA + W10, 0},
   1.511 +  {"lo", "lo", LAOTHIAN + W10, 0},
   1.512 +  {"lt", "lt", LITHUANIAN + W10, 0},
   1.513 +  {"ltu", "lt", LITHUANIAN + W10, 0},
   1.514 +  {"lv", "lv", LATVIAN + W10, 0},
   1.515 +
   1.516 +  {"mfe", "mfe", MAURITIAN_CREOLE + W10, 0},
   1.517 +  {"mg", "mg", MALAGASY + W10, 0},
   1.518 +  {"mi", "mi", MAORI + W10, 0},
   1.519 +  {"mk", "mk", MACEDONIAN + W10, 0},
   1.520 +  {"ml", "ml", MALAYALAM + W10, 0},
   1.521 +  {"mn", "mn", MONGOLIAN + W10, 0},
   1.522 +  {"mo", "mo", ROMANIAN + W10, 0},
   1.523 +  {"mon", "mn", MONGOLIAN + W10, 0},        // Mongolian
   1.524 +  {"mr", "mr", MARATHI + W10, HINDI - W4},
   1.525 +  {"ms", "ms", MALAY + W10, INDONESIAN - W4},
   1.526 +  {"mt", "mt", MALTESE + W10, 0},
   1.527 +  {"mx", "es", SPANISH + W10, 0},           // Mexico
   1.528 +  {"my", "my,ms", BURMESE + W10, MALAY + W10}, // Myanmar, Malaysia
   1.529 +
   1.530 +  {"na", "na", NAURU + W10, 0},
   1.531 +  {"nb", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
   1.532 +  {"ne", "ne", NEPALI + W10, 0},
   1.533 +  {"nl", "nl", DUTCH + W10, 0},
   1.534 +  {"nn", "nn", NORWEGIAN_N + W10, NORWEGIAN - W4},
   1.535 +  {"no", "no", NORWEGIAN + W10, NORWEGIAN_N - W4},
   1.536 +  {"nr", "nr", NDEBELE + W10, 0},
   1.537 +  {"nso", "nso", PEDI + W10, 0},
   1.538 +  {"ny", "ny", NYANJA + W10, 0},
   1.539 +
   1.540 +  {"oc", "oc", OCCITAN + W10, 0},
   1.541 +  {"om", "om", OROMO + W10, 0},
   1.542 +  {"or", "or", ORIYA + W10, 0},
   1.543 +
   1.544 +  {"pa", "pa,ps", PUNJABI + W10, PASHTO + W10},   // 1:2 pa-Guru ps-Arab
   1.545 +  {"per", "fa", PERSIAN + W10, 0},
   1.546 +  {"ph", "tl", TAGALOG + W10, 0},           // Philippines
   1.547 +  {"pk", "ur", URDU + W10, 0},              // Pakistan
   1.548 +  {"pl", "pl", POLISH + W10, 0},
   1.549 +  {"pnb", "pa", PUNJABI + W10, 0},          // Western Punjabi
   1.550 +  {"pol", "pl", POLISH + W10, 0},
   1.551 +  {"por", "pt", PORTUGUESE + W10, 0},
   1.552 +  {"ps", "ps", PASHTO + W10, 0},
   1.553 +  {"pt", "pt", PORTUGUESE + W10, 0},
   1.554 +  {"ptg", "pt", PORTUGUESE + W10, 0},
   1.555 +  {"qc", "fr", FRENCH + W10, 0},            // Quebec "country" code
   1.556 +  {"qu", "qu", QUECHUA + W10, 0},
   1.557 +
   1.558 +  {"rm", "rm", RHAETO_ROMANCE + W10, 0},
   1.559 +  {"rn", "rn", RUNDI + W10, 0},
   1.560 +  {"ro", "ro", ROMANIAN + W10, 0},
   1.561 +  {"rs", "sr", SERBIAN + W10, 0},           // Serbia country code
   1.562 +  {"ru", "ru", RUSSIAN + W10, 0},
   1.563 +  {"rus", "ru", RUSSIAN + W10, 0},
   1.564 +  {"rw", "rw", KINYARWANDA + W10, 0},
   1.565 +
   1.566 +  {"sa", "sa", SANSKRIT + W10, 0},
   1.567 +  {"sco", "sco", SCOTS + W10, ENGLISH - W4},
   1.568 +  {"sd", "sd", SINDHI + W10, 0},
   1.569 +  {"se", "sv", SWEDISH + W10, 0},
   1.570 +  {"sg", "sg", SANGO + W10, 0},
   1.571 +  {"si", "si,sl", SINHALESE + W10, SLOVENIAN + W10},  // 1:2 Sinhalese, Slovinia
   1.572 +  {"sk", "sk", SLOVAK + W10, CZECH - W4},
   1.573 +  {"sl", "sl", SLOVENIAN + W10, 0},
   1.574 +  {"slo", "sl", SLOVENIAN + W10, 0},
   1.575 +  {"sm", "sm", SAMOAN + W10, 0},
   1.576 +  {"sn", "sn", SHONA + W10, 0},
   1.577 +  {"so", "so", SOMALI + W10, 0},
   1.578 +  {"sp", "es", SPANISH + W10, 0},
   1.579 +  {"sq", "sq", ALBANIAN + W10, 0},
   1.580 +  {"sr", "sr", SERBIAN + W10, 0},
   1.581 +  {"srb", "sr", SERBIAN + W10, 0},
   1.582 +  {"srl", "sr", SERBIAN + W10, 0},          // Serbian Latin
   1.583 +  {"srp", "sr", SERBIAN + W10, 0},
   1.584 +  {"ss", "ss", SISWANT + W10, 0},
   1.585 +  {"st", "st", SESOTHO + W10, 0},
   1.586 +  {"su", "su", SUNDANESE + W10, 0},
   1.587 +  {"sv", "sv", SWEDISH + W10, 0},
   1.588 +  {"sve", "sv", SWEDISH + W10, 0},
   1.589 +  {"sw", "sw", SWAHILI + W10, 0},
   1.590 +  {"swe", "sv", SWEDISH + W10, 0},
   1.591 +  {"sy", "syr", SYRIAC + W10, 0},
   1.592 +  {"syr", "syr", SYRIAC + W10, 0},
   1.593 +
   1.594 +  {"ta", "ta", TAMIL + W10, 0},
   1.595 +  {"te", "te", TELUGU + W10, 0},
   1.596 +  {"tg", "tg", TAJIK + W10, 0},
   1.597 +  {"th", "th", THAI + W10, 0},
   1.598 +  {"ti", "ti,bo", TIGRINYA + W10, TIBETAN + W10},    // 1:2 Tigrinya, Tibet
   1.599 +  {"tj", "tg", TAJIK + W10, 0},             // Tajikistan
   1.600 +  {"tk", "tk", TURKMEN + W10, 0},
   1.601 +  {"tl", "tl", TAGALOG + W10, 0},
   1.602 +  {"tlh", "tlh", X_KLINGON + W10, 0},
   1.603 +  {"tn", "tn", TSWANA + W10, 0},
   1.604 +  {"to", "to", TONGA + W10, 0},
   1.605 +  {"tr", "tr", TURKISH + W10, 0},
   1.606 +  {"ts", "ts", TSONGA + W10, 0},
   1.607 +  {"tt", "tt", TATAR + W10, 0},
   1.608 +  {"tw", "ak,zhT", AKAN + W10, CHINESE_T + W10},   // 1:2 Twi => Akan, Taiwan
   1.609 +  {"twi", "ak", AKAN + W10, 0},             // Twi => Akan
   1.610 +
   1.611 +  {"ua", "uk", UKRAINIAN + W10, 0},         // Ukraine
   1.612 +  {"ug", "ug", UIGHUR + W10, 0},
   1.613 +  {"uk", "uk", UKRAINIAN + W10, 0},
   1.614 +  {"ur", "ur", URDU + W10, 0},
   1.615 +  {"uz", "uz", UZBEK + W10, 0},
   1.616 +
   1.617 +  {"va", "ca", CATALAN + W10, 0},           // Valencia => Catalan
   1.618 +  {"val", "ca", CATALAN + W10, 0},          // Valencia => Catalan
   1.619 +  {"ve", "ve", VENDA + W10, 0},
   1.620 +  {"vi", "vi", VIETNAMESE + W10, 0},
   1.621 +  {"vie", "vi", VIETNAMESE + W10, 0},
   1.622 +  {"vn", "vi", VIETNAMESE + W10, 0},
   1.623 +  {"vo", "vo", VOLAPUK + W10, 0},
   1.624 +
   1.625 +  {"wo", "wo", WOLOF + W10, 0},
   1.626 +
   1.627 +  {"xh", "xh", XHOSA + W10, ZULU - W4},
   1.628 +  {"xho", "xh", XHOSA + W10, ZULU - W4},
   1.629 +
   1.630 +  {"yi", "yi", YIDDISH + W10, 0},
   1.631 +  {"yo", "yo", YORUBA + W10, 0},
   1.632 +
   1.633 +  {"za", "za", ZHUANG + W10, 0},
   1.634 +  {"zh", "zh", CHINESE + W10, 0},
   1.635 +  {"zht", "zhT", CHINESE_T + W10, 0},
   1.636 +  {"zu", "zu", ZULU + W10, XHOSA - W4},
   1.637 +};
   1.638 +
   1.639 +
   1.640 +// Possibly map to tl:
   1.641 +// -LangTags tl-Latn /7val.com/ ,bcl 2 Central Bicolano
   1.642 +// -LangTags tl-Latn /7val.com/ ,ceb 6 Cebuano
   1.643 +// -LangTags tl-Latn /7val.com/ ,war 1 Waray
   1.644 +
   1.645 +
   1.646 +
   1.647 +// Table to look up country TLD (no general TLD)
   1.648 +// In alphabetical order for binary search
   1.649 +static const int kCLDTable3Size = 181;
   1.650 +static const TLDLookup kCLDTLDHintTable[kCLDTable3Size] = {
   1.651 +  {"ac", JAPANESE + W2, 0},
   1.652 +  {"ad", CATALAN + W4, 0},
   1.653 +  {"ae", ARABIC + W4, 0},
   1.654 +  {"af", PASHTO + W4, PERSIAN + W4},
   1.655 +  {"ag", GERMAN + W2, 0},                // meager
   1.656 +  // {"ai", 0, 0},                          // meager
   1.657 +  {"al", ALBANIAN + W4, 0},
   1.658 +  {"am", ARMENIAN + W4, 0},
   1.659 +  {"an", DUTCH + W4, 0},                 // meager
   1.660 +  {"ao", PORTUGUESE + W4, 0},
   1.661 +  // {"aq", 0, 0},                          // meager
   1.662 +  {"ar", SPANISH + W4, 0},
   1.663 +  // {"as", 0, 0},
   1.664 +  {"at", GERMAN + W4, 0},
   1.665 +  {"au", ENGLISH + W2, 0},
   1.666 +  {"aw", DUTCH + W4, 0},
   1.667 +  {"ax", SWEDISH + W4, 0},
   1.668 +  {"az", AZERBAIJANI + W4, 0},
   1.669 +
   1.670 +  {"ba", BOSNIAN + W8, CROATIAN - W4},
   1.671 +  // {"bb", 0, 0},
   1.672 +  {"bd", BENGALI + W4, 0},
   1.673 +  {"be", DUTCH + W4, FRENCH + W4},
   1.674 +  {"bf", FRENCH + W4, 0},
   1.675 +  {"bg", BULGARIAN + W4, 0},
   1.676 +  {"bh", ARABIC + W4, 0},
   1.677 +  {"bi", RUNDI + W4, FRENCH + W4},
   1.678 +  {"bj", FRENCH + W4, 0},
   1.679 +  {"bm", ENGLISH + W2, 0},
   1.680 +  {"bn", MALAY + W4, INDONESIAN - W4},
   1.681 +  {"bo", SPANISH + W4, AYMARA + W2},   // and GUARANI QUECHUA
   1.682 +  {"br", PORTUGUESE + W4, 0},
   1.683 +  // {"bs", 0, 0},
   1.684 +  {"bt", DZONGKHA + W10, TIBETAN - W10},      // Strong presumption of Dzongha
   1.685 +  {"bw", TSWANA + W4, 0},
   1.686 +  {"by", BELARUSIAN + W4, 0},
   1.687 +  // {"bz", 0, 0},
   1.688 +
   1.689 +  {"ca", FRENCH + W4, ENGLISH + W2},
   1.690 +  {"cat", CATALAN + W4, 0},
   1.691 +  {"cc", 0, 0},
   1.692 +  {"cd", FRENCH + W4, 0},
   1.693 +  {"cf", FRENCH + W4, 0},
   1.694 +  {"cg", FRENCH + W4, 0},
   1.695 +  {"ch", GERMAN + W4, FRENCH + W4},
   1.696 +  {"ci", FRENCH + W4, 0},
   1.697 +  // {"ck", 0, 0},
   1.698 +  {"cl", SPANISH + W4, 0},
   1.699 +  {"cm", FRENCH + W4, 0},
   1.700 +  {"cn", CHINESE + W4, 0},
   1.701 +  {"co", SPANISH + W4, 0},
   1.702 +  {"cr", SPANISH + W4, 0},
   1.703 +  {"cu", SPANISH + W4, 0},
   1.704 +  {"cv", PORTUGUESE + W4, 0},
   1.705 +  // {"cx", 0, 0},
   1.706 +  {"cy", GREEK + W4, TURKISH + W4},
   1.707 +  {"cz", CZECH + W4, SLOVAK - W4},
   1.708 +
   1.709 +  {"de", GERMAN + W4, 0},
   1.710 +  {"dj", 0, 0},
   1.711 +  {"dk", DANISH + W4, NORWEGIAN - W4},
   1.712 +  {"dm", 0, 0},
   1.713 +  {"do", SPANISH + W4, 0},
   1.714 +  {"dz", FRENCH + W4, ARABIC + W4},
   1.715 +
   1.716 +  {"ec", SPANISH + W4, 0},
   1.717 +  {"ee", ESTONIAN + W4, 0},
   1.718 +  {"eg", ARABIC + W4, 0},
   1.719 +  {"er", AFAR + W4, 0},
   1.720 +  {"es", SPANISH + W4, 0},
   1.721 +  {"et", AMHARIC + W4, AFAR + W4},
   1.722 +
   1.723 +  {"fi", FINNISH + W4, 0},
   1.724 +  {"fj", FIJIAN + W4, 0},
   1.725 +  // {"fk", 0, 0},
   1.726 +  // {"fm", 0, 0},
   1.727 +  {"fo", FAROESE + W4, ICELANDIC - W4},
   1.728 +  {"fr", FRENCH + W4, 0},
   1.729 +
   1.730 +  {"ga", FRENCH + W4, 0},
   1.731 +  {"gd", 0, 0},
   1.732 +  {"ge", GEORGIAN + W4, 0},
   1.733 +  {"gf", FRENCH + W4, 0},
   1.734 +  // {"gg", 0, 0},
   1.735 +  // {"gh", 0, 0},
   1.736 +  // {"gi", 0, 0},
   1.737 +  {"gl", GREENLANDIC + W4, DANISH + W4},
   1.738 +  // {"gm", 0, 0},
   1.739 +  {"gn", FRENCH + W4, 0},
   1.740 +  // {"gp", 0, 0},
   1.741 +  // {"gq", 0, 0},
   1.742 +  {"gr", GREEK + W4, 0},
   1.743 +  // {"gs", 0, 0},
   1.744 +  {"gt", SPANISH + W4, 0},
   1.745 +  // {"gu", 0, 0},
   1.746 +  // {"gy", 0, 0},
   1.747 +
   1.748 +  {"hk", CHINESE_T + W4, 0},
   1.749 +  // {"hm", 0, 0},
   1.750 +  {"hn", SPANISH + W4, 0},
   1.751 +  {"hr", CROATIAN + W8, BOSNIAN - W4},
   1.752 +  {"ht", HAITIAN_CREOLE + W4, FRENCH + W4},
   1.753 +  {"hu", HUNGARIAN + W4, 0},
   1.754 +
   1.755 +  {"id", INDONESIAN + W4, MALAY - W4},
   1.756 +  {"ie", IRISH + W4, 0},
   1.757 +  {"il", HEBREW + W4, 0},
   1.758 +  {"im", MANX + W4, 0},
   1.759 +  // {"in", 0, 0},
   1.760 +  // {"io", 0, 0},
   1.761 +  {"iq", ARABIC + W4, 0},
   1.762 +  {"ir", PERSIAN + W4, 0},
   1.763 +  {"is", ICELANDIC + W4, FAROESE - W4},
   1.764 +  {"it", ITALIAN + W4, 0},
   1.765 +
   1.766 +  // {"je", 0, 0},
   1.767 +  // {"jm", 0, 0},
   1.768 +  {"jo", ARABIC + W4, 0},
   1.769 +  {"jp", JAPANESE + W4, 0},
   1.770 +
   1.771 +  // {"ke", 0, 0},
   1.772 +  {"kg", KYRGYZ + W4, 0},
   1.773 +  {"kh", KHMER + W4, 0},
   1.774 +  // {"ki", 0, 0},
   1.775 +  {"km", FRENCH + W4, 0},
   1.776 +  // {"kn", 0, 0},
   1.777 +  {"kp", KOREAN + W4, 0},
   1.778 +  {"kr", KOREAN + W4, 0},
   1.779 +  {"kw", ARABIC + W4, 0},
   1.780 +  // {"ky", 0, 0},
   1.781 +  {"kz", KAZAKH + W4, 0},
   1.782 +
   1.783 +  {"la", LAOTHIAN + W4, 0},
   1.784 +  {"lb", ARABIC + W4, FRENCH + W4},
   1.785 +  // {"lc", 0, 0},
   1.786 +  {"li", GERMAN + W4, 0},
   1.787 +  {"lk", SINHALESE + W4, 0},
   1.788 +  // {"lr", 0, 0},
   1.789 +  {"ls", SESOTHO + W4, 0},
   1.790 +  {"lt", LITHUANIAN + W4, 0},
   1.791 +  {"lu", LUXEMBOURGISH + W4},
   1.792 +  {"lv", LATVIAN + W4, 0},
   1.793 +  {"ly", ARABIC + W4, 0},
   1.794 +
   1.795 +  {"ma", FRENCH + W4, 0},
   1.796 +  {"mc", FRENCH + W4, 0},
   1.797 +  {"md", ROMANIAN + W4, 0},
   1.798 +  {"me", MONTENEGRIN + W8, SERBIAN - W4},
   1.799 +  {"mg", FRENCH + W4, 0},
   1.800 +  {"mk", MACEDONIAN + W4, 0},
   1.801 +  {"ml", FRENCH + W4, 0},
   1.802 +  {"mm", BURMESE + W4, 0},
   1.803 +  {"mn", MONGOLIAN + W4, 0},
   1.804 +  {"mo", CHINESE_T + W4, PORTUGUESE + W4},
   1.805 +  // {"mp", 0, 0},
   1.806 +  {"mq", FRENCH + W4, 0},
   1.807 +  {"mr", FRENCH + W4, ARABIC + W4},
   1.808 +  // {"ms", 0, 0},
   1.809 +  {"mt", MALTESE + W4, 0},
   1.810 +  // {"mu", 0, 0},
   1.811 +  {"mv", DHIVEHI + W4, 0},
   1.812 +  // {"mw", 0, 0},
   1.813 +  {"mx", SPANISH + W4, 0},
   1.814 +  {"my", MALAY + W4, INDONESIAN - W4},
   1.815 +  {"mz", PORTUGUESE + W4, 0},
   1.816 +
   1.817 +  {"na", 0, 0},            // Namibia
   1.818 +  {"nc", FRENCH + W4, 0},
   1.819 +  {"ne", FRENCH + W4, 0},
   1.820 +  {"nf", FRENCH + W4, 0},
   1.821 +  // {"ng", 0, 0},
   1.822 +  {"ni", SPANISH + W4, 0},
   1.823 +  {"nl", DUTCH + W4, 0},
   1.824 +  {"no", NORWEGIAN + W4, NORWEGIAN_N + W2},
   1.825 +  {"np", NEPALI + W4, 0},
   1.826 +  {"nr", NAURU + W4, 0},
   1.827 +  {"nu", SWEDISH + W4, 0},
   1.828 +  {"nz", MAORI + W4, ENGLISH + W2},
   1.829 +
   1.830 +  {"om", ARABIC + W4, 0},
   1.831 +
   1.832 +  {"pa", SPANISH + W4, 0},
   1.833 +  {"pe", SPANISH + W4, QUECHUA + W2},   // also AYMARA
   1.834 +  {"pf", FRENCH + W4, 0},
   1.835 +  // {"pg", 0, 0},
   1.836 +  {"ph", TAGALOG + W4, 0},
   1.837 +  {"pk", URDU + W4, 0},
   1.838 +  {"pl", POLISH + W4, 0},
   1.839 +  // {"pn", 0, 0},
   1.840 +  {"pr", SPANISH + W4, 0},
   1.841 +  {"ps", ARABIC + W4, 0},
   1.842 +  {"pt", PORTUGUESE + W4, 0},
   1.843 +  {"py", SPANISH + W4, GUARANI + W2},
   1.844 +
   1.845 +  {"qa", ARABIC + W4, 0},
   1.846 +
   1.847 +  {"re", FRENCH + W4, 0},
   1.848 +  {"ro", ROMANIAN + W4, 0},
   1.849 +  {"rs", SERBIAN + W8, MONTENEGRIN - W4},
   1.850 +  {"ru", RUSSIAN + W4, 0},
   1.851 +  {"rw", KINYARWANDA + W4, FRENCH + W2},
   1.852 +
   1.853 +  {"sa", ARABIC + W4, 0},
   1.854 +  // {"sb", 0, 0},
   1.855 +  {"sc", SESELWA + W4, 0},
   1.856 +  {"sd", ARABIC + W4, 0},
   1.857 +  {"se", SWEDISH + W4, 0},
   1.858 +  // {"sg", 0, 0},
   1.859 +  // {"sh", 0, 0},
   1.860 +  {"si", SLOVENIAN + W4, 0},
   1.861 +  {"sk", SLOVAK + W4, CZECH - W4},
   1.862 +  // {"sl", 0, 0},
   1.863 +  {"sm", ITALIAN + W4, 0},
   1.864 +  {"sn", FRENCH + W4, 0},
   1.865 +  // {"sr", 0, 0},
   1.866 +  {"ss", ARABIC + W4, 0},     // Presumed South Sudan TLD. dsites 2011.07.07
   1.867 +  // {"st", 0, 0},
   1.868 +  {"su", RUSSIAN + W4, 0},
   1.869 +  {"sv", SPANISH + W4, 0},
   1.870 +  {"sy", ARABIC + W4, 0},
   1.871 +  // {"sz", 0, 0},
   1.872 +
   1.873 +  // {"tc", 0, 0},
   1.874 +  {"td", FRENCH + W4, 0},
   1.875 +  // {"tf", 0, 0},
   1.876 +  {"tg", FRENCH + W4, 0},
   1.877 +  {"th", THAI + W4, 0},
   1.878 +                              // Tibet has no country code (see .cn)
   1.879 +  {"tj", TAJIK + W4, 0},
   1.880 +  // {"tk", 0, 0},
   1.881 +  // {"tl", 0, 0},
   1.882 +  {"tm", TURKISH + W4, 0},
   1.883 +  {"tn", FRENCH + W4, ARABIC + W4},
   1.884 +  // {"to", 0, 0},
   1.885 +  {"tp", JAPANESE + W4, 0},
   1.886 +  {"tr", TURKISH + W4, 0},
   1.887 +  // {"tt", 0, 0},
   1.888 +  // {"tv", 0, 0},
   1.889 +  {"tw", CHINESE_T + W4, 0},
   1.890 +  {"tz", SWAHILI + W4, AKAN + W4},
   1.891 +
   1.892 +  {"ua", UKRAINIAN + W4, 0},
   1.893 +  {"ug", GANDA + W4, 0},
   1.894 +  {"uk", ENGLISH + W2, 0},
   1.895 +  {"us", ENGLISH + W2, 0},
   1.896 +  {"uy", SPANISH + W4, 0},
   1.897 +  {"uz", UZBEK + W4, 0},
   1.898 +
   1.899 +  {"va", ITALIAN + W4, LATIN + W2},
   1.900 +  // {"vc", 0, 0},
   1.901 +  {"ve", SPANISH + W4, 0},
   1.902 +  // {"vg", 0, 0},
   1.903 +  // {"vi", 0, 0},
   1.904 +  {"vn", VIETNAMESE + W4, 0},
   1.905 +  // {"vu", 0, 0},
   1.906 +
   1.907 +  {"wf", FRENCH + W4, 0},
   1.908 +  // {"ws", 0, 0},
   1.909 +
   1.910 +  {"ye", ARABIC + W4, 0},
   1.911 +
   1.912 +  {"za", AFRIKAANS + W4, 0},
   1.913 +  // {"zm", 0, 0},
   1.914 +  // {"zw", 0, 0},
   1.915 +};
   1.916 +
   1.917 +#undef W2
   1.918 +#undef W4
   1.919 +#undef W6
   1.920 +#undef W8
   1.921 +#undef W10
   1.922 +#undef W12
   1.923 +
   1.924 +
   1.925 +
   1.926 +
   1.927 +
   1.928 +inline void SetCLDPriorWeight(int w, OneCLDLangPrior* olp) {
   1.929 +  *olp = (*olp & 0x3ff) + (w << 10);
   1.930 +}
   1.931 +inline void SetCLDPriorLang(Language lang, OneCLDLangPrior* olp) {
   1.932 +  *olp = (*olp & ~0x3ff) + lang;
   1.933 +}
   1.934 +
   1.935 +OneCLDLangPrior PackCLDPriorLangWeight(Language lang, int w) {
   1.936 +  return (w << 10) + lang;
   1.937 +}
   1.938 +
   1.939 +inline int MaxInt(int a, int b) {
   1.940 +  return (a >= b) ? a : b;
   1.941 +}
   1.942 +
   1.943 +// Merge in another language prior, taking max if already there
   1.944 +void MergeCLDLangPriorsMax(OneCLDLangPrior olp, CLDLangPriors* lps) {
   1.945 +  if (olp == 0) {return;}
   1.946 +  Language target_lang = GetCLDPriorLang(olp);
   1.947 +  for (int i = 0; i < lps->n; ++i) {
   1.948 +    if (GetCLDPriorLang(lps->prior[i]) == target_lang) {
   1.949 +      int new_weight = MaxInt(GetCLDPriorWeight(lps->prior[i]),
   1.950 +                              GetCLDPriorWeight(olp));
   1.951 +      SetCLDPriorWeight(new_weight, &lps->prior[i]);
   1.952 +      return;
   1.953 +    }
   1.954 +  }
   1.955 +  // Not found; add it if room
   1.956 +  if (lps->n >= kMaxOneCLDLangPrior) {return;}
   1.957 +  lps->prior[lps->n++] = olp;
   1.958 +}
   1.959 +
   1.960 +// Merge in another language prior, boosting 10x if already there
   1.961 +void MergeCLDLangPriorsBoost(OneCLDLangPrior olp, CLDLangPriors* lps) {
   1.962 +  if (olp == 0) {return;}
   1.963 +  Language target_lang = GetCLDPriorLang(olp);
   1.964 +  for (int i = 0; i < lps->n; ++i) {
   1.965 +    if (GetCLDPriorLang(lps->prior[i]) == target_lang) {
   1.966 +      int new_weight = GetCLDPriorWeight(lps->prior[i]) + 2;
   1.967 +      SetCLDPriorWeight(new_weight, &lps->prior[i]);
   1.968 +      return;
   1.969 +    }
   1.970 +  }
   1.971 +  // Not found; add it if room
   1.972 +  if (lps->n >= kMaxOneCLDLangPrior) {return;}
   1.973 +  lps->prior[lps->n++] = olp;
   1.974 +}
   1.975 +
   1.976 +
   1.977 +// Trim language priors to no more than max_entries, keeping largest abs weights
   1.978 +void TrimCLDLangPriors(int max_entries, CLDLangPriors* lps) {
   1.979 +  if (lps->n <= max_entries) {return;}
   1.980 +
   1.981 +  // Insertion sort in-place by abs(weight)
   1.982 +  for (int i = 0; i < lps->n; ++i) {
   1.983 +    OneCLDLangPrior temp_olp = lps->prior[i];
   1.984 +    int w = abs(GetCLDPriorWeight(temp_olp));
   1.985 +    int kk = i;
   1.986 +    for (; kk > 0; --kk) {
   1.987 +      if (abs(GetCLDPriorWeight(lps->prior[kk - 1])) < w) {
   1.988 +        // Move down and continue
   1.989 +        lps->prior[kk] = lps->prior[kk - 1];
   1.990 +      } else {
   1.991 +        // abs(weight[kk - 1]) >= w, time to stop
   1.992 +        break;
   1.993 +      }
   1.994 +    }
   1.995 +    lps->prior[kk] = temp_olp;
   1.996 +  }
   1.997 +
   1.998 +  lps->n = max_entries;
   1.999 +}
  1.1000 +
  1.1001 +int CountCommas(const string& langtags) {
  1.1002 +  int commas = 0;
  1.1003 +  for (int i = 0; i < static_cast<int>(langtags.size()); ++i) {
  1.1004 +    if (langtags[i] == ',') {++commas;}
  1.1005 +  }
  1.1006 +  return commas;
  1.1007 +}
  1.1008 +
  1.1009 +// Binary lookup on language tag
  1.1010 +const LangTagLookup* DoLangTagLookup(const char* key,
  1.1011 +                                     const LangTagLookup* tbl, int tbl_size) {
  1.1012 +  // Key is always in range [lo..hi)
  1.1013 +  int lo = 0;
  1.1014 +  int hi = tbl_size;
  1.1015 +  while (lo < hi) {
  1.1016 +    int mid = (lo + hi) >> 1;
  1.1017 +    int comp = strcmp(tbl[mid].langtag, key);
  1.1018 +    if (comp < 0) {
  1.1019 +      lo = mid + 1;
  1.1020 +    } else if (comp > 0) {
  1.1021 +      hi = mid;
  1.1022 +    } else {
  1.1023 +      return &tbl[mid];
  1.1024 +    }
  1.1025 +  }
  1.1026 +  return NULL;
  1.1027 +}
  1.1028 +
  1.1029 +// Binary lookup on tld
  1.1030 +const TLDLookup* DoTLDLookup(const char* key,
  1.1031 +                             const TLDLookup* tbl, int tbl_size) {
  1.1032 +  // Key is always in range [lo..hi)
  1.1033 +  int lo = 0;
  1.1034 +  int hi = tbl_size;
  1.1035 +  while (lo < hi) {
  1.1036 +    int mid = (lo + hi) >> 1;
  1.1037 +    int comp = strcmp(tbl[mid].tld, key);
  1.1038 +    if (comp < 0) {
  1.1039 +      lo = mid + 1;
  1.1040 +    } else if (comp > 0) {
  1.1041 +      hi = mid;
  1.1042 +    } else {
  1.1043 +      return &tbl[mid];
  1.1044 +    }
  1.1045 +  }
  1.1046 +  return NULL;
  1.1047 +}
  1.1048 +
  1.1049 +
  1.1050 +
  1.1051 +// Trim language tag string to canonical form for each language
  1.1052 +// Input is from GetLangTagsFromHtml(), already lowercased
  1.1053 +string TrimCLDLangTagsHint(const string& langtags) {
  1.1054 +  string retval;
  1.1055 +  if (langtags.empty()) {return retval;}
  1.1056 +  int commas = CountCommas(langtags);
  1.1057 +  if (commas > 4) {return retval;}       // Ignore if too many language tags
  1.1058 +
  1.1059 +  char temp[20];
  1.1060 +  int pos = 0;
  1.1061 +  while (pos < static_cast<int>(langtags.size())) {
  1.1062 +    int comma = langtags.find(',', pos);
  1.1063 +    if (comma == string::npos) {comma = langtags.size();} // fake trailing comma
  1.1064 +    int len = comma - pos;
  1.1065 +    if (len <= 16) {
  1.1066 +      // Short enough to use
  1.1067 +      memcpy(temp, &langtags[pos], len);
  1.1068 +      temp[len] = '\0';
  1.1069 +      const LangTagLookup* entry = DoLangTagLookup(temp,
  1.1070 +                                                   kCLDLangTagsHintTable1,
  1.1071 +                                                   kCLDTable1Size);
  1.1072 +      if (entry != NULL) {
  1.1073 +        // First table hit
  1.1074 +        retval.append(entry->langcode);     // may be "code1,code2"
  1.1075 +        retval.append(1, ',');
  1.1076 +      } else {
  1.1077 +        // Try second table with language code truncated at first hyphen
  1.1078 +        char* hyphen = strchr(temp, '-');
  1.1079 +        if (hyphen != NULL) {*hyphen = '\0';}
  1.1080 +        len = strlen(temp);
  1.1081 +        if (len <= 3) {                 // Short enough to use
  1.1082 +          entry = DoLangTagLookup(temp,
  1.1083 +                                  kCLDLangTagsHintTable2,
  1.1084 +                                  kCLDTable2Size);
  1.1085 +          if (entry != NULL) {
  1.1086 +            // Second table hit
  1.1087 +            retval.append(entry->langcode);     // may be "code1,code2"
  1.1088 +            retval.append(1, ',');
  1.1089 +          }
  1.1090 +        }
  1.1091 +      }
  1.1092 +    }
  1.1093 +    pos = comma + 1;
  1.1094 +  }
  1.1095 +
  1.1096 +  // Remove trainling comma, if any
  1.1097 +  if (!retval.empty()) {retval.resize(retval.size() - 1);}
  1.1098 +  return retval;
  1.1099 +}
  1.1100 +
  1.1101 +
  1.1102 +
  1.1103 +//==============================================================================
  1.1104 +
  1.1105 +// Little state machine to scan insides of language attribute quoted-string.
  1.1106 +// Each language code is lowercased and copied to the output string. Underscore
  1.1107 +// is mapped to minus. Space, tab, and comma are all mapped to comma, and
  1.1108 +// multiple consecutive commas are removed.
  1.1109 +// Each language code in the output list will be followed by a single comma.
  1.1110 +
  1.1111 +// There are three states, and we start in state 1:
  1.1112 +// State 0: After a letter.
  1.1113 +//  Copy all letters/minus[0], copy comma[1]; all others copy comma and skip [2]
  1.1114 +// State 1: Just after a comma.
  1.1115 +//  Copy letter [0], Ignore subsequent commas[1]. minus and all others skip [2]
  1.1116 +// State 2: Skipping.
  1.1117 +//  All characters except comma skip and stay in [2]. comma goes to [1]
  1.1118 +
  1.1119 +// The thing that is copied is kLangCodeRemap[c] when going to state 0,
  1.1120 +// and always comma when going to state 1 or 2. The design depends on copying
  1.1121 +// a comma at the *beginning* of skipping, and in state 2 never doing a copy.
  1.1122 +
  1.1123 +// We pack all this into 8 bits:
  1.1124 +//    +--+---+---+
  1.1125 +//    |78|654|321|
  1.1126 +//    +--+---+---+
  1.1127 +//
  1.1128 +// Shift byte right by 3*state, giving [0] 321, [1] 654, [2] .78
  1.1129 +// where . is always zero
  1.1130 +// Of these 3 bits, low two are next state ss, high bit is copy bit C.
  1.1131 +// If C=1 and ss == 0, copy kLangCodeRemap[c], else copy a comma
  1.1132 +
  1.1133 +#define SKIP0 0
  1.1134 +#define SKIP1 1
  1.1135 +#define SKIP2 2
  1.1136 +#define COPY0 4   // copy kLangCodeRemap[c]
  1.1137 +#define COPY1 5   // copy ','
  1.1138 +#define COPY2 6   // copy ','
  1.1139 +
  1.1140 +// These combined actions pack three states into one byte.
  1.1141 +// Ninth bit must be zero, so all state 2 values must be skips.
  1.1142 +//              state[2]       state[1]      state[0]
  1.1143 +#define LTR   ((SKIP2 << 6) + (COPY0 << 3) + COPY0)
  1.1144 +#define MINUS ((SKIP2 << 6) + (COPY2 << 3) + COPY0)
  1.1145 +#define COMMA ((SKIP1 << 6) + (SKIP1 << 3) + COPY1)
  1.1146 +#define Bad   ((SKIP2 << 6) + (COPY2 << 3) + COPY2)
  1.1147 +
  1.1148 +// Treat as letter: a-z,  A-Z
  1.1149 +// Treat as minus:  2D minus,  5F underscore
  1.1150 +// Treat as comma:  09 tab,  20 space,  2C comma
  1.1151 +
  1.1152 +static const unsigned char kLangCodeAction[256] = {
  1.1153 +  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,COMMA,Bad,Bad,Bad,Bad,Bad,Bad,
  1.1154 +  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1.1155 +  COMMA,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,COMMA,MINUS,Bad,Bad,
  1.1156 +  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1.1157 +
  1.1158 +  Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR,  LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,
  1.1159 +  LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,  LTR,LTR,LTR,Bad,Bad,Bad,Bad,MINUS,
  1.1160 +  Bad,LTR,LTR,LTR,LTR,LTR,LTR,LTR,  LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,
  1.1161 +  LTR,LTR,LTR,LTR,LTR,LTR,LTR,LTR,  LTR,LTR,LTR,Bad,Bad,Bad,Bad,Bad,
  1.1162 +
  1.1163 +  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1.1164 +  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1.1165 +  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1.1166 +  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1.1167 +
  1.1168 +  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1.1169 +  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1.1170 +  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1.1171 +  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,  Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,
  1.1172 +};
  1.1173 +
  1.1174 +// This does lowercasing, maps underscore to minus, and maps tab/space to comma
  1.1175 +static const unsigned char kLangCodeRemap[256] = {
  1.1176 +  0,0,0,0,0,0,0,0,  0,',',0,0,0,0,0,0,          // 09 tab
  1.1177 +  0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1.1178 +  ',',0,0,0,0,0,0,0,  0,0,0,0,',','-',0,0,      // 20 space 2C comma 2D minus
  1.1179 +  0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1.1180 +
  1.1181 +    0,'a','b','c','d','e','f','g',  'h','i','j','k','l','m','n','o',
  1.1182 +  'p','q','r','s','t','u','v','w',  'x','y','z',0,0,0,0,'-',  // 5F underscore
  1.1183 +    0,'a','b','c','d','e','f','g',  'h','i','j','k','l','m','n','o',
  1.1184 +  'p','q','r','s','t','u','v','w',  'x','y','z',0,0,0,0,0,
  1.1185 +
  1.1186 +  0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1.1187 +  0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1.1188 +  0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1.1189 +  0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1.1190 +
  1.1191 +  0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1.1192 +  0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1.1193 +  0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1.1194 +  0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
  1.1195 +};
  1.1196 +
  1.1197 +#undef LTR
  1.1198 +#undef MINUS
  1.1199 +#undef COMMA
  1.1200 +#undef Bad
  1.1201 +
  1.1202 +#undef SKIP0
  1.1203 +#undef SKIP1
  1.1204 +#undef SKIP2
  1.1205 +#undef COPY0
  1.1206 +#undef COPY1
  1.1207 +#undef COPY2
  1.1208 +
  1.1209 +
  1.1210 +// Find opening '<' for HTML tag
  1.1211 +// Note: this is all somewhat insensitive to mismatched quotes
  1.1212 +int32 FindTagStart(const char* utf8_body, int32 pos, int32 max_pos) {
  1.1213 +  int i = pos;
  1.1214 +  // Advance i by 4 if none of the next 4 bytes are '<'
  1.1215 +  for (i = pos; i < (max_pos - 3); i += 4) {
  1.1216 +    // Fast check for any <
  1.1217 +    const char* p = &utf8_body[i];
  1.1218 +    uint32 s0123 = UNALIGNED_LOAD32(p);
  1.1219 +    uint32 temp = s0123 ^ 0x3c3c3c3c;    // <<<<
  1.1220 +    if (((temp - 0x01010101) & (~temp & 0x80808080)) != 0) {
  1.1221 +      // At least one byte is '<'
  1.1222 +      break;
  1.1223 +    }
  1.1224 +  }
  1.1225 +  // Continue, advancing i by 1
  1.1226 +  for (; i < max_pos; ++i) {
  1.1227 +    if (utf8_body[i] == '<') {return i;}
  1.1228 +  }
  1.1229 +  return -1;
  1.1230 +}
  1.1231 +
  1.1232 +
  1.1233 +// Find closing '>' for HTML tag. Also stop on < and & (simplistic parsing)
  1.1234 +int32 FindTagEnd(const char* utf8_body, int32 pos, int32 max_pos) {
  1.1235 +  // Always outside quotes
  1.1236 +  for (int i = pos; i < max_pos; ++i) {
  1.1237 +    char c = utf8_body[i];
  1.1238 +    if (c == '>') {return i;}
  1.1239 +    if (c == '<') {return i - 1;}
  1.1240 +    if (c == '&') {return i - 1;}
  1.1241 +  }
  1.1242 +  return -1;              // nothing found
  1.1243 +}
  1.1244 +
  1.1245 +// Find opening quote or apostrophe, skipping spaces
  1.1246 +// Note: this is all somewhat insensitive to mismatched quotes
  1.1247 +int32 FindQuoteStart(const char* utf8_body, int32 pos, int32 max_pos) {
  1.1248 +  for (int i = pos; i < max_pos; ++i) {
  1.1249 +    char c = utf8_body[i];
  1.1250 +    if (c == '"') {return i;}
  1.1251 +    if (c == '\'') {return i;}
  1.1252 +    if (c != ' ') {return -1;}
  1.1253 +  }
  1.1254 +  return -1;
  1.1255 +}
  1.1256 +
  1.1257 +// Find closing quot/apos. Also stop on = > < and & (simplistic parsing)
  1.1258 +int32 FindQuoteEnd(const char* utf8_body, int32 pos, int32 max_pos) {
  1.1259 +  // Always outside quotes
  1.1260 +  for (int i = pos; i < max_pos; ++i) {
  1.1261 +    char c = utf8_body[i];
  1.1262 +    if (c == '"') {return i;}
  1.1263 +    if (c == '\'') {return i;}
  1.1264 +    if (c == '>') {return i - 1;}
  1.1265 +    if (c == '=') {return i - 1;}
  1.1266 +    if (c == '<') {return i - 1;}
  1.1267 +    if (c == '&') {return i - 1;}
  1.1268 +  }
  1.1269 +  return -1;              // nothing found
  1.1270 +}
  1.1271 +
  1.1272 +int32 FindEqualSign(const char* utf8_body, int32 pos, int32 max_pos) {
  1.1273 +  // Outside quotes/apostrophes loop
  1.1274 +  for (int i = pos; i < max_pos; ++i) {
  1.1275 +    char c = utf8_body[i];
  1.1276 +    if (c == '=') {       // Found bare equal sign inside tag
  1.1277 +      return i;
  1.1278 +    } else if (c == '"') {
  1.1279 +      // Inside quotes loop
  1.1280 +      int j;
  1.1281 +      for (j = i + 1; j < max_pos; ++j) {
  1.1282 +        if (utf8_body[j] == '"') {
  1.1283 +          break;
  1.1284 +        } else if (utf8_body[j] == '\\') {
  1.1285 +          ++j;
  1.1286 +        }
  1.1287 +      }
  1.1288 +      i = j;
  1.1289 +    } else if (c == '\'') {
  1.1290 +      // Inside apostrophes loop
  1.1291 +      int j;
  1.1292 +      for (j = i + 1; j < max_pos; ++j) {
  1.1293 +        if (utf8_body[j] == '\'') {
  1.1294 +          break;
  1.1295 +        } else if (utf8_body[j] == '\\') {
  1.1296 +          ++j;
  1.1297 +        }
  1.1298 +      }
  1.1299 +      i = j;
  1.1300 +    }
  1.1301 +
  1.1302 +  }
  1.1303 +  return -1;              // nothing found
  1.1304 +}
  1.1305 +
  1.1306 +// Scan backwards for case-insensitive string s in [min_pos..pos)
  1.1307 +// Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f]
  1.1308 +// Cheap lowercase. Control codes will masquerade as 20..3f
  1.1309 +bool FindBefore(const char* utf8_body,
  1.1310 +                 int32 min_pos, int32 pos, const char* s) {
  1.1311 +  int len = strlen(s);
  1.1312 +  if ((pos - min_pos) < len) {return false;}     // Too small to fit s
  1.1313 +
  1.1314 +  // Skip trailing spaces
  1.1315 +  int i = pos;
  1.1316 +  while ((i > (min_pos + len)) && (utf8_body[i - 1] == ' ')) {--i;}
  1.1317 +  i -= len;
  1.1318 +  if (i < min_pos) {return false;}   // pos - min_pos < len, so s can't be found
  1.1319 +
  1.1320 +  const char* p = &utf8_body[i];
  1.1321 +  for (int j = 0; j < len; ++j) {
  1.1322 +    if ((p[j] | 0x20) != s[j])  {return false;}    // Unequal byte
  1.1323 +  }
  1.1324 +  return true;                                     // All bytes equal at i
  1.1325 +}
  1.1326 +
  1.1327 +// Scan forwards for case-insensitive string s in [pos..max_pos)
  1.1328 +// Bytes of s must already be lowercase, i.e. in [20..3f] or [60..7f]
  1.1329 +// Cheap lowercase. Control codes will masquerade as 20..3f
  1.1330 +// Allows but does not require quoted/apostrophe string
  1.1331 +bool FindAfter(const char* utf8_body,
  1.1332 +                 int32 pos, int32 max_pos, const char* s) {
  1.1333 +  int len = strlen(s);
  1.1334 +  if ((max_pos - pos) < len) {return false;}     // Too small to fit s
  1.1335 +
  1.1336 +  // Skip leading spaces, quote, apostrophe
  1.1337 +  int i = pos;
  1.1338 +  while (i < (max_pos - len)) {
  1.1339 +    unsigned char c = utf8_body[i];
  1.1340 +    if ((c == ' ') || (c == '"') || (c == '\'')) {++i;}
  1.1341 +    else {break;}
  1.1342 +  }
  1.1343 +
  1.1344 +  const char* p = &utf8_body[i];
  1.1345 +  for (int j = 0; j < len; ++j) {
  1.1346 +    if ((p[j] | 0x20) != s[j])  {return false;}    // Unequal byte
  1.1347 +  }
  1.1348 +  return true;                                     // All bytes equal
  1.1349 +}
  1.1350 +
  1.1351 +
  1.1352 +
  1.1353 +// Copy attribute value in [pos..max_pos)
  1.1354 +// pos is just after an opening quote/apostrophe and max_pos is the ending one
  1.1355 +// String must all be on a single line.
  1.1356 +// Return slightly-normalized language list, empty or ending in comma
  1.1357 +// Does lowercasing and removes excess punctuation/space
  1.1358 +string CopyOneQuotedString(const char* utf8_body,
  1.1359 +                         int32 pos, int32 max_pos) {
  1.1360 +  string s;
  1.1361 +  int state = 1;        // Front is logically just after a comma
  1.1362 +  for (int i = pos; i < max_pos; ++i) {
  1.1363 +    unsigned char c = utf8_body[i];
  1.1364 +    int e = kLangCodeAction[c] >> (3 * state);
  1.1365 +    state = e & 3;      // Update to next state
  1.1366 +    if ((e & 4) != 0) {
  1.1367 +      // Copy a remapped byte if going to state 0, else copy a comma
  1.1368 +      if (state == 0) {
  1.1369 +        s.append(1, kLangCodeRemap[c]);
  1.1370 +      } else {
  1.1371 +        s.append(1, ',');
  1.1372 +      }
  1.1373 +    }
  1.1374 +  }
  1.1375 +
  1.1376 +  // Add final comma if needed
  1.1377 +  if (state == 0) {
  1.1378 +    s.append(1, ',');
  1.1379 +  }
  1.1380 +  return s;
  1.1381 +}
  1.1382 +
  1.1383 +// Find and copy attribute value: quoted string in [pos..max_pos)
  1.1384 +// Return slightly-normalized language list, empty or ending in comma
  1.1385 +string CopyQuotedString(const char* utf8_body,
  1.1386 +                         int32 pos, int32 max_pos) {
  1.1387 +  int32 start_quote = FindQuoteStart(utf8_body, pos, max_pos);
  1.1388 +  if (start_quote < 0) {return string("");}
  1.1389 +  int32 end_quote = FindQuoteEnd(utf8_body, start_quote + 1, max_pos);
  1.1390 +  if (end_quote < 0) {return string("");}
  1.1391 +
  1.1392 +  return CopyOneQuotedString(utf8_body, start_quote + 1, end_quote);
  1.1393 +}
  1.1394 +
  1.1395 +// Add hints to vector of langpriors
  1.1396 +// Input is from GetLangTagsFromHtml(), already lowercased
  1.1397 +void SetCLDLangTagsHint(const string& langtags, CLDLangPriors* langpriors) {
  1.1398 +  if (langtags.empty()) {return;}
  1.1399 +  int commas = CountCommas(langtags);
  1.1400 +  if (commas > 4) {return;}       // Ignore if too many language tags
  1.1401 +
  1.1402 +  char temp[20];
  1.1403 +  int pos = 0;
  1.1404 +  while (pos < static_cast<int>(langtags.size())) {
  1.1405 +    int comma = langtags.find(',', pos);
  1.1406 +    if (comma == string::npos) {comma = langtags.size();} // fake trailing comma
  1.1407 +    int len = comma - pos;
  1.1408 +    if (len <= 16) {
  1.1409 +      // Short enough to use
  1.1410 +      memcpy(temp, &langtags[pos], len);
  1.1411 +      temp[len] = '\0';
  1.1412 +      const LangTagLookup* entry = DoLangTagLookup(temp,
  1.1413 +                                                   kCLDLangTagsHintTable1,
  1.1414 +                                                   kCLDTable1Size);
  1.1415 +      if (entry != NULL) {
  1.1416 +        // First table hit
  1.1417 +        MergeCLDLangPriorsMax(entry->onelangprior1, langpriors);
  1.1418 +        MergeCLDLangPriorsMax(entry->onelangprior2, langpriors);
  1.1419 +      } else {
  1.1420 +        // Try second table with language code truncated at first hyphen
  1.1421 +        char* hyphen = strchr(temp, '-');
  1.1422 +        if (hyphen != NULL) {*hyphen = '\0';}
  1.1423 +        len = strlen(temp);
  1.1424 +        if (len <= 3) {                 // Short enough to use
  1.1425 +          entry = DoLangTagLookup(temp,
  1.1426 +                                  kCLDLangTagsHintTable2,
  1.1427 +                                  kCLDTable2Size);
  1.1428 +          if (entry != NULL) {
  1.1429 +            // Second table hit
  1.1430 +            MergeCLDLangPriorsMax(entry->onelangprior1, langpriors);
  1.1431 +            MergeCLDLangPriorsMax(entry->onelangprior2, langpriors);
  1.1432 +          }
  1.1433 +        }
  1.1434 +      }
  1.1435 +    }
  1.1436 +    pos = comma + 1;
  1.1437 +  }
  1.1438 +}
  1.1439 +
  1.1440 +// Add hints to vector of langpriors
  1.1441 +// Input is string after HTTP header Content-Language:
  1.1442 +void SetCLDContentLangHint(const char* contentlang, CLDLangPriors* langpriors) {
  1.1443 +  string langtags = CopyOneQuotedString(contentlang, 0, strlen(contentlang));
  1.1444 +  SetCLDLangTagsHint(langtags, langpriors);
  1.1445 +}
  1.1446 +
  1.1447 +// Add hints to vector of langpriors
  1.1448 +// Input is last element of hostname (no dot), e.g. from GetTLD()
  1.1449 +void SetCLDTLDHint(const char* tld, CLDLangPriors* langpriors) {
  1.1450 +  int len = strlen(tld);
  1.1451 +  if (len > 3) {return;}        // Ignore if more than three letters
  1.1452 +  char local_tld[4];
  1.1453 +  strncpy(local_tld, tld, 4);
  1.1454 +  local_tld[3] = '\0';          // Safety move
  1.1455 +  // Lowercase
  1.1456 +  for (int i = 0; i < len; ++i) {local_tld[i] |= 0x20;}
  1.1457 +  const TLDLookup* entry = DoTLDLookup(local_tld,
  1.1458 +                                       kCLDTLDHintTable,
  1.1459 +                                       kCLDTable3Size);
  1.1460 +  if (entry != NULL) {
  1.1461 +    // Table hit
  1.1462 +    MergeCLDLangPriorsBoost(entry->onelangprior1, langpriors);
  1.1463 +    MergeCLDLangPriorsBoost(entry->onelangprior2, langpriors);
  1.1464 +  }
  1.1465 +}
  1.1466 +
  1.1467 +// Add hints to vector of langpriors
  1.1468 +// Input is from DetectEncoding()
  1.1469 +void SetCLDEncodingHint(Encoding enc, CLDLangPriors* langpriors) {
  1.1470 +  OneCLDLangPrior olp;
  1.1471 +  switch (enc) {
  1.1472 +  case CHINESE_GB:
  1.1473 +  case GBK:
  1.1474 +  case GB18030:
  1.1475 +  case ISO_2022_CN:
  1.1476 +  case HZ_GB_2312:
  1.1477 +    olp = PackCLDPriorLangWeight(CHINESE, kCLDPriorEncodingWeight);
  1.1478 +    MergeCLDLangPriorsBoost(olp, langpriors);
  1.1479 +    break;
  1.1480 +  case CHINESE_BIG5:
  1.1481 +  case CHINESE_BIG5_CP950:
  1.1482 +  case BIG5_HKSCS:
  1.1483 +    olp = PackCLDPriorLangWeight(CHINESE_T, kCLDPriorEncodingWeight);
  1.1484 +    MergeCLDLangPriorsBoost(olp, langpriors);
  1.1485 +    break;
  1.1486 +  case JAPANESE_EUC_JP:
  1.1487 +  case JAPANESE_SHIFT_JIS:
  1.1488 +  case JAPANESE_CP932:
  1.1489 +  case JAPANESE_JIS:          // ISO-2022-JP
  1.1490 +    olp = PackCLDPriorLangWeight(JAPANESE, kCLDPriorEncodingWeight);
  1.1491 +    MergeCLDLangPriorsBoost(olp, langpriors);
  1.1492 +    break;
  1.1493 +  case KOREAN_EUC_KR:
  1.1494 +  case ISO_2022_KR:
  1.1495 +    olp = PackCLDPriorLangWeight(KOREAN, kCLDPriorEncodingWeight);
  1.1496 +    MergeCLDLangPriorsBoost(olp, langpriors);
  1.1497 +    break;
  1.1498 +
  1.1499 +  default:
  1.1500 +    break;
  1.1501 +  }
  1.1502 +}
  1.1503 +
  1.1504 +// Add hints to vector of langpriors
  1.1505 +// Input is from random source
  1.1506 +void SetCLDLanguageHint(Language lang, CLDLangPriors* langpriors) {
  1.1507 +  OneCLDLangPrior olp = PackCLDPriorLangWeight(lang, kCLDPriorLanguageWeight);
  1.1508 +  MergeCLDLangPriorsBoost(olp, langpriors);
  1.1509 +}
  1.1510 +
  1.1511 +
  1.1512 +// Make printable string of priors
  1.1513 +string DumpCLDLangPriors(const CLDLangPriors* langpriors) {
  1.1514 +  string retval;
  1.1515 +  for (int i = 0; i < langpriors->n; ++i) {
  1.1516 +    char temp[64];
  1.1517 +    sprintf(temp, "%s.%d ",
  1.1518 +             LanguageCode(GetCLDPriorLang(langpriors->prior[i])),
  1.1519 +             GetCLDPriorWeight(langpriors->prior[i]));
  1.1520 +    retval.append(temp);
  1.1521 +  }
  1.1522 +  return retval;
  1.1523 +}
  1.1524 +
  1.1525 +
  1.1526 +
  1.1527 +
  1.1528 +// Look for
  1.1529 +//  <html lang="en">
  1.1530 +//  <doc xml:lang="en">
  1.1531 +//  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en-US">
  1.1532 +//  <meta http-equiv="content-language" content="en-GB" />
  1.1533 +//  <meta name="language" content="Srpski">
  1.1534 +//  <meta name="DC.language" scheme="RFCOMMA766" content="en">
  1.1535 +//  <SPAN id="msg1" class="info" lang='en'>
  1.1536 +//
  1.1537 +// Do not trigger on
  1.1538 +//  <!-- lang=french ...-->
  1.1539 +//  <font lang=postscript ...>
  1.1540 +//  <link href="index.fr.html" hreflang="fr-FR" xml:lang="fr-FR" />
  1.1541 +//  <META name="Author" lang="fr" content="Arnaud Le Hors">
  1.1542 +//
  1.1543 +// Stop fairly quickly on mismatched quotes
  1.1544 +//
  1.1545 +// Allowed language characters
  1.1546 +//  a-z A-Z -_ , space\t
  1.1547 +// Think about: GB2312, big5, shift-jis, euc-jp, ksc euc-kr
  1.1548 +//  zh-hans zh-TW cmn-Hani zh_cn.gb18030_CN zh-min-nan zh-yue
  1.1549 +//  de-x-mtfrom-en  zh-tw-x-mtfrom-en  (machine translation)
  1.1550 +// GB2312 => gb
  1.1551 +// Big5 => big
  1.1552 +// zh_CN.gb18030_C => zh-cn
  1.1553 +//
  1.1554 +// Remove duplicates and extra spaces as we go
  1.1555 +// Lowercase as we go.
  1.1556 +
  1.1557 +// Get language tag hints from HTML body
  1.1558 +// Normalize: remove spaces and make lowercase comma list
  1.1559 +
  1.1560 +string GetLangTagsFromHtml(const char* utf8_body, int32 utf8_body_len,
  1.1561 +                           int32 max_scan_bytes) {
  1.1562 +  string retval;
  1.1563 +  if (max_scan_bytes > utf8_body_len) {
  1.1564 +    max_scan_bytes = utf8_body_len;
  1.1565 +  }
  1.1566 +
  1.1567 +  int32 k = 0;
  1.1568 +  while (k < max_scan_bytes) {
  1.1569 +    int32 start_tag = FindTagStart(utf8_body, k, max_scan_bytes);
  1.1570 +    if (start_tag < 0) {break;}
  1.1571 +    int32 end_tag = FindTagEnd(utf8_body, start_tag + 1, max_scan_bytes);
  1.1572 +    // FindTagEnd exits on < > &
  1.1573 +    if (end_tag < 0) {break;}
  1.1574 +
  1.1575 +    // Skip <!--...>
  1.1576 +    // Skip <font ...>
  1.1577 +    // Skip <script ...>
  1.1578 +    // Skip <link ...>
  1.1579 +    // Skip <img ...>
  1.1580 +    // Skip <a ...>
  1.1581 +    if (FindAfter(utf8_body, start_tag + 1, end_tag, "!--") ||
  1.1582 +        FindAfter(utf8_body, start_tag + 1, end_tag, "font ") ||
  1.1583 +        FindAfter(utf8_body, start_tag + 1, end_tag, "script ") ||
  1.1584 +        FindAfter(utf8_body, start_tag + 1, end_tag, "link ") ||
  1.1585 +        FindAfter(utf8_body, start_tag + 1, end_tag, "img ") ||
  1.1586 +        FindAfter(utf8_body, start_tag + 1, end_tag, "a ")) {
  1.1587 +      k = end_tag + 1;
  1.1588 +      continue;
  1.1589 +    }
  1.1590 +
  1.1591 +    // Remember <meta ...>
  1.1592 +    bool in_meta = false;
  1.1593 +    if (FindAfter(utf8_body, start_tag + 1, end_tag, "meta ")) {
  1.1594 +      in_meta = true;
  1.1595 +    }
  1.1596 +
  1.1597 +    // Scan for each equal sign inside tag
  1.1598 +    bool content_is_lang = false;
  1.1599 +    int32 kk = start_tag + 1;
  1.1600 +    int32 equal_sign;
  1.1601 +    while ((equal_sign = FindEqualSign(utf8_body, kk, end_tag)) >= 0) {
  1.1602 +      // eq exits on < > &
  1.1603 +
  1.1604 +      // Look inside a meta tag
  1.1605 +      // <meta ... http-equiv="content-language" ...>
  1.1606 +      // <meta ... name="language" ...>
  1.1607 +      // <meta ... name="dc.language" ...>
  1.1608 +      if (in_meta) {
  1.1609 +        if (FindBefore(utf8_body, kk, equal_sign, " http-equiv") &&
  1.1610 +            FindAfter(utf8_body, equal_sign + 1, end_tag,
  1.1611 +                      "content-language ")) {
  1.1612 +          content_is_lang = true;
  1.1613 +        } else if (FindBefore(utf8_body, kk, equal_sign, " name") &&
  1.1614 +                   (FindAfter(utf8_body, equal_sign + 1, end_tag,
  1.1615 +                              "dc.language ") ||
  1.1616 +                    FindAfter(utf8_body, equal_sign + 1, end_tag,
  1.1617 +                              "language "))) {
  1.1618 +          content_is_lang = true;
  1.1619 +        }
  1.1620 +      }
  1.1621 +
  1.1622 +      // Look inside any tag
  1.1623 +      // <meta ... content="lang-list" ...>
  1.1624 +      // <... lang="lang-list" ...>
  1.1625 +      // <... xml:lang="lang-list" ...>
  1.1626 +      if ((content_is_lang && FindBefore(utf8_body, kk, equal_sign,
  1.1627 +                                         " content")) ||
  1.1628 +          FindBefore(utf8_body, kk, equal_sign, " lang") ||
  1.1629 +          FindBefore(utf8_body, kk, equal_sign, ":lang")) {
  1.1630 +        string temp = CopyQuotedString(utf8_body, equal_sign + 1, end_tag);
  1.1631 +
  1.1632 +        // Append new lang tag(s) if not a duplicate
  1.1633 +        if (!temp.empty() && (retval.find(temp) == string::npos)) {
  1.1634 +          retval.append(temp);
  1.1635 +        }
  1.1636 +      }
  1.1637 +
  1.1638 +      kk = equal_sign + 1;
  1.1639 +    }
  1.1640 +    k = end_tag + 1;
  1.1641 +  }
  1.1642 +
  1.1643 +  // Strip last comma
  1.1644 +  if (retval.size() > 1) {
  1.1645 +    retval.erase(retval.size() - 1);
  1.1646 +  }
  1.1647 +  return retval;
  1.1648 +}
  1.1649 +
  1.1650 +}       // End namespace CLD2
  1.1651 +
  1.1652 +//==============================================================================
  1.1653 +
  1.1654 +

mercurial