Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | // Copyright 2013 Google Inc. All Rights Reserved. |
michael@0 | 2 | // |
michael@0 | 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
michael@0 | 4 | // you may not use this file except in compliance with the License. |
michael@0 | 5 | // You may obtain a copy of the License at |
michael@0 | 6 | // |
michael@0 | 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
michael@0 | 8 | // |
michael@0 | 9 | // Unless required by applicable law or agreed to in writing, software |
michael@0 | 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
michael@0 | 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
michael@0 | 12 | // See the License for the specific language governing permissions and |
michael@0 | 13 | // limitations under the License. |
michael@0 | 14 | |
michael@0 | 15 | // |
michael@0 | 16 | // File: lang_script.cc |
michael@0 | 17 | // ================ |
michael@0 | 18 | // |
michael@0 | 19 | // Author: dsites@google.com (Dick Sites) |
michael@0 | 20 | // |
michael@0 | 21 | // This file declares language and script numbers and names for CLD2 |
michael@0 | 22 | // |
michael@0 | 23 | |
michael@0 | 24 | #include "lang_script.h" |
michael@0 | 25 | |
michael@0 | 26 | #include <stdlib.h> |
michael@0 | 27 | #include <string.h> |
michael@0 | 28 | |
michael@0 | 29 | #include "generated_language.h" |
michael@0 | 30 | #include "generated_ulscript.h" |
michael@0 | 31 | |
michael@0 | 32 | namespace CLD2 { |
michael@0 | 33 | |
michael@0 | 34 | // Language tables |
michael@0 | 35 | // Subscripted by enum Language |
michael@0 | 36 | extern const int kLanguageToNameSize; |
michael@0 | 37 | extern const char* const kLanguageToName[]; |
michael@0 | 38 | extern const int kLanguageToCodeSize; |
michael@0 | 39 | extern const char* const kLanguageToCode[]; |
michael@0 | 40 | extern const int kLanguageToCNameSize; |
michael@0 | 41 | extern const char* const kLanguageToCName[]; |
michael@0 | 42 | extern const int kLanguageToScriptsSize; |
michael@0 | 43 | extern const FourScripts kLanguageToScripts[]; |
michael@0 | 44 | |
michael@0 | 45 | // Subscripted by Language |
michael@0 | 46 | extern const int kLanguageToPLangSize; |
michael@0 | 47 | extern const uint8 kLanguageToPLang[]; |
michael@0 | 48 | // Subscripted by per-script language |
michael@0 | 49 | extern const uint16 kPLangToLanguageLatn[]; |
michael@0 | 50 | extern const uint16 kPLangToLanguageOthr[]; |
michael@0 | 51 | |
michael@0 | 52 | // Alphabetical order for binary search |
michael@0 | 53 | extern const int kNameToLanguageSize; |
michael@0 | 54 | extern const CharIntPair kNameToLanguage[]; |
michael@0 | 55 | extern const int kCodeToLanguageSize; |
michael@0 | 56 | extern const CharIntPair kCodeToLanguage[]; |
michael@0 | 57 | |
michael@0 | 58 | // ULScript tables |
michael@0 | 59 | // Subscripted by enum ULScript |
michael@0 | 60 | extern const int kULScriptToNameSize; |
michael@0 | 61 | extern const char* const kULScriptToName[]; |
michael@0 | 62 | extern const int kULScriptToCodeSize; |
michael@0 | 63 | extern const char* const kULScriptToCode[]; |
michael@0 | 64 | extern const int kULScriptToCNameSize; |
michael@0 | 65 | extern const char* const kULScriptToCName[]; |
michael@0 | 66 | extern const int kULScriptToRtypeSize; |
michael@0 | 67 | extern const ULScriptRType kULScriptToRtype[]; |
michael@0 | 68 | extern const int kULScriptToDefaultLangSize; |
michael@0 | 69 | extern const Language kULScriptToDefaultLang[]; |
michael@0 | 70 | |
michael@0 | 71 | // Alphabetical order for binary search |
michael@0 | 72 | extern const int kNameToULScriptSize; |
michael@0 | 73 | extern const CharIntPair kNameToULScript[]; |
michael@0 | 74 | extern const int kCodeToULScriptSize; |
michael@0 | 75 | extern const CharIntPair kCodeToULScript[]; |
michael@0 | 76 | |
michael@0 | 77 | |
michael@0 | 78 | // |
michael@0 | 79 | // File: lang_script.h |
michael@0 | 80 | // ================ |
michael@0 | 81 | // |
michael@0 | 82 | // Author: dsites@google.com (Dick Sites) |
michael@0 | 83 | // |
michael@0 | 84 | // This file declares language and script numbers and names for CLD2 |
michael@0 | 85 | // |
michael@0 | 86 | |
michael@0 | 87 | |
michael@0 | 88 | // NOTE: The script numbers and language numbers here are not guaranteed to be |
michael@0 | 89 | // stable. If you want to record a result for posterity, save the ISO codes |
michael@0 | 90 | // as character strings. |
michael@0 | 91 | // |
michael@0 | 92 | // |
michael@0 | 93 | // The Unicode scripts recognized by CLD2 are numbered almost arbitrarily, |
michael@0 | 94 | // specified in an enum. Each script has human-readable script name and a |
michael@0 | 95 | // 4-letter ISO 15924 script code. Each has a C name (largely for use by |
michael@0 | 96 | // programs that generate declarations in cld2_generated_scripts.h). Each |
michael@0 | 97 | // also has a recognition type |
michael@0 | 98 | // r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK |
michael@0 | 99 | // |
michael@0 | 100 | // The declarations for a particular version of Unicode are machine-generated in |
michael@0 | 101 | // cld2_generated_scripts.h |
michael@0 | 102 | // |
michael@0 | 103 | // This file includes that one and declares the access routines. The type |
michael@0 | 104 | // involved is called "ULScript" to signify Unicode Letters-Marks Scripts, |
michael@0 | 105 | // which are not quite Unicode Scripts. In particular, the CJK scripts are |
michael@0 | 106 | // merged into a single number because CLD2 recognizes the CJK languages from |
michael@0 | 107 | // four scripts intermixed: Hani (both Hans and Hant), Hangul, Hiragana, and |
michael@0 | 108 | // Katakana. |
michael@0 | 109 | |
michael@0 | 110 | // Each script has one of these four recognition types. |
michael@0 | 111 | // RTypeNone: There is no language associated with this script. In extended |
michael@0 | 112 | // language recognition calls, return a fake language number that maps to |
michael@0 | 113 | // xx-Cham, with literally "xx" for the language code,and with the script |
michael@0 | 114 | // code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE. |
michael@0 | 115 | // RTypeOne: The script maps 1:1 to a single language. No letters are examined |
michael@0 | 116 | // during recognition and no lookups done. |
michael@0 | 117 | // RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring |
michael@0 | 118 | // is done to determine the languages involved. |
michael@0 | 119 | // RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the |
michael@0 | 120 | // languages involved. |
michael@0 | 121 | // |
michael@0 | 122 | // Note that the choice of recognition type is a function of script, not |
michael@0 | 123 | // language. In particular, some languges are recognized in multiple scripts |
michael@0 | 124 | // and those have different recognition types (Mongolian mn-Latn vs. mn-Mong |
michael@0 | 125 | // for example). |
michael@0 | 126 | |
michael@0 | 127 | //----------------------------------------------------------------------------// |
michael@0 | 128 | // Functions of ULScript // |
michael@0 | 129 | //----------------------------------------------------------------------------// |
michael@0 | 130 | |
michael@0 | 131 | // If the input is out of range or otherwise unrecognized, it is treated |
michael@0 | 132 | // as UNKNOWN_ULSCRIPT (which never participates in language recognition) |
michael@0 | 133 | const char* ULScriptName(ULScript ulscript) { |
michael@0 | 134 | int i_ulscript = ulscript; |
michael@0 | 135 | if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;} |
michael@0 | 136 | if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;} |
michael@0 | 137 | return kULScriptToName[i_ulscript]; |
michael@0 | 138 | } |
michael@0 | 139 | |
michael@0 | 140 | const char* ULScriptCode(ULScript ulscript) { |
michael@0 | 141 | int i_ulscript = ulscript; |
michael@0 | 142 | if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;} |
michael@0 | 143 | if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;} |
michael@0 | 144 | return kULScriptToCode[i_ulscript]; |
michael@0 | 145 | } |
michael@0 | 146 | |
michael@0 | 147 | const char* ULScriptDeclaredName(ULScript ulscript) { |
michael@0 | 148 | int i_ulscript = ulscript; |
michael@0 | 149 | if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;} |
michael@0 | 150 | if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;} |
michael@0 | 151 | return kULScriptToCName[i_ulscript]; |
michael@0 | 152 | } |
michael@0 | 153 | |
michael@0 | 154 | ULScriptRType ULScriptRecognitionType(ULScript ulscript) { |
michael@0 | 155 | int i_ulscript = ulscript; |
michael@0 | 156 | if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;} |
michael@0 | 157 | if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;} |
michael@0 | 158 | return kULScriptToRtype[i_ulscript]; |
michael@0 | 159 | } |
michael@0 | 160 | |
michael@0 | 161 | |
michael@0 | 162 | |
michael@0 | 163 | // The languages recognized by CLD2 are numbered almost arbitrarily, |
michael@0 | 164 | // specified in an enum. Each language has human-readable language name and a |
michael@0 | 165 | // 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by |
michael@0 | 166 | // programs that generate declarations in cld2_generated_languagess.h). |
michael@0 | 167 | // Each has a list of up to four scripts in which it is currently recognized. |
michael@0 | 168 | // |
michael@0 | 169 | // The declarations for a particular set of recognized languages are |
michael@0 | 170 | // machine-generated in |
michael@0 | 171 | // cld2_generated_languages.h |
michael@0 | 172 | // |
michael@0 | 173 | // The Language enum is intended to match the internal Google Language enum |
michael@0 | 174 | // in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional |
michael@0 | 175 | // languages assigned above that. Over time, some languages may be renumbered |
michael@0 | 176 | // if they are moved into the Language enum. |
michael@0 | 177 | // |
michael@0 | 178 | // The Language enum includes the fake language numbers for RTypeNone above. |
michael@0 | 179 | // |
michael@0 | 180 | // In an open-source environment, the Google-specific Language enum is not |
michael@0 | 181 | // available. Language decouples the two environments while maintaining |
michael@0 | 182 | // internal compatibility. |
michael@0 | 183 | |
michael@0 | 184 | |
michael@0 | 185 | // If the input is out of range or otherwise unrecognized, it is treated |
michael@0 | 186 | // as UNKNOWN_LANGUAGE |
michael@0 | 187 | // |
michael@0 | 188 | // LanguageCode |
michael@0 | 189 | // ------------ |
michael@0 | 190 | // Given the Language, return the language code, e.g. "ko" |
michael@0 | 191 | // This is determined by |
michael@0 | 192 | // the following (in order of preference): |
michael@0 | 193 | // - ISO-639-1 two-letter language code |
michael@0 | 194 | // (all except those mentioned below) |
michael@0 | 195 | // - ISO-639-2 three-letter bibliographic language code |
michael@0 | 196 | // (Tibetan, Dhivehi, Cherokee, Syriac) |
michael@0 | 197 | // - Google-specific language code |
michael@0 | 198 | // (ChineseT ("zh-TW"), Teragram Unknown, Unknown, |
michael@0 | 199 | // Portuguese-Portugal, Portuguese-Brazil, Limbu) |
michael@0 | 200 | // - Fake RTypeNone names. |
michael@0 | 201 | |
michael@0 | 202 | //----------------------------------------------------------------------------// |
michael@0 | 203 | // Functions of Language // |
michael@0 | 204 | //----------------------------------------------------------------------------// |
michael@0 | 205 | |
michael@0 | 206 | const char* LanguageName(Language lang) { |
michael@0 | 207 | int i_lang = lang; |
michael@0 | 208 | if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;} |
michael@0 | 209 | if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;} |
michael@0 | 210 | return kLanguageToName[i_lang]; |
michael@0 | 211 | } |
michael@0 | 212 | const char* LanguageCode(Language lang) { |
michael@0 | 213 | int i_lang = lang; |
michael@0 | 214 | if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;} |
michael@0 | 215 | if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;} |
michael@0 | 216 | return kLanguageToCode[i_lang]; |
michael@0 | 217 | } |
michael@0 | 218 | |
michael@0 | 219 | const char* LanguageDeclaredName(Language lang) { |
michael@0 | 220 | int i_lang = lang; |
michael@0 | 221 | if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;} |
michael@0 | 222 | if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;} |
michael@0 | 223 | return kLanguageToCName[i_lang]; |
michael@0 | 224 | } |
michael@0 | 225 | |
michael@0 | 226 | // n is in 0..3. Trailing entries are filled with |
michael@0 | 227 | // UNKNOWN_LANGUAGE (which never participates in language recognition) |
michael@0 | 228 | ULScript LanguageRecognizedScript(Language lang, int n) { |
michael@0 | 229 | int i_lang = lang; |
michael@0 | 230 | if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;} |
michael@0 | 231 | if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;} |
michael@0 | 232 | return static_cast<ULScript>(kLanguageToScripts[i_lang][n]); |
michael@0 | 233 | } |
michael@0 | 234 | |
michael@0 | 235 | // Given the Language, returns its string name used as the output by |
michael@0 | 236 | // the lang/enc identifier, e.g. "Korean" |
michael@0 | 237 | // "invalid_language" if the input is invalid. |
michael@0 | 238 | // TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language, |
michael@0 | 239 | // used to subtract out HTML, link farms, DNA strings, and alittle English porn |
michael@0 | 240 | const char* ExtLanguageName(const Language lang) { |
michael@0 | 241 | return LanguageName(lang); |
michael@0 | 242 | } |
michael@0 | 243 | |
michael@0 | 244 | // Given the Language, return the language code, e.g. "ko" |
michael@0 | 245 | const char* ExtLanguageCode(const Language lang) { |
michael@0 | 246 | return LanguageCode(lang); |
michael@0 | 247 | } |
michael@0 | 248 | |
michael@0 | 249 | |
michael@0 | 250 | // Given the Language, returns its Language enum spelling, for use by |
michael@0 | 251 | // programs that create C declarations, e.g. "KOREAN" |
michael@0 | 252 | // "UNKNOWN_LANGUAGE" if the input is invalid. |
michael@0 | 253 | const char* ExtLanguageDeclaredName(const Language lang) { |
michael@0 | 254 | return LanguageDeclaredName(lang); |
michael@0 | 255 | } |
michael@0 | 256 | |
michael@0 | 257 | |
michael@0 | 258 | extern const int kCloseSetSize = 10; |
michael@0 | 259 | |
michael@0 | 260 | // Returns which set of statistically-close languages lang is in. 0 means none. |
michael@0 | 261 | int LanguageCloseSet(Language lang) { |
michael@0 | 262 | // Scaffolding |
michael@0 | 263 | // id ms # INDONESIAN MALAY coef=0.4698 Problematic w/o extra words |
michael@0 | 264 | // bo dz # TIBETAN DZONGKHA coef=0.4571 |
michael@0 | 265 | // cs sk # CZECH SLOVAK coef=0.4273 |
michael@0 | 266 | // zu xh # ZULU XHOSA coef=0.3716 |
michael@0 | 267 | // |
michael@0 | 268 | // bs hr sr srm # BOSNIAN CROATIAN SERBIAN MONTENEGRIN |
michael@0 | 269 | // hi mr bh ne # HINDI MARATHI BIHARI NEPALI |
michael@0 | 270 | // no nn da # NORWEGIAN NORWEGIAN_N DANISH |
michael@0 | 271 | // gl es pt # GALICIAN SPANISH PORTUGUESE |
michael@0 | 272 | // rw rn # KINYARWANDA RUNDI |
michael@0 | 273 | |
michael@0 | 274 | if (lang == INDONESIAN) {return 1;} |
michael@0 | 275 | if (lang == MALAY) {return 1;} |
michael@0 | 276 | |
michael@0 | 277 | if (lang == TIBETAN) {return 2;} |
michael@0 | 278 | if (lang == DZONGKHA) {return 2;} |
michael@0 | 279 | |
michael@0 | 280 | if (lang == CZECH) {return 3;} |
michael@0 | 281 | if (lang == SLOVAK) {return 3;} |
michael@0 | 282 | |
michael@0 | 283 | if (lang == ZULU) {return 4;} |
michael@0 | 284 | if (lang == XHOSA) {return 4;} |
michael@0 | 285 | |
michael@0 | 286 | if (lang == BOSNIAN) {return 5;} |
michael@0 | 287 | if (lang == CROATIAN) {return 5;} |
michael@0 | 288 | if (lang == SERBIAN) {return 5;} |
michael@0 | 289 | if (lang == MONTENEGRIN) {return 5;} |
michael@0 | 290 | |
michael@0 | 291 | if (lang == HINDI) {return 6;} |
michael@0 | 292 | if (lang == MARATHI) {return 6;} |
michael@0 | 293 | if (lang == BIHARI) {return 6;} |
michael@0 | 294 | if (lang == NEPALI) {return 6;} |
michael@0 | 295 | |
michael@0 | 296 | if (lang == NORWEGIAN) {return 7;} |
michael@0 | 297 | if (lang == NORWEGIAN_N) {return 7;} |
michael@0 | 298 | if (lang == DANISH) {return 7;} |
michael@0 | 299 | |
michael@0 | 300 | if (lang == GALICIAN) {return 8;} |
michael@0 | 301 | if (lang == SPANISH) {return 8;} |
michael@0 | 302 | if (lang == PORTUGUESE) {return 8;} |
michael@0 | 303 | |
michael@0 | 304 | if (lang == KINYARWANDA) {return 9;} |
michael@0 | 305 | if (lang == RUNDI) {return 9;} |
michael@0 | 306 | |
michael@0 | 307 | return 0; |
michael@0 | 308 | } |
michael@0 | 309 | |
michael@0 | 310 | //----------------------------------------------------------------------------// |
michael@0 | 311 | // Functions of ULScript and Language // |
michael@0 | 312 | //----------------------------------------------------------------------------// |
michael@0 | 313 | |
michael@0 | 314 | Language DefaultLanguage(ULScript ulscript) { |
michael@0 | 315 | if (ulscript < 0) {return UNKNOWN_LANGUAGE;} |
michael@0 | 316 | if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;} |
michael@0 | 317 | return kULScriptToDefaultLang[ulscript]; |
michael@0 | 318 | } |
michael@0 | 319 | |
michael@0 | 320 | uint8 PerScriptNumber(ULScript ulscript, Language lang) { |
michael@0 | 321 | if (ulscript < 0) {return 0;} |
michael@0 | 322 | if (ulscript >= NUM_ULSCRIPTS) {return 0;} |
michael@0 | 323 | if (kULScriptToRtype[ulscript] == RTypeNone) {return 1;} |
michael@0 | 324 | if (lang >= kLanguageToPLangSize) {return 0;} |
michael@0 | 325 | return kLanguageToPLang[lang]; |
michael@0 | 326 | } |
michael@0 | 327 | |
michael@0 | 328 | Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number) { |
michael@0 | 329 | if (ulscript < 0) {return UNKNOWN_LANGUAGE;} |
michael@0 | 330 | if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;} |
michael@0 | 331 | if ((kULScriptToRtype[ulscript] == RTypeNone) || |
michael@0 | 332 | (kULScriptToRtype[ulscript] == RTypeOne)) { |
michael@0 | 333 | return kULScriptToDefaultLang[ulscript]; |
michael@0 | 334 | } |
michael@0 | 335 | |
michael@0 | 336 | if (ulscript == ULScript_Latin) { |
michael@0 | 337 | return static_cast<Language>(kPLangToLanguageLatn[perscript_number]); |
michael@0 | 338 | } else { |
michael@0 | 339 | return static_cast<Language>(kPLangToLanguageOthr[perscript_number]); |
michael@0 | 340 | } |
michael@0 | 341 | } |
michael@0 | 342 | |
michael@0 | 343 | // Return true if language can be in the Latin script |
michael@0 | 344 | bool IsLatnLanguage(Language lang) { |
michael@0 | 345 | if (lang >= kLanguageToPLangSize) {return false;} |
michael@0 | 346 | return (lang == kPLangToLanguageLatn[kLanguageToPLang[lang]]); |
michael@0 | 347 | } |
michael@0 | 348 | |
michael@0 | 349 | // Return true if language can be in a non-Latin script |
michael@0 | 350 | bool IsOthrLanguage(Language lang) { |
michael@0 | 351 | if (lang >= kLanguageToPLangSize) {return false;} |
michael@0 | 352 | return (lang == kPLangToLanguageOthr[kLanguageToPLang[lang]]); |
michael@0 | 353 | } |
michael@0 | 354 | |
michael@0 | 355 | |
michael@0 | 356 | //----------------------------------------------------------------------------// |
michael@0 | 357 | // Other // |
michael@0 | 358 | //----------------------------------------------------------------------------// |
michael@0 | 359 | |
michael@0 | 360 | // Returns mid if key found in lo <= mid < hi, else -1 |
michael@0 | 361 | int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair) { |
michael@0 | 362 | // binary search |
michael@0 | 363 | while (lo < hi) { |
michael@0 | 364 | int mid = (lo + hi) >> 1; |
michael@0 | 365 | if (strcmp(key, cipair[mid].s) < 0) { |
michael@0 | 366 | hi = mid; |
michael@0 | 367 | } else if (strcmp(key, cipair[mid].s) > 0) { |
michael@0 | 368 | lo = mid + 1; |
michael@0 | 369 | } else { |
michael@0 | 370 | return mid; |
michael@0 | 371 | } |
michael@0 | 372 | } |
michael@0 | 373 | return -1; |
michael@0 | 374 | } |
michael@0 | 375 | |
michael@0 | 376 | Language MakeLang(int i) {return static_cast<Language>(i);} |
michael@0 | 377 | |
michael@0 | 378 | // Name can be either full name or ISO code, or can be ISO code embedded in |
michael@0 | 379 | // a language-script combination such as "ABKHAZIAN", "en", "en-Latn-GB" |
michael@0 | 380 | Language GetLanguageFromName(const char* src) { |
michael@0 | 381 | const char* hyphen1 = strchr(src, '-'); |
michael@0 | 382 | const char* hyphen2 = NULL; |
michael@0 | 383 | if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');} |
michael@0 | 384 | |
michael@0 | 385 | int match = -1; |
michael@0 | 386 | if (hyphen1 == NULL) { |
michael@0 | 387 | // Bare name. Look at full name, then code |
michael@0 | 388 | match = BinarySearch(src, 0, kNameToLanguageSize, kNameToLanguage); |
michael@0 | 389 | if (match >= 0) {return MakeLang(kNameToLanguage[match].i);} // aa |
michael@0 | 390 | match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage); |
michael@0 | 391 | if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa |
michael@0 | 392 | return UNKNOWN_LANGUAGE; |
michael@0 | 393 | } |
michael@0 | 394 | |
michael@0 | 395 | if (hyphen2 == NULL) { |
michael@0 | 396 | // aa-bb. Not a full name; must be code-something. Try zh-TW then bare zh |
michael@0 | 397 | match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage); |
michael@0 | 398 | if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb |
michael@0 | 399 | |
michael@0 | 400 | int len = strlen(src); |
michael@0 | 401 | if (len >= 16) {return UNKNOWN_LANGUAGE;} // Real codes are shorter |
michael@0 | 402 | |
michael@0 | 403 | char temp[16]; |
michael@0 | 404 | int hyphen1_offset = hyphen1 - src; |
michael@0 | 405 | // Take off part after hyphen1 |
michael@0 | 406 | memcpy(temp, src, len); |
michael@0 | 407 | temp[hyphen1_offset] = '\0'; |
michael@0 | 408 | match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage); |
michael@0 | 409 | if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa |
michael@0 | 410 | |
michael@0 | 411 | return UNKNOWN_LANGUAGE; |
michael@0 | 412 | } |
michael@0 | 413 | |
michael@0 | 414 | // aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en |
michael@0 | 415 | match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage); |
michael@0 | 416 | if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb-cc |
michael@0 | 417 | |
michael@0 | 418 | |
michael@0 | 419 | int len = strlen(src); |
michael@0 | 420 | if (len >= 16) {return UNKNOWN_LANGUAGE;} // Real codes are shorter |
michael@0 | 421 | |
michael@0 | 422 | char temp[16]; |
michael@0 | 423 | int hyphen1_offset = hyphen1 - src; |
michael@0 | 424 | int hyphen2_offset = hyphen2 - src; |
michael@0 | 425 | // Take off part after hyphen2 |
michael@0 | 426 | memcpy(temp, src, len); |
michael@0 | 427 | temp[hyphen2_offset] = '\0'; |
michael@0 | 428 | match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage); |
michael@0 | 429 | if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb |
michael@0 | 430 | |
michael@0 | 431 | |
michael@0 | 432 | // Take off part between hyphen1 and hyphen2 |
michael@0 | 433 | int len2 = len - hyphen2_offset; |
michael@0 | 434 | memcpy(temp, src, len); |
michael@0 | 435 | memcpy(&temp[hyphen1_offset], hyphen2, len2); |
michael@0 | 436 | temp[hyphen1_offset + len2] = '\0'; |
michael@0 | 437 | match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage); |
michael@0 | 438 | if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-cc |
michael@0 | 439 | |
michael@0 | 440 | |
michael@0 | 441 | // Take off everything after hyphen1 |
michael@0 | 442 | memcpy(temp, src, len); |
michael@0 | 443 | temp[hyphen1_offset] = '\0'; |
michael@0 | 444 | match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage); |
michael@0 | 445 | if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa |
michael@0 | 446 | |
michael@0 | 447 | |
michael@0 | 448 | return UNKNOWN_LANGUAGE; |
michael@0 | 449 | } |
michael@0 | 450 | |
michael@0 | 451 | |
michael@0 | 452 | // Name can be either full name or ISO code, or can be ISO code embedded in |
michael@0 | 453 | // a language-script combination such as "en-Latn-GB" |
michael@0 | 454 | // MORE WORK to do here. also kLanguageToScripts [4] is bogus |
michael@0 | 455 | // if bare language name, no script, want zh, ja, ko to Hani, pt to Latn, etc. |
michael@0 | 456 | // Something like map code to Language, then Language to kLanguageToScripts[x][0] |
michael@0 | 457 | // ADD BIAS: kLanguageToScripts lists default script first |
michael@0 | 458 | // If total mismatch, reutrn Latn |
michael@0 | 459 | // if (strcmp(src, "nd") == 0) {return NDEBELE;} // [nd was wrong] |
michael@0 | 460 | // if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;} |
michael@0 | 461 | |
michael@0 | 462 | ULScript MakeULScr(int i) {return static_cast<ULScript>(i);} |
michael@0 | 463 | |
michael@0 | 464 | ULScript GetULScriptFromName(const char* src) { |
michael@0 | 465 | const char* hyphen1 = strchr(src, '-'); |
michael@0 | 466 | const char* hyphen2 = NULL; |
michael@0 | 467 | if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');} |
michael@0 | 468 | |
michael@0 | 469 | int match = -1; |
michael@0 | 470 | if (hyphen1 == NULL) { |
michael@0 | 471 | // Bare name. Look at full name, then code, then try backmapping as Language |
michael@0 | 472 | match = BinarySearch(src, 0, kNameToULScriptSize, kNameToULScript); |
michael@0 | 473 | if (match >= 0) {return MakeULScr(kNameToULScript[match].i);} // aa |
michael@0 | 474 | match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript); |
michael@0 | 475 | if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa |
michael@0 | 476 | |
michael@0 | 477 | Language backmap_me = GetLanguageFromName(src); |
michael@0 | 478 | if (backmap_me != UNKNOWN_LANGUAGE) { |
michael@0 | 479 | return static_cast<ULScript>(kLanguageToScripts[backmap_me][0]); |
michael@0 | 480 | } |
michael@0 | 481 | return ULScript_Latin; |
michael@0 | 482 | } |
michael@0 | 483 | |
michael@0 | 484 | if (hyphen2 == NULL) { |
michael@0 | 485 | // aa-bb. Not a full name; must be code-something. Try en-Latn, bare Latn |
michael@0 | 486 | if (strcmp(src, "zh-TW") == 0) {return ULScript_Hani;} |
michael@0 | 487 | if (strcmp(src, "zh-CN") == 0) {return ULScript_Hani;} |
michael@0 | 488 | if (strcmp(src, "sit-NP") == 0) {return ULScript_Limbu;} |
michael@0 | 489 | if (strcmp(src, "sit-Limb") == 0) {return ULScript_Limbu;} |
michael@0 | 490 | if (strcmp(src, "sr-ME") == 0) {return ULScript_Latin;} |
michael@0 | 491 | match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript); |
michael@0 | 492 | if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa-bb |
michael@0 | 493 | |
michael@0 | 494 | int len = strlen(src); |
michael@0 | 495 | if (len >= 16) {return ULScript_Latin;} // Real codes are shorter |
michael@0 | 496 | |
michael@0 | 497 | char temp[16]; |
michael@0 | 498 | int hyphen1_offset = hyphen1 - src; |
michael@0 | 499 | int len1 = len - hyphen1_offset - 1; // Exclude the hyphen |
michael@0 | 500 | // Take off part before hyphen1 |
michael@0 | 501 | memcpy(temp, hyphen1 + 1, len1); |
michael@0 | 502 | temp[len1] = '\0'; |
michael@0 | 503 | match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); |
michael@0 | 504 | if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // bb |
michael@0 | 505 | |
michael@0 | 506 | // Take off part after hyphen1 |
michael@0 | 507 | memcpy(temp, src, len); |
michael@0 | 508 | temp[hyphen1_offset] = '\0'; |
michael@0 | 509 | match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); |
michael@0 | 510 | if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa |
michael@0 | 511 | |
michael@0 | 512 | return ULScript_Latin; |
michael@0 | 513 | } |
michael@0 | 514 | |
michael@0 | 515 | // aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en |
michael@0 | 516 | if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;} |
michael@0 | 517 | if (strcmp(src, "sr-ME-Latn") == 0) {return ULScript_Latin;} |
michael@0 | 518 | if (strcmp(src, "sr-ME-Cyrl") == 0) {return ULScript_Cyrillic;} |
michael@0 | 519 | match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript); |
michael@0 | 520 | if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa-bb-cc |
michael@0 | 521 | |
michael@0 | 522 | int len = strlen(src); |
michael@0 | 523 | if (len >= 16) {return ULScript_Latin;} // Real codes are shorter |
michael@0 | 524 | |
michael@0 | 525 | char temp[16]; |
michael@0 | 526 | int hyphen1_offset = hyphen1 - src; |
michael@0 | 527 | int hyphen2_offset = hyphen2 - src; |
michael@0 | 528 | int len2 = len - hyphen2_offset - 1; // Exclude the hyphen |
michael@0 | 529 | int lenmid = hyphen2_offset - hyphen1_offset - 1; // Exclude the hyphen |
michael@0 | 530 | // Keep part between hyphen1 and hyphen2 |
michael@0 | 531 | memcpy(temp, hyphen1 + 1, lenmid); |
michael@0 | 532 | temp[lenmid] = '\0'; |
michael@0 | 533 | match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); |
michael@0 | 534 | if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // bb |
michael@0 | 535 | |
michael@0 | 536 | // Keep part after hyphen2 |
michael@0 | 537 | memcpy(temp, hyphen2 + 1, len2); |
michael@0 | 538 | temp[len2] = '\0'; |
michael@0 | 539 | match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); |
michael@0 | 540 | if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // cc |
michael@0 | 541 | |
michael@0 | 542 | // Keep part before hyphen1 |
michael@0 | 543 | memcpy(temp, src, len); |
michael@0 | 544 | temp[hyphen1_offset] = '\0'; |
michael@0 | 545 | match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); |
michael@0 | 546 | if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa |
michael@0 | 547 | |
michael@0 | 548 | return ULScript_Latin; |
michael@0 | 549 | } |
michael@0 | 550 | |
michael@0 | 551 | // Map script into Latin, Cyrillic, Arabic, Other |
michael@0 | 552 | int LScript4(ULScript ulscript) { |
michael@0 | 553 | if (ulscript == ULScript_Latin) {return 0;} |
michael@0 | 554 | if (ulscript == ULScript_Cyrillic) {return 1;} |
michael@0 | 555 | if (ulscript == ULScript_Arabic) {return 2;} |
michael@0 | 556 | return 3; |
michael@0 | 557 | } |
michael@0 | 558 | |
michael@0 | 559 | } // namespace CLD2 |
michael@0 | 560 |