browser/components/translation/cld2/internal/lang_script.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 // Copyright 2013 Google Inc. All Rights Reserved.
michael@0 2 //
michael@0 3 // Licensed under the Apache License, Version 2.0 (the "License");
michael@0 4 // you may not use this file except in compliance with the License.
michael@0 5 // You may obtain a copy of the License at
michael@0 6 //
michael@0 7 // http://www.apache.org/licenses/LICENSE-2.0
michael@0 8 //
michael@0 9 // Unless required by applicable law or agreed to in writing, software
michael@0 10 // distributed under the License is distributed on an "AS IS" BASIS,
michael@0 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
michael@0 12 // See the License for the specific language governing permissions and
michael@0 13 // limitations under the License.
michael@0 14
michael@0 15 //
michael@0 16 // File: lang_script.cc
michael@0 17 // ================
michael@0 18 //
michael@0 19 // Author: dsites@google.com (Dick Sites)
michael@0 20 //
michael@0 21 // This file declares language and script numbers and names for CLD2
michael@0 22 //
michael@0 23
michael@0 24 #include "lang_script.h"
michael@0 25
michael@0 26 #include <stdlib.h>
michael@0 27 #include <string.h>
michael@0 28
michael@0 29 #include "generated_language.h"
michael@0 30 #include "generated_ulscript.h"
michael@0 31
michael@0 32 namespace CLD2 {
michael@0 33
michael@0 34 // Language tables
michael@0 35 // Subscripted by enum Language
michael@0 36 extern const int kLanguageToNameSize;
michael@0 37 extern const char* const kLanguageToName[];
michael@0 38 extern const int kLanguageToCodeSize;
michael@0 39 extern const char* const kLanguageToCode[];
michael@0 40 extern const int kLanguageToCNameSize;
michael@0 41 extern const char* const kLanguageToCName[];
michael@0 42 extern const int kLanguageToScriptsSize;
michael@0 43 extern const FourScripts kLanguageToScripts[];
michael@0 44
michael@0 45 // Subscripted by Language
michael@0 46 extern const int kLanguageToPLangSize;
michael@0 47 extern const uint8 kLanguageToPLang[];
michael@0 48 // Subscripted by per-script language
michael@0 49 extern const uint16 kPLangToLanguageLatn[];
michael@0 50 extern const uint16 kPLangToLanguageOthr[];
michael@0 51
michael@0 52 // Alphabetical order for binary search
michael@0 53 extern const int kNameToLanguageSize;
michael@0 54 extern const CharIntPair kNameToLanguage[];
michael@0 55 extern const int kCodeToLanguageSize;
michael@0 56 extern const CharIntPair kCodeToLanguage[];
michael@0 57
michael@0 58 // ULScript tables
michael@0 59 // Subscripted by enum ULScript
michael@0 60 extern const int kULScriptToNameSize;
michael@0 61 extern const char* const kULScriptToName[];
michael@0 62 extern const int kULScriptToCodeSize;
michael@0 63 extern const char* const kULScriptToCode[];
michael@0 64 extern const int kULScriptToCNameSize;
michael@0 65 extern const char* const kULScriptToCName[];
michael@0 66 extern const int kULScriptToRtypeSize;
michael@0 67 extern const ULScriptRType kULScriptToRtype[];
michael@0 68 extern const int kULScriptToDefaultLangSize;
michael@0 69 extern const Language kULScriptToDefaultLang[];
michael@0 70
michael@0 71 // Alphabetical order for binary search
michael@0 72 extern const int kNameToULScriptSize;
michael@0 73 extern const CharIntPair kNameToULScript[];
michael@0 74 extern const int kCodeToULScriptSize;
michael@0 75 extern const CharIntPair kCodeToULScript[];
michael@0 76
michael@0 77
michael@0 78 //
michael@0 79 // File: lang_script.h
michael@0 80 // ================
michael@0 81 //
michael@0 82 // Author: dsites@google.com (Dick Sites)
michael@0 83 //
michael@0 84 // This file declares language and script numbers and names for CLD2
michael@0 85 //
michael@0 86
michael@0 87
michael@0 88 // NOTE: The script numbers and language numbers here are not guaranteed to be
michael@0 89 // stable. If you want to record a result for posterity, save the ISO codes
michael@0 90 // as character strings.
michael@0 91 //
michael@0 92 //
michael@0 93 // The Unicode scripts recognized by CLD2 are numbered almost arbitrarily,
michael@0 94 // specified in an enum. Each script has human-readable script name and a
michael@0 95 // 4-letter ISO 15924 script code. Each has a C name (largely for use by
michael@0 96 // programs that generate declarations in cld2_generated_scripts.h). Each
michael@0 97 // also has a recognition type
michael@0 98 // r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK
michael@0 99 //
michael@0 100 // The declarations for a particular version of Unicode are machine-generated in
michael@0 101 // cld2_generated_scripts.h
michael@0 102 //
michael@0 103 // This file includes that one and declares the access routines. The type
michael@0 104 // involved is called "ULScript" to signify Unicode Letters-Marks Scripts,
michael@0 105 // which are not quite Unicode Scripts. In particular, the CJK scripts are
michael@0 106 // merged into a single number because CLD2 recognizes the CJK languages from
michael@0 107 // four scripts intermixed: Hani (both Hans and Hant), Hangul, Hiragana, and
michael@0 108 // Katakana.
michael@0 109
michael@0 110 // Each script has one of these four recognition types.
michael@0 111 // RTypeNone: There is no language associated with this script. In extended
michael@0 112 // language recognition calls, return a fake language number that maps to
michael@0 113 // xx-Cham, with literally "xx" for the language code,and with the script
michael@0 114 // code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE.
michael@0 115 // RTypeOne: The script maps 1:1 to a single language. No letters are examined
michael@0 116 // during recognition and no lookups done.
michael@0 117 // RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring
michael@0 118 // is done to determine the languages involved.
michael@0 119 // RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the
michael@0 120 // languages involved.
michael@0 121 //
michael@0 122 // Note that the choice of recognition type is a function of script, not
michael@0 123 // language. In particular, some languges are recognized in multiple scripts
michael@0 124 // and those have different recognition types (Mongolian mn-Latn vs. mn-Mong
michael@0 125 // for example).
michael@0 126
michael@0 127 //----------------------------------------------------------------------------//
michael@0 128 // Functions of ULScript //
michael@0 129 //----------------------------------------------------------------------------//
michael@0 130
michael@0 131 // If the input is out of range or otherwise unrecognized, it is treated
michael@0 132 // as UNKNOWN_ULSCRIPT (which never participates in language recognition)
michael@0 133 const char* ULScriptName(ULScript ulscript) {
michael@0 134 int i_ulscript = ulscript;
michael@0 135 if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
michael@0 136 if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
michael@0 137 return kULScriptToName[i_ulscript];
michael@0 138 }
michael@0 139
michael@0 140 const char* ULScriptCode(ULScript ulscript) {
michael@0 141 int i_ulscript = ulscript;
michael@0 142 if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
michael@0 143 if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
michael@0 144 return kULScriptToCode[i_ulscript];
michael@0 145 }
michael@0 146
michael@0 147 const char* ULScriptDeclaredName(ULScript ulscript) {
michael@0 148 int i_ulscript = ulscript;
michael@0 149 if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
michael@0 150 if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
michael@0 151 return kULScriptToCName[i_ulscript];
michael@0 152 }
michael@0 153
michael@0 154 ULScriptRType ULScriptRecognitionType(ULScript ulscript) {
michael@0 155 int i_ulscript = ulscript;
michael@0 156 if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;}
michael@0 157 if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;}
michael@0 158 return kULScriptToRtype[i_ulscript];
michael@0 159 }
michael@0 160
michael@0 161
michael@0 162
michael@0 163 // The languages recognized by CLD2 are numbered almost arbitrarily,
michael@0 164 // specified in an enum. Each language has human-readable language name and a
michael@0 165 // 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by
michael@0 166 // programs that generate declarations in cld2_generated_languagess.h).
michael@0 167 // Each has a list of up to four scripts in which it is currently recognized.
michael@0 168 //
michael@0 169 // The declarations for a particular set of recognized languages are
michael@0 170 // machine-generated in
michael@0 171 // cld2_generated_languages.h
michael@0 172 //
michael@0 173 // The Language enum is intended to match the internal Google Language enum
michael@0 174 // in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional
michael@0 175 // languages assigned above that. Over time, some languages may be renumbered
michael@0 176 // if they are moved into the Language enum.
michael@0 177 //
michael@0 178 // The Language enum includes the fake language numbers for RTypeNone above.
michael@0 179 //
michael@0 180 // In an open-source environment, the Google-specific Language enum is not
michael@0 181 // available. Language decouples the two environments while maintaining
michael@0 182 // internal compatibility.
michael@0 183
michael@0 184
michael@0 185 // If the input is out of range or otherwise unrecognized, it is treated
michael@0 186 // as UNKNOWN_LANGUAGE
michael@0 187 //
michael@0 188 // LanguageCode
michael@0 189 // ------------
michael@0 190 // Given the Language, return the language code, e.g. "ko"
michael@0 191 // This is determined by
michael@0 192 // the following (in order of preference):
michael@0 193 // - ISO-639-1 two-letter language code
michael@0 194 // (all except those mentioned below)
michael@0 195 // - ISO-639-2 three-letter bibliographic language code
michael@0 196 // (Tibetan, Dhivehi, Cherokee, Syriac)
michael@0 197 // - Google-specific language code
michael@0 198 // (ChineseT ("zh-TW"), Teragram Unknown, Unknown,
michael@0 199 // Portuguese-Portugal, Portuguese-Brazil, Limbu)
michael@0 200 // - Fake RTypeNone names.
michael@0 201
michael@0 202 //----------------------------------------------------------------------------//
michael@0 203 // Functions of Language //
michael@0 204 //----------------------------------------------------------------------------//
michael@0 205
michael@0 206 const char* LanguageName(Language lang) {
michael@0 207 int i_lang = lang;
michael@0 208 if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
michael@0 209 if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
michael@0 210 return kLanguageToName[i_lang];
michael@0 211 }
michael@0 212 const char* LanguageCode(Language lang) {
michael@0 213 int i_lang = lang;
michael@0 214 if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
michael@0 215 if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
michael@0 216 return kLanguageToCode[i_lang];
michael@0 217 }
michael@0 218
michael@0 219 const char* LanguageDeclaredName(Language lang) {
michael@0 220 int i_lang = lang;
michael@0 221 if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
michael@0 222 if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
michael@0 223 return kLanguageToCName[i_lang];
michael@0 224 }
michael@0 225
michael@0 226 // n is in 0..3. Trailing entries are filled with
michael@0 227 // UNKNOWN_LANGUAGE (which never participates in language recognition)
michael@0 228 ULScript LanguageRecognizedScript(Language lang, int n) {
michael@0 229 int i_lang = lang;
michael@0 230 if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;}
michael@0 231 if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;}
michael@0 232 return static_cast<ULScript>(kLanguageToScripts[i_lang][n]);
michael@0 233 }
michael@0 234
michael@0 235 // Given the Language, returns its string name used as the output by
michael@0 236 // the lang/enc identifier, e.g. "Korean"
michael@0 237 // "invalid_language" if the input is invalid.
michael@0 238 // TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language,
michael@0 239 // used to subtract out HTML, link farms, DNA strings, and alittle English porn
michael@0 240 const char* ExtLanguageName(const Language lang) {
michael@0 241 return LanguageName(lang);
michael@0 242 }
michael@0 243
michael@0 244 // Given the Language, return the language code, e.g. "ko"
michael@0 245 const char* ExtLanguageCode(const Language lang) {
michael@0 246 return LanguageCode(lang);
michael@0 247 }
michael@0 248
michael@0 249
michael@0 250 // Given the Language, returns its Language enum spelling, for use by
michael@0 251 // programs that create C declarations, e.g. "KOREAN"
michael@0 252 // "UNKNOWN_LANGUAGE" if the input is invalid.
michael@0 253 const char* ExtLanguageDeclaredName(const Language lang) {
michael@0 254 return LanguageDeclaredName(lang);
michael@0 255 }
michael@0 256
michael@0 257
michael@0 258 extern const int kCloseSetSize = 10;
michael@0 259
michael@0 260 // Returns which set of statistically-close languages lang is in. 0 means none.
michael@0 261 int LanguageCloseSet(Language lang) {
michael@0 262 // Scaffolding
michael@0 263 // id ms # INDONESIAN MALAY coef=0.4698 Problematic w/o extra words
michael@0 264 // bo dz # TIBETAN DZONGKHA coef=0.4571
michael@0 265 // cs sk # CZECH SLOVAK coef=0.4273
michael@0 266 // zu xh # ZULU XHOSA coef=0.3716
michael@0 267 //
michael@0 268 // bs hr sr srm # BOSNIAN CROATIAN SERBIAN MONTENEGRIN
michael@0 269 // hi mr bh ne # HINDI MARATHI BIHARI NEPALI
michael@0 270 // no nn da # NORWEGIAN NORWEGIAN_N DANISH
michael@0 271 // gl es pt # GALICIAN SPANISH PORTUGUESE
michael@0 272 // rw rn # KINYARWANDA RUNDI
michael@0 273
michael@0 274 if (lang == INDONESIAN) {return 1;}
michael@0 275 if (lang == MALAY) {return 1;}
michael@0 276
michael@0 277 if (lang == TIBETAN) {return 2;}
michael@0 278 if (lang == DZONGKHA) {return 2;}
michael@0 279
michael@0 280 if (lang == CZECH) {return 3;}
michael@0 281 if (lang == SLOVAK) {return 3;}
michael@0 282
michael@0 283 if (lang == ZULU) {return 4;}
michael@0 284 if (lang == XHOSA) {return 4;}
michael@0 285
michael@0 286 if (lang == BOSNIAN) {return 5;}
michael@0 287 if (lang == CROATIAN) {return 5;}
michael@0 288 if (lang == SERBIAN) {return 5;}
michael@0 289 if (lang == MONTENEGRIN) {return 5;}
michael@0 290
michael@0 291 if (lang == HINDI) {return 6;}
michael@0 292 if (lang == MARATHI) {return 6;}
michael@0 293 if (lang == BIHARI) {return 6;}
michael@0 294 if (lang == NEPALI) {return 6;}
michael@0 295
michael@0 296 if (lang == NORWEGIAN) {return 7;}
michael@0 297 if (lang == NORWEGIAN_N) {return 7;}
michael@0 298 if (lang == DANISH) {return 7;}
michael@0 299
michael@0 300 if (lang == GALICIAN) {return 8;}
michael@0 301 if (lang == SPANISH) {return 8;}
michael@0 302 if (lang == PORTUGUESE) {return 8;}
michael@0 303
michael@0 304 if (lang == KINYARWANDA) {return 9;}
michael@0 305 if (lang == RUNDI) {return 9;}
michael@0 306
michael@0 307 return 0;
michael@0 308 }
michael@0 309
michael@0 310 //----------------------------------------------------------------------------//
michael@0 311 // Functions of ULScript and Language //
michael@0 312 //----------------------------------------------------------------------------//
michael@0 313
michael@0 314 Language DefaultLanguage(ULScript ulscript) {
michael@0 315 if (ulscript < 0) {return UNKNOWN_LANGUAGE;}
michael@0 316 if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;}
michael@0 317 return kULScriptToDefaultLang[ulscript];
michael@0 318 }
michael@0 319
michael@0 320 uint8 PerScriptNumber(ULScript ulscript, Language lang) {
michael@0 321 if (ulscript < 0) {return 0;}
michael@0 322 if (ulscript >= NUM_ULSCRIPTS) {return 0;}
michael@0 323 if (kULScriptToRtype[ulscript] == RTypeNone) {return 1;}
michael@0 324 if (lang >= kLanguageToPLangSize) {return 0;}
michael@0 325 return kLanguageToPLang[lang];
michael@0 326 }
michael@0 327
michael@0 328 Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number) {
michael@0 329 if (ulscript < 0) {return UNKNOWN_LANGUAGE;}
michael@0 330 if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;}
michael@0 331 if ((kULScriptToRtype[ulscript] == RTypeNone) ||
michael@0 332 (kULScriptToRtype[ulscript] == RTypeOne)) {
michael@0 333 return kULScriptToDefaultLang[ulscript];
michael@0 334 }
michael@0 335
michael@0 336 if (ulscript == ULScript_Latin) {
michael@0 337 return static_cast<Language>(kPLangToLanguageLatn[perscript_number]);
michael@0 338 } else {
michael@0 339 return static_cast<Language>(kPLangToLanguageOthr[perscript_number]);
michael@0 340 }
michael@0 341 }
michael@0 342
michael@0 343 // Return true if language can be in the Latin script
michael@0 344 bool IsLatnLanguage(Language lang) {
michael@0 345 if (lang >= kLanguageToPLangSize) {return false;}
michael@0 346 return (lang == kPLangToLanguageLatn[kLanguageToPLang[lang]]);
michael@0 347 }
michael@0 348
michael@0 349 // Return true if language can be in a non-Latin script
michael@0 350 bool IsOthrLanguage(Language lang) {
michael@0 351 if (lang >= kLanguageToPLangSize) {return false;}
michael@0 352 return (lang == kPLangToLanguageOthr[kLanguageToPLang[lang]]);
michael@0 353 }
michael@0 354
michael@0 355
michael@0 356 //----------------------------------------------------------------------------//
michael@0 357 // Other //
michael@0 358 //----------------------------------------------------------------------------//
michael@0 359
michael@0 360 // Returns mid if key found in lo <= mid < hi, else -1
michael@0 361 int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair) {
michael@0 362 // binary search
michael@0 363 while (lo < hi) {
michael@0 364 int mid = (lo + hi) >> 1;
michael@0 365 if (strcmp(key, cipair[mid].s) < 0) {
michael@0 366 hi = mid;
michael@0 367 } else if (strcmp(key, cipair[mid].s) > 0) {
michael@0 368 lo = mid + 1;
michael@0 369 } else {
michael@0 370 return mid;
michael@0 371 }
michael@0 372 }
michael@0 373 return -1;
michael@0 374 }
michael@0 375
michael@0 376 Language MakeLang(int i) {return static_cast<Language>(i);}
michael@0 377
michael@0 378 // Name can be either full name or ISO code, or can be ISO code embedded in
michael@0 379 // a language-script combination such as "ABKHAZIAN", "en", "en-Latn-GB"
michael@0 380 Language GetLanguageFromName(const char* src) {
michael@0 381 const char* hyphen1 = strchr(src, '-');
michael@0 382 const char* hyphen2 = NULL;
michael@0 383 if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');}
michael@0 384
michael@0 385 int match = -1;
michael@0 386 if (hyphen1 == NULL) {
michael@0 387 // Bare name. Look at full name, then code
michael@0 388 match = BinarySearch(src, 0, kNameToLanguageSize, kNameToLanguage);
michael@0 389 if (match >= 0) {return MakeLang(kNameToLanguage[match].i);} // aa
michael@0 390 match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
michael@0 391 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa
michael@0 392 return UNKNOWN_LANGUAGE;
michael@0 393 }
michael@0 394
michael@0 395 if (hyphen2 == NULL) {
michael@0 396 // aa-bb. Not a full name; must be code-something. Try zh-TW then bare zh
michael@0 397 match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
michael@0 398 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb
michael@0 399
michael@0 400 int len = strlen(src);
michael@0 401 if (len >= 16) {return UNKNOWN_LANGUAGE;} // Real codes are shorter
michael@0 402
michael@0 403 char temp[16];
michael@0 404 int hyphen1_offset = hyphen1 - src;
michael@0 405 // Take off part after hyphen1
michael@0 406 memcpy(temp, src, len);
michael@0 407 temp[hyphen1_offset] = '\0';
michael@0 408 match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
michael@0 409 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa
michael@0 410
michael@0 411 return UNKNOWN_LANGUAGE;
michael@0 412 }
michael@0 413
michael@0 414 // aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en
michael@0 415 match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage);
michael@0 416 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb-cc
michael@0 417
michael@0 418
michael@0 419 int len = strlen(src);
michael@0 420 if (len >= 16) {return UNKNOWN_LANGUAGE;} // Real codes are shorter
michael@0 421
michael@0 422 char temp[16];
michael@0 423 int hyphen1_offset = hyphen1 - src;
michael@0 424 int hyphen2_offset = hyphen2 - src;
michael@0 425 // Take off part after hyphen2
michael@0 426 memcpy(temp, src, len);
michael@0 427 temp[hyphen2_offset] = '\0';
michael@0 428 match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
michael@0 429 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb
michael@0 430
michael@0 431
michael@0 432 // Take off part between hyphen1 and hyphen2
michael@0 433 int len2 = len - hyphen2_offset;
michael@0 434 memcpy(temp, src, len);
michael@0 435 memcpy(&temp[hyphen1_offset], hyphen2, len2);
michael@0 436 temp[hyphen1_offset + len2] = '\0';
michael@0 437 match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
michael@0 438 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-cc
michael@0 439
michael@0 440
michael@0 441 // Take off everything after hyphen1
michael@0 442 memcpy(temp, src, len);
michael@0 443 temp[hyphen1_offset] = '\0';
michael@0 444 match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage);
michael@0 445 if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa
michael@0 446
michael@0 447
michael@0 448 return UNKNOWN_LANGUAGE;
michael@0 449 }
michael@0 450
michael@0 451
michael@0 452 // Name can be either full name or ISO code, or can be ISO code embedded in
michael@0 453 // a language-script combination such as "en-Latn-GB"
michael@0 454 // MORE WORK to do here. also kLanguageToScripts [4] is bogus
michael@0 455 // if bare language name, no script, want zh, ja, ko to Hani, pt to Latn, etc.
michael@0 456 // Something like map code to Language, then Language to kLanguageToScripts[x][0]
michael@0 457 // ADD BIAS: kLanguageToScripts lists default script first
michael@0 458 // If total mismatch, reutrn Latn
michael@0 459 // if (strcmp(src, "nd") == 0) {return NDEBELE;} // [nd was wrong]
michael@0 460 // if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;}
michael@0 461
michael@0 462 ULScript MakeULScr(int i) {return static_cast<ULScript>(i);}
michael@0 463
michael@0 464 ULScript GetULScriptFromName(const char* src) {
michael@0 465 const char* hyphen1 = strchr(src, '-');
michael@0 466 const char* hyphen2 = NULL;
michael@0 467 if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');}
michael@0 468
michael@0 469 int match = -1;
michael@0 470 if (hyphen1 == NULL) {
michael@0 471 // Bare name. Look at full name, then code, then try backmapping as Language
michael@0 472 match = BinarySearch(src, 0, kNameToULScriptSize, kNameToULScript);
michael@0 473 if (match >= 0) {return MakeULScr(kNameToULScript[match].i);} // aa
michael@0 474 match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
michael@0 475 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa
michael@0 476
michael@0 477 Language backmap_me = GetLanguageFromName(src);
michael@0 478 if (backmap_me != UNKNOWN_LANGUAGE) {
michael@0 479 return static_cast<ULScript>(kLanguageToScripts[backmap_me][0]);
michael@0 480 }
michael@0 481 return ULScript_Latin;
michael@0 482 }
michael@0 483
michael@0 484 if (hyphen2 == NULL) {
michael@0 485 // aa-bb. Not a full name; must be code-something. Try en-Latn, bare Latn
michael@0 486 if (strcmp(src, "zh-TW") == 0) {return ULScript_Hani;}
michael@0 487 if (strcmp(src, "zh-CN") == 0) {return ULScript_Hani;}
michael@0 488 if (strcmp(src, "sit-NP") == 0) {return ULScript_Limbu;}
michael@0 489 if (strcmp(src, "sit-Limb") == 0) {return ULScript_Limbu;}
michael@0 490 if (strcmp(src, "sr-ME") == 0) {return ULScript_Latin;}
michael@0 491 match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
michael@0 492 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa-bb
michael@0 493
michael@0 494 int len = strlen(src);
michael@0 495 if (len >= 16) {return ULScript_Latin;} // Real codes are shorter
michael@0 496
michael@0 497 char temp[16];
michael@0 498 int hyphen1_offset = hyphen1 - src;
michael@0 499 int len1 = len - hyphen1_offset - 1; // Exclude the hyphen
michael@0 500 // Take off part before hyphen1
michael@0 501 memcpy(temp, hyphen1 + 1, len1);
michael@0 502 temp[len1] = '\0';
michael@0 503 match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
michael@0 504 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // bb
michael@0 505
michael@0 506 // Take off part after hyphen1
michael@0 507 memcpy(temp, src, len);
michael@0 508 temp[hyphen1_offset] = '\0';
michael@0 509 match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
michael@0 510 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa
michael@0 511
michael@0 512 return ULScript_Latin;
michael@0 513 }
michael@0 514
michael@0 515 // aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en
michael@0 516 if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;}
michael@0 517 if (strcmp(src, "sr-ME-Latn") == 0) {return ULScript_Latin;}
michael@0 518 if (strcmp(src, "sr-ME-Cyrl") == 0) {return ULScript_Cyrillic;}
michael@0 519 match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript);
michael@0 520 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa-bb-cc
michael@0 521
michael@0 522 int len = strlen(src);
michael@0 523 if (len >= 16) {return ULScript_Latin;} // Real codes are shorter
michael@0 524
michael@0 525 char temp[16];
michael@0 526 int hyphen1_offset = hyphen1 - src;
michael@0 527 int hyphen2_offset = hyphen2 - src;
michael@0 528 int len2 = len - hyphen2_offset - 1; // Exclude the hyphen
michael@0 529 int lenmid = hyphen2_offset - hyphen1_offset - 1; // Exclude the hyphen
michael@0 530 // Keep part between hyphen1 and hyphen2
michael@0 531 memcpy(temp, hyphen1 + 1, lenmid);
michael@0 532 temp[lenmid] = '\0';
michael@0 533 match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
michael@0 534 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // bb
michael@0 535
michael@0 536 // Keep part after hyphen2
michael@0 537 memcpy(temp, hyphen2 + 1, len2);
michael@0 538 temp[len2] = '\0';
michael@0 539 match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
michael@0 540 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // cc
michael@0 541
michael@0 542 // Keep part before hyphen1
michael@0 543 memcpy(temp, src, len);
michael@0 544 temp[hyphen1_offset] = '\0';
michael@0 545 match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript);
michael@0 546 if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa
michael@0 547
michael@0 548 return ULScript_Latin;
michael@0 549 }
michael@0 550
michael@0 551 // Map script into Latin, Cyrillic, Arabic, Other
michael@0 552 int LScript4(ULScript ulscript) {
michael@0 553 if (ulscript == ULScript_Latin) {return 0;}
michael@0 554 if (ulscript == ULScript_Cyrillic) {return 1;}
michael@0 555 if (ulscript == ULScript_Arabic) {return 2;}
michael@0 556 return 3;
michael@0 557 }
michael@0 558
michael@0 559 } // namespace CLD2
michael@0 560

mercurial