1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/browser/components/translation/cld2/internal/lang_script.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,560 @@ 1.4 +// Copyright 2013 Google Inc. All Rights Reserved. 1.5 +// 1.6 +// Licensed under the Apache License, Version 2.0 (the "License"); 1.7 +// you may not use this file except in compliance with the License. 1.8 +// You may obtain a copy of the License at 1.9 +// 1.10 +// http://www.apache.org/licenses/LICENSE-2.0 1.11 +// 1.12 +// Unless required by applicable law or agreed to in writing, software 1.13 +// distributed under the License is distributed on an "AS IS" BASIS, 1.14 +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1.15 +// See the License for the specific language governing permissions and 1.16 +// limitations under the License. 1.17 + 1.18 +// 1.19 +// File: lang_script.cc 1.20 +// ================ 1.21 +// 1.22 +// Author: dsites@google.com (Dick Sites) 1.23 +// 1.24 +// This file declares language and script numbers and names for CLD2 1.25 +// 1.26 + 1.27 +#include "lang_script.h" 1.28 + 1.29 +#include <stdlib.h> 1.30 +#include <string.h> 1.31 + 1.32 +#include "generated_language.h" 1.33 +#include "generated_ulscript.h" 1.34 + 1.35 +namespace CLD2 { 1.36 + 1.37 +// Language tables 1.38 +// Subscripted by enum Language 1.39 +extern const int kLanguageToNameSize; 1.40 +extern const char* const kLanguageToName[]; 1.41 +extern const int kLanguageToCodeSize; 1.42 +extern const char* const kLanguageToCode[]; 1.43 +extern const int kLanguageToCNameSize; 1.44 +extern const char* const kLanguageToCName[]; 1.45 +extern const int kLanguageToScriptsSize; 1.46 +extern const FourScripts kLanguageToScripts[]; 1.47 + 1.48 +// Subscripted by Language 1.49 +extern const int kLanguageToPLangSize; 1.50 +extern const uint8 kLanguageToPLang[]; 1.51 +// Subscripted by per-script language 1.52 +extern const uint16 kPLangToLanguageLatn[]; 1.53 +extern const uint16 kPLangToLanguageOthr[]; 1.54 + 1.55 +// Alphabetical order for binary search 1.56 +extern const int kNameToLanguageSize; 1.57 +extern const CharIntPair kNameToLanguage[]; 1.58 +extern const int kCodeToLanguageSize; 1.59 +extern const CharIntPair kCodeToLanguage[]; 1.60 + 1.61 +// ULScript tables 1.62 +// Subscripted by enum ULScript 1.63 +extern const int kULScriptToNameSize; 1.64 +extern const char* const kULScriptToName[]; 1.65 +extern const int kULScriptToCodeSize; 1.66 +extern const char* const kULScriptToCode[]; 1.67 +extern const int kULScriptToCNameSize; 1.68 +extern const char* const kULScriptToCName[]; 1.69 +extern const int kULScriptToRtypeSize; 1.70 +extern const ULScriptRType kULScriptToRtype[]; 1.71 +extern const int kULScriptToDefaultLangSize; 1.72 +extern const Language kULScriptToDefaultLang[]; 1.73 + 1.74 +// Alphabetical order for binary search 1.75 +extern const int kNameToULScriptSize; 1.76 +extern const CharIntPair kNameToULScript[]; 1.77 +extern const int kCodeToULScriptSize; 1.78 +extern const CharIntPair kCodeToULScript[]; 1.79 + 1.80 + 1.81 +// 1.82 +// File: lang_script.h 1.83 +// ================ 1.84 +// 1.85 +// Author: dsites@google.com (Dick Sites) 1.86 +// 1.87 +// This file declares language and script numbers and names for CLD2 1.88 +// 1.89 + 1.90 + 1.91 +// NOTE: The script numbers and language numbers here are not guaranteed to be 1.92 +// stable. If you want to record a result for posterity, save the ISO codes 1.93 +// as character strings. 1.94 +// 1.95 +// 1.96 +// The Unicode scripts recognized by CLD2 are numbered almost arbitrarily, 1.97 +// specified in an enum. Each script has human-readable script name and a 1.98 +// 4-letter ISO 15924 script code. Each has a C name (largely for use by 1.99 +// programs that generate declarations in cld2_generated_scripts.h). Each 1.100 +// also has a recognition type 1.101 +// r_type: 0 script-only, 1 nilgrams, 2 quadgrams, 3 CJK 1.102 +// 1.103 +// The declarations for a particular version of Unicode are machine-generated in 1.104 +// cld2_generated_scripts.h 1.105 +// 1.106 +// This file includes that one and declares the access routines. The type 1.107 +// involved is called "ULScript" to signify Unicode Letters-Marks Scripts, 1.108 +// which are not quite Unicode Scripts. In particular, the CJK scripts are 1.109 +// merged into a single number because CLD2 recognizes the CJK languages from 1.110 +// four scripts intermixed: Hani (both Hans and Hant), Hangul, Hiragana, and 1.111 +// Katakana. 1.112 + 1.113 +// Each script has one of these four recognition types. 1.114 +// RTypeNone: There is no language associated with this script. In extended 1.115 +// language recognition calls, return a fake language number that maps to 1.116 +// xx-Cham, with literally "xx" for the language code,and with the script 1.117 +// code instead of "Cham". In non-extended calls, return UNKNOWN_LANGUAGE. 1.118 +// RTypeOne: The script maps 1:1 to a single language. No letters are examined 1.119 +// during recognition and no lookups done. 1.120 +// RTypeMany: The usual quadgram + delta-octagram + distinctive-words scoring 1.121 +// is done to determine the languages involved. 1.122 +// RTypeCJK: The CJK unigram + delta-bigram scoring is done to determine the 1.123 +// languages involved. 1.124 +// 1.125 +// Note that the choice of recognition type is a function of script, not 1.126 +// language. In particular, some languges are recognized in multiple scripts 1.127 +// and those have different recognition types (Mongolian mn-Latn vs. mn-Mong 1.128 +// for example). 1.129 + 1.130 +//----------------------------------------------------------------------------// 1.131 +// Functions of ULScript // 1.132 +//----------------------------------------------------------------------------// 1.133 + 1.134 +// If the input is out of range or otherwise unrecognized, it is treated 1.135 +// as UNKNOWN_ULSCRIPT (which never participates in language recognition) 1.136 +const char* ULScriptName(ULScript ulscript) { 1.137 + int i_ulscript = ulscript; 1.138 + if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;} 1.139 + if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;} 1.140 + return kULScriptToName[i_ulscript]; 1.141 +} 1.142 + 1.143 +const char* ULScriptCode(ULScript ulscript) { 1.144 + int i_ulscript = ulscript; 1.145 + if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;} 1.146 + if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;} 1.147 + return kULScriptToCode[i_ulscript]; 1.148 +} 1.149 + 1.150 +const char* ULScriptDeclaredName(ULScript ulscript) { 1.151 + int i_ulscript = ulscript; 1.152 + if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;} 1.153 + if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;} 1.154 + return kULScriptToCName[i_ulscript]; 1.155 +} 1.156 + 1.157 +ULScriptRType ULScriptRecognitionType(ULScript ulscript) { 1.158 + int i_ulscript = ulscript; 1.159 + if (i_ulscript < 0) {i_ulscript = UNKNOWN_ULSCRIPT;} 1.160 + if (i_ulscript >= NUM_ULSCRIPTS) {i_ulscript = UNKNOWN_ULSCRIPT;} 1.161 + return kULScriptToRtype[i_ulscript]; 1.162 +} 1.163 + 1.164 + 1.165 + 1.166 +// The languages recognized by CLD2 are numbered almost arbitrarily, 1.167 +// specified in an enum. Each language has human-readable language name and a 1.168 +// 2- or 3-letter ISO 639 language code. Each has a C name (largely for use by 1.169 +// programs that generate declarations in cld2_generated_languagess.h). 1.170 +// Each has a list of up to four scripts in which it is currently recognized. 1.171 +// 1.172 +// The declarations for a particular set of recognized languages are 1.173 +// machine-generated in 1.174 +// cld2_generated_languages.h 1.175 +// 1.176 +// The Language enum is intended to match the internal Google Language enum 1.177 +// in i18n/languages/proto/languages.proto up to NUM_LANGUAGES, with additional 1.178 +// languages assigned above that. Over time, some languages may be renumbered 1.179 +// if they are moved into the Language enum. 1.180 +// 1.181 +// The Language enum includes the fake language numbers for RTypeNone above. 1.182 +// 1.183 +// In an open-source environment, the Google-specific Language enum is not 1.184 +// available. Language decouples the two environments while maintaining 1.185 +// internal compatibility. 1.186 + 1.187 + 1.188 +// If the input is out of range or otherwise unrecognized, it is treated 1.189 +// as UNKNOWN_LANGUAGE 1.190 +// 1.191 +// LanguageCode 1.192 +// ------------ 1.193 +// Given the Language, return the language code, e.g. "ko" 1.194 +// This is determined by 1.195 +// the following (in order of preference): 1.196 +// - ISO-639-1 two-letter language code 1.197 +// (all except those mentioned below) 1.198 +// - ISO-639-2 three-letter bibliographic language code 1.199 +// (Tibetan, Dhivehi, Cherokee, Syriac) 1.200 +// - Google-specific language code 1.201 +// (ChineseT ("zh-TW"), Teragram Unknown, Unknown, 1.202 +// Portuguese-Portugal, Portuguese-Brazil, Limbu) 1.203 +// - Fake RTypeNone names. 1.204 + 1.205 +//----------------------------------------------------------------------------// 1.206 +// Functions of Language // 1.207 +//----------------------------------------------------------------------------// 1.208 + 1.209 +const char* LanguageName(Language lang) { 1.210 + int i_lang = lang; 1.211 + if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;} 1.212 + if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;} 1.213 + return kLanguageToName[i_lang]; 1.214 +} 1.215 +const char* LanguageCode(Language lang) { 1.216 + int i_lang = lang; 1.217 + if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;} 1.218 + if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;} 1.219 + return kLanguageToCode[i_lang]; 1.220 +} 1.221 + 1.222 +const char* LanguageDeclaredName(Language lang) { 1.223 + int i_lang = lang; 1.224 + if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;} 1.225 + if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;} 1.226 + return kLanguageToCName[i_lang]; 1.227 +} 1.228 + 1.229 +// n is in 0..3. Trailing entries are filled with 1.230 +// UNKNOWN_LANGUAGE (which never participates in language recognition) 1.231 +ULScript LanguageRecognizedScript(Language lang, int n) { 1.232 + int i_lang = lang; 1.233 + if (i_lang < 0) {i_lang = UNKNOWN_LANGUAGE;} 1.234 + if (i_lang >= NUM_LANGUAGES) {i_lang = UNKNOWN_LANGUAGE;} 1.235 + return static_cast<ULScript>(kLanguageToScripts[i_lang][n]); 1.236 +} 1.237 + 1.238 +// Given the Language, returns its string name used as the output by 1.239 +// the lang/enc identifier, e.g. "Korean" 1.240 +// "invalid_language" if the input is invalid. 1.241 +// TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language, 1.242 +// used to subtract out HTML, link farms, DNA strings, and alittle English porn 1.243 +const char* ExtLanguageName(const Language lang) { 1.244 + return LanguageName(lang); 1.245 +} 1.246 + 1.247 +// Given the Language, return the language code, e.g. "ko" 1.248 +const char* ExtLanguageCode(const Language lang) { 1.249 + return LanguageCode(lang); 1.250 +} 1.251 + 1.252 + 1.253 +// Given the Language, returns its Language enum spelling, for use by 1.254 +// programs that create C declarations, e.g. "KOREAN" 1.255 +// "UNKNOWN_LANGUAGE" if the input is invalid. 1.256 +const char* ExtLanguageDeclaredName(const Language lang) { 1.257 + return LanguageDeclaredName(lang); 1.258 +} 1.259 + 1.260 + 1.261 +extern const int kCloseSetSize = 10; 1.262 + 1.263 +// Returns which set of statistically-close languages lang is in. 0 means none. 1.264 +int LanguageCloseSet(Language lang) { 1.265 + // Scaffolding 1.266 + // id ms # INDONESIAN MALAY coef=0.4698 Problematic w/o extra words 1.267 + // bo dz # TIBETAN DZONGKHA coef=0.4571 1.268 + // cs sk # CZECH SLOVAK coef=0.4273 1.269 + // zu xh # ZULU XHOSA coef=0.3716 1.270 + // 1.271 + // bs hr sr srm # BOSNIAN CROATIAN SERBIAN MONTENEGRIN 1.272 + // hi mr bh ne # HINDI MARATHI BIHARI NEPALI 1.273 + // no nn da # NORWEGIAN NORWEGIAN_N DANISH 1.274 + // gl es pt # GALICIAN SPANISH PORTUGUESE 1.275 + // rw rn # KINYARWANDA RUNDI 1.276 + 1.277 + if (lang == INDONESIAN) {return 1;} 1.278 + if (lang == MALAY) {return 1;} 1.279 + 1.280 + if (lang == TIBETAN) {return 2;} 1.281 + if (lang == DZONGKHA) {return 2;} 1.282 + 1.283 + if (lang == CZECH) {return 3;} 1.284 + if (lang == SLOVAK) {return 3;} 1.285 + 1.286 + if (lang == ZULU) {return 4;} 1.287 + if (lang == XHOSA) {return 4;} 1.288 + 1.289 + if (lang == BOSNIAN) {return 5;} 1.290 + if (lang == CROATIAN) {return 5;} 1.291 + if (lang == SERBIAN) {return 5;} 1.292 + if (lang == MONTENEGRIN) {return 5;} 1.293 + 1.294 + if (lang == HINDI) {return 6;} 1.295 + if (lang == MARATHI) {return 6;} 1.296 + if (lang == BIHARI) {return 6;} 1.297 + if (lang == NEPALI) {return 6;} 1.298 + 1.299 + if (lang == NORWEGIAN) {return 7;} 1.300 + if (lang == NORWEGIAN_N) {return 7;} 1.301 + if (lang == DANISH) {return 7;} 1.302 + 1.303 + if (lang == GALICIAN) {return 8;} 1.304 + if (lang == SPANISH) {return 8;} 1.305 + if (lang == PORTUGUESE) {return 8;} 1.306 + 1.307 + if (lang == KINYARWANDA) {return 9;} 1.308 + if (lang == RUNDI) {return 9;} 1.309 + 1.310 + return 0; 1.311 +} 1.312 + 1.313 +//----------------------------------------------------------------------------// 1.314 +// Functions of ULScript and Language // 1.315 +//----------------------------------------------------------------------------// 1.316 + 1.317 +Language DefaultLanguage(ULScript ulscript) { 1.318 + if (ulscript < 0) {return UNKNOWN_LANGUAGE;} 1.319 + if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;} 1.320 + return kULScriptToDefaultLang[ulscript]; 1.321 +} 1.322 + 1.323 +uint8 PerScriptNumber(ULScript ulscript, Language lang) { 1.324 + if (ulscript < 0) {return 0;} 1.325 + if (ulscript >= NUM_ULSCRIPTS) {return 0;} 1.326 + if (kULScriptToRtype[ulscript] == RTypeNone) {return 1;} 1.327 + if (lang >= kLanguageToPLangSize) {return 0;} 1.328 + return kLanguageToPLang[lang]; 1.329 +} 1.330 + 1.331 +Language FromPerScriptNumber(ULScript ulscript, uint8 perscript_number) { 1.332 + if (ulscript < 0) {return UNKNOWN_LANGUAGE;} 1.333 + if (ulscript >= NUM_ULSCRIPTS) {return UNKNOWN_LANGUAGE;} 1.334 + if ((kULScriptToRtype[ulscript] == RTypeNone) || 1.335 + (kULScriptToRtype[ulscript] == RTypeOne)) { 1.336 + return kULScriptToDefaultLang[ulscript]; 1.337 + } 1.338 + 1.339 + if (ulscript == ULScript_Latin) { 1.340 + return static_cast<Language>(kPLangToLanguageLatn[perscript_number]); 1.341 + } else { 1.342 + return static_cast<Language>(kPLangToLanguageOthr[perscript_number]); 1.343 + } 1.344 +} 1.345 + 1.346 +// Return true if language can be in the Latin script 1.347 +bool IsLatnLanguage(Language lang) { 1.348 + if (lang >= kLanguageToPLangSize) {return false;} 1.349 + return (lang == kPLangToLanguageLatn[kLanguageToPLang[lang]]); 1.350 +} 1.351 + 1.352 +// Return true if language can be in a non-Latin script 1.353 +bool IsOthrLanguage(Language lang) { 1.354 + if (lang >= kLanguageToPLangSize) {return false;} 1.355 + return (lang == kPLangToLanguageOthr[kLanguageToPLang[lang]]); 1.356 +} 1.357 + 1.358 + 1.359 +//----------------------------------------------------------------------------// 1.360 +// Other // 1.361 +//----------------------------------------------------------------------------// 1.362 + 1.363 +// Returns mid if key found in lo <= mid < hi, else -1 1.364 +int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair) { 1.365 + // binary search 1.366 + while (lo < hi) { 1.367 + int mid = (lo + hi) >> 1; 1.368 + if (strcmp(key, cipair[mid].s) < 0) { 1.369 + hi = mid; 1.370 + } else if (strcmp(key, cipair[mid].s) > 0) { 1.371 + lo = mid + 1; 1.372 + } else { 1.373 + return mid; 1.374 + } 1.375 + } 1.376 + return -1; 1.377 +} 1.378 + 1.379 +Language MakeLang(int i) {return static_cast<Language>(i);} 1.380 + 1.381 +// Name can be either full name or ISO code, or can be ISO code embedded in 1.382 +// a language-script combination such as "ABKHAZIAN", "en", "en-Latn-GB" 1.383 +Language GetLanguageFromName(const char* src) { 1.384 + const char* hyphen1 = strchr(src, '-'); 1.385 + const char* hyphen2 = NULL; 1.386 + if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');} 1.387 + 1.388 + int match = -1; 1.389 + if (hyphen1 == NULL) { 1.390 + // Bare name. Look at full name, then code 1.391 + match = BinarySearch(src, 0, kNameToLanguageSize, kNameToLanguage); 1.392 + if (match >= 0) {return MakeLang(kNameToLanguage[match].i);} // aa 1.393 + match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage); 1.394 + if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa 1.395 + return UNKNOWN_LANGUAGE; 1.396 + } 1.397 + 1.398 + if (hyphen2 == NULL) { 1.399 + // aa-bb. Not a full name; must be code-something. Try zh-TW then bare zh 1.400 + match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage); 1.401 + if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb 1.402 + 1.403 + int len = strlen(src); 1.404 + if (len >= 16) {return UNKNOWN_LANGUAGE;} // Real codes are shorter 1.405 + 1.406 + char temp[16]; 1.407 + int hyphen1_offset = hyphen1 - src; 1.408 + // Take off part after hyphen1 1.409 + memcpy(temp, src, len); 1.410 + temp[hyphen1_offset] = '\0'; 1.411 + match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage); 1.412 + if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa 1.413 + 1.414 + return UNKNOWN_LANGUAGE; 1.415 + } 1.416 + 1.417 + // aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en 1.418 + match = BinarySearch(src, 0, kCodeToLanguageSize, kCodeToLanguage); 1.419 + if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb-cc 1.420 + 1.421 + 1.422 + int len = strlen(src); 1.423 + if (len >= 16) {return UNKNOWN_LANGUAGE;} // Real codes are shorter 1.424 + 1.425 + char temp[16]; 1.426 + int hyphen1_offset = hyphen1 - src; 1.427 + int hyphen2_offset = hyphen2 - src; 1.428 + // Take off part after hyphen2 1.429 + memcpy(temp, src, len); 1.430 + temp[hyphen2_offset] = '\0'; 1.431 + match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage); 1.432 + if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-bb 1.433 + 1.434 + 1.435 + // Take off part between hyphen1 and hyphen2 1.436 + int len2 = len - hyphen2_offset; 1.437 + memcpy(temp, src, len); 1.438 + memcpy(&temp[hyphen1_offset], hyphen2, len2); 1.439 + temp[hyphen1_offset + len2] = '\0'; 1.440 + match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage); 1.441 + if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa-cc 1.442 + 1.443 + 1.444 + // Take off everything after hyphen1 1.445 + memcpy(temp, src, len); 1.446 + temp[hyphen1_offset] = '\0'; 1.447 + match = BinarySearch(temp, 0, kCodeToLanguageSize, kCodeToLanguage); 1.448 + if (match >= 0) {return MakeLang(kCodeToLanguage[match].i);} // aa 1.449 + 1.450 + 1.451 + return UNKNOWN_LANGUAGE; 1.452 +} 1.453 + 1.454 + 1.455 +// Name can be either full name or ISO code, or can be ISO code embedded in 1.456 +// a language-script combination such as "en-Latn-GB" 1.457 +// MORE WORK to do here. also kLanguageToScripts [4] is bogus 1.458 +// if bare language name, no script, want zh, ja, ko to Hani, pt to Latn, etc. 1.459 +// Something like map code to Language, then Language to kLanguageToScripts[x][0] 1.460 +// ADD BIAS: kLanguageToScripts lists default script first 1.461 +// If total mismatch, reutrn Latn 1.462 +// if (strcmp(src, "nd") == 0) {return NDEBELE;} // [nd was wrong] 1.463 +// if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;} 1.464 + 1.465 +ULScript MakeULScr(int i) {return static_cast<ULScript>(i);} 1.466 + 1.467 +ULScript GetULScriptFromName(const char* src) { 1.468 + const char* hyphen1 = strchr(src, '-'); 1.469 + const char* hyphen2 = NULL; 1.470 + if (hyphen1 != NULL) {hyphen2 = strchr(hyphen1 + 1, '-');} 1.471 + 1.472 + int match = -1; 1.473 + if (hyphen1 == NULL) { 1.474 + // Bare name. Look at full name, then code, then try backmapping as Language 1.475 + match = BinarySearch(src, 0, kNameToULScriptSize, kNameToULScript); 1.476 + if (match >= 0) {return MakeULScr(kNameToULScript[match].i);} // aa 1.477 + match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript); 1.478 + if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa 1.479 + 1.480 + Language backmap_me = GetLanguageFromName(src); 1.481 + if (backmap_me != UNKNOWN_LANGUAGE) { 1.482 + return static_cast<ULScript>(kLanguageToScripts[backmap_me][0]); 1.483 + } 1.484 + return ULScript_Latin; 1.485 + } 1.486 + 1.487 + if (hyphen2 == NULL) { 1.488 + // aa-bb. Not a full name; must be code-something. Try en-Latn, bare Latn 1.489 + if (strcmp(src, "zh-TW") == 0) {return ULScript_Hani;} 1.490 + if (strcmp(src, "zh-CN") == 0) {return ULScript_Hani;} 1.491 + if (strcmp(src, "sit-NP") == 0) {return ULScript_Limbu;} 1.492 + if (strcmp(src, "sit-Limb") == 0) {return ULScript_Limbu;} 1.493 + if (strcmp(src, "sr-ME") == 0) {return ULScript_Latin;} 1.494 + match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript); 1.495 + if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa-bb 1.496 + 1.497 + int len = strlen(src); 1.498 + if (len >= 16) {return ULScript_Latin;} // Real codes are shorter 1.499 + 1.500 + char temp[16]; 1.501 + int hyphen1_offset = hyphen1 - src; 1.502 + int len1 = len - hyphen1_offset - 1; // Exclude the hyphen 1.503 + // Take off part before hyphen1 1.504 + memcpy(temp, hyphen1 + 1, len1); 1.505 + temp[len1] = '\0'; 1.506 + match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); 1.507 + if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // bb 1.508 + 1.509 + // Take off part after hyphen1 1.510 + memcpy(temp, src, len); 1.511 + temp[hyphen1_offset] = '\0'; 1.512 + match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); 1.513 + if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa 1.514 + 1.515 + return ULScript_Latin; 1.516 + } 1.517 + 1.518 + // aa-bb-cc. Must be code-something. Try en-Latn-US, en-Latn, en-US, en 1.519 + if (strcmp(src, "sit-NP-Limb") == 0) {return ULScript_Limbu;} 1.520 + if (strcmp(src, "sr-ME-Latn") == 0) {return ULScript_Latin;} 1.521 + if (strcmp(src, "sr-ME-Cyrl") == 0) {return ULScript_Cyrillic;} 1.522 + match = BinarySearch(src, 0, kCodeToULScriptSize, kCodeToULScript); 1.523 + if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa-bb-cc 1.524 + 1.525 + int len = strlen(src); 1.526 + if (len >= 16) {return ULScript_Latin;} // Real codes are shorter 1.527 + 1.528 + char temp[16]; 1.529 + int hyphen1_offset = hyphen1 - src; 1.530 + int hyphen2_offset = hyphen2 - src; 1.531 + int len2 = len - hyphen2_offset - 1; // Exclude the hyphen 1.532 + int lenmid = hyphen2_offset - hyphen1_offset - 1; // Exclude the hyphen 1.533 + // Keep part between hyphen1 and hyphen2 1.534 + memcpy(temp, hyphen1 + 1, lenmid); 1.535 + temp[lenmid] = '\0'; 1.536 + match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); 1.537 + if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // bb 1.538 + 1.539 + // Keep part after hyphen2 1.540 + memcpy(temp, hyphen2 + 1, len2); 1.541 + temp[len2] = '\0'; 1.542 + match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); 1.543 + if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // cc 1.544 + 1.545 + // Keep part before hyphen1 1.546 + memcpy(temp, src, len); 1.547 + temp[hyphen1_offset] = '\0'; 1.548 + match = BinarySearch(temp, 0, kCodeToULScriptSize, kCodeToULScript); 1.549 + if (match >= 0) {return MakeULScr(kCodeToULScript[match].i);} // aa 1.550 + 1.551 + return ULScript_Latin; 1.552 +} 1.553 + 1.554 +// Map script into Latin, Cyrillic, Arabic, Other 1.555 +int LScript4(ULScript ulscript) { 1.556 + if (ulscript == ULScript_Latin) {return 0;} 1.557 + if (ulscript == ULScript_Cyrillic) {return 1;} 1.558 + if (ulscript == ULScript_Arabic) {return 2;} 1.559 + return 3; 1.560 +} 1.561 + 1.562 +} // namespace CLD2 1.563 +